summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-01 08:51:09 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-01 08:51:09 (GMT)
commit62c00ac54a6f93ad1324d7e7aa5ef43623ca2415 (patch)
tree1504e408d8e107d384113d4109a8ffcb37c982cd /generic
parent9a8559d4cb683fe7f03d28704ec74cbc08835fb2 (diff)
downloadtcl-62c00ac54a6f93ad1324d7e7aa5ef43623ca2415.zip
tcl-62c00ac54a6f93ad1324d7e7aa5ef43623ca2415.tar.gz
tcl-62c00ac54a6f93ad1324d7e7aa5ef43623ca2415.tar.bz2
Fix [ed29806baf] by introducing TclUCS4Complete(). All other calls of Tcl_UtfToUniChar() are suspicious, because those cannot handle 4-byte UTF-8 sequences reliable.
So, there's more work to do, but this part can already be backported to Tcl 8.6 and see where we get.
Diffstat (limited to 'generic')
-rw-r--r--generic/tclDecls.h2
-rw-r--r--generic/tclEncoding.c14
-rw-r--r--generic/tclInt.h5
3 files changed, 12 insertions, 9 deletions
diff --git a/generic/tclDecls.h b/generic/tclDecls.h
index 4531be3..c713469 100644
--- a/generic/tclDecls.h
+++ b/generic/tclDecls.h
@@ -4181,7 +4181,7 @@ extern const TclStubs *tclStubsPtr;
#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX > 3)
# undef Tcl_UtfCharComplete
# define Tcl_UtfCharComplete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \
- ? 4 : tclStubsPtr->tcl_UtfCharComplete((src), (length)))
+ ? ((length) >= 4) : tclStubsPtr->tcl_UtfCharComplete((src), (length)))
#endif
#endif /* _TCLDECLS */
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 4789b7f..422627b 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2300,7 +2300,7 @@ UtfToUtfProc(
const char *srcStart, *srcEnd, *srcClose;
const char *dstStart, *dstEnd;
int result, numChars, charLimit = INT_MAX;
- Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
+ int *chPtr = (int *) statePtr;
if (flags & TCL_ENCODING_START) {
*statePtr = 0;
@@ -2321,7 +2321,7 @@ UtfToUtfProc(
dstEnd = dst + dstLen - TCL_UTF_MAX;
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
- if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
+ if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) {
/*
* If there is more string to follow, this will ensure that the
* last UTF-8 character in the source buffer hasn't been cut off.
@@ -2349,9 +2349,9 @@ UtfToUtfProc(
*dst++ = 0;
src += 2;
- } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
+ } else if (!TclUCS4Complete(src, srcEnd - src)) {
/*
- * Always check before using TclUtfToUniChar. Not doing can so
+ * Always check before using TclUtfToUCS4. Not doing can so
* cause it run beyond the end of the buffer! If we happen such an
* incomplete char its bytes are made to represent themselves.
*/
@@ -2360,11 +2360,11 @@ UtfToUtfProc(
src += 1;
dst += Tcl_UniCharToUtf(*chPtr, dst);
} else {
- src += TclUtfToUniChar(src, chPtr);
+ src += TclUtfToUCS4(src, chPtr);
if ((*chPtr | 0x7FF) == 0xDFFF) {
/* A surrogate character is detected, handle especially */
- Tcl_UniChar low = *chPtr;
- size_t len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0;
+ int low = *chPtr;
+ size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
if (((low | 0x3FF) != 0xDFFF) || (*chPtr & 0x400)) {
*dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
*dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 2ff644e..5f660e3 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3252,8 +3252,11 @@ MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct);
MODULE_SCOPE int TclUtfCount(int ch);
#if TCL_UTF_MAX > 3
# define TclUtfToUCS4 Tcl_UtfToUniChar
+# define TclUCS4Complete Tcl_UtfCharComplete
#else
- MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr);
+ MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr);
+# define TclUCS4Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \
+ ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length)))
#endif
MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData);
MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr);