From 2473a591bfbd5b346e1900e3c1088496b0d17590 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 18 Feb 2019 20:48:59 +0000 Subject: Proposed fix for [bd94500678]: SEGFAULT by conversion of unicode (out of BMP) to byte-array. --- generic/tclCmdMZ.c | 8 +++--- generic/tclEncoding.c | 8 +++--- generic/tclScan.c | 4 +-- generic/tclUtf.c | 75 ++++++++++++++++++++++++++------------------------- 4 files changed, 49 insertions(+), 46 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index dac82b8..c17c4f1 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1221,8 +1221,8 @@ Tcl_SplitObjCmd( fullchar = ch; #if TCL_UTF_MAX <= 4 - if (!len) { - len += TclUtfToUniChar(stringPtr, &ch); + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + len += TclUtfToUniChar(stringPtr + len, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } #endif @@ -1854,8 +1854,8 @@ StringIsCmd( length2 = TclUtfToUniChar(string1, &ch); fullchar = ch; #if TCL_UTF_MAX <= 4 - if (!length2) { - length2 = TclUtfToUniChar(string1, &ch); + if ((length2 == 1) && ((ch & 0xFC00) == 0xD800)) { + length2 += TclUtfToUniChar(string1 + length2, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } #endif diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index e601c3a..b5517bc 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2384,8 +2384,8 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(*chPtr, dst); #if TCL_UTF_MAX <= 4 - if (!len) { - src += TclUtfToUniChar(src, chPtr); + if ((len == 1) && ((*chPtr & 0xFC00) == 0xD800)) { + src += TclUtfToUniChar(src + len, chPtr); dst += Tcl_UniCharToUtf(*chPtr, dst); } #endif @@ -3006,7 +3006,7 @@ Iso88591FromUtfProc( if (ch > 0xff #if TCL_UTF_MAX <= 4 - || !len + || ((len == 1) && ((ch & 0xFC00) == 0xD800)) #endif ) { if (flags & TCL_ENCODING_STOPONERROR) { @@ -3014,7 +3014,7 @@ Iso88591FromUtfProc( break; } #if TCL_UTF_MAX <= 4 - if (!len) len = 4; + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) len = 4; #endif /* * Plunge on, using '?' as a fallback character. diff --git a/generic/tclScan.c b/generic/tclScan.c index fbfba2d..45035f1 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -882,8 +882,8 @@ Tcl_ScanObjCmd( offset = TclUtfToUniChar(string, &sch); i = (int)sch; #if TCL_UTF_MAX == 4 - if (!offset) { - offset = TclUtfToUniChar(string, &sch); + if ((offset == 1) && ((sch & 0xFC00) == 0xD800)) { + offset += TclUtfToUniChar(string+offset, &sch); i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF); } #endif diff --git a/generic/tclUtf.c b/generic/tclUtf.c index ce67db7..2227d45 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -312,6 +312,20 @@ Tcl_UtfToUniChar( * characters representing themselves. */ +#if TCL_UTF_MAX <= 4 + /* If *chPtr contains a high surrogate (produced by a previous + * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation + * bytes, then we must produce a follow-up low surrogate. We only + * do that if the high surrogate matches the bits we encounter. + */ + if ((byte >= 0x80) + && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC)) + && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80)) + && ((src[2] & 0xC0) == 0x80)) { + *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; + return 3; + } +#endif if ((unsigned)(byte-0x80) < (unsigned) 0x20) { *chPtr = (Tcl_UniChar) cp1252[byte-0x80]; } else { @@ -358,21 +372,14 @@ Tcl_UtfToUniChar( * Four-byte-character lead byte followed by three trail bytes. */ #if TCL_UTF_MAX <= 4 - Tcl_UniChar surrogate; - - byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000; - surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10)); - if (byte & 0x100000) { + byte = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) + | ((src[2] & 0x3F) >> 4)) - 0x40; + if ((unsigned) byte >= 0x400) { /* out of range, < 0x10000 or > 0x10ffff */ - } else if (*chPtr != surrogate) { - /* produce high surrogate, but don't advance source pointer */ - *chPtr = surrogate; - return 0; } else { - /* produce low surrogate, and advance source pointer */ - *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF)); - return 4; + /* produce high surrogate, advance source pointer */ + *chPtr = 0xD800 + byte; + return 1; } #else *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) @@ -582,8 +589,8 @@ Tcl_UtfFindFirst( len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX <= 4 - if (!len) { - len += TclUtfToUniChar(src, &find); + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + len += TclUtfToUniChar(src + len, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; } #endif @@ -630,8 +637,8 @@ Tcl_UtfFindLast( len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX <= 4 - if (!len) { - len += TclUtfToUniChar(src, &find); + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + len += TclUtfToUniChar(src + len, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; } #endif @@ -673,8 +680,8 @@ Tcl_UtfNext( int len = TclUtfToUniChar(src, &ch); #if TCL_UTF_MAX <= 4 - if (len == 0) { - len = TclUtfToUniChar(src, &ch); + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + len += TclUtfToUniChar(src + len, &ch); } #endif return src + len; @@ -755,7 +762,7 @@ Tcl_UniCharAtIndex( Tcl_UniChar ch = 0; int fullchar = 0; #if TCL_UTF_MAX <= 4 - int len = 1; + int len = 0; #endif while (index-- >= 0) { @@ -767,9 +774,9 @@ Tcl_UniCharAtIndex( } fullchar = ch; #if TCL_UTF_MAX <= 4 - if (!len) { + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { /* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */ - (void)TclUtfToUniChar(src, &ch); + (void)TclUtfToUniChar(src + len, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } #endif @@ -801,14 +808,14 @@ Tcl_UtfAtIndex( register int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; - int len = 1; + int len = 0; while (index-- > 0) { len = TclUtfToUniChar(src, &ch); src += len; } #if TCL_UTF_MAX <= 4 - if (!len) { + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { /* Index points at character following High Surrogate */ src += TclUtfToUniChar(src, &ch); } @@ -905,9 +912,8 @@ Tcl_UtfToUpper( bytes = TclUtfToUniChar(src, &ch); upChar = ch; #if TCL_UTF_MAX <= 4 - if (!bytes) { - /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ - bytes = TclUtfToUniChar(src, &ch); + if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { + bytes += TclUtfToUniChar(src + bytes, &ch); /* Combine surrogates */ upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -968,9 +974,8 @@ Tcl_UtfToLower( bytes = TclUtfToUniChar(src, &ch); lowChar = ch; #if TCL_UTF_MAX <= 4 - if (!bytes) { - /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ - bytes = TclUtfToUniChar(src, &ch); + if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { + bytes += TclUtfToUniChar(src + bytes, &ch); /* Combine surrogates */ lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -1034,9 +1039,8 @@ Tcl_UtfToTitle( bytes = TclUtfToUniChar(src, &ch); titleChar = ch; #if TCL_UTF_MAX <= 4 - if (!bytes) { - /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ - bytes = TclUtfToUniChar(src, &ch); + if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { + bytes += TclUtfToUniChar(src + bytes, &ch); /* Combine surrogates */ titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -1055,9 +1059,8 @@ Tcl_UtfToTitle( bytes = TclUtfToUniChar(src, &ch); lowChar = ch; #if TCL_UTF_MAX <= 4 - if (!bytes) { - /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ - bytes = TclUtfToUniChar(src, &ch); + if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { + bytes += TclUtfToUniChar(src + bytes, &ch); /* Combine surrogates */ lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } -- cgit v0.12 From 9589c85462da7e8d01fe0154de892c6d30d92f0d Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 19 Feb 2019 19:38:10 +0000 Subject: Minor optimizations --- generic/tclCmdMZ.c | 20 ++++----- generic/tclEncoding.c | 14 +++--- generic/tclExecute.c | 4 +- generic/tclParse.c | 6 +-- generic/tclScan.c | 2 +- generic/tclStringObj.c | 8 ++-- generic/tclStubInit.c | 4 +- generic/tclUtf.c | 113 +++++++++++++++++++++++++------------------------ generic/tclZipfs.c | 22 +--------- win/tclWin32Dll.c | 8 +++- 10 files changed, 95 insertions(+), 106 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index c17c4f1..a289a5c 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1221,7 +1221,7 @@ Tcl_SplitObjCmd( fullchar = ch; #if TCL_UTF_MAX <= 4 - if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + if ((ch >= 0xD800) && (len < 3)) { len += TclUtfToUniChar(stringPtr + len, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -1447,8 +1447,8 @@ StringIndexCmd( char buf[4]; length = Tcl_UniCharToUtf(ch, buf); - if (!length) { - length = Tcl_UniCharToUtf(-1, buf); + if ((ch >= 0xD800) && (length < 3)) { + length = Tcl_UniCharToUtf(-1, buf + length); } Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length)); } @@ -1854,7 +1854,7 @@ StringIsCmd( length2 = TclUtfToUniChar(string1, &ch); fullchar = ch; #if TCL_UTF_MAX <= 4 - if ((length2 == 1) && ((ch & 0xFC00) == 0xD800)) { + if ((ch >= 0xD800) && (length2 < 3)) { length2 += TclUtfToUniChar(string1 + length2, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -1935,7 +1935,7 @@ StringMapCmd( const char *string = TclGetStringFromObj(objv[1], &length2); if ((length2 > 1) && - strncmp(string, "-nocase", (size_t) length2) == 0) { + strncmp(string, "-nocase", length2) == 0) { nocase = 1; } else { Tcl_SetObjResult(interp, Tcl_ObjPrintf( @@ -2203,7 +2203,7 @@ StringMatchCmd( const char *string = TclGetStringFromObj(objv[1], &length); if ((length > 1) && - strncmp(string, "-nocase", (size_t) length) == 0) { + strncmp(string, "-nocase", length) == 0) { nocase = TCL_MATCH_NOCASE; } else { Tcl_SetObjResult(interp, Tcl_ObjPrintf( @@ -2605,10 +2605,10 @@ StringEqualCmd( for (i = 1; i < objc-2; i++) { string2 = TclGetStringFromObj(objv[i], &length); - if ((length > 1) && !strncmp(string2, "-nocase", (size_t)length)) { + if ((length > 1) && !strncmp(string2, "-nocase", length)) { nocase = 1; } else if ((length > 1) - && !strncmp(string2, "-length", (size_t)length)) { + && !strncmp(string2, "-length", length)) { if (i+1 >= objc-2) { goto str_cmp_args; } @@ -2703,10 +2703,10 @@ TclStringCmpOpts( for (i = 1; i < objc-2; i++) { string = TclGetStringFromObj(objv[i], &length); - if ((length > 1) && !strncmp(string, "-nocase", (size_t)length)) { + if ((length > 1) && !strncmp(string, "-nocase", length)) { *nocase = 1; } else if ((length > 1) - && !strncmp(string, "-length", (size_t)length)) { + && !strncmp(string, "-length", length)) { if (i+1 >= objc-2) { goto str_cmp_args; } diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index b5517bc..08a0de3 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2167,7 +2167,7 @@ BinaryProc( *srcReadPtr = srcLen; *dstWrotePtr = srcLen; *dstCharsPtr = srcLen; - memcpy(dst, src, (size_t) srcLen); + memcpy(dst, src, srcLen); return result; } @@ -2384,7 +2384,7 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(*chPtr, dst); #if TCL_UTF_MAX <= 4 - if ((len == 1) && ((*chPtr & 0xFC00) == 0xD800)) { + if ((*chPtr >= 0xD800) && (len < 3)) { src += TclUtfToUniChar(src + len, chPtr); dst += Tcl_UniCharToUtf(*chPtr, dst); } @@ -3006,7 +3006,7 @@ Iso88591FromUtfProc( if (ch > 0xff #if TCL_UTF_MAX <= 4 - || ((len == 1) && ((ch & 0xFC00) == 0xD800)) + || ((ch >= 0xD800) && (len < 3)) #endif ) { if (flags & TCL_ENCODING_STOPONERROR) { @@ -3014,7 +3014,7 @@ Iso88591FromUtfProc( break; } #if TCL_UTF_MAX <= 4 - if ((len == 1) && ((ch & 0xFC00) == 0xD800)) len = 4; + if ((ch >= 0xD800) && (len < 3)) len = 4; #endif /* * Plunge on, using '?' as a fallback character. @@ -3364,7 +3364,7 @@ EscapeFromUtfProc( *dstWrotePtr = 0; return TCL_CONVERT_NOSPACE; } - memcpy(dst, dataPtr->init, (size_t)dataPtr->initLen); + memcpy(dst, dataPtr->init, dataPtr->initLen); dst += dataPtr->initLen; } else { state = PTR2INT(*statePtr); @@ -3443,7 +3443,7 @@ EscapeFromUtfProc( break; } memcpy(dst, subTablePtr->sequence, - (size_t) subTablePtr->sequenceLen); + subTablePtr->sequenceLen); dst += subTablePtr->sequenceLen; } } @@ -3486,7 +3486,7 @@ EscapeFromUtfProc( memcpy(dst, dataPtr->subTables[0].sequence, len); dst += len; } - memcpy(dst, dataPtr->final, (size_t) dataPtr->finalLen); + memcpy(dst, dataPtr->final, dataPtr->finalLen); dst += dataPtr->finalLen; state &= ~TCL_ENCODING_END; } diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 17ad0bb..3ae5571 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5227,8 +5227,8 @@ TEBCresume( objResultPtr = Tcl_NewObj(); } else { length = Tcl_UniCharToUtf(ch, buf); - if (!length) { - length = Tcl_UniCharToUtf(-1, buf); + if ((ch >= 0xD800) && (length < 3)) { + length = Tcl_UniCharToUtf(-1, buf + length); } objResultPtr = Tcl_NewStringObj(buf, length); } diff --git a/generic/tclParse.c b/generic/tclParse.c index ccb648c..8d07f7f 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -939,9 +939,9 @@ TclParseBackslash( *readPtr = count; } count = Tcl_UniCharToUtf(result, dst); - if (!count) { - /* Special case for handling upper surrogates. */ - count = Tcl_UniCharToUtf(-1, dst); + if ((result >= 0xD800) && (count < 3)) { + /* Special case for handling high surrogates. */ + count += Tcl_UniCharToUtf(-1, dst + count); } return count; } diff --git a/generic/tclScan.c b/generic/tclScan.c index 45035f1..21ad953 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -882,7 +882,7 @@ Tcl_ScanObjCmd( offset = TclUtfToUniChar(string, &sch); i = (int)sch; #if TCL_UTF_MAX == 4 - if ((offset == 1) && ((sch & 0xFC00) == 0xD800)) { + if (((sch & 0xFC00) == 0xD800) && (offset < 3)) { offset += TclUtfToUniChar(string+offset, &sch); i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF); } diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 72ca7cd..1ef7b9a 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -2048,9 +2048,9 @@ Tcl_AppendFormatToObj( goto error; } length = Tcl_UniCharToUtf(code, buf); - if (!length) { - /* Special case for handling upper surrogates. */ - length = Tcl_UniCharToUtf(-1, buf); + if ((code >= 0xD800) && (length < 3)) { + /* Special case for handling high surrogates. */ + length += Tcl_UniCharToUtf(-1, buf + length); } segment = Tcl_NewStringObj(buf, length); Tcl_IncrRefCount(segment); @@ -4287,7 +4287,7 @@ ExtendStringRepWithUnicode( copyBytes: dst = objPtr->bytes + origLength; for (i = 0; i < numChars; i++) { - dst += Tcl_UniCharToUtf((int) unicode[i], dst); + dst += Tcl_UniCharToUtf(unicode[i], dst); } *dst = '\0'; objPtr->length = dst - objPtr->bytes; diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 8429a2f..66bb305 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -333,7 +333,7 @@ Tcl_WinTCharToUtf( wEnd = (wchar_t *)string + len; for (w = (wchar_t *)string; w < wEnd; ) { if (!blen && ((*w & 0xFC00) != 0xDC00)) { - /* Special case for handling upper surrogates. */ + /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } blen = Tcl_UniCharToUtf(*w, p); @@ -341,7 +341,7 @@ Tcl_WinTCharToUtf( w++; } if (!blen) { - /* Special case for handling upper surrogates. */ + /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } Tcl_DStringSetLength(dsPtr, oldLength + (p - result)); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 2227d45..6b63ecb 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -147,17 +147,17 @@ Tcl_UniCharToUtf( /* Low surrogate */ if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xCF) == 0)) { - /* Previous Tcl_UniChar was a High surrogate, so combine */ + /* Previous Tcl_UniChar was a high surrogate, so combine */ buf[3] = (char) ((ch & 0x3F) | 0x80); buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); return 4; } - /* Previous Tcl_UniChar was not a High surrogate, so just output */ + /* Previous Tcl_UniChar was not a high surrogate, so just output */ } else { /* High surrogate */ ch += 0x40; /* Fill buffer with specific 3-byte (invalid) byte combination, - so following Low surrogate can recognize it and combine */ + so following low surrogate can recognize it and combine */ buf[2] = (char) ((ch << 4) & 0x30); buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); @@ -232,15 +232,18 @@ Tcl_UniCharToUtfDString( wEnd = uniStr + uniLength; for (w = uniStr; w < wEnd; ) { if (!len && ((*w & 0xFC00) != 0xDC00)) { - /* Special case for handling upper surrogates. */ + /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } len = Tcl_UniCharToUtf(*w, p); p += len; + if ((*w >= 0xD800) && (len < 3)) { + len = 0; /* Indication that high surrogate was found */ + } w++; } if (!len) { - /* Special case for handling upper surrogates. */ + /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); @@ -296,7 +299,7 @@ Tcl_UtfToUniChar( register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { - register int byte; + Tcl_UniChar byte; /* * Unroll 1 to 3 (or 4) byte UTF-8 sequences. @@ -326,10 +329,10 @@ Tcl_UtfToUniChar( return 3; } #endif - if ((unsigned)(byte-0x80) < (unsigned) 0x20) { - *chPtr = (Tcl_UniChar) cp1252[byte-0x80]; + if (byte-0x80 < 0x20) { + *chPtr = cp1252[byte-0x80]; } else { - *chPtr = (Tcl_UniChar) byte; + *chPtr = byte; } return 1; } else if (byte < 0xE0) { @@ -338,7 +341,7 @@ Tcl_UtfToUniChar( * Two-byte-character lead-byte followed by a trail-byte. */ - *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); + *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F)); if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { return 2; } @@ -354,7 +357,7 @@ Tcl_UtfToUniChar( * Three-byte-character lead byte followed by two trail bytes. */ - *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) + *chPtr = (((byte & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); if (*chPtr > 0x7FF) { return 3; @@ -374,7 +377,7 @@ Tcl_UtfToUniChar( #if TCL_UTF_MAX <= 4 byte = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) | ((src[2] & 0x3F) >> 4)) - 0x40; - if ((unsigned) byte >= 0x400) { + if (byte >= 0x400) { /* out of range, < 0x10000 or > 0x10ffff */ } else { /* produce high surrogate, advance source pointer */ @@ -382,9 +385,9 @@ Tcl_UtfToUniChar( return 1; } #else - *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) + *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); - if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { + if ((*chPtr - 0x10000) <= 0xFFFFF) { return 4; } #endif @@ -396,7 +399,7 @@ Tcl_UtfToUniChar( */ } - *chPtr = (Tcl_UniChar) byte; + *chPtr = byte; return 1; } @@ -457,8 +460,8 @@ Tcl_UtfToUniCharDString( while (p < end) { if (Tcl_UtfCharComplete(p, end-p)) { p += TclUtfToUniChar(p, &ch); - } else if ((unsigned)((UCHAR(*p)-0x80)) < (unsigned) 0x20) { - ch = (Tcl_UniChar) cp1252[UCHAR(*p++)-0x80]; + } else if (((UCHAR(*p)-0x80)) < 0x20) { + ch = cp1252[UCHAR(*p++)-0x80]; } else { ch = UCHAR(*p++); } @@ -589,7 +592,7 @@ Tcl_UtfFindFirst( len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX <= 4 - if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + if ((ch >= 0xD800) && (len < 3)) { len += TclUtfToUniChar(src + len, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; } @@ -637,7 +640,7 @@ Tcl_UtfFindLast( len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX <= 4 - if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + if ((ch >= 0xD800) && (len < 3)) { len += TclUtfToUniChar(src + len, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; } @@ -680,8 +683,8 @@ Tcl_UtfNext( int len = TclUtfToUniChar(src, &ch); #if TCL_UTF_MAX <= 4 - if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { - len += TclUtfToUniChar(src + len, &ch); + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &ch); } #endif return src + len; @@ -774,8 +777,8 @@ Tcl_UniCharAtIndex( } fullchar = ch; #if TCL_UTF_MAX <= 4 - if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { - /* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */ + if ((ch >= 0xD800) && (len < 3)) { + /* If last Tcl_UniChar was an high surrogate, combine with low surrogate */ (void)TclUtfToUniChar(src + len, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -815,7 +818,7 @@ Tcl_UtfAtIndex( src += len; } #if TCL_UTF_MAX <= 4 - if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + if ((ch >= 0xD800) && (len < 3)) { /* Index points at character following High Surrogate */ src += TclUtfToUniChar(src, &ch); } @@ -901,7 +904,7 @@ Tcl_UtfToUpper( Tcl_UniChar ch = 0; int upChar; char *src, *dst; - int bytes; + int len; /* * Iterate over the string until we hit the terminating null. @@ -909,11 +912,11 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { - bytes = TclUtfToUniChar(src, &ch); + len = TclUtfToUniChar(src, &ch); upChar = ch; #if TCL_UTF_MAX <= 4 - if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { - bytes += TclUtfToUniChar(src + bytes, &ch); + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &ch); /* Combine surrogates */ upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -926,13 +929,13 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if ((bytes < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; + if ((len < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) { + memcpy(dst, src, len); + dst += len; } else { dst += Tcl_UniCharToUtf(upChar, dst); } - src += bytes; + src += len; } *dst = '\0'; return (dst - str); @@ -963,7 +966,7 @@ Tcl_UtfToLower( Tcl_UniChar ch = 0; int lowChar; char *src, *dst; - int bytes; + int len; /* * Iterate over the string until we hit the terminating null. @@ -971,11 +974,11 @@ Tcl_UtfToLower( src = dst = str; while (*src) { - bytes = TclUtfToUniChar(src, &ch); + len = TclUtfToUniChar(src, &ch); lowChar = ch; #if TCL_UTF_MAX <= 4 - if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { - bytes += TclUtfToUniChar(src + bytes, &ch); + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &ch); /* Combine surrogates */ lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -988,13 +991,13 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; + if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) { + memcpy(dst, src, len); + dst += len; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } - src += bytes; + src += len; } *dst = '\0'; return (dst - str); @@ -1026,7 +1029,7 @@ Tcl_UtfToTitle( Tcl_UniChar ch = 0; int titleChar, lowChar; char *src, *dst; - int bytes; + int len; /* * Capitalize the first character and then lowercase the rest of the @@ -1036,31 +1039,31 @@ Tcl_UtfToTitle( src = dst = str; if (*src) { - bytes = TclUtfToUniChar(src, &ch); + len = TclUtfToUniChar(src, &ch); titleChar = ch; #if TCL_UTF_MAX <= 4 - if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { - bytes += TclUtfToUniChar(src + bytes, &ch); + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &ch); /* Combine surrogates */ titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } #endif titleChar = Tcl_UniCharToTitle(titleChar); - if ((bytes < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; + if ((len < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) { + memcpy(dst, src, len); + dst += len; } else { dst += Tcl_UniCharToUtf(titleChar, dst); } - src += bytes; + src += len; } while (*src) { - bytes = TclUtfToUniChar(src, &ch); + len = TclUtfToUniChar(src, &ch); lowChar = ch; #if TCL_UTF_MAX <= 4 - if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { - bytes += TclUtfToUniChar(src + bytes, &ch); + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &ch); /* Combine surrogates */ lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -1070,13 +1073,13 @@ Tcl_UtfToTitle( lowChar = Tcl_UniCharToLower(lowChar); } - if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; + if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) { + memcpy(dst, src, len); + dst += len; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } - src += bytes; + src += len; } *dst = '\0'; return (dst - str); diff --git a/generic/tclZipfs.c b/generic/tclZipfs.c index d02a2da..64a12a3 100644 --- a/generic/tclZipfs.c +++ b/generic/tclZipfs.c @@ -3155,21 +3155,6 @@ ZipFSListObjCmd( #ifdef _WIN32 #define LIBRARY_SIZE 64 - -static inline int -WCharToUtf( - const WCHAR *wSrc, - char *dst) -{ - char *start = dst; - - while (*wSrc != '\0') { - dst += Tcl_UniCharToUtf(*wSrc, dst); - wSrc++; - } - *dst = '\0'; - return (int) (dst - start); -} #endif /* _WIN32 */ Tcl_Obj * @@ -3213,11 +3198,8 @@ TclZipfs_TclLibrary(void) #if defined(_WIN32) hModule = TclWinGetTclInstance(); - if (GetModuleFileNameW(hModule, wName, MAX_PATH) == 0) { - GetModuleFileNameA(hModule, dllName, MAX_PATH); - } else { - WCharToUtf(wName, dllName); - } + GetModuleFileNameW(hModule, wName, MAX_PATH); + WideCharToMultiByte(CP_UTF8, 0, wName, -1, dllName, sizeof(dllName), NULL, NULL); if (ZipfsAppHookFindTclInit(dllName) == TCL_OK) { return Tcl_NewStringObj(zipfs_literal_tcl_library, -1); diff --git a/win/tclWin32Dll.c b/win/tclWin32Dll.c index ddfa0d6..c39d2c1 100644 --- a/win/tclWin32Dll.c +++ b/win/tclWin32Dll.c @@ -567,15 +567,19 @@ Tcl_WinTCharToUtf( wEnd = (TCHAR *)string + len; for (w = (TCHAR *)string; w < wEnd; ) { if (!blen && ((*w & 0xFC00) != 0xDC00)) { - /* Special case for handling upper surrogates. */ + /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } blen = Tcl_UniCharToUtf(*w, p); p += blen; + if ((*w >= 0xD800) && (blen < 3)) { + /* Indication that high surrogate is handled */ + blen = 0; + } w++; } if (!blen) { - /* Special case for handling upper surrogates. */ + /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } Tcl_DStringSetLength(dsPtr, oldLength + (p - result)); -- cgit v0.12 From 57d9952ece8f81fc6802097bace965a196bb849b Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 25 Feb 2019 21:10:33 +0000 Subject: Finish complete fix, all corner-cases correct now. Also spurious UTF-8 testcase failure (as seen on travis) fixed now. --- generic/tclBinary.c | 4 ++-- generic/tclCmdMZ.c | 4 ++-- generic/tclCompCmdsSZ.c | 2 +- generic/tclCompile.c | 4 ++-- generic/tclExecute.c | 4 ++-- generic/tclParse.c | 4 ++-- generic/tclScan.c | 2 +- generic/tclUtf.c | 38 ++++++++++++++++++++------------------ generic/tclUtil.c | 2 +- 9 files changed, 33 insertions(+), 31 deletions(-) diff --git a/generic/tclBinary.c b/generic/tclBinary.c index 677213e..3590af4 100644 --- a/generic/tclBinary.c +++ b/generic/tclBinary.c @@ -1354,7 +1354,7 @@ BinaryFormatCmd( badField: { Tcl_UniChar ch = 0; - char buf[TCL_UTF_MAX + 1]; + char buf[TCL_UTF_MAX + 1] = ""; TclUtfToUniChar(errorString, &ch); buf[Tcl_UniCharToUtf(ch, buf)] = '\0'; @@ -1724,7 +1724,7 @@ BinaryScanCmd( badField: { Tcl_UniChar ch = 0; - char buf[TCL_UTF_MAX + 1]; + char buf[TCL_UTF_MAX + 1] = ""; TclUtfToUniChar(errorString, &ch); buf[Tcl_UniCharToUtf(ch, buf)] = '\0'; diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index a289a5c..38689fd 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1444,11 +1444,11 @@ StringIndexCmd( Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1)); } else { - char buf[4]; + char buf[TCL_UTF_MAX] = ""; length = Tcl_UniCharToUtf(ch, buf); if ((ch >= 0xD800) && (length < 3)) { - length = Tcl_UniCharToUtf(-1, buf + length); + length += Tcl_UniCharToUtf(-1, buf + length); } Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length)); } diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c index daab0d5..b97121e 100644 --- a/generic/tclCompCmdsSZ.c +++ b/generic/tclCompCmdsSZ.c @@ -1502,7 +1502,7 @@ TclSubstCompile( for (endTokenPtr = tokenPtr + parse.numTokens; tokenPtr < endTokenPtr; tokenPtr = TokenAfter(tokenPtr)) { int length, literal, catchRange, breakJump; - char buf[TCL_UTF_MAX]; + char buf[TCL_UTF_MAX] = ""; JumpFixup startFixup, okFixup, returnFixup, breakFixup; JumpFixup continueFixup, otherFixup, endFixup; diff --git a/generic/tclCompile.c b/generic/tclCompile.c index f6e6b81..d940ff7 100644 --- a/generic/tclCompile.c +++ b/generic/tclCompile.c @@ -1744,7 +1744,7 @@ TclWordKnownAtCompileTime( case TCL_TOKEN_BS: if (tempPtr != NULL) { - char utfBuf[TCL_UTF_MAX]; + char utfBuf[TCL_UTF_MAX] = ""; int length = TclParseBackslash(tokenPtr->start, tokenPtr->size, NULL, utfBuf); @@ -2358,7 +2358,7 @@ TclCompileTokens( { Tcl_DString textBuffer; /* Holds concatenated chars from adjacent * TCL_TOKEN_TEXT, TCL_TOKEN_BS tokens. */ - char buffer[TCL_UTF_MAX]; + char buffer[TCL_UTF_MAX] = ""; int i, numObjsToConcat, length, adjust; unsigned char *entryCodeNext = envPtr->codeNext; #define NUM_STATIC_POS 20 diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 3ae5571..78012f0 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5215,7 +5215,7 @@ TEBCresume( objResultPtr = Tcl_NewStringObj((const char *) valuePtr->bytes+index, 1); } else { - char buf[4]; + char buf[TCL_UTF_MAX] = ""; int ch = Tcl_GetUniChar(valuePtr, index); /* @@ -5228,7 +5228,7 @@ TEBCresume( } else { length = Tcl_UniCharToUtf(ch, buf); if ((ch >= 0xD800) && (length < 3)) { - length = Tcl_UniCharToUtf(-1, buf + length); + length += Tcl_UniCharToUtf(-1, buf + length); } objResultPtr = Tcl_NewStringObj(buf, length); } diff --git a/generic/tclParse.c b/generic/tclParse.c index 8d07f7f..c791585 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -791,7 +791,7 @@ TclParseBackslash( Tcl_UniChar unichar = 0; int result; int count; - char buf[TCL_UTF_MAX]; + char buf[TCL_UTF_MAX] = ""; if (numBytes == 0) { if (readPtr != NULL) { @@ -2151,7 +2151,7 @@ TclSubstTokens( Tcl_Obj *appendObj = NULL; const char *append = NULL; int appendByteLength = 0; - char utfCharBytes[TCL_UTF_MAX]; + char utfCharBytes[TCL_UTF_MAX] = ""; switch (tokenPtr->type) { case TCL_TOKEN_TEXT: diff --git a/generic/tclScan.c b/generic/tclScan.c index 21ad953..acf1a58 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -261,7 +261,7 @@ ValidateFormat( Tcl_UniChar ch = 0; int objIndex, xpgSize, nspace = numVars; int *nassign = TclStackAlloc(interp, nspace * sizeof(int)); - char buf[TCL_UTF_MAX+1]; + char buf[TCL_UTF_MAX+1] = ""; Tcl_Obj *errorMsg; /* Place to build an error messages. Note that * these are messy operations because we do * not want to use the formatting engine; diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 6b63ecb..67c0b08 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -145,12 +145,11 @@ Tcl_UniCharToUtf( if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ - if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) - && ((buf[2] & 0xCF) == 0)) { + if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) { /* Previous Tcl_UniChar was a high surrogate, so combine */ - buf[3] = (char) ((ch & 0x3F) | 0x80); - buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); - return 4; + buf[2] = (char) ((ch & 0x3F) | 0x80); + buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80); + return 3; } /* Previous Tcl_UniChar was not a high surrogate, so just output */ } else { @@ -161,7 +160,7 @@ Tcl_UniCharToUtf( buf[2] = (char) ((ch << 4) & 0x30); buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); - return 0; + return 1; } } goto three; @@ -174,11 +173,14 @@ Tcl_UniCharToUtf( return 4; } } else if (ch == -1) { - if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) - && ((buf[2] & 0xCF) == 0)) { - ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2) - + ((buf[2] & 0x30) >> 4); - goto three; + if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0) + && ((buf[-1] & 0xF8) == 0xF0)) { + ch = 0xD7C0 + ((buf[-1] & 0x07) << 8) + ((buf[0] & 0x3F) << 2) + + ((buf[1] & 0x30) >> 4); + buf[1] = (char) ((ch | 0x80) & 0xBF); + buf[0] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[-1] = (char) ((ch >> 12) | 0xE0); + return 2; } } @@ -302,7 +304,7 @@ Tcl_UtfToUniChar( Tcl_UniChar byte; /* - * Unroll 1 to 3 (or 4) byte UTF-8 sequences. + * Unroll 1 to 4 byte UTF-8 sequences. */ byte = *((unsigned char *) src); @@ -375,13 +377,13 @@ Tcl_UtfToUniChar( * Four-byte-character lead byte followed by three trail bytes. */ #if TCL_UTF_MAX <= 4 - byte = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) + Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) | ((src[2] & 0x3F) >> 4)) - 0x40; - if (byte >= 0x400) { + if (high >= 0x400) { /* out of range, < 0x10000 or > 0x10ffff */ } else { /* produce high surrogate, advance source pointer */ - *chPtr = 0xD800 + byte; + *chPtr = 0xD800 + high; return 1; } #else @@ -778,8 +780,8 @@ Tcl_UniCharAtIndex( fullchar = ch; #if TCL_UTF_MAX <= 4 if ((ch >= 0xD800) && (len < 3)) { - /* If last Tcl_UniChar was an high surrogate, combine with low surrogate */ - (void)TclUtfToUniChar(src + len, &ch); + /* If last Tcl_UniChar was a high surrogate, combine with low surrogate */ + (void)TclUtfToUniChar(src, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } #endif @@ -819,7 +821,7 @@ Tcl_UtfAtIndex( } #if TCL_UTF_MAX <= 4 if ((ch >= 0xD800) && (len < 3)) { - /* Index points at character following High Surrogate */ + /* Index points at character following high Surrogate */ src += TclUtfToUniChar(src, &ch); } #endif diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 3d4298e..4590e8f 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1654,7 +1654,7 @@ Tcl_Backslash( int *readPtr) /* Fill in with number of characters read from * src, unless NULL. */ { - char buf[TCL_UTF_MAX]; + char buf[TCL_UTF_MAX] = ""; Tcl_UniChar ch = 0; Tcl_UtfBackslash(src, readPtr, buf); -- cgit v0.12