From 9501890b6c738830781eebe5d8bdcff2d6a0068c Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 2 May 2020 22:48:20 +0000 Subject: Join test-cases utf-6.93.0 and utf-6.93.1, which MUST give the same answer always for whatever testConstraints. Fix one invalid use of TclUCS4Complete(), and let TclUtfToUCS4() handle (invalid) 4-byte sequences. Test-case cleanup (removal of unnecessary quoting) --- generic/tclEncoding.c | 2 +- generic/tclUtf.c | 2 +- tests/utf.test | 59 ++++++++++++++++++++++++--------------------------- 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 1584de0..5c7aab8 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2341,7 +2341,7 @@ UtfToUtfProc( *dst++ = 0; *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!TclUCS4Complete(src, srcEnd - src)) { + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* * Always check before using TclUtfToUniChar. Not doing can so * cause it run beyond the end of the buffer! If we happen such an diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 9ffbfba..160e444 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2360,7 +2360,7 @@ TclUtfToUCS4( len = TclUtfToUniChar(src, &ch); fullchar = ch; -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 /* 4-byte UTF-8 is supported; decode surrogates */ if ((ch >= 0xD800) && len < 3) { diff --git a/tests/utf.test b/tests/utf.test index 0929801..2bfb9ea 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -496,10 +496,7 @@ test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { testutfnext \xA0\xA0\xA0 } 1 -test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} { - testutfnext \x80\x80\x80 -} 1 -test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} { +test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { testutfnext \x80\x80\x80 } 1 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { @@ -977,37 +974,37 @@ test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { } a test utf-8.2 {Tcl_UniCharAtIndex: index = 0} { string index \u4E4E\u25A 0 -} "\u4E4E" +} \u4E4E test utf-8.3 {Tcl_UniCharAtIndex: index > 0} { string index abcd 2 } c test utf-8.4 {Tcl_UniCharAtIndex: index > 0} { string index \u4E4E\u25A\xFF\u543 2 -} "\uFF" +} \uFF test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 { string index \uD842 0 -} "\uD842" +} \uD842 test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 { string index \uD842 0 -} "\uD842" +} \uD842 test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} tip389 { string index \uD842 0 -} "\uD842" +} \uD842 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { string index \uDC42 0 -} "\uDC42" +} \uDC42 test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 0 -} "\uD83D" +} \uD83D test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 0 -} "\U1F600" +} \U1F600 test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} tip389 { string index \uD83D\uDE00G 0 -} "\U1F600" +} \U1F600 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 1 -} "\uDE00" +} \uDE00 test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 1 } G @@ -1025,13 +1022,13 @@ test utf-8.9.2 {Tcl_UniCharAtIndex: Emoji} tip389 { } G test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { string index \U1F600G 0 -} "\uFFFD" +} \uFFFD test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { string index \U1F600G 0 -} "\U1F600" +} \U1F600 test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} { string index \U1F600G 0 -} "\U1F600" +} \U1F600 test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { string index \U1F600G 1 } G @@ -1056,22 +1053,22 @@ test utf-9.1 {Tcl_UtfAtIndex: index = 0} { } abc test utf-9.2 {Tcl_UtfAtIndex: index > 0} { string range \u4E4E\u25A\xFF\u543klmnop 1 5 -} "\u25A\xFF\u543kl" +} \u25A\xFF\u543kl test utf-9.3.0 {Tcl_UtfAtIndex: index = 0, Emoji} ucs2 { string range \uD83D\uDE00G 0 0 -} "\uD83D" +} \uD83D test utf-9.3.1 {Tcl_UtfAtIndex: index = 0, Emoji} ucs4 { string range \uD83D\uDE00G 0 0 -} "\U1F600" +} \U1F600 test utf-9.3.2 {Tcl_UtfAtIndex: index = 0, Emoji} tip389 { string range \uD83D\uDE00G 0 0 -} "\U1F600" +} \U1F600 test utf-9.4.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 { string range \uD83D\uDE00G 1 1 -} "\uDE00" +} \uDE00 test utf-9.4.1 {Tcl_UtfAtIndex: index > 0, Emoji} ucs4 { string range \uD83D\uDE00G 1 1 -} "G" +} G test utf-9.4.2 {Tcl_UtfAtIndex: index > 0, Emoji} tip389 { string range \uD83D\uDE00G 1 1 } {} @@ -1086,19 +1083,19 @@ test utf-9.5.2 {Tcl_UtfAtIndex: index > 0, Emoji} tip389 { } G test utf-9.6.0 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs2} { string range \U1f600G 0 0 -} "\uFFFD" +} \uFFFD test utf-9.6.1 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs4} { string range \U1f600G 0 0 -} "\U1F600" +} \U1F600 test utf-9.6.2 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc tip389} { string range \U1f600G 0 0 -} "\U1F600" +} \U1F600 test utf-9.7.0 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs2} { string range \U1f600G 1 1 } G test utf-9.7.1 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs4} { string range \U1f600G 1 1 -} "G" +} G test utf-9.7.2 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc tip389} { string range \U1f600G 1 1 } {} @@ -1182,7 +1179,7 @@ bsCheck \uA 10 bsCheck \340 224 bsCheck \uA1 161 bsCheck \u4E21 20001 -bsCheck \741 225 pre388 ;# == \341 +bsCheck \741 225 pre388 ;# == \341 bsCheck \741 60 !pre388 ;# == \74 1 bsCheck \U 85 bsCheck \Uk 85 @@ -1344,7 +1341,7 @@ test utf-20.2 {[4c591fa487] TclUniCharNcmp/TclUtfNcmp} knownBug { set two [format %c 0x10000] set first [string compare $one $two] string range $one 0 0 - string range $two 0 0 + string range $two 0 0 set second [string compare $one $two] expr {($first == $second) ? "agree" : "disagree"} } agree @@ -1466,9 +1463,9 @@ UniCharCaseCmpTest < a b UniCharCaseCmpTest > b a UniCharCaseCmpTest > B a UniCharCaseCmpTest > aBcB abca -UniCharCaseCmpTest < \uFFFF [format %c 0x10000] ucs4 +UniCharCaseCmpTest < \uFFFF [format %c 0x10000] ucs4 UniCharCaseCmpTest < \uFFFF \U10000 {Uesc ucs4} -UniCharCaseCmpTest > [format %c 0x10000] \uFFFF ucs4 +UniCharCaseCmpTest > [format %c 0x10000] \uFFFF ucs4 UniCharCaseCmpTest > \U10000 \uFFFF {Uesc ucs4} -- cgit v0.12 From c17661c31e3f4fac5a70dd487b4c9b3372ee5e5b Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 3 May 2020 22:16:21 +0000 Subject: Re-join utf-6.93.0 and utf-6.93.1 (please disregard comment in previous commit, it was not correct). Perfectionalize TclUtfToUCS4()/TclUCS4Complete() and new (internal) function TclUCS4ToUtf(). They can help preventing bugs regarding splitting/joining surrogates. Used them in a few more places. --- generic/tclBinary.c | 16 ++++++------- generic/tclCmdMZ.c | 9 ++------ generic/tclInt.h | 9 +++++++- generic/tclParse.c | 23 ++++++++----------- generic/tclUtf.c | 66 +++++++++++++++++++++++++++++++++++++++++++---------- tests/utf.test | 5 +++- 6 files changed, 86 insertions(+), 42 deletions(-) diff --git a/generic/tclBinary.c b/generic/tclBinary.c index 6306159..52ef457 100644 --- a/generic/tclBinary.c +++ b/generic/tclBinary.c @@ -1222,11 +1222,11 @@ BinaryFormatCmd( badField: { - Tcl_UniChar ch = 0; - char buf[TCL_UTF_MAX + 1] = ""; + int ch; + char buf[8] = ""; - TclUtfToUniChar(errorString, &ch); - buf[Tcl_UniCharToUtf(ch, buf)] = '\0'; + TclUtfToUCS4(errorString, &ch); + buf[TclUCS4ToUtf(ch, buf)] = '\0'; Tcl_SetObjResult(interp, Tcl_ObjPrintf( "bad field specifier \"%s\"", buf)); return TCL_ERROR; @@ -1592,11 +1592,11 @@ BinaryScanCmd( badField: { - Tcl_UniChar ch = 0; - char buf[TCL_UTF_MAX + 1] = ""; + int ch; + char buf[8] = ""; - TclUtfToUniChar(errorString, &ch); - buf[Tcl_UniCharToUtf(ch, buf)] = '\0'; + TclUtfToUCS4(errorString, &ch); + buf[TclUCS4ToUtf(ch, buf)] = '\0'; Tcl_SetObjResult(interp, Tcl_ObjPrintf( "bad field specifier \"%s\"", buf)); return TCL_ERROR; diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 162a5a6..011164b 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1413,14 +1413,9 @@ StringIndexCmd( Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1)); } else { - char buf[TCL_UTF_MAX] = ""; + char buf[8] = ""; - length = Tcl_UniCharToUtf(ch, buf); -#if TCL_UTF_MAX > 3 - if ((ch >= 0xD800) && (length < 3)) { - length += Tcl_UniCharToUtf(-1, buf + length); - } -#endif + length = TclUCS4ToUtf(ch, buf); Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length)); } } diff --git a/generic/tclInt.h b/generic/tclInt.h index 593d878..6f024a6 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3184,8 +3184,15 @@ MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); +MODULE_SCOPE int TclUCS4ToUtf(int, char *); + +/* + * Bytes F0-F4 are start-bytes for 4-byte sequences. + * Byte 0xED can be the start-byte of an upper surrogate. In that case, + * TclUtfToUCS4() might read the lower surrogate following it too. + */ # define TclUCS4Complete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \ - ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length))) + ? ((length) >= 4) : (UCHAR(*(src)) == 0xED) ? ((length) >= 6) : Tcl_UtfCharComplete((src), (length))) MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); MODULE_SCOPE int TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr, diff --git a/generic/tclParse.c b/generic/tclParse.c index 7beaeea..23a07cf 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -843,7 +843,6 @@ TclParseBackslash( * written there. */ { register const char *p = src+1; - Tcl_UniChar unichar = 0; int result; int count; char buf[TCL_UTF_MAX] = ""; @@ -943,7 +942,7 @@ TclParseBackslash( * No hexdigits -> This is just "U". */ result = 'U'; - } else if ((result | 0x7FF) == 0xDFFF) { + } else if ((result & ~0x7FF) == 0xD800) { /* Upper or lower surrogate, not allowed in this syntax. */ result = 0xFFFD; } @@ -991,16 +990,15 @@ TclParseBackslash( * #217987] test subst-3.2 */ - if (Tcl_UtfCharComplete(p, numBytes - 1)) { - count = TclUtfToUniChar(p, &unichar) + 1; /* +1 for '\' */ + if (TclUCS4Complete(p, numBytes - 1)) { + count = TclUtfToUCS4(p, &result) + 1; /* +1 for '\' */ } else { - char utfBytes[TCL_UTF_MAX]; + char utfBytes[8]; - memcpy(utfBytes, p, (size_t) (numBytes - 1)); + memcpy(utfBytes, p, numBytes - 1); utfBytes[numBytes - 1] = '\0'; - count = TclUtfToUniChar(utfBytes, &unichar) + 1; + count = TclUtfToUCS4(utfBytes, &result) + 1; } - result = unichar; break; } @@ -1008,13 +1006,12 @@ TclParseBackslash( if (readPtr != NULL) { *readPtr = count; } - count = Tcl_UniCharToUtf(result, dst); -#if TCL_UTF_MAX > 3 - if ((result >= 0xD800) && (count < 3)) { - count += Tcl_UniCharToUtf(-1, dst + count); +#if TCL_UTF_MAX < 4 + if (result > 0xFFFF) { + result = 0xFFFD; } #endif - return count; + return TclUCS4ToUtf(result, dst); } /* diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 160e444..a2080dd 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2335,7 +2335,7 @@ TclUniCharMatch( * routine does not run off the end and dereference non-existent memory * looking for trail bytes. If the source buffer is known to be '\0' * terminated, this cannot happen. Otherwise, the caller should call - * Tcl_UtfCharComplete() before calling this routine to ensure that + * TclUCS4Complete() before calling this routine to ensure that * enough bytes remain in the string. * * Results: @@ -2354,26 +2354,68 @@ TclUtfToUCS4( int *ucs4Ptr) /* Filled with the UCS4 codepoint represented * by the UTF-8 string. */ { - int len, fullchar; Tcl_UniChar ch = 0; - - len = TclUtfToUniChar(src, &ch); - fullchar = ch; + int len = Tcl_UtfToUniChar(src, &ch); #if TCL_UTF_MAX <= 4 - /* 4-byte UTF-8 is supported; decode surrogates */ - - if ((ch >= 0xD800) && len < 3) { - len += Tcl_UtfToUniChar(src + len, &ch); - fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000; + if ((ch & ~0x3FF) == 0xD800) { + Tcl_UniChar low = ch; + int len2 = Tcl_UtfToUniChar(src+len, &low); + if ((low & ~0x3FF) == 0xDC00) { + *ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000; + return len + len2; + } } #endif - - *ucs4Ptr = fullchar; + *ucs4Ptr = (int)ch; return len; } /* + *--------------------------------------------------------------------------- + * + * TclUCS4ToUtf -- + * + * Store the given Unicode character as a sequence of UTF-8 bytes in the + * provided buffer. Might output 6 bytes, if the code point > 0xFFFF. + * + * Results: + * The return values is the number of bytes in the buffer that were + * consumed. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +TclUCS4ToUtf( + int ch, /* Unicode character to be stored in the + * buffer. */ + char *buf) /* Buffer in which the UTF-8 representation of + * the Unicode character is stored. Buffer must be + * large enough to hold the UTF-8 character(s) + * (at most 6 bytes). */ +{ +#if TCL_UTF_MAX <= 4 + if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) { + /* Spit out a 4-byte UTF-8 character or 2 x 3-byte UTF-8 characters, depending on Tcl + * version and/or TCL_UTF_MAX build value */ + int len = Tcl_UniCharToUtf(0xD800 | ((ch - 0x10000) >> 10), buf); + return len + Tcl_UniCharToUtf(0xDC00 | (ch & 0x7FF), buf + len); + } +#endif + if ((ch & ~0x7FF) == 0xD800) { + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; + } + return Tcl_UniCharToUtf(ch, buf); +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 diff --git a/tests/utf.test b/tests/utf.test index 2bfb9ea..c0fed6f 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -496,7 +496,10 @@ test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { testutfnext \xA0\xA0\xA0 } 1 -test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { +test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} { + testutfnext \x80\x80\x80 +} 1 +test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} { testutfnext \x80\x80\x80 } 1 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { -- cgit v0.12