diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-04-12 09:26:27 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-04-12 09:26:27 (GMT) |
commit | 7deb6f2082391e5fdb40740e2dd00a14b2e325bb (patch) | |
tree | a37981a71cf24a87295e9901a8793f281816c3a4 | |
parent | c318d73bd33bd73fbdf12d1fb7fb5a53a5675082 (diff) | |
parent | b1e4f0fa23663a2ced0099c6d2d0bdb4ff286523 (diff) | |
download | tcl-7deb6f2082391e5fdb40740e2dd00a14b2e325bb.zip tcl-7deb6f2082391e5fdb40740e2dd00a14b2e325bb.tar.gz tcl-7deb6f2082391e5fdb40740e2dd00a14b2e325bb.tar.bz2 |
Fix [7f1162a867]. Don't compile "string cat" any more, since there still is a bug in it, demonstrated by utf-1.18 testcase
-rw-r--r-- | generic/tclCmdMZ.c | 2 | ||||
-rw-r--r-- | generic/tclStringObj.c | 43 | ||||
-rw-r--r-- | tests/append.test | 29 | ||||
-rw-r--r-- | tests/format.test | 6 | ||||
-rw-r--r-- | tests/string.test | 12 | ||||
-rw-r--r-- | tests/utf.test | 57 |
6 files changed, 123 insertions, 26 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d020a93..7d84c86 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -3307,7 +3307,7 @@ TclInitStringCmd( { static const EnsembleImplMap stringImplMap[] = { {"bytelength", StringBytesCmd, TclCompileBasic1ArgCmd, NULL, NULL, 0}, - {"cat", StringCatCmd, TclCompileStringCatCmd, NULL, NULL, 0}, + {"cat", StringCatCmd, NULL/*TclCompileStringCatCmd*/, NULL, NULL, 0}, {"compare", StringCmpCmd, TclCompileStringCmpCmd, NULL, NULL, 0}, {"equal", StringEqualCmd, TclCompileStringEqualCmd, NULL, NULL, 0}, {"first", StringFirstCmd, TclCompileStringFirstCmd, NULL, NULL, 0}, diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index b557af0..78a47e3 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -70,6 +70,11 @@ static void SetUnicodeObj(Tcl_Obj *objPtr, static int UnicodeLength(const Tcl_UniChar *unicode); static void UpdateStringOfString(Tcl_Obj *objPtr); +#define ISCONTINUATION(bytes) ((bytes) \ + && ((((bytes)[0] & 0xC0) == 0x80) || (((bytes)[0] == '\xED') \ + && (((bytes)[1] & 0xF0) == 0xB0) && (((bytes)[2] & 0xC0) == 0x80)))) + + /* * The structure below defines the string Tcl object type by means of * functions that can be invoked by generic object code. @@ -1218,6 +1223,11 @@ Tcl_AppendLimitedToObj( SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); + /* If appended string starts with a continuation byte or a lower surrogate, + * force objPtr to unicode representation. See [7f1162a867] */ + if (ISCONTINUATION(bytes)) { + Tcl_GetUnicode(objPtr); + } if (stringPtr->hasUnicode && stringPtr->numChars > 0) { AppendUtfToUnicodeRep(objPtr, bytes, toCopy); } else { @@ -1415,6 +1425,12 @@ Tcl_AppendObjToObj( SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); + /* If appended string starts with a continuation byte or a lower surrogate, + * force objPtr to unicode representation. See [7f1162a867] + * This fixes append-3.4, append-3.7 and utf-1.18 testcases. */ + if (ISCONTINUATION(appendObjPtr->bytes)) { + Tcl_GetUnicode(objPtr); + } /* * If objPtr has a valid Unicode rep, then get a Unicode string from * appendObjPtr and append it. @@ -3032,7 +3048,7 @@ TclStringCat( { Tcl_Obj *objResultPtr, * const *ov; int oc, length = 0, binary = 1; - int allowUniChar = 1, requestUniChar = 0; + int allowUniChar = 1, requestUniChar = 0, forceUniChar = 0; int first = objc - 1; /* Index of first value possibly not empty */ int last = 0; /* Index of last value possibly not empty */ int inPlace = flags & TCL_STRING_IN_PLACE; @@ -3068,7 +3084,9 @@ TclStringCat( */ binary = 0; - if ((objPtr->typePtr) && (objPtr->typePtr != &tclStringType)) { + if (ov > objv+1 && ISCONTINUATION(objPtr->bytes)) { + forceUniChar = 1; + } else if ((objPtr->typePtr) && (objPtr->typePtr != &tclStringType)) { /* Prevent shimmer of non-string types. */ allowUniChar = 0; } @@ -3117,7 +3135,7 @@ TclStringCat( } } } while (--oc); - } else if (allowUniChar && requestUniChar) { + } else if ((allowUniChar && requestUniChar) || forceUniChar) { /* * Result will be pure Tcl_UniChar array. Pre-size it. */ @@ -3270,7 +3288,7 @@ TclStringCat( dst += more; } } - } else if (allowUniChar && requestUniChar) { + } else if ((allowUniChar && requestUniChar) || forceUniChar) { /* Efficiently produce a pure Tcl_UniChar array result */ Tcl_UniChar *dst; @@ -4145,9 +4163,22 @@ ExtendUnicodeRepWithString( } else { numAppendChars = 0; } - for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) { + dst = stringPtr->unicode + numOrigChars; + if (numAppendChars-- > 0) { bytes += TclUtfToUniChar(bytes, &unichar); - *dst = unichar; +#if TCL_UTF_MAX > 3 + /* join upper/lower surrogate */ + if (bytes && (stringPtr->unicode[numOrigChars - 1] | 0x3FF) == 0xDBFF && (unichar | 0x3FF) == 0xDFFF) { + stringPtr->numChars--; + unichar = ((stringPtr->unicode[numOrigChars - 1] & 0x3FF) << 10) + (unichar & 0x3FF) + 0x10000; + dst--; + } +#endif + *dst++ = unichar; + while (numAppendChars-- > 0) { + bytes += TclUtfToUniChar(bytes, &unichar); + *dst++ = unichar; + } } *dst = 0; } diff --git a/tests/append.test b/tests/append.test index a174615..7d895b7 100644 --- a/tests/append.test +++ b/tests/append.test @@ -53,6 +53,35 @@ test append-3.3 {append errors} -returnCodes error -body { unset -nocomplain x append x } -result {can't read "x": no such variable} +test append-3.4 {append surrogates} -body { + set x \uD83D + append x \uDE02 +} -result \uD83D\uDE02 +test append-3.5 {append surrogates} -body { + set x \uD83D + set x $x\uDE02 +} -result \uD83D\uDE02 +test append-3.6 {append surrogates} -body { + set x \uDE02 + set x \uD83D$x +} -result \uD83D\uDE02 +test append-3.7 {append \xC0 \x80} -body { + set x [testbytestring \xC0] + string length [append x [testbytestring \x80]] +} -result 2 +test append-3.8 {append \xC0 \x80} -body { + set x [testbytestring \xC0] + string length $x[testbytestring \x80] +} -result 2 +test append-3.9 {append \xC0 \x80} -body { + set x [testbytestring \x80] + string length [testbytestring \xC0]$x +} -result 2 +test append-3.10 {append surrogates} -body { + set x \uD83D + string range $x 0 end + append x \uDE02 +} -result [string range \uD83D\uDE02 0 end] test append-4.1 {lappend command} { unset -nocomplain x diff --git a/tests/format.test b/tests/format.test index c7a8a10..41918b2 100644 --- a/tests/format.test +++ b/tests/format.test @@ -143,6 +143,12 @@ test format-2.16 {string formatting, width and precision} { test format-2.17 {string formatting, width and precision} { format "a%5.7sa" foobarbaz } "afoobarba" +test format-2.18 {string formatting, surrogates} { + format "\uD83D%s" \uDE02 +} \uD83D\uDE02 +test format-2.19 {string formatting, surrogates} { + format "%s\uDE02" \uD83D +} \uD83D\uDE02 test format-3.1 {Tcl_FormatObjCmd: character formatting} { format "|%c|%0c|%-1c|%1c|%-6c|%6c|%*c|%*c|" 65 65 65 65 65 65 3 65 -4 65 diff --git a/tests/string.test b/tests/string.test index b55a497..c2d47d3 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1614,6 +1614,18 @@ test string-14.20.$noComp {string replace} { run {string replace [makeByteArray abcdefghijklmnop] end-10 end-2\ [makeByteArray NEW]} } {abcdeNEWop} +test string-14.21.$noComp {string replace (surrogates)} { + run {string replace \uD83D? 1 end \uDE02} +} \uD83D\uDE02 +test string-14.22.$noComp {string replace (surrogates)} { + run {string replace ?\uDE02 0 end-1 \uD83D} +} \uD83D\uDE02 +test string-14.23.$noComp {string replace \xC0 \x80} testbytestring { + run {string length [string replace [testbytestring \xC0]? 1 end [testbytestring \x80]]} +} 2 +test string-14.24.$noComp {string replace \xC0 \x80} testbytestring { + run {string length [string replace ?[testbytestring \x80] 0 end-1 [testbytestring \xC0]]} +} 2 test stringComp-14.21.$noComp {Bug 82e7f67325} { diff --git a/tests/utf.test b/tests/utf.test index 3262214..477216c 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -84,6 +84,25 @@ test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc ucs2} { test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} { expr {"\UD842" eq [testbytestring \xEF\xBF\xBD]} } 1 +test utf-1.14 {Tcl_UniCharToUtf: surrogate pairs from concat} { + set lo \uDE02 + return \uD83D$lo +} \uD83D\uDE02 +test utf-1.15 {Tcl_UniCharToUtf: surrogate pairs from concat} { + set hi \uD83D + return $hi\uDE02 +} \uD83D\uDE02 +test utf-1.16 {Tcl_UniCharToUtf: \xC0 + \x80} testbytestring { + set lo [testbytestring \x80] + string length [testbytestring \xC0]$lo +} 2 +test utf-1.17 {Tcl_UniCharToUtf: \xC0 + \x80} testbytestring { + set hi [testbytestring \xC0] + string length $hi[testbytestring \x80] +} 2 +test utf-1.18 {Tcl_UniCharToUtf: surrogate pairs from concat} { + string cat \uD83D \uDE02 +} \uD83D\uDE02 test utf-2.1 {Tcl_UtfToUniChar: low ascii} { string length "abc" @@ -204,7 +223,7 @@ test utf-6.3 {Tcl_UtfNext} testutfnext { testutfnext AA } 1 test utf-6.4 {Tcl_UtfNext} {testutfnext testbytestring} { - testutfnext A[testbytestring \xA0] + testutfnext [testbytestring A\xA0] } 1 test utf-6.5 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext A[testbytestring \xD0] @@ -390,7 +409,7 @@ test utf-6.62 {Tcl_UtfNext} testutfnext { testutfnext 蠠G } 3 test utf-6.63 {Tcl_UtfNext} {testutfnext testbytestring} { - testutfnext 蠠[testbytestring \xA0] + testutfnext [testbytestring \xE8\xA0\xA0\xA0] } 3 test utf-6.64 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext 蠠[testbytestring \xD0] @@ -565,7 +584,7 @@ test utf-7.6 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xE8] } 1 test utf-7.6.1 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A蠠[testbytestring \xA0] 2 + testutfprev A[testbytestring \xE8\xA0\xA0\xA0] 2 } 1 test utf-7.6.2 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xE8\xF8\xA0\xA0] 2 @@ -580,13 +599,13 @@ test utf-7.7.2 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xD0\xF8\xA0\xA0] 2 } 1 test utf-7.8 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xA0] + testutfprev [testbytestring A\xA0] } 1 test utf-7.8.1 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xA0\xA0\xA0\xA0] 2 + testutfprev [testbytestring A\xA0\xA0\xA0\xA0] 2 } 1 test utf-7.8.2 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xA0\xF8\xA0\xA0] 2 + testutfprev [testbytestring A\xA0\xF8\xA0\xA0] 2 } 1 test utf-7.9 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xF8\xA0] @@ -619,7 +638,7 @@ test utf-7.11 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xE8\xA0] } 1 test utf-7.11.1 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A蠠[testbytestring \xA0] 3 + testutfprev A[testbytestring \xE8\xA0\xA0\xA0] 3 } 1 test utf-7.11.2 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xE8\xA0\xF8\xA0] 3 @@ -637,13 +656,13 @@ test utf-7.12.2 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xD0\xA0\xF8\xA0] 3 } 1 test utf-7.13 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xA0\xA0] + testutfprev [testbytestring A\xA0\xA0] } 2 test utf-7.13.1 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xA0\xA0\xA0\xA0] 3 + testutfprev [testbytestring A\xA0\xA0\xA0\xA0] 3 } 2 test utf-7.13.2 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xA0\xA0\xF8\xA0] 3 + testutfprev [testbytestring A\xA0\xA0\xF8\xA0] 3 } 2 test utf-7.14 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xF8\xA0\xA0] @@ -676,7 +695,7 @@ test utf-7.16 {Tcl_UtfPrev} testutfprev { testutfprev A蠠 } 1 test utf-7.16.1 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A蠠[testbytestring \xA0] 4 + testutfprev A[testbytestring \xE8\xA0\xA0\xA0] 4 } 1 test utf-7.16.2 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A蠠[testbytestring \xF8] 4 @@ -691,31 +710,31 @@ test utf-7.17.2 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xD0\xA0\xA0\xF8] 4 } 3 test utf-7.18.0 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xA0\xA0\xA0] + testutfprev [testbytestring A\xA0\xA0\xA0] } 3 test utf-7.18.1 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xA0\xA0\xA0\xA0] 4 + testutfprev [testbytestring A\xA0\xA0\xA0\xA0] 4 } 3 test utf-7.18.2 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xA0\xA0\xA0\xF8] 4 + testutfprev [testbytestring A\xA0\xA0\xA0\xF8] 4 } 3 test utf-7.19 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xF8\xA0\xA0\xA0] + testutfprev [testbytestring A\xF8\xA0\xA0\xA0] } 4 test utf-7.20.0 {Tcl_UtfPrev} {testutfprev testbytestring ucs2} { - testutfprev A[testbytestring \xF2\xA0\xA0\xA0] + testutfprev [testbytestring A\xF2\xA0\xA0\xA0] } 4 test utf-7.20.1 {Tcl_UtfPrev} {testutfprev testbytestring fullutf} { - testutfprev A[testbytestring \xF2\xA0\xA0\xA0] + testutfprev [testbytestring A\xF2\xA0\xA0\xA0] } 1 test utf-7.21 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A蠠[testbytestring \xA0] + testutfprev A[testbytestring \xE8\xA0\xA0\xA0] } 4 test utf-7.22 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xD0\xA0\xA0\xA0] } 4 test utf-7.23 {Tcl_UtfPrev} {testutfprev testbytestring} { - testutfprev A[testbytestring \xA0\xA0\xA0\xA0] + testutfprev [testbytestring A\xA0\xA0\xA0\xA0] } 4 test utf-7.24 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xC0\x81] |