summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2021-04-12 09:26:27 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2021-04-12 09:26:27 (GMT)
commit7deb6f2082391e5fdb40740e2dd00a14b2e325bb (patch)
treea37981a71cf24a87295e9901a8793f281816c3a4
parentc318d73bd33bd73fbdf12d1fb7fb5a53a5675082 (diff)
parentb1e4f0fa23663a2ced0099c6d2d0bdb4ff286523 (diff)
downloadtcl-7deb6f2082391e5fdb40740e2dd00a14b2e325bb.zip
tcl-7deb6f2082391e5fdb40740e2dd00a14b2e325bb.tar.gz
tcl-7deb6f2082391e5fdb40740e2dd00a14b2e325bb.tar.bz2
Fix [7f1162a867]. Don't compile "string cat" any more, since there still is a bug in it, demonstrated by utf-1.18 testcase
-rw-r--r--generic/tclCmdMZ.c2
-rw-r--r--generic/tclStringObj.c43
-rw-r--r--tests/append.test29
-rw-r--r--tests/format.test6
-rw-r--r--tests/string.test12
-rw-r--r--tests/utf.test57
6 files changed, 123 insertions, 26 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index d020a93..7d84c86 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -3307,7 +3307,7 @@ TclInitStringCmd(
{
static const EnsembleImplMap stringImplMap[] = {
{"bytelength", StringBytesCmd, TclCompileBasic1ArgCmd, NULL, NULL, 0},
- {"cat", StringCatCmd, TclCompileStringCatCmd, NULL, NULL, 0},
+ {"cat", StringCatCmd, NULL/*TclCompileStringCatCmd*/, NULL, NULL, 0},
{"compare", StringCmpCmd, TclCompileStringCmpCmd, NULL, NULL, 0},
{"equal", StringEqualCmd, TclCompileStringEqualCmd, NULL, NULL, 0},
{"first", StringFirstCmd, TclCompileStringFirstCmd, NULL, NULL, 0},
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index b557af0..78a47e3 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -70,6 +70,11 @@ static void SetUnicodeObj(Tcl_Obj *objPtr,
static int UnicodeLength(const Tcl_UniChar *unicode);
static void UpdateStringOfString(Tcl_Obj *objPtr);
+#define ISCONTINUATION(bytes) ((bytes) \
+ && ((((bytes)[0] & 0xC0) == 0x80) || (((bytes)[0] == '\xED') \
+ && (((bytes)[1] & 0xF0) == 0xB0) && (((bytes)[2] & 0xC0) == 0x80))))
+
+
/*
* The structure below defines the string Tcl object type by means of
* functions that can be invoked by generic object code.
@@ -1218,6 +1223,11 @@ Tcl_AppendLimitedToObj(
SetStringFromAny(NULL, objPtr);
stringPtr = GET_STRING(objPtr);
+ /* If appended string starts with a continuation byte or a lower surrogate,
+ * force objPtr to unicode representation. See [7f1162a867] */
+ if (ISCONTINUATION(bytes)) {
+ Tcl_GetUnicode(objPtr);
+ }
if (stringPtr->hasUnicode && stringPtr->numChars > 0) {
AppendUtfToUnicodeRep(objPtr, bytes, toCopy);
} else {
@@ -1415,6 +1425,12 @@ Tcl_AppendObjToObj(
SetStringFromAny(NULL, objPtr);
stringPtr = GET_STRING(objPtr);
+ /* If appended string starts with a continuation byte or a lower surrogate,
+ * force objPtr to unicode representation. See [7f1162a867]
+ * This fixes append-3.4, append-3.7 and utf-1.18 testcases. */
+ if (ISCONTINUATION(appendObjPtr->bytes)) {
+ Tcl_GetUnicode(objPtr);
+ }
/*
* If objPtr has a valid Unicode rep, then get a Unicode string from
* appendObjPtr and append it.
@@ -3032,7 +3048,7 @@ TclStringCat(
{
Tcl_Obj *objResultPtr, * const *ov;
int oc, length = 0, binary = 1;
- int allowUniChar = 1, requestUniChar = 0;
+ int allowUniChar = 1, requestUniChar = 0, forceUniChar = 0;
int first = objc - 1; /* Index of first value possibly not empty */
int last = 0; /* Index of last value possibly not empty */
int inPlace = flags & TCL_STRING_IN_PLACE;
@@ -3068,7 +3084,9 @@ TclStringCat(
*/
binary = 0;
- if ((objPtr->typePtr) && (objPtr->typePtr != &tclStringType)) {
+ if (ov > objv+1 && ISCONTINUATION(objPtr->bytes)) {
+ forceUniChar = 1;
+ } else if ((objPtr->typePtr) && (objPtr->typePtr != &tclStringType)) {
/* Prevent shimmer of non-string types. */
allowUniChar = 0;
}
@@ -3117,7 +3135,7 @@ TclStringCat(
}
}
} while (--oc);
- } else if (allowUniChar && requestUniChar) {
+ } else if ((allowUniChar && requestUniChar) || forceUniChar) {
/*
* Result will be pure Tcl_UniChar array. Pre-size it.
*/
@@ -3270,7 +3288,7 @@ TclStringCat(
dst += more;
}
}
- } else if (allowUniChar && requestUniChar) {
+ } else if ((allowUniChar && requestUniChar) || forceUniChar) {
/* Efficiently produce a pure Tcl_UniChar array result */
Tcl_UniChar *dst;
@@ -4145,9 +4163,22 @@ ExtendUnicodeRepWithString(
} else {
numAppendChars = 0;
}
- for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) {
+ dst = stringPtr->unicode + numOrigChars;
+ if (numAppendChars-- > 0) {
bytes += TclUtfToUniChar(bytes, &unichar);
- *dst = unichar;
+#if TCL_UTF_MAX > 3
+ /* join upper/lower surrogate */
+ if (bytes && (stringPtr->unicode[numOrigChars - 1] | 0x3FF) == 0xDBFF && (unichar | 0x3FF) == 0xDFFF) {
+ stringPtr->numChars--;
+ unichar = ((stringPtr->unicode[numOrigChars - 1] & 0x3FF) << 10) + (unichar & 0x3FF) + 0x10000;
+ dst--;
+ }
+#endif
+ *dst++ = unichar;
+ while (numAppendChars-- > 0) {
+ bytes += TclUtfToUniChar(bytes, &unichar);
+ *dst++ = unichar;
+ }
}
*dst = 0;
}
diff --git a/tests/append.test b/tests/append.test
index a174615..7d895b7 100644
--- a/tests/append.test
+++ b/tests/append.test
@@ -53,6 +53,35 @@ test append-3.3 {append errors} -returnCodes error -body {
unset -nocomplain x
append x
} -result {can't read "x": no such variable}
+test append-3.4 {append surrogates} -body {
+ set x \uD83D
+ append x \uDE02
+} -result \uD83D\uDE02
+test append-3.5 {append surrogates} -body {
+ set x \uD83D
+ set x $x\uDE02
+} -result \uD83D\uDE02
+test append-3.6 {append surrogates} -body {
+ set x \uDE02
+ set x \uD83D$x
+} -result \uD83D\uDE02
+test append-3.7 {append \xC0 \x80} -body {
+ set x [testbytestring \xC0]
+ string length [append x [testbytestring \x80]]
+} -result 2
+test append-3.8 {append \xC0 \x80} -body {
+ set x [testbytestring \xC0]
+ string length $x[testbytestring \x80]
+} -result 2
+test append-3.9 {append \xC0 \x80} -body {
+ set x [testbytestring \x80]
+ string length [testbytestring \xC0]$x
+} -result 2
+test append-3.10 {append surrogates} -body {
+ set x \uD83D
+ string range $x 0 end
+ append x \uDE02
+} -result [string range \uD83D\uDE02 0 end]
test append-4.1 {lappend command} {
unset -nocomplain x
diff --git a/tests/format.test b/tests/format.test
index c7a8a10..41918b2 100644
--- a/tests/format.test
+++ b/tests/format.test
@@ -143,6 +143,12 @@ test format-2.16 {string formatting, width and precision} {
test format-2.17 {string formatting, width and precision} {
format "a%5.7sa" foobarbaz
} "afoobarba"
+test format-2.18 {string formatting, surrogates} {
+ format "\uD83D%s" \uDE02
+} \uD83D\uDE02
+test format-2.19 {string formatting, surrogates} {
+ format "%s\uDE02" \uD83D
+} \uD83D\uDE02
test format-3.1 {Tcl_FormatObjCmd: character formatting} {
format "|%c|%0c|%-1c|%1c|%-6c|%6c|%*c|%*c|" 65 65 65 65 65 65 3 65 -4 65
diff --git a/tests/string.test b/tests/string.test
index b55a497..c2d47d3 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -1614,6 +1614,18 @@ test string-14.20.$noComp {string replace} {
run {string replace [makeByteArray abcdefghijklmnop] end-10 end-2\
[makeByteArray NEW]}
} {abcdeNEWop}
+test string-14.21.$noComp {string replace (surrogates)} {
+ run {string replace \uD83D? 1 end \uDE02}
+} \uD83D\uDE02
+test string-14.22.$noComp {string replace (surrogates)} {
+ run {string replace ?\uDE02 0 end-1 \uD83D}
+} \uD83D\uDE02
+test string-14.23.$noComp {string replace \xC0 \x80} testbytestring {
+ run {string length [string replace [testbytestring \xC0]? 1 end [testbytestring \x80]]}
+} 2
+test string-14.24.$noComp {string replace \xC0 \x80} testbytestring {
+ run {string length [string replace ?[testbytestring \x80] 0 end-1 [testbytestring \xC0]]}
+} 2
test stringComp-14.21.$noComp {Bug 82e7f67325} {
diff --git a/tests/utf.test b/tests/utf.test
index 3262214..477216c 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -84,6 +84,25 @@ test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc ucs2} {
test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} {
expr {"\UD842" eq [testbytestring \xEF\xBF\xBD]}
} 1
+test utf-1.14 {Tcl_UniCharToUtf: surrogate pairs from concat} {
+ set lo \uDE02
+ return \uD83D$lo
+} \uD83D\uDE02
+test utf-1.15 {Tcl_UniCharToUtf: surrogate pairs from concat} {
+ set hi \uD83D
+ return $hi\uDE02
+} \uD83D\uDE02
+test utf-1.16 {Tcl_UniCharToUtf: \xC0 + \x80} testbytestring {
+ set lo [testbytestring \x80]
+ string length [testbytestring \xC0]$lo
+} 2
+test utf-1.17 {Tcl_UniCharToUtf: \xC0 + \x80} testbytestring {
+ set hi [testbytestring \xC0]
+ string length $hi[testbytestring \x80]
+} 2
+test utf-1.18 {Tcl_UniCharToUtf: surrogate pairs from concat} {
+ string cat \uD83D \uDE02
+} \uD83D\uDE02
test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
string length "abc"
@@ -204,7 +223,7 @@ test utf-6.3 {Tcl_UtfNext} testutfnext {
testutfnext AA
} 1
test utf-6.4 {Tcl_UtfNext} {testutfnext testbytestring} {
- testutfnext A[testbytestring \xA0]
+ testutfnext [testbytestring A\xA0]
} 1
test utf-6.5 {Tcl_UtfNext} {testutfnext testbytestring} {
testutfnext A[testbytestring \xD0]
@@ -390,7 +409,7 @@ test utf-6.62 {Tcl_UtfNext} testutfnext {
testutfnext 蠠G
} 3
test utf-6.63 {Tcl_UtfNext} {testutfnext testbytestring} {
- testutfnext 蠠[testbytestring \xA0]
+ testutfnext [testbytestring \xE8\xA0\xA0\xA0]
} 3
test utf-6.64 {Tcl_UtfNext} {testutfnext testbytestring} {
testutfnext 蠠[testbytestring \xD0]
@@ -565,7 +584,7 @@ test utf-7.6 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A[testbytestring \xE8]
} 1
test utf-7.6.1 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A蠠[testbytestring \xA0] 2
+ testutfprev A[testbytestring \xE8\xA0\xA0\xA0] 2
} 1
test utf-7.6.2 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A[testbytestring \xE8\xF8\xA0\xA0] 2
@@ -580,13 +599,13 @@ test utf-7.7.2 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A[testbytestring \xD0\xF8\xA0\xA0] 2
} 1
test utf-7.8 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xA0]
+ testutfprev [testbytestring A\xA0]
} 1
test utf-7.8.1 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xA0\xA0\xA0\xA0] 2
+ testutfprev [testbytestring A\xA0\xA0\xA0\xA0] 2
} 1
test utf-7.8.2 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xA0\xF8\xA0\xA0] 2
+ testutfprev [testbytestring A\xA0\xF8\xA0\xA0] 2
} 1
test utf-7.9 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A[testbytestring \xF8\xA0]
@@ -619,7 +638,7 @@ test utf-7.11 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A[testbytestring \xE8\xA0]
} 1
test utf-7.11.1 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A蠠[testbytestring \xA0] 3
+ testutfprev A[testbytestring \xE8\xA0\xA0\xA0] 3
} 1
test utf-7.11.2 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A[testbytestring \xE8\xA0\xF8\xA0] 3
@@ -637,13 +656,13 @@ test utf-7.12.2 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A[testbytestring \xD0\xA0\xF8\xA0] 3
} 1
test utf-7.13 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xA0\xA0]
+ testutfprev [testbytestring A\xA0\xA0]
} 2
test utf-7.13.1 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xA0\xA0\xA0\xA0] 3
+ testutfprev [testbytestring A\xA0\xA0\xA0\xA0] 3
} 2
test utf-7.13.2 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xA0\xA0\xF8\xA0] 3
+ testutfprev [testbytestring A\xA0\xA0\xF8\xA0] 3
} 2
test utf-7.14 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A[testbytestring \xF8\xA0\xA0]
@@ -676,7 +695,7 @@ test utf-7.16 {Tcl_UtfPrev} testutfprev {
testutfprev A蠠
} 1
test utf-7.16.1 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A蠠[testbytestring \xA0] 4
+ testutfprev A[testbytestring \xE8\xA0\xA0\xA0] 4
} 1
test utf-7.16.2 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A蠠[testbytestring \xF8] 4
@@ -691,31 +710,31 @@ test utf-7.17.2 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A[testbytestring \xD0\xA0\xA0\xF8] 4
} 3
test utf-7.18.0 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xA0\xA0\xA0]
+ testutfprev [testbytestring A\xA0\xA0\xA0]
} 3
test utf-7.18.1 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xA0\xA0\xA0\xA0] 4
+ testutfprev [testbytestring A\xA0\xA0\xA0\xA0] 4
} 3
test utf-7.18.2 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xA0\xA0\xA0\xF8] 4
+ testutfprev [testbytestring A\xA0\xA0\xA0\xF8] 4
} 3
test utf-7.19 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xF8\xA0\xA0\xA0]
+ testutfprev [testbytestring A\xF8\xA0\xA0\xA0]
} 4
test utf-7.20.0 {Tcl_UtfPrev} {testutfprev testbytestring ucs2} {
- testutfprev A[testbytestring \xF2\xA0\xA0\xA0]
+ testutfprev [testbytestring A\xF2\xA0\xA0\xA0]
} 4
test utf-7.20.1 {Tcl_UtfPrev} {testutfprev testbytestring fullutf} {
- testutfprev A[testbytestring \xF2\xA0\xA0\xA0]
+ testutfprev [testbytestring A\xF2\xA0\xA0\xA0]
} 1
test utf-7.21 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A蠠[testbytestring \xA0]
+ testutfprev A[testbytestring \xE8\xA0\xA0\xA0]
} 4
test utf-7.22 {Tcl_UtfPrev} {testutfprev testbytestring} {
testutfprev A[testbytestring \xD0\xA0\xA0\xA0]
} 4
test utf-7.23 {Tcl_UtfPrev} {testutfprev testbytestring} {
- testutfprev A[testbytestring \xA0\xA0\xA0\xA0]
+ testutfprev [testbytestring A\xA0\xA0\xA0\xA0]
} 4
test utf-7.24 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} {
testutfprev A[testbytestring \xC0\x81]