summaryrefslogtreecommitdiffstats
path: root/generic/tclStringObj.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclStringObj.c')
-rw-r--r--generic/tclStringObj.c43
1 files changed, 37 insertions, 6 deletions
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 78eafa7..3908ae9 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -70,6 +70,11 @@ static void SetUnicodeObj(Tcl_Obj *objPtr,
static size_t UnicodeLength(const Tcl_UniChar *unicode);
static void UpdateStringOfString(Tcl_Obj *objPtr);
+#define ISCONTINUATION(bytes) ((bytes) \
+ && ((((bytes)[0] & 0xC0) == 0x80) || (((bytes)[0] == '\xED') \
+ && (((bytes)[1] & 0xF0) == 0xB0) && (((bytes)[2] & 0xC0) == 0x80))))
+
+
/*
* The structure below defines the string Tcl object type by means of
* functions that can be invoked by generic object code.
@@ -1146,6 +1151,11 @@ Tcl_AppendLimitedToObj(
SetStringFromAny(NULL, objPtr);
stringPtr = GET_STRING(objPtr);
+ /* If appended string starts with a continuation byte or a lower surrogate,
+ * force objPtr to unicode representation. See [7f1162a867] */
+ if (ISCONTINUATION(bytes)) {
+ Tcl_GetUnicode(objPtr);
+ }
if (stringPtr->hasUnicode && (stringPtr->numChars+1) > 1) {
AppendUtfToUnicodeRep(objPtr, bytes, toCopy);
} else {
@@ -1344,6 +1354,12 @@ Tcl_AppendObjToObj(
SetStringFromAny(NULL, objPtr);
stringPtr = GET_STRING(objPtr);
+ /* If appended string starts with a continuation byte or a lower surrogate,
+ * force objPtr to unicode representation. See [7f1162a867]
+ * This fixes append-3.4, append-3.7 and utf-1.18 testcases. */
+ if (ISCONTINUATION(appendObjPtr->bytes)) {
+ Tcl_GetUnicode(objPtr);
+ }
/*
* If objPtr has a valid Unicode rep, then get a Unicode string from
* appendObjPtr and append it.
@@ -2926,7 +2942,7 @@ TclStringCat(
Tcl_Obj *objResultPtr, * const *ov;
int oc, binary = 1;
size_t length = 0;
- int allowUniChar = 1, requestUniChar = 0;
+ int allowUniChar = 1, requestUniChar = 0, forceUniChar = 0;
int first = objc - 1; /* Index of first value possibly not empty */
int last = 0; /* Index of last value possibly not empty */
int inPlace = flags & TCL_STRING_IN_PLACE;
@@ -2962,7 +2978,9 @@ TclStringCat(
*/
binary = 0;
- if ((objPtr->typePtr) && (objPtr->typePtr != &tclStringType)) {
+ if (ov > objv+1 && ISCONTINUATION(objPtr->bytes)) {
+ forceUniChar = 1;
+ } else if ((objPtr->typePtr) && (objPtr->typePtr != &tclStringType)) {
/* Prevent shimmer of non-string types. */
allowUniChar = 0;
}
@@ -3009,7 +3027,7 @@ TclStringCat(
}
}
} while (--oc);
- } else if (allowUniChar && requestUniChar) {
+ } else if ((allowUniChar && requestUniChar) || forceUniChar) {
/*
* Result will be pure Tcl_UniChar array. Pre-size it.
*/
@@ -3161,7 +3179,7 @@ TclStringCat(
dst += more;
}
}
- } else if (allowUniChar && requestUniChar) {
+ } else if ((allowUniChar && requestUniChar) || forceUniChar) {
/* Efficiently produce a pure Tcl_UniChar array result */
Tcl_UniChar *dst;
@@ -4032,9 +4050,22 @@ ExtendUnicodeRepWithString(
} else {
numAppendChars = 0;
}
- for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) {
+ dst = stringPtr->unicode + numOrigChars;
+ if (numAppendChars-- > 0) {
bytes += TclUtfToUniChar(bytes, &unichar);
- *dst = unichar;
+#if TCL_UTF_MAX > 3
+ /* join upper/lower surrogate */
+ if (bytes && (stringPtr->unicode[numOrigChars - 1] | 0x3FF) == 0xDBFF && (unichar | 0x3FF) == 0xDFFF) {
+ stringPtr->numChars--;
+ unichar = ((stringPtr->unicode[numOrigChars - 1] & 0x3FF) << 10) + (unichar & 0x3FF) + 0x10000;
+ dst--;
+ }
+#endif
+ *dst++ = unichar;
+ while (numAppendChars-- > 0) {
+ bytes += TclUtfToUniChar(bytes, &unichar);
+ *dst++ = unichar;
+ }
}
*dst = 0;
}