diff options
Diffstat (limited to 'generic/tclStringObj.c')
-rw-r--r-- | generic/tclStringObj.c | 43 |
1 files changed, 37 insertions, 6 deletions
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 78eafa7..3908ae9 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -70,6 +70,11 @@ static void SetUnicodeObj(Tcl_Obj *objPtr, static size_t UnicodeLength(const Tcl_UniChar *unicode); static void UpdateStringOfString(Tcl_Obj *objPtr); +#define ISCONTINUATION(bytes) ((bytes) \ + && ((((bytes)[0] & 0xC0) == 0x80) || (((bytes)[0] == '\xED') \ + && (((bytes)[1] & 0xF0) == 0xB0) && (((bytes)[2] & 0xC0) == 0x80)))) + + /* * The structure below defines the string Tcl object type by means of * functions that can be invoked by generic object code. @@ -1146,6 +1151,11 @@ Tcl_AppendLimitedToObj( SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); + /* If appended string starts with a continuation byte or a lower surrogate, + * force objPtr to unicode representation. See [7f1162a867] */ + if (ISCONTINUATION(bytes)) { + Tcl_GetUnicode(objPtr); + } if (stringPtr->hasUnicode && (stringPtr->numChars+1) > 1) { AppendUtfToUnicodeRep(objPtr, bytes, toCopy); } else { @@ -1344,6 +1354,12 @@ Tcl_AppendObjToObj( SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); + /* If appended string starts with a continuation byte or a lower surrogate, + * force objPtr to unicode representation. See [7f1162a867] + * This fixes append-3.4, append-3.7 and utf-1.18 testcases. */ + if (ISCONTINUATION(appendObjPtr->bytes)) { + Tcl_GetUnicode(objPtr); + } /* * If objPtr has a valid Unicode rep, then get a Unicode string from * appendObjPtr and append it. @@ -2926,7 +2942,7 @@ TclStringCat( Tcl_Obj *objResultPtr, * const *ov; int oc, binary = 1; size_t length = 0; - int allowUniChar = 1, requestUniChar = 0; + int allowUniChar = 1, requestUniChar = 0, forceUniChar = 0; int first = objc - 1; /* Index of first value possibly not empty */ int last = 0; /* Index of last value possibly not empty */ int inPlace = flags & TCL_STRING_IN_PLACE; @@ -2962,7 +2978,9 @@ TclStringCat( */ binary = 0; - if ((objPtr->typePtr) && (objPtr->typePtr != &tclStringType)) { + if (ov > objv+1 && ISCONTINUATION(objPtr->bytes)) { + forceUniChar = 1; + } else if ((objPtr->typePtr) && (objPtr->typePtr != &tclStringType)) { /* Prevent shimmer of non-string types. */ allowUniChar = 0; } @@ -3009,7 +3027,7 @@ TclStringCat( } } } while (--oc); - } else if (allowUniChar && requestUniChar) { + } else if ((allowUniChar && requestUniChar) || forceUniChar) { /* * Result will be pure Tcl_UniChar array. Pre-size it. */ @@ -3161,7 +3179,7 @@ TclStringCat( dst += more; } } - } else if (allowUniChar && requestUniChar) { + } else if ((allowUniChar && requestUniChar) || forceUniChar) { /* Efficiently produce a pure Tcl_UniChar array result */ Tcl_UniChar *dst; @@ -4032,9 +4050,22 @@ ExtendUnicodeRepWithString( } else { numAppendChars = 0; } - for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) { + dst = stringPtr->unicode + numOrigChars; + if (numAppendChars-- > 0) { bytes += TclUtfToUniChar(bytes, &unichar); - *dst = unichar; +#if TCL_UTF_MAX > 3 + /* join upper/lower surrogate */ + if (bytes && (stringPtr->unicode[numOrigChars - 1] | 0x3FF) == 0xDBFF && (unichar | 0x3FF) == 0xDFFF) { + stringPtr->numChars--; + unichar = ((stringPtr->unicode[numOrigChars - 1] & 0x3FF) << 10) + (unichar & 0x3FF) + 0x10000; + dst--; + } +#endif + *dst++ = unichar; + while (numAppendChars-- > 0) { + bytes += TclUtfToUniChar(bytes, &unichar); + *dst++ = unichar; + } } *dst = 0; } |