summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclExecute.c25
-rw-r--r--generic/tclStringObj.c87
-rw-r--r--tests/encoding.test10
-rw-r--r--tests/string.test8
-rw-r--r--tests/utf.test38
5 files changed, 100 insertions, 68 deletions
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index dbbea34..5c701a2 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -2628,34 +2628,11 @@ TEBCresume(
opnd = TclGetUInt1AtPtr(pc+1);
- objv = &OBJ_AT_DEPTH(opnd-1);
- /* minor optimization in simplest cases */
- switch (opnd) {
- case 1: /* only one object */
- objResultPtr = *objv;
- goto endINST_STR_CONCAT1;
- case 2: /* two objects - check empty */
- if (objv[0]->bytes == &tclEmptyString) {
- objResultPtr = objv[1];
- goto endINST_STR_CONCAT1;
- }
- else
- if (objv[1]->bytes == &tclEmptyString) {
- objResultPtr = objv[0];
- goto endINST_STR_CONCAT1;
- }
- break;
- case 0: /* no objects - use new empty */
- TclNewObj(objResultPtr);
- goto endINST_STR_CONCAT1;
- }
- /* do concat */
if (TCL_OK != TclStringCatObjv(interp, /* inPlace */ 1,
- opnd, objv, &objResultPtr)) {
+ opnd, &OBJ_AT_DEPTH(opnd-1), &objResultPtr)) {
TRACE_ERROR(interp);
goto gotError;
}
- endINST_STR_CONCAT1:
TRACE_WITH_OBJ(("%u => ", opnd), objResultPtr);
NEXT_INST_V(2, opnd, 1);
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 33ecff7..6e73068 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -2849,10 +2849,18 @@ TclStringCatObjv(
Tcl_Obj **objPtrPtr)
{
Tcl_Obj *objPtr, *objResultPtr, * const *ov;
- int oc, length = 0, binary = 1, first = 0;
+ int oc, length = 0, binary = 1, first = 0, last = 0;
int allowUniChar = 1, requestUniChar = 0;
- /* assert (objc >= 2) */
+ /* assert ( objc >= 0 ) */
+
+ if (objc <= 1) {
+ /* Only one or no objects; return first or empty */
+ *objPtrPtr = objc ? objv[0] : Tcl_NewObj();
+ return TCL_OK;
+ }
+
+ /* assert ( objc >= 2 ) */
/*
* Analyze to determine what representation result should be.
@@ -2862,7 +2870,7 @@ TclStringCatObjv(
*/
ov = objv, oc = objc;
- while (oc-- && (binary || allowUniChar)) {
+ do {
objPtr = *ov++;
if (objPtr->bytes) {
@@ -2893,72 +2901,79 @@ TclStringCatObjv(
}
}
}
- }
+ } while (--oc && (binary || allowUniChar));
if (binary) {
/* Result will be pure byte array. Pre-size it */
ov = objv; oc = objc;
- while (oc-- && (length >= 0)) {
+ do {
objPtr = *ov++;
if (objPtr->bytes == NULL) {
int numBytes;
Tcl_GetByteArrayFromObj(objPtr, &numBytes); /* PANIC? */
- if (length == 0) {
- first = objc - oc - 1;
+ if (numBytes) {
+ last = objc - oc;
+ if (length == 0) {
+ first = last;
+ }
+ if ((length += numBytes) < 0) {
+ goto overflow;
+ }
}
- length += numBytes;
}
- }
+ } while (--oc);
} else if (allowUniChar && requestUniChar) {
/* Result will be pure Tcl_UniChar array. Pre-size it. */
ov = objv; oc = objc;
- while (oc-- && (length >= 0)) {
+ do {
objPtr = *ov++;
if ((objPtr->bytes == NULL) || (objPtr->length)) {
int numChars;
Tcl_GetUnicodeFromObj(objPtr, &numChars); /* PANIC? */
- if (length == 0) {
- first = objc - oc - 1;
+ if (numChars) {
+ last = objc - oc;
+ if (length == 0) {
+ first = last;
+ }
+ if ((length += numChars) < 0) {
+ goto overflow;
+ }
}
- length += numChars;
}
- }
+ } while (--oc);
} else {
/* Result will be concat of string reps. Pre-size it. */
ov = objv; oc = objc;
- while (oc-- && (length >= 0)) {
+ do {
int numBytes;
objPtr = *ov++;
Tcl_GetStringFromObj(objPtr, &numBytes); /* PANIC? */
- if ((length == 0) && numBytes) {
- first = objc - oc - 1;
+ if (numBytes) {
+ last = objc - oc;
+ if (length == 0) {
+ first = last;
+ }
+ if ((length += numBytes) < 0) {
+ goto overflow;
+ }
}
- length += numBytes;
- }
+ } while (--oc);
}
- if (length < 0) {
- if (interp) {
- Tcl_SetObjResult(interp, Tcl_ObjPrintf(
- "max size for a Tcl value (%d bytes) exceeded", INT_MAX));
- Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL);
- }
- return TCL_ERROR;
- }
-
- if (length == 0) {
- /* Total length of zero means every value has length zero */
- *objPtrPtr = objv[0];
+ if (last == first /*|| length == 0 */) {
+ /* Only one non-empty value or zero length; return first */
+ /* NOTE: (length == 0) implies (last == first) */
+ *objPtrPtr = objv[first];
return TCL_OK;
}
- objv += first; objc -= first;
+ objv += first; objc = (last - first + 1);
if (binary) {
/* Efficiently produce a pure byte array result */
@@ -3088,6 +3103,14 @@ TclStringCatObjv(
}
*objPtrPtr = objResultPtr;
return TCL_OK;
+
+ overflow:
+ if (interp) {
+ Tcl_SetObjResult(interp, Tcl_ObjPrintf(
+ "max size for a Tcl value (%d bytes) exceeded", INT_MAX));
+ Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL);
+ }
+ return TCL_ERROR;
}
/*
diff --git a/tests/encoding.test b/tests/encoding.test
index 49555b6..be1f4d5 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -34,6 +34,7 @@ proc runtests {} {
# Some tests require the testencoding command
testConstraint testencoding [llength [info commands testencoding]]
+testConstraint fullutf [expr {[format %c 0x010000] != "\ufffd"}]
testConstraint exec [llength [info commands exec]]
testConstraint testgetencpath [llength [info commands testgetencpath]]
@@ -332,9 +333,14 @@ test encoding-16.1 {UnicodeToUtfProc} {
set val [encoding convertfrom unicode NN]
list $val [format %x [scan $val %c]]
} "\u4e4e 4e4e"
+test encoding-16.2 {UnicodeToUtfProc} -constraints fullutf -body {
+ set val [encoding convertfrom unicode "\xd8\xd8\xdc\xdc"]
+ list $val [format %x [scan $val %c]]
+} -result "\U460dc 460dc"
-test encoding-17.1 {UtfToUnicodeProc} {
-} {}
+test encoding-17.1 {UtfToUnicodeProc} -constraints fullutf -body {
+ encoding convertto unicode "\U460dc"
+} -result "\xd8\xd8\xdc\xdc"
test encoding-18.1 {TableToUtfProc} {
} {}
diff --git a/tests/string.test b/tests/string.test
index eff0525..10c1e5d 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -576,12 +576,12 @@ test string-6.85 {string is control} {
} 0
test string-6.86 {string is graph} {
## graph is any print char, except space
- list [string is gra -fail var "0123abc!@#\$\u0100 "] $var
-} {0 12}
+ list [string is gra -fail var "0123abc!@#\$\u0100\UE0100\UE01EF "] $var
+} {0 14}
test string-6.87 {string is print} {
## basically any printable char
- list [string is print -fail var "0123abc!@#\$\u0100 \u0010"] $var
-} {0 13}
+ list [string is print -fail var "0123abc!@#\$\u0100 \UE0100\UE01EF\u0010"] $var
+} {0 15}
test string-6.88 {string is punct} {
## any graph char that isn't alnum
list [string is punct -fail var "_!@#\u00beq0"] $var
diff --git a/tests/utf.test b/tests/utf.test
index a03dd6c..28981d6 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -20,6 +20,9 @@ testConstraint testbytestring [llength [info commands testbytestring]]
catch {unset x}
+# Some tests require support for 4-byte UTF-8 sequences
+testConstraint fullutf [expr {[format %c 0x010000] != "\ufffd"}]
+
test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} testbytestring {
expr {"\x01" eq [testbytestring "\x01"]}
} 1
@@ -38,6 +41,9 @@ test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} testbytestring {
test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring {
expr {[format %c -1] eq [testbytestring "\xef\xbf\xbd"]}
} 1
+test utf-1.7 {Tcl_UniCharToUtf: 4 byte sequences} -constraints {fullutf testbytestring} -body {
+ expr {"\U014e4e" eq [testbytestring "\xf0\x94\xb9\x8e"]}
+} -result 1
test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
string length "abc"
@@ -60,9 +66,21 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestrin
test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring {
string length [testbytestring "\xE4\xb9\x8e"]
} {1}
-test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestring {
- string length [testbytestring "\xF4\xA2\xA2\xA2"]
+test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body {
+ string length [testbytestring "\xF0\x90\x80\x80"]
+} -result {1}
+test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body {
+ string length [testbytestring "\xF4\x8F\xBF\xBF"]
+} -result {1}
+test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring {
+ string length [testbytestring "\xF0\x8F\xBF\xBF"]
+} {4}
+test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, overflow} testbytestring {
+ string length [testbytestring "\xF4\x90\x80\x80"]
} {4}
+test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestring {
+ string length [testbytestring "\xF8\xA2\xA2\xA2\xA2"]
+} {5}
test utf-3.1 {Tcl_UtfCharComplete} {
} {}
@@ -195,8 +213,16 @@ bsCheck \Ua1 161
bsCheck \U4e21 20001
bsCheck \U004e21 20001
bsCheck \U00004e21 20001
-bsCheck \U00110000 65533
-bsCheck \Uffffffff 65533
+bsCheck \U0000004e21 78
+if {[testConstraint fullutf]} {
+ bsCheck \U00110000 69632
+ bsCheck \U01100000 69632
+ bsCheck \U11000000 69632
+ bsCheck \U0010FFFF 1114111
+ bsCheck \U010FFFF0 1114111
+ bsCheck \U10FFFF00 1114111
+ bsCheck \UFFFFFFFF 1048575
+}
test utf-11.1 {Tcl_UtfToUpper} {
string toupper {}
@@ -264,8 +290,8 @@ test utf-16.1 {Tcl_UniCharToLower, negative delta} {
string tolower aA
} aa
test utf-16.2 {Tcl_UniCharToLower, positive delta} {
- string tolower \u0178\u00ff\uA78D\u01c5
-} \u00ff\u00ff\u0265\u01c6
+ string tolower \u0178\u00ff\uA78D\u01c5\U10400
+} \u00ff\u00ff\u0265\u01c6\U10428
test utf-17.1 {Tcl_UniCharToLower, no delta} {
string tolower !