summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclInt.h8
-rw-r--r--generic/tclUtf.c50
-rw-r--r--tests/utf.test92
3 files changed, 99 insertions, 51 deletions
diff --git a/generic/tclInt.h b/generic/tclInt.h
index b6d6a88..1a286a0 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3685,17 +3685,17 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, CONST char *file,
*/
#define TclUtfToUniChar(str, chPtr) \
- ((((unsigned char) *(str)) < 0xC0) ? \
- ((*(chPtr) = (unsigned char) *(str)), 1) \
+ (((UCHAR(*(str))) < 0x80) ? \
+ ((*(chPtr) = UCHAR(*(str))), 1) \
: Tcl_UtfToUniChar(str, chPtr))
#define TclUtfPrev(src, start) \
(((src) < (start)+2) ? (start) : \
- ((unsigned char) *(src - 1)) < 0x80 ? (src)-1 : \
+ (UCHAR(*((src) - 1))) < 0x80 ? (src)-1 : \
Tcl_UtfPrev(src, start))
#define TclUtfNext(src) \
- ((((unsigned char) *(src)) < 0xC0) ? src + 1 : Tcl_UtfNext(src))
+ (((UCHAR(*(src))) < 0x80) ? src + 1 : Tcl_UtfNext(src))
/*
*----------------------------------------------------------------
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 7309208..03d0f3a 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -167,14 +167,13 @@ Invalid(
unsigned char byte = UCHAR(*src);
int index;
- if ((byte & 0xC3) != 0xC0) {
+ if ((byte & 0xC3) == 0xC0) {
/* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */
- return 0;
- }
- index = (byte - 0xC0) >> 1;
- if (UCHAR(src[1]) < bounds[index] || UCHAR(src[1]) > bounds[index+1]) {
- /* Out of bounds - report invalid. */
- return 1;
+ index = (byte - 0xC0) >> 1;
+ if (UCHAR(src[1]) < bounds[index] || UCHAR(src[1]) > bounds[index+1]) {
+ /* Out of bounds - report invalid. */
+ return 1;
+ }
}
return 0;
}
@@ -425,10 +424,10 @@ Tcl_UtfToUniCharDString(
Tcl_UniChar *w, *wString;
const char *p;
int oldLength;
- /* Pointer to the end of string. Never read endPtr[0] */
- const char *endPtr = src + length;
- /* Pointer to breakpoint in scan where optimization is lost */
- const char *optPtr = endPtr - TCL_UTF_MAX;
+ /* Pointer to the end of string. Never read endPtr[0] */
+ const char *endPtr;
+ /* Pointer to last byte where optimization still can be used */
+ const char *optPtr;
if (length < 0) {
length = strlen(src);
@@ -448,14 +447,15 @@ Tcl_UtfToUniCharDString(
w = wString;
p = src;
endPtr = src + length;
- optPtr = endPtr - TCL_UTF_MAX;
+ optPtr = endPtr - ((TCL_UTF_MAX > 3) ? 4 : 3) ;
while (p <= optPtr) {
p += TclUtfToUniChar(p, w);
w++;
}
while (p < endPtr) {
if (Tcl_UtfCharComplete(p, endPtr-p)) {
- p += TclUtfToUniChar(p, w++);
+ p += TclUtfToUniChar(p, w);
+ w++;
} else {
*w++ = UCHAR(*p++);
}
@@ -534,7 +534,7 @@ Tcl_NumUtfChars(
/* Pointer to the end of string. Never read endPtr[0] */
const char *endPtr = src + length;
/* Pointer to last byte where optimization still can be used */
- const char *optPtr = endPtr - TCL_UTF_MAX;
+ const char *optPtr = endPtr - ((TCL_UTF_MAX > 3) ? 4 : 3);
/*
* Optimize away the call in this loop. Justified because...
@@ -554,7 +554,7 @@ Tcl_NumUtfChars(
src += TclUtfToUniChar(src, &ch);
} else {
/*
- * src points to incomplete UTF-8 sequence
+ * src points to incomplete UTF-8 sequence
* Treat first byte as character and count it
*/
src++;
@@ -570,7 +570,7 @@ Tcl_NumUtfChars(
*
* Tcl_UtfFindFirst --
*
- * Returns a pointer to the first occurance of the given Unicode character
+ * Returns a pointer to the first occurrence of the given Unicode character
* in the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrune().
*
@@ -671,6 +671,19 @@ Tcl_UtfNext(
int left;
const char *next;
+#if TCL_UTF_MAX > 3
+ if (((*src) & 0xC0) == 0x80) {
+ /* Continuation byte, so we start 'inside' a (possible valid) UTF-8
+ * sequence. Since we are not allowed to access src[-1], we cannot
+ * check if the sequence is actually valid, the best we can do is
+ * just assume it is valid and locate the end. */
+ if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) {
+ ++src;
+ }
+ return src;
+ }
+#endif
+
left = totalBytes[UCHAR(*src)];
next = src + 1;
while (--left) {
@@ -800,14 +813,13 @@ Tcl_UtfPrev(
/* Continue the search backwards... */
look--;
- } while (trailBytesSeen < TCL_UTF_MAX);
+ } while (trailBytesSeen < (TCL_UTF_MAX < 4 ? 3 : 4));
/*
- * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a
+ * We've seen 3 trail bytes, so we know there will not be a
* properly formed byte sequence to find, and we can stop looking,
* accepting the fallback.
*/
-
return fallback;
}
diff --git a/tests/utf.test b/tests/utf.test
index e65f352..06ac329 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -3,7 +3,7 @@
# errors. No output means no errors were found.
#
# Copyright (c) 1997 Sun Microsystems, Inc.
-# Copyright (c) 1998-1999 by Scriptics Corporation.
+# Copyright (c) 1998-1999 Scriptics Corporation.
#
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
@@ -78,7 +78,10 @@ test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring
test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} {
expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]}
} 1
-test utf-1.13 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc testbytestring} {
+test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc ucs2} {
+ expr {"\UD842" eq "\uD842"}
+} 1
+test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc testbytestring fullutf} {
expr {"\UD842" eq [testbytestring \xEF\xBF\xBD]}
} 1
@@ -106,13 +109,19 @@ test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestrin
test utf-2.8.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytestring ucs2} {
string length [testbytestring \xF0\x90\x80\x80]
} 4
-test utf-2.8.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytestring ucs4} {
- string length [testbytestring \xF0\x90\x80\x80]
+test utf-2.8.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {Uesc utf16} {
+ string length \U010000
+} 2
+test utf-2.8.2 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {Uesc ucs4} {
+ string length \U010000
} 1
test utf-2.9.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytestring ucs2} {
string length [testbytestring \xF4\x8F\xBF\xBF]
} 4
-test utf-2.9.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {Uesc ucs4} {
+test utf-2.9.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {Uesc utf16} {
+ string length \U10FFFF
+} 2
+test utf-2.9.2 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {Uesc ucs4} {
string length \U10FFFF
} 1
test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring {
@@ -209,15 +218,18 @@ test utf-6.7 {Tcl_UtfNext} {testutfnext testbytestring} {
test utf-6.8 {Tcl_UtfNext} {testutfnext testbytestring} {
testutfnext A[testbytestring \xF8]
} 1
-test utf-6.9 {Tcl_UtfNext} {testutfnext testbytestring} {
+test utf-6.9 {Tcl_UtfNext} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0]
} 1
test utf-6.10 {Tcl_UtfNext} {testutfnext testbytestring} {
testutfnext [testbytestring \xA0]G
} 1
-test utf-6.11 {Tcl_UtfNext} {testutfnext testbytestring} {
+test utf-6.11.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\x00]
} 1
+test utf-6.11.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \xA0\xA0\x00]
+} 2
test utf-6.12 {Tcl_UtfNext} {testutfnext testbytestring} {
testutfnext [testbytestring \xA0\xD0]
} 1
@@ -476,12 +488,18 @@ test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring u
test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring fullutf} {
testutfnext [testbytestring \xF0\x90\x80\x80]
} 4
-test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring} {
+test utf-6.88.0 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\x00]
} 1
-test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring} {
+test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \xA0\xA0\x00]
+} 2
+test utf-6.89.0 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \x80\x80\x00]
} 1
+test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \x80\x80\x00]
+} 2
test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xF4\x8F\xBF\xBF]
} 1
@@ -491,18 +509,30 @@ test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbyte
test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring} {
testutfnext [testbytestring \xF4\x90\x80\x80]
} 1
-test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring} {
+test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\xA0]
} 1
-test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring} {
+test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \xA0\xA0\xA0]
+} 3
+test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \x80\x80\x80]
} 1
-test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring} {
+test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \x80\x80\x80]
+} 3
+test utf-6.94.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\xA0\xA0]
} 1
-test utf-6.95 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring} {
+test utf-6.94.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \xA0\xA0\xA0\xA0]
+} 3
+test utf-6.95.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \x80\x80\x80\x80]
} 1
+test utf-6.95.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \x80\x80\x80\x80]
+} 3
test utf-6.96 {Tcl_UtfNext, read limits} testutfnext {
testutfnext G 0
} 0
@@ -554,10 +584,7 @@ test utf-6.111 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
test utf-6.112.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xF2\xA0\xA0\xA0]G 3
} 1
-test utf-6.112.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring utf16} {
- testutfnext [testbytestring \xF2\xA0\xA0\xA0]G 3
-} 4
-test utf-6.112.3 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs4} {
+test utf-6.112.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} {
testutfnext [testbytestring \xF2\xA0\xA0\xA0]G 3
} 0
test utf-6.113.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
@@ -575,10 +602,7 @@ test utf-6.115 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
test utf-6.116.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] 3
} 1
-test utf-6.116.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring utf16} {
- testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] 3
-} 4
-test utf-6.116.2 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs4} {
+test utf-6.116.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} {
testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] 3
} 0
test utf-6.117.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
@@ -593,27 +617,39 @@ test utf-6.118 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
testutfnext [testbytestring \xA0]G 1
} 1
-test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0] 1
} 1
-test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0]G 2
} 1
-test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\xA0] 2
} 1
-test utf-6.123 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.123.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\xA0]G 3
} 1
-test utf-6.124 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.123.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \xA0\xA0\xA0]G 3
+} 3
+test utf-6.124.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\xA0\xA0] 3
} 1
-test utf-6.125 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.124.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \xA0\xA0\xA0\xA0] 3
+} 3
+test utf-6.125.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\xA0\xA0]G 4
} 1
-test utf-6.126 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.125.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \xA0\xA0\xA0\xA0]G 4
+} 3
+test utf-6.126.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\xA0\xA0\xA0] 4
} 1
+test utf-6.126.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} {
+ testutfnext [testbytestring \xA0\xA0\xA0\xA0\xA0] 4
+} 3
test utf-7.1 {Tcl_UtfPrev} testutfprev {
testutfprev {}