summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclEncoding.c2
-rw-r--r--generic/tclInt.h7
-rw-r--r--generic/tclUtf.c58
-rw-r--r--tests/utf.test45
4 files changed, 71 insertions, 41 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 1584de0..5c7aab8 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2341,7 +2341,7 @@ UtfToUtfProc(
*dst++ = 0;
*chPtr = 0; /* reset surrogate handling */
src += 2;
- } else if (!TclUCS4Complete(src, srcEnd - src)) {
+ } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
/*
* Always check before using TclUtfToUniChar. Not doing can so
* cause it run beyond the end of the buffer! If we happen such an
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 593d878..5c46470 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3184,8 +3184,13 @@ MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes,
const char *trim, int numTrim);
MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct);
MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr);
+/*
+ * Bytes F0-F4 are start-bytes for 4-byte sequences.
+ * Byte 0xED can be the start-byte of an upper surrogate. In that case,
+ * TclUtfToUCS4() might read the lower surrogate following it too.
+ */
# define TclUCS4Complete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \
- ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length)))
+ ? ((length) >= 4) : (UCHAR(*(src)) == 0xED) ? ((length) >= 6) : Tcl_UtfCharComplete((src), (length)))
MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData);
MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr);
MODULE_SCOPE int TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr,
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 9ffbfba..9375a01 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -81,6 +81,28 @@ static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1
};
+static const unsigned char complete[256] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+#if TCL_UTF_MAX > 4
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#endif
+ 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#if TCL_UTF_MAX > 3
+ 4,4,4,4,4,
+#else
+ 1,1,1,1,1,
+#endif
+ 1,1,1,1,1,1,1,1,1,1,1
+};
+
/*
* Functions used only in this module.
*/
@@ -359,8 +381,8 @@ Tcl_UniCharToUtfDString(
int
Tcl_UtfToUniChar(
- register const char *src, /* The UTF-8 string. */
- register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+ const char *src, /* The UTF-8 string. */
+ Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
* the UTF-8 string. */
{
Tcl_UniChar byte;
@@ -557,7 +579,7 @@ Tcl_UtfCharComplete(
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
- return length >= totalBytes[UCHAR(*src)];
+ return length >= complete[UCHAR(*src)];
}
/*
@@ -580,12 +602,12 @@ Tcl_UtfCharComplete(
int
Tcl_NumUtfChars(
- register const char *src, /* The UTF-8 string to measure. */
+ const char *src, /* The UTF-8 string to measure. */
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
Tcl_UniChar ch = 0;
- register int i = 0;
+ int i = 0;
/*
* The separate implementations are faster.
@@ -601,27 +623,29 @@ Tcl_NumUtfChars(
}
if (i < 0) i = INT_MAX; /* Bug [2738427] */
} else {
- register const char *endPtr = src + length - TCL_UTF_MAX;
+ const char *endPtr = src + length - TCL_UTF_MAX;
while (src < endPtr) {
+#if TCL_UTF_MAX < 4
if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
/* treat F0 - F4 as single character */
ch = 0;
src++;
- } else {
- src += TclUtfToUniChar(src, &ch);
- }
+ } else
+#endif
+ src += TclUtfToUniChar(src, &ch);
i++;
}
endPtr += TCL_UTF_MAX;
while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
+#if TCL_UTF_MAX < 4
if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
/* treat F0 - F4 as single character */
ch = 0;
src++;
- } else {
- src += TclUtfToUniChar(src, &ch);
- }
+ } else
+#endif
+ src += TclUtfToUniChar(src, &ch);
i++;
}
if (src < endPtr) {
@@ -890,8 +914,8 @@ Tcl_UtfPrev(
Tcl_UniChar
Tcl_UniCharAtIndex(
- register const char *src, /* The UTF-8 string to dereference. */
- register int index) /* The position of the desired character. */
+ const char *src, /* The UTF-8 string to dereference. */
+ int index) /* The position of the desired character. */
{
Tcl_UniChar ch = 0;
@@ -918,8 +942,8 @@ Tcl_UniCharAtIndex(
const char *
Tcl_UtfAtIndex(
- register const char *src, /* The UTF-8 string. */
- register int index) /* The position of the desired character. */
+ const char *src, /* The UTF-8 string. */
+ int index) /* The position of the desired character. */
{
Tcl_UniChar ch = 0;
int len = 0;
@@ -1191,7 +1215,7 @@ TclpUtfNcmp2(
* fine in the strcmp manner.
*/
- register int result = 0;
+ int result = 0;
for ( ; numBytes != 0; numBytes--, cs++, ct++) {
if (*cs != *ct) {
diff --git a/tests/utf.test b/tests/utf.test
index 0929801..50351cb 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -29,6 +29,7 @@ testConstraint pre388 [eq \x741 A]
testConstraint pairsTo4bytes [expr {[llength [info commands teststringbytes]]
&& [string length [teststringbytes \uD83D\uDCA9]] == 4}]
+testConstraint teststringbytes [llength [info commands teststringbytes]]
testConstraint testbytestring [llength [info commands testbytestring]]
testConstraint testfindfirst [llength [info commands testfindfirst]]
testConstraint testfindlast [llength [info commands testfindlast]]
@@ -501,7 +502,7 @@ test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {
} 1
test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} {
testutfnext \x80\x80\x80
-} 1
+} 3
test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext {
testutfnext \xA0\xA0\xA0\xA0
} 1
@@ -601,18 +602,18 @@ test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
test utf-6.118 {Tcl_UtfNext, read limits} testutfnext {
testutfnext \xA0G 0
} 0
-test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+test utf-6.119 {Tcl_UtfNext, read limits} testutfnext {
testutfnext \xA0G 1
-} 1
-test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 0
+test utf-6.120 {Tcl_UtfNext, read limits} testutfnext {
testutfnext \xA0\xA0 1
-} 1
-test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 0
+test utf-6.121 {Tcl_UtfNext, read limits} testutfnext {
testutfnext \xA0\xA0G 2
-} 1
-test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 0
+test utf-6.122 {Tcl_UtfNext, read limits} testutfnext {
testutfnext \xA0\xA0\xA0 2
-} 1
+} 0
test utf-6.123 {Tcl_UtfNext, read limits} testutfnext {
testutfnext \xA0\xA0\xA0G 3
} 1
@@ -990,9 +991,9 @@ test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 {
test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 {
string index \uD842 0
} "\uD842"
-test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} tip389 {
- string index \uD842 0
-} "\uD842"
+test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {teststringbytes tip389} {
+ teststringbytes [string index \uD842 0]
+} \xF0
test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} {
string index \uDC42 0
} "\uDC42"
@@ -1002,18 +1003,18 @@ test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
string index \uD83D\uDE00G 0
} "\U1F600"
-test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} tip389 {
- string index \uD83D\uDE00G 0
-} "\U1F600"
+test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} {
+ teststringbytes [string index \uD83D\uDE00G 0]
+} \xF0
test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
string index \uD83D\uDE00G 1
} "\uDE00"
test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
string index \uD83D\uDE00G 1
} G
-test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} tip389 {
- string index \uD83D\uDE00G 1
-} {}
+test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} {
+ teststringbytes [string index \uD83D\uDE00G 1]
+} \xED\xB8\x80
test utf-8.9.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
string index \uD83D\uDE00G 2
} G
@@ -1029,9 +1030,9 @@ test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} {
string index \U1F600G 0
} "\U1F600"
-test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} {
- string index \U1F600G 0
-} "\U1F600"
+test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc teststringbytes tip389} {
+ teststringbytes [string index \U1F600G 0]
+} \xF0
test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
string index \U1F600G 1
} G
@@ -1040,7 +1041,7 @@ test utf-8.11.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} {
} G
test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} {
string index \U1F600G 1
-} {}
+} \uDE00
test utf-8.12.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
string index \U1F600G 2
} {}