summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclUtf.c18
-rw-r--r--tests/utf.test34
2 files changed, 37 insertions, 15 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index c018472..6908985 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -860,9 +860,11 @@ Tcl_UtfFindLast(
*
* Tcl_UtfNext --
*
- * Given a pointer to some current location in a UTF-8 string, move
- * forward one character. The caller must ensure that they are not asking
- * for the next character after the last character in the string.
+ * Given a pointer to some location in a UTF-8 string, Tcl_UtfNext
+ * returns a pointer to the next UTF-8 character in the string.
+ * The caller must not ask for the next character after the last
+ * character in the string if the string is not terminated by a null
+ * character.
*
* Results:
* The return value is the pointer to the next character in the UTF-8
@@ -879,7 +881,15 @@ Tcl_UtfNext(
const char *src) /* The current location in the string. */
{
Tcl_UniChar ch = 0;
- int len = TclUtfToUniChar(src, &ch);
+ int len;
+
+ if (((*src) & 0xC0) == 0x80) {
+ if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) {
+ ++src;
+ }
+ return src;
+ }
+ len = TclUtfToUniChar(src, &ch);
#if TCL_UTF_MAX <= 3
if ((ch >= 0xD800) && (len < 3)) {
diff --git a/tests/utf.test b/tests/utf.test
index a12cc73..9b319f3 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -140,10 +140,10 @@ test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars
testnumutfchars [testbytestring "\x00"] end+1
} {2}
test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} {
- testnumutfchars [testbytestring \xf0\x9f\x92\xa9] end-1
+ testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end-1
} {3}
test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring tip389} {
- testnumutfchars [testbytestring \xf0\x9f\x92\xa9] end
+ testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end
} {2}
test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} {
@@ -187,7 +187,7 @@ test utf-6.10 {Tcl_UtfNext} testutfnext {
} 1
test utf-6.11 {Tcl_UtfNext} testutfnext {
testutfnext \xA0\xA0
-} 1
+} 2
test utf-6.12 {Tcl_UtfNext} testutfnext {
testutfnext \xA0\xD0
} 1
@@ -246,7 +246,7 @@ test utf-6.30 {Tcl_UtfNext} testutfnext {
testutfnext \xF2
} 1
test utf-6.31 {Tcl_UtfNext} testutfnext {
- testutfnext \xF2A
+ testutfnext \xF2G
} 1
test utf-6.32 {Tcl_UtfNext} testutfnext {
testutfnext \xF2\xA0
@@ -369,7 +369,7 @@ test utf-6.71 {Tcl_UtfNext} testutfnext {
testutfnext \xF2\xA0\xA0\xE8
} 1
test utf-6.71 {Tcl_UtfNext} testutfnext {
- testutfnext \xF2\xA0\xA0\xF4
+ testutfnext \xF2\xA0\xA0\xF2
} 1
test utf-6.73 {Tcl_UtfNext} testutfnext {
testutfnext \xF2\xA0\xA0\xF8
@@ -418,22 +418,34 @@ test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext {
} 4
test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} testutfnext {
testutfnext \xA0\xA0
-} 1
+} 2
test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} testutfnext {
testutfnext \xE8\xA0\xA0 1
-} 2
+} 3
test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} testutfnext {
testutfnext \x80\x80
-} 1
+} 2
test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} testutfnext {
testutfnext \xF0\x80\x80 1
-} 2
+} 3
test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext {
testutfnext \xF4\x8F\xBF\xBF
} 4
test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext {
testutfnext \xF4\x90\x80\x80
} 1
+test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext {
+ testutfnext \xA0\xA0\xA0
+} 3
+test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext {
+ testutfnext \xF2\xA0\xA0\xA0 1
+} 4
+test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext {
+ testutfnext \x80\x80\x80
+} 3
+test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext {
+ testutfnext \xF0\x80\x80\x80 1
+} 4
test utf-7.1 {Tcl_UtfPrev} testutfprev {
testutfprev {}
@@ -861,8 +873,8 @@ test utf-12.3 {Tcl_UtfToLower} {
string tolower \xC3GH
} \xE3gh
test utf-12.4 {Tcl_UtfToLower} {
- string tolower \u01E2AB
-} \u01E3ab
+ string tolower \u01E2GH
+} \u01E3gh
test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} {
string tolower \u10D0\u1C90
} \u10D0\u10D0