diff options
| -rw-r--r-- | generic/tclUtf.c | 18 | ||||
| -rw-r--r-- | tests/utf.test | 34 |
2 files changed, 37 insertions, 15 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index c018472..6908985 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -860,9 +860,11 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * Given a pointer to some current location in a UTF-8 string, move - * forward one character. The caller must ensure that they are not asking - * for the next character after the last character in the string. + * Given a pointer to some location in a UTF-8 string, Tcl_UtfNext + * returns a pointer to the next UTF-8 character in the string. + * The caller must not ask for the next character after the last + * character in the string if the string is not terminated by a null + * character. * * Results: * The return value is the pointer to the next character in the UTF-8 @@ -879,7 +881,15 @@ Tcl_UtfNext( const char *src) /* The current location in the string. */ { Tcl_UniChar ch = 0; - int len = TclUtfToUniChar(src, &ch); + int len; + + if (((*src) & 0xC0) == 0x80) { + if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) { + ++src; + } + return src; + } + len = TclUtfToUniChar(src, &ch); #if TCL_UTF_MAX <= 3 if ((ch >= 0xD800) && (len < 3)) { diff --git a/tests/utf.test b/tests/utf.test index a12cc73..9b319f3 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -140,10 +140,10 @@ test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars testnumutfchars [testbytestring "\x00"] end+1 } {2} test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring \xf0\x9f\x92\xa9] end-1 + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end-1 } {3} test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring tip389} { - testnumutfchars [testbytestring \xf0\x9f\x92\xa9] end + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end } {2} test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { @@ -187,7 +187,7 @@ test utf-6.10 {Tcl_UtfNext} testutfnext { } 1 test utf-6.11 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xA0 -} 1 +} 2 test utf-6.12 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xD0 } 1 @@ -246,7 +246,7 @@ test utf-6.30 {Tcl_UtfNext} testutfnext { testutfnext \xF2 } 1 test utf-6.31 {Tcl_UtfNext} testutfnext { - testutfnext \xF2A + testutfnext \xF2G } 1 test utf-6.32 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0 @@ -369,7 +369,7 @@ test utf-6.71 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0\xE8 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0\xF4 + testutfnext \xF2\xA0\xA0\xF2 } 1 test utf-6.73 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0\xF8 @@ -418,22 +418,34 @@ test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { } 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} testutfnext { testutfnext \xA0\xA0 -} 1 +} 2 test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} testutfnext { testutfnext \xE8\xA0\xA0 1 -} 2 +} 3 test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} testutfnext { testutfnext \x80\x80 -} 1 +} 2 test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} testutfnext { testutfnext \xF0\x80\x80 1 -} 2 +} 3 test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { testutfnext \xF4\x8F\xBF\xBF } 4 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { testutfnext \xF4\x90\x80\x80 } 1 +test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { + testutfnext \xA0\xA0\xA0 +} 3 +test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { + testutfnext \xF2\xA0\xA0\xA0 1 +} 4 +test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { + testutfnext \x80\x80\x80 +} 3 +test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { + testutfnext \xF0\x80\x80\x80 1 +} 4 test utf-7.1 {Tcl_UtfPrev} testutfprev { testutfprev {} @@ -861,8 +873,8 @@ test utf-12.3 {Tcl_UtfToLower} { string tolower \xC3GH } \xE3gh test utf-12.4 {Tcl_UtfToLower} { - string tolower \u01E2AB -} \u01E3ab + string tolower \u01E2GH +} \u01E3gh test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} { string tolower \u10D0\u1C90 } \u10D0\u10D0 |
