diff options
-rw-r--r-- | generic/tclUtf.c | 38 | ||||
-rw-r--r-- | tests/utf.test | 22 |
2 files changed, 29 insertions, 31 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 1ba474e..aa949ca 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -716,31 +716,11 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * The aim of this routine is to provide a way to iterate forward - * through a UTF-8 string. The caller is expected to pass a non-NULL - * pointer argument /src/ which points to a location within a string. - * (*src) will be read, so /src/ must not point to an unreadable - * location past the end of the string. If /src/ points to the - * beginning of a complete, well-formed and valid UTF_8 byte sequence - * of no more than TCL_UTF_MAX bytes, Tcl_UtfNext returns the pointer - * just past the end of that sequence. In any other circumstance, - * Tcl_UtfNext returns /src/+1. - * - * Because this routine always returns a value > /src/, it is useful - * as a forward iterator that will always make progress. If the string - * is NUL-terminated, Tcl_UtfNext will not read beyond the terminating - * NUL character. If it is not NUL-terminated, the caller must make - * use of the companion routine Tcl_UtfCharComplete to test whether - * there is risk that Tcl_UtfNext will read beyond the end of the string. - * Tcl_UtfNext will never read more than TCL_UTF_MAX bytes. - * - * In a string where all characters are complete and properly formed, - * and /src/ points to the first byte of a character, repeated - * Tcl_UtfNext calls will step to the starting bytes of characters, one - * character at a time. Within those limitations, Tcl_UtfPrev and - * Tcl_UtfNext are inverses. If either condition cannot be met, - * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the - * caller will have to take greater care. + * Given a pointer to some location in a UTF-8 string, Tcl_UtfNext + * returns a pointer to the next UTF-8 character in the string. + * The caller must not ask for the next character after the last + * character in the string if the string is not terminated by a null + * character. * * Results: * A pointer to the start of the next character in the string (or to @@ -760,13 +740,19 @@ Tcl_UtfNext( int left = totalBytes[byte]; const char *next = src + 1; + if (((*src) & 0xC0) == 0x80) { + if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) { + ++src; + } + return src; + } + while (--left) { byte = *((unsigned char *) next); if ((byte & 0xC0) != 0x80) { /* * src points to non-trail byte; We ran out of trail bytes * before the needs of the lead byte were satisfied. - * Let the (malformed) lead byte alone be a character */ return src + 1; } diff --git a/tests/utf.test b/tests/utf.test index 0ba2b85..f56fabc 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -189,7 +189,7 @@ test utf-6.10 {Tcl_UtfNext} testutfnext { } 1 test utf-6.11 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xA0 -} 1 +} 2 test utf-6.12 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xD0 } 1 @@ -420,18 +420,30 @@ test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext} { } 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { testutfnext \xA0\xA0 -} 1 +} 2 test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { testutfnext \xE8\xA0\xA0 1 -} 2 +} 3 test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \x80\x80 -} 1 +} 2 test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 -} 2 +} 3 testConstraint testutfprev [llength [info commands testutfprev]] +test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { + testutfnext \xA0\xA0\xA0 +} 3 +test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { + testutfnext \xF2\xA0\xA0\xA0 1 +} 4 +test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { + testutfnext \x80\x80\x80 +} 3 +test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { + testutfnext \xF0\x80\x80\x80 1 +} 4 test utf-7.1 {Tcl_UtfPrev} testutfprev { testutfprev {} |