diff options
| -rw-r--r-- | generic/tclCompExpr.c | 5 | ||||
| -rw-r--r-- | generic/tclUtf.c | 38 | ||||
| -rw-r--r-- | tests/utf.test | 6 |
3 files changed, 16 insertions, 33 deletions
diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index ed4e958..4390282 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1885,6 +1885,7 @@ ParseLexeme( { const char *end; int scanned; + Tcl_UniChar ch; Tcl_Obj *literal = NULL; unsigned char byte; @@ -2063,13 +2064,13 @@ ParseLexeme( if (!TclIsBareword(*start) || *start == '_') { if (Tcl_UtfCharComplete(start, numBytes)) { - scanned = TclUtfNext(start) - start; + scanned = Tcl_UtfToUniChar(start, &ch); } else { char utfBytes[TCL_UTF_MAX]; memcpy(utfBytes, start, (size_t) numBytes); utfBytes[numBytes] = '\0'; - scanned = TclUtfNext(utfBytes) - utfBytes; + scanned = Tcl_UtfToUniChar(utfBytes, &ch); } *lexemePtr = INVALID; Tcl_DecrRefCount(literal); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 8d6d86b..53d51e5 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -579,7 +579,7 @@ Tcl_NumUtfChars( int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { - const char *next; + Tcl_UniChar ch; register int i = 0; /* @@ -591,35 +591,20 @@ Tcl_NumUtfChars( if (length < 0) { while ((*src != '\0') && (i < INT_MAX)) { - next = TclUtfNext(src); -#if TCL_UTF_MAX > 4 + src += TclUtfToUniChar(src, &ch); i++; -#else - i += 1 + ((next - src) > 3); -#endif - src = next; } } else { register const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { - next = TclUtfNext(src); -#if TCL_UTF_MAX > 4 + src += TclUtfToUniChar(src, &ch); i++; -#else - i += 1 + ((next - src) > 3); -#endif - src = next; } endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - next = TclUtfNext(src); -#if TCL_UTF_MAX > 4 + src += TclUtfToUniChar(src, &ch); i++; -#else - i += 1 + ((next - src) > 3); -#endif - src = next; } if (src < endPtr) { i += endPtr - src; @@ -946,19 +931,10 @@ Tcl_UtfAtIndex( register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { - while (index-- > 0) { - const char *next = TclUtfNext(src); + Tcl_UniChar ch; -#if TCL_UTF_MAX <= 4 - /* - * 4-byte sequences generate two UCS-2 code units in the - * UTF-16 representation, so in the current indexing scheme - * we need to account for an extra index (total of two). - */ - index -= ((next - src) > 3); -#endif - - src = next; + while (index-- > 0) { + src += TclUtfToUniChar(src, &ch); } return src; } diff --git a/tests/utf.test b/tests/utf.test index 84f3f38..faa9ee9 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -470,6 +470,12 @@ test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testu test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { testutfnext -bytestring \x80\x80\x80 } 1 +test utf-6.125 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { + testutfnext \xA0\xA0\xA0\xA0 +} 1 +test utf-6.126 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { + testutfnext \x80\x80\x80\x80 +} 1 test utf-7.1 {Tcl_UtfPrev} testutfprev { testutfprev {} |
