From 882fcc12b24d44674254eabaacfe15be718f3b73 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 03:54:50 +0000 Subject: Fix the bad tests utf-2.11 and utf-6.88 that expected the wrong results. Also reconcile the merge from 8.5 to the new decoupling of bytesequence counts from indexed code unit couints. Docs still need an update. --- generic/tclUtf.c | 50 ++++++++++++++++++++------------------------------ tests/utf.test | 4 ++-- 2 files changed, 22 insertions(+), 32 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index d6ba15c..24fd418 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -589,6 +589,7 @@ Tcl_NumUtfChars( int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { + const char *next; register int i = 0; /* @@ -600,20 +601,23 @@ Tcl_NumUtfChars( if (length < 0) { while ((*src != '\0') && (i < INT_MAX)) { - src = TclUtfNext(src); - i++; + next = TclUtfNext(src); + i += 1 + ((next - src) > 3); + src = next; } } else { register const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { - src = TclUtfNext(src); - i++; + next = TclUtfNext(src); + i += 1 + ((next - src) > 3); + src = next; } endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - src = TclUtfNext(src); - i++; + next = TclUtfNext(src); + i += 1 + ((next - src) > 3); + src = next; } if (src < endPtr) { i += endPtr - src; @@ -958,33 +962,19 @@ Tcl_UtfAtIndex( register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { -#if 0 -/* The Tcl 8.6 implementation */ - Tcl_UniChar ch = 0; - int len = 0; - while (index-- > 0) { - len = TclUtfToUniChar(src, &ch); - src += len; - } -#if TCL_UTF_MAX == 4 - if ((ch >= 0xD800) && (len < 3)) { - /* Index points at character following high Surrogate */ - src = TclUtfToUniChar(src, &ch); - } -#endif - return src; -#else -/* The Tcl 8.5 implementation */ - while (index > 0) { - index--; - src = TclUtfNext(src); /* NOTE: counts each valid byte sequence - * as one character, maybe including those - * that will get stored as two UCS-2 units - * in the UTF-16 encoding. */ + const char *next = TclUtfNext(src); + + /* + * 4-byte sequences generate two UCS-2 code units in the + * UTF-16 representation, so in the current indexing scheme + * we need to account for an extra index (total of two). + */ + index -= ((next - src) > 3); + + src = next; } return src; -#endif } /* diff --git a/tests/utf.test b/tests/utf.test index 76cf3fe..dd94c54 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -96,7 +96,7 @@ test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} t } {4} test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, overflow} testbytestring { string length [testbytestring "\xF4\x90\x80\x80"] -} {4} +} {2} test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestring { string length [testbytestring "\xF8\xA2\xA2\xA2\xA2"] } {5} @@ -420,7 +420,7 @@ test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { } 1 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { testutfnext \xE8\xA0\xA0 1 -} 3 +} 2 testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12