summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordgp <dgp@users.sourceforge.net>2020-04-17 03:54:50 (GMT)
committerdgp <dgp@users.sourceforge.net>2020-04-17 03:54:50 (GMT)
commit882fcc12b24d44674254eabaacfe15be718f3b73 (patch)
tree5f3dff164a381a0808c147760d4952550c159e2d
parentd168b44ff69105f80110a01cda37279becc68dc5 (diff)
downloadtcl-882fcc12b24d44674254eabaacfe15be718f3b73.zip
tcl-882fcc12b24d44674254eabaacfe15be718f3b73.tar.gz
tcl-882fcc12b24d44674254eabaacfe15be718f3b73.tar.bz2
Fix the bad tests utf-2.11 and utf-6.88 that expected the wrong results.
Also reconcile the merge from 8.5 to the new decoupling of bytesequence counts from indexed code unit couints. Docs still need an update.
-rw-r--r--generic/tclUtf.c50
-rw-r--r--tests/utf.test4
2 files changed, 22 insertions, 32 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index d6ba15c..24fd418 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -589,6 +589,7 @@ Tcl_NumUtfChars(
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
+ const char *next;
register int i = 0;
/*
@@ -600,20 +601,23 @@ Tcl_NumUtfChars(
if (length < 0) {
while ((*src != '\0') && (i < INT_MAX)) {
- src = TclUtfNext(src);
- i++;
+ next = TclUtfNext(src);
+ i += 1 + ((next - src) > 3);
+ src = next;
}
} else {
register const char *endPtr = src + length - TCL_UTF_MAX;
while (src < endPtr) {
- src = TclUtfNext(src);
- i++;
+ next = TclUtfNext(src);
+ i += 1 + ((next - src) > 3);
+ src = next;
}
endPtr += TCL_UTF_MAX;
while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
- src = TclUtfNext(src);
- i++;
+ next = TclUtfNext(src);
+ i += 1 + ((next - src) > 3);
+ src = next;
}
if (src < endPtr) {
i += endPtr - src;
@@ -958,33 +962,19 @@ Tcl_UtfAtIndex(
register const char *src, /* The UTF-8 string. */
register int index) /* The position of the desired character. */
{
-#if 0
-/* The Tcl 8.6 implementation */
- Tcl_UniChar ch = 0;
- int len = 0;
-
while (index-- > 0) {
- len = TclUtfToUniChar(src, &ch);
- src += len;
- }
-#if TCL_UTF_MAX == 4
- if ((ch >= 0xD800) && (len < 3)) {
- /* Index points at character following high Surrogate */
- src = TclUtfToUniChar(src, &ch);
- }
-#endif
- return src;
-#else
-/* The Tcl 8.5 implementation */
- while (index > 0) {
- index--;
- src = TclUtfNext(src); /* NOTE: counts each valid byte sequence
- * as one character, maybe including those
- * that will get stored as two UCS-2 units
- * in the UTF-16 encoding. */
+ const char *next = TclUtfNext(src);
+
+ /*
+ * 4-byte sequences generate two UCS-2 code units in the
+ * UTF-16 representation, so in the current indexing scheme
+ * we need to account for an extra index (total of two).
+ */
+ index -= ((next - src) > 3);
+
+ src = next;
}
return src;
-#endif
}
/*
diff --git a/tests/utf.test b/tests/utf.test
index 76cf3fe..dd94c54 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -96,7 +96,7 @@ test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} t
} {4}
test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, overflow} testbytestring {
string length [testbytestring "\xF4\x90\x80\x80"]
-} {4}
+} {2}
test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestring {
string length [testbytestring "\xF8\xA2\xA2\xA2\xA2"]
} {5}
@@ -420,7 +420,7 @@ test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext {
} 1
test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} {
testutfnext \xE8\xA0\xA0 1
-} 3
+} 2
testConstraint testutfprev [llength [info commands testutfprev]]