From 882fcc12b24d44674254eabaacfe15be718f3b73 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Fri, 17 Apr 2020 03:54:50 +0000
Subject: Fix the bad tests utf-2.11 and utf-6.88 that expected the wrong
 results. Also reconcile the merge from 8.5 to the new decoupling of
 bytesequence counts from indexed code unit couints. Docs still need an
 update.

---
 generic/tclUtf.c | 50 ++++++++++++++++++++------------------------------
 tests/utf.test   |  4 ++--
 2 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index d6ba15c..24fd418 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -589,6 +589,7 @@ Tcl_NumUtfChars(
     int length)			/* The length of the string in bytes, or -1
 				 * for strlen(string). */
 {
+    const char *next;
     register int i = 0;
 
     /*
@@ -600,20 +601,23 @@ Tcl_NumUtfChars(
 
     if (length < 0) {
 	while ((*src != '\0') && (i < INT_MAX)) {
-	    src = TclUtfNext(src);
-	    i++;
+	    next = TclUtfNext(src);
+	    i += 1 + ((next - src) > 3);
+	    src = next;
 	}
     } else {
 	register const char *endPtr = src + length - TCL_UTF_MAX;
 
 	while (src < endPtr) {
-	    src = TclUtfNext(src);
-	    i++;
+	    next = TclUtfNext(src);
+	    i += 1 + ((next - src) > 3);
+	    src = next;
 	}
 	endPtr += TCL_UTF_MAX;
 	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
-	    src = TclUtfNext(src);
-	    i++;
+	    next = TclUtfNext(src);
+	    i += 1 + ((next - src) > 3);
+	    src = next;
 	}
 	if (src < endPtr) {
 	    i += endPtr - src;
@@ -958,33 +962,19 @@ Tcl_UtfAtIndex(
     register const char *src,	/* The UTF-8 string. */
     register int index)		/* The position of the desired character. */
 {
-#if 0
-/* The Tcl 8.6 implementation */
-    Tcl_UniChar ch = 0;
-    int len = 0;
-
     while (index-- > 0) {
-	len = TclUtfToUniChar(src, &ch);
-	src += len;
-    }
-#if TCL_UTF_MAX == 4
-    if ((ch >= 0xD800) && (len < 3)) {
-	/* Index points at character following high Surrogate */
-	src = TclUtfToUniChar(src, &ch);
-    }
-#endif
-    return src;
-#else
-/* The Tcl 8.5 implementation */
-    while (index > 0) {
-        index--;
-        src = TclUtfNext(src);	/* NOTE: counts each valid byte sequence
-				 * as one character, maybe including those
-				 * that will get stored as two UCS-2 units
-				 * in the UTF-16 encoding. */
+        const char *next = TclUtfNext(src);
+
+	/*
+	 * 4-byte sequences generate two UCS-2 code units in the
+	 * UTF-16 representation, so in the current indexing scheme
+	 * we need to account for an extra index (total of two).
+	 */
+	index -= ((next - src) > 3);
+
+	src = next;
     }
     return src;
-#endif
 }
 
 /*
diff --git a/tests/utf.test b/tests/utf.test
index 76cf3fe..dd94c54 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -96,7 +96,7 @@ test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} t
 } {4}
 test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, overflow} testbytestring {
     string length [testbytestring "\xF4\x90\x80\x80"]
-} {4}
+} {2}
 test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestring {
     string length [testbytestring "\xF8\xA2\xA2\xA2\xA2"]
 } {5}
@@ -420,7 +420,7 @@ test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext {
 } 1
 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} {
     testutfnext \xE8\xA0\xA0 1
-} 3
+} 2
 
 testConstraint testutfprev [llength [info commands testutfprev]]
 
-- 
cgit v0.12