From 0187afa965d2276476598016ea28d8fcd96d48ea Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 7 May 2020 18:56:37 +0000
Subject: Test demonstrating bug in ticket [b2816a3afe].

---
 tests/utf.test | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/utf.test b/tests/utf.test
index 8aa3757..f48299d 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -175,6 +175,9 @@ test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars test
 test utf-4.12.2 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring utf16} {
     testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end
 } 2
+test utf-4.13 {Tcl_NumUtfChars: end of string} {testnumutfchars testbytestring} { 
+    testnumutfchars foobar[testbytestring \xF2\xC2\xA0] end
+} 8
 
 test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} {
     testfindfirst [testbytestring abcbc] 98
-- 
cgit v0.12


From 2f7461f649fe3b5d78645f8efea56d24693f1bef Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 7 May 2020 19:08:24 +0000
Subject: Fix.  Note that just because we get one positive detection of an
 incomplete character, we cannot conclude that the next byte also will be, or
 can by taken as a single byte.  At least we cannot when TCL_UTF_MAX > 3 so
 that we have room for valid two-byte sequences after incomplete sequence
 detection. No need for conditional code, just use an algorithm that always
 works.

---
 generic/tclUtf.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 4147bbc..80f3be8 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -548,17 +548,18 @@ Tcl_NumUtfChars(
 	    i++;
 	}
 	/* Loop over the remaining string where call must happen */
-	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
-	    src += TclUtfToUniChar(src, &ch);
+	while (src < endPtr) {
+	    if (Tcl_UtfCharComplete(src, endPtr - src)) {
+		src += TclUtfToUniChar(src, &ch);
+	    } else {
+		/*
+		 * src points to incomplete UTF-8 sequence 
+		 * Treat first byte as character and count it
+		 */
+		src++;
+	    }
 	    i++;
 	}
-	if (src < endPtr) {
-	    /*
-	     * String ends in an incomplete UTF-8 sequence.
-	     * Count every byte in it.
-	     */
-	    i += endPtr - src;
-	}
     }
     return i;
 }
-- 
cgit v0.12


From 9c8670e7341c883b6179637d0903cb87c5e90f26 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Thu, 7 May 2020 20:24:58 +0000
Subject: Simplify test-cases which don't need the "testbytestring" command to
 run without it. e.g. [testbytestring \xC2\xA2] is the same as just \xA2.

---
 tests/utf.test | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/utf.test b/tests/utf.test
index f48299d..a25fde2 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -88,8 +88,8 @@ test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring {
 test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} testbytestring {
     string length [testbytestring \xC2]
 } 1
-test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} testbytestring {
-    string length [testbytestring \xC2\xA2]
+test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} {
+    string length \xA2
 } 1
 test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} testbytestring {
     string length [testbytestring \xE2]
@@ -135,32 +135,32 @@ test utf-3.1 {Tcl_UtfCharComplete} {
 test utf-4.1 {Tcl_NumUtfChars: zero length} testnumutfchars {
     testnumutfchars ""
 } 0
-test utf-4.2 {Tcl_NumUtfChars: length 1} {testnumutfchars testbytestring} {
-    testnumutfchars [testbytestring \xC2\xA2]
+test utf-4.2 {Tcl_NumUtfChars: length 1} testnumutfchars {
+    testnumutfchars \xA2
 } 1
 test utf-4.3 {Tcl_NumUtfChars: long string} {testnumutfchars testbytestring} {
-    testnumutfchars [testbytestring abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E]
+    testnumutfchars abc\xA2[testbytestring \xE4\xB9\x8E\xA2\x4E]
 } 7
-test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} {
-    testnumutfchars [testbytestring \xC0\x80]
+test utf-4.4 {Tcl_NumUtfChars: #x00} testnumutfchars {
+    testnumutfchars \x00
 } 1
 test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} testnumutfchars {
     testnumutfchars "" 0
 } 0
 test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestring} {
-    testnumutfchars [testbytestring \xC2\xA2] end
+    testnumutfchars \xA2 end
 } 1
 test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} {
-    testnumutfchars [testbytestring abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E] end
+    testnumutfchars abc\xA2[testbytestring \xE4\xB9\x8E\xA2\x4E] end
 } 7
-test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} {
-    testnumutfchars [testbytestring \xC0\x80] end
+test utf-4.8 {Tcl_NumUtfChars: #x00, calc len} testnumutfchars {
+    testnumutfchars \x00 end
 } 1
 # Bug [2738427]: Tcl_NumUtfChars(...) no overflow check
 test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} {testnumutfchars testbytestring} {
     testnumutfchars [testbytestring \xE2\x82\xAC] end-1
 } 2
-test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars testbytestring} {
+test utf-4.10 {Tcl_NumUtfChars: #x00, calc len, overcomplete} {testnumutfchars testbytestring} {
     testnumutfchars [testbytestring \x00] end+1
 } 2
 test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} {
@@ -446,8 +446,8 @@ test utf-6.79.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} {
 test utf-6.79.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} {
     testutfnext [testbytestring \xF2\xA0\xA0\xA0G\xF8]
 } 4
-test utf-6.80 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring} {
-    testutfnext [testbytestring \xC0\x80]
+test utf-6.80 {Tcl_UtfNext - overlong sequences} {testutfnext} {
+    testutfnext \x00
 } 2
 test utf-6.81 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring} {
     testutfnext [testbytestring \xC0\x81]
@@ -873,8 +873,8 @@ test utf-7.31 {Tcl_UtfPrev -- overlong sequence}  {testutfprev testbytestring} {
 test utf-7.32 {Tcl_UtfPrev -- overlong sequence}  {testutfprev testbytestring} {
     testutfprev A[testbytestring \xF0\x80\x80\x80] 2
 } 1
-test utf-7.33 {Tcl_UtfPrev -- overlong sequence}  {testutfprev testbytestring} {
-    testutfprev A[testbytestring \xC0\x80]
+test utf-7.33 {Tcl_UtfPrev -- overlong sequence}  {testutfprev} {
+    testutfprev A\x00
 } 1
 test utf-7.34 {Tcl_UtfPrev -- overlong sequence}  {testutfprev testbytestring} {
     testutfprev A[testbytestring \xC1\x80]
-- 
cgit v0.12


From 3ec501f29bdd63cb4b75f5b369d2c24505b3a6ee Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 7 May 2020 20:31:48 +0000
Subject: Same trouble with Tcl_UtfToUniCharDstring. Test and fix.

---
 generic/tclUtf.c |  9 +++++----
 tests/utf.test   | 11 ++++++++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 80f3be8..7309208 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -453,11 +453,12 @@ Tcl_UtfToUniCharDString(
 	p += TclUtfToUniChar(p, w);
 	w++;
     }
-    while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) {
-	p += TclUtfToUniChar(p, w++);
-    }
     while (p < endPtr) {
-	*w++ = UCHAR(*p++);
+	if (Tcl_UtfCharComplete(p, endPtr-p)) {
+	    p += TclUtfToUniChar(p, w++);
+	} else {
+	    *w++ = UCHAR(*p++);
+	}
     }
     *w = '\0';
     Tcl_DStringSetLength(dsPtr,
diff --git a/tests/utf.test b/tests/utf.test
index f48299d..5300328 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -1473,7 +1473,16 @@ UniCharCaseCmpTest > [format %c 0x10000] \uFFFF	ucs4
 UniCharCaseCmpTest > \U10000 \uFFFF		{Uesc ucs4}
 
 
-
+test utf-26.1 {Tcl_UniCharDString} -setup {
+    testobj freeallvars
+} -constraints {teststringobj} -cleanup {
+    testobj freeallvars
+} -body {
+    teststringobj set 1 foo
+    teststringobj getunicode 1
+    teststringobj append 1 [testbytestring barsoom\xF2\xC2\x80] 10
+    scan [string index [teststringobj get 1] 11] %c
+} -result 128
 
 
 unset count
-- 
cgit v0.12