From cc90266615cb98853ebc61301119d1f7a3718d7a Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Sat, 2 May 2020 10:15:29 +0000
Subject: More fixes for [ed29806baf]. Not working yet. WIP

---
 generic/tclEncoding.c |  2 +-
 generic/tclInt.h      |  7 ++++++-
 generic/tclUtf.c      | 58 ++++++++++++++++++++++++++++++++++++---------------
 tests/utf.test        | 45 ++++++++++++++++++++-------------------
 4 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 1584de0..5c7aab8 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2341,7 +2341,7 @@ UtfToUtfProc(
 	    *dst++ = 0;
 	    *chPtr = 0; /* reset surrogate handling */
 	    src += 2;
-	} else if (!TclUCS4Complete(src, srcEnd - src)) {
+	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
 	    /*
 	     * Always check before using TclUtfToUniChar. Not doing can so
 	     * cause it run beyond the end of the buffer! If we happen such an
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 593d878..5c46470 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3184,8 +3184,13 @@ MODULE_SCOPE int	TclTrimRight(const char *bytes, int numBytes,
 			    const char *trim, int numTrim);
 MODULE_SCOPE int	TclUtfCasecmp(const char *cs, const char *ct);
 MODULE_SCOPE int	TclUtfToUCS4(const char *src, int *ucs4Ptr);
+/*
+ * Bytes F0-F4 are start-bytes for 4-byte sequences.
+ * Byte 0xED can be the start-byte of an upper surrogate. In that case,
+ * TclUtfToUCS4() might read the lower surrogate following it too.
+ */
 #   define TclUCS4Complete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \
-	    ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length)))
+	    ? ((length) >= 4) : (UCHAR(*(src)) == 0xED) ? ((length) >= 6) : Tcl_UtfCharComplete((src), (length)))
 MODULE_SCOPE Tcl_Obj *	TclpNativeToNormalized(ClientData clientData);
 MODULE_SCOPE Tcl_Obj *	TclpFilesystemPathType(Tcl_Obj *pathPtr);
 MODULE_SCOPE int	TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr,
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 9ffbfba..9375a01 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -81,6 +81,28 @@ static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1
 };
 
+static const unsigned char complete[256] = {
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+#if TCL_UTF_MAX > 4
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#endif
+    2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#if TCL_UTF_MAX > 3
+    4,4,4,4,4,
+#else
+    1,1,1,1,1,
+#endif
+    1,1,1,1,1,1,1,1,1,1,1
+};
+
 /*
  * Functions used only in this module.
  */
@@ -359,8 +381,8 @@ Tcl_UniCharToUtfDString(
 
 int
 Tcl_UtfToUniChar(
-    register const char *src,	/* The UTF-8 string. */
-    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+    const char *src,	/* The UTF-8 string. */
+    Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
 				 * the UTF-8 string. */
 {
     Tcl_UniChar byte;
@@ -557,7 +579,7 @@ Tcl_UtfCharComplete(
 				 * a complete UTF-8 character. */
     int length)			/* Length of above string in bytes. */
 {
-    return length >= totalBytes[UCHAR(*src)];
+    return length >= complete[UCHAR(*src)];
 }
 
 /*
@@ -580,12 +602,12 @@ Tcl_UtfCharComplete(
 
 int
 Tcl_NumUtfChars(
-    register const char *src,	/* The UTF-8 string to measure. */
+    const char *src,	/* The UTF-8 string to measure. */
     int length)			/* The length of the string in bytes, or -1
 				 * for strlen(string). */
 {
     Tcl_UniChar ch = 0;
-    register int i = 0;
+    int i = 0;
 
     /*
      * The separate implementations are faster.
@@ -601,27 +623,29 @@ Tcl_NumUtfChars(
 	}
 	if (i < 0) i = INT_MAX; /* Bug [2738427] */
     } else {
-	register const char *endPtr = src + length - TCL_UTF_MAX;
+	const char *endPtr = src + length - TCL_UTF_MAX;
 
 	while (src < endPtr) {
+#if TCL_UTF_MAX < 4
 	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
 		/* treat F0 - F4 as single character */
 		ch = 0;
 		src++;
-	    } else {
-		src += TclUtfToUniChar(src, &ch);
-	    }
+	    } else
+#endif
+	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
 	endPtr += TCL_UTF_MAX;
 	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
+#if TCL_UTF_MAX < 4
 	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
 		/* treat F0 - F4 as single character */
 		ch = 0;
 		src++;
-	    } else {
-		src += TclUtfToUniChar(src, &ch);
-	    }
+	    } else
+#endif
+	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
 	if (src < endPtr) {
@@ -890,8 +914,8 @@ Tcl_UtfPrev(
 
 Tcl_UniChar
 Tcl_UniCharAtIndex(
-    register const char *src,	/* The UTF-8 string to dereference. */
-    register int index)		/* The position of the desired character. */
+    const char *src,	/* The UTF-8 string to dereference. */
+    int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch = 0;
 
@@ -918,8 +942,8 @@ Tcl_UniCharAtIndex(
 
 const char *
 Tcl_UtfAtIndex(
-    register const char *src,	/* The UTF-8 string. */
-    register int index)		/* The position of the desired character. */
+    const char *src,	/* The UTF-8 string. */
+    int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch = 0;
     int len = 0;
@@ -1191,7 +1215,7 @@ TclpUtfNcmp2(
      * fine in the strcmp manner.
      */
 
-    register int result = 0;
+    int result = 0;
 
     for ( ; numBytes != 0; numBytes--, cs++, ct++) {
 	if (*cs != *ct) {
diff --git a/tests/utf.test b/tests/utf.test
index 0929801..50351cb 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -29,6 +29,7 @@ testConstraint pre388 [eq \x741 A]
 testConstraint pairsTo4bytes [expr {[llength [info commands teststringbytes]]
 		&& [string length [teststringbytes \uD83D\uDCA9]] == 4}]
 
+testConstraint teststringbytes [llength [info commands teststringbytes]]
 testConstraint testbytestring [llength [info commands testbytestring]]
 testConstraint testfindfirst [llength [info commands testfindfirst]]
 testConstraint testfindlast [llength [info commands testfindlast]]
@@ -501,7 +502,7 @@ test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {
 } 1
 test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} {
     testutfnext \x80\x80\x80
-} 1
+} 3
 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext {
     testutfnext \xA0\xA0\xA0\xA0
 } 1
@@ -601,18 +602,18 @@ test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
 test utf-6.118 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0G 0
 } 0
-test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+test utf-6.119 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0G 1
-} 1
-test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 0
+test utf-6.120 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0 1
-} 1
-test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 0
+test utf-6.121 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0G 2
-} 1
-test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 0
+test utf-6.122 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0\xA0 2
-} 1
+} 0
 test utf-6.123 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0\xA0G 3
 } 1
@@ -990,9 +991,9 @@ test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 {
 test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 {
     string index \uD842 0
 } "\uD842"
-test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} tip389 {
-    string index \uD842 0
-} "\uD842"
+test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {teststringbytes tip389} {
+    teststringbytes [string index \uD842 0]
+} \xF0
 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} {
     string index \uDC42 0
 } "\uDC42"
@@ -1002,18 +1003,18 @@ test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
 test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
     string index \uD83D\uDE00G 0
 } "\U1F600"
-test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} tip389 {
-    string index \uD83D\uDE00G 0
-} "\U1F600"
+test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} {
+    teststringbytes [string index \uD83D\uDE00G 0]
+} \xF0
 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
     string index \uD83D\uDE00G 1
 } "\uDE00"
 test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
     string index \uD83D\uDE00G 1
 } G
-test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} tip389 {
-    string index \uD83D\uDE00G 1
-} {}
+test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} {
+     teststringbytes [string index \uD83D\uDE00G 1]
+} \xED\xB8\x80
 test utf-8.9.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
     string index \uD83D\uDE00G 2
 } G
@@ -1029,9 +1030,9 @@ test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
 test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} {
     string index \U1F600G 0
 } "\U1F600"
-test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} {
-    string index \U1F600G 0
-} "\U1F600"
+test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc teststringbytes tip389} {
+    teststringbytes [string index \U1F600G 0]
+} \xF0
 test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
     string index \U1F600G 1
 } G
@@ -1040,7 +1041,7 @@ test utf-8.11.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} {
 } G
 test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} {
     string index \U1F600G 1
-} {}
+} \uDE00
 test utf-8.12.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
     string index \U1F600G 2
 } {}
-- 
cgit v0.12


From 797bc84ed2bc1f24a8adc1e42a91ef90d2c0c91f Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Sat, 2 May 2020 21:54:14 +0000
Subject: Seems almost correct. Still problem with "string index" for
 TCL_UTF_MAX>3

---
 generic/tclUtf.c |  25 +++++-------
 tests/utf.test   | 115 +++++++++++++++++++++++++++----------------------------
 2 files changed, 66 insertions(+), 74 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 9375a01..03a7ca9 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -64,13 +64,10 @@ static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-#if TCL_UTF_MAX != 4
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
+/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-#endif
+/* End of "continuation byte section" */
     2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
 #if TCL_UTF_MAX > 3
@@ -80,7 +77,7 @@ static const unsigned char totalBytes[256] = {
 #endif
     1,1,1,1,1,1,1,1,1,1,1
 };
-
+
 static const unsigned char complete[256] = {
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -95,7 +92,7 @@ static const unsigned char complete[256] = {
 #endif
     2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
     4,4,4,4,4,
 #else
     1,1,1,1,1,
@@ -626,26 +623,24 @@ Tcl_NumUtfChars(
 	const char *endPtr = src + length - TCL_UTF_MAX;
 
 	while (src < endPtr) {
-#if TCL_UTF_MAX < 4
 	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
 		/* treat F0 - F4 as single character */
 		ch = 0;
 		src++;
-	    } else
-#endif
-	    src += TclUtfToUniChar(src, &ch);
+	    } else {
+		src += TclUtfToUniChar(src, &ch);
+	    }
 	    i++;
 	}
 	endPtr += TCL_UTF_MAX;
 	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
-#if TCL_UTF_MAX < 4
 	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
 		/* treat F0 - F4 as single character */
 		ch = 0;
 		src++;
-	    } else
-#endif
-	    src += TclUtfToUniChar(src, &ch);
+	    } else {
+		src += TclUtfToUniChar(src, &ch);
+	    }
 	    i++;
 	}
 	if (src < endPtr) {
diff --git a/tests/utf.test b/tests/utf.test
index 50351cb..71b4978 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -398,7 +398,7 @@ test utf-6.68 {Tcl_UtfNext} testutfnext {
 test utf-6.69.0 {Tcl_UtfNext} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0
 } 1
-test utf-6.69.1 {Tcl_UtfNext} {testutfnext fullutf} {
+test utf-6.69.1 {Tcl_UtfNext} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0
 } 4
 test utf-6.70 {Tcl_UtfNext} testutfnext {
@@ -416,37 +416,37 @@ test utf-6.73 {Tcl_UtfNext} testutfnext {
 test utf-6.74.0 {Tcl_UtfNext} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0G
 } 1
-test utf-6.74.1 {Tcl_UtfNext} {testutfnext fullutf} {
+test utf-6.74.1 {Tcl_UtfNext} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0G
 } 4
 test utf-6.75.0 {Tcl_UtfNext} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0\xA0
 } 1
-test utf-6.75.1 {Tcl_UtfNext} {testutfnext fullutf} {
+test utf-6.75.1 {Tcl_UtfNext} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0\xA0
 } 4
 test utf-6.76.0 {Tcl_UtfNext} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0\xD0
 } 1
-test utf-6.76.1 {Tcl_UtfNext} {testutfnext fullutf} {
+test utf-6.76.1 {Tcl_UtfNext} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0\xD0
 } 4
 test utf-6.77.0 {Tcl_UtfNext} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0\xE8
 } 1
-test utf-6.77.1 {Tcl_UtfNext} {testutfnext fullutf} {
+test utf-6.77.1 {Tcl_UtfNext} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0\xE8
 } 4
 test utf-6.78.0 {Tcl_UtfNext} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0\xF2
 } 1
-test utf-6.78.1 {Tcl_UtfNext} {testutfnext fullutf} {
+test utf-6.78.1 {Tcl_UtfNext} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0\xF2
 } 4
 test utf-6.79.0 {Tcl_UtfNext} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0G\xF8
 } 1
-test utf-6.79.1 {Tcl_UtfNext} {testutfnext fullutf} {
+test utf-6.79.1 {Tcl_UtfNext} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0G\xF8
 } 4
 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext {
@@ -473,7 +473,7 @@ test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext {
 test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext ucs2} {
     testutfnext \xF0\x90\x80\x80
 } 1
-test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext fullutf} {
+test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext ucs4} {
     testutfnext \xF0\x90\x80\x80
 } 4
 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} testutfnext {
@@ -485,7 +485,7 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} tes
 test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs2} {
     testutfnext \xF4\x8F\xBF\xBF
 } 1
-test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} {
+test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs4} {
     testutfnext \xF4\x8F\xBF\xBF
 } 4
 test utf-6.91.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs2} {
@@ -497,12 +497,9 @@ test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf}
 test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext {
     testutfnext \xA0\xA0\xA0
 } 1
-test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} {
+test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext {
     testutfnext \x80\x80\x80
 } 1
-test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} {
-    testutfnext \x80\x80\x80
-} 3
 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext {
     testutfnext \xA0\xA0\xA0\xA0
 } 1
@@ -554,64 +551,64 @@ test utf-6.109 {Tcl_UtfNext, read limits} testutfnext {
 test utf-6.110.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0G 1
 } 1
-test utf-6.110.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
+test utf-6.110.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0G 1
 } 0
 test utf-6.111.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0G 2
 } 1
-test utf-6.111.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
+test utf-6.111.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0G 2
 } 0
 test utf-6.112.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0G 3
 } 1
-test utf-6.112.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
+test utf-6.112.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0G 3
 } 0
 test utf-6.113.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0G 4
 } 1
-test utf-6.113.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
+test utf-6.113.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0G 4
 } 4
 test utf-6.114.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0\xA0 1
 } 1
-test utf-6.114.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
+test utf-6.114.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0\xA0 1
 } 0
 test utf-6.115.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0\xA0 2
 } 1
-test utf-6.115.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
+test utf-6.115.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0\xA0 2
 } 0
 test utf-6.116.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0\xA0 3
 } 1
-test utf-6.116.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
+test utf-6.116.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0\xA0 3
 } 0
 test utf-6.117.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xF2\xA0\xA0\xA0\xA0 4
 } 1
-test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
+test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} {
     testutfnext \xF2\xA0\xA0\xA0\xA0 4
 } 4
 test utf-6.118 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0G 0
 } 0
-test utf-6.119 {Tcl_UtfNext, read limits} testutfnext {
+test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xA0G 1
 } 0
-test utf-6.120 {Tcl_UtfNext, read limits} testutfnext {
+test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xA0\xA0 1
 } 0
-test utf-6.121 {Tcl_UtfNext, read limits} testutfnext {
+test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xA0\xA0G 2
 } 0
-test utf-6.122 {Tcl_UtfNext, read limits} testutfnext {
+test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xA0\xA0\xA0 2
 } 0
 test utf-6.123 {Tcl_UtfNext, read limits} testutfnext {
@@ -693,19 +690,19 @@ test utf-7.9.2 {Tcl_UtfPrev} testutfprev {
 test utf-7.10.0 {Tcl_UtfPrev} {testutfprev ucs2} {
     testutfprev A\xF2\xA0
 } 2
-test utf-7.10.1 {Tcl_UtfPrev} {testutfprev fullutf} {
+test utf-7.10.1 {Tcl_UtfPrev} {testutfprev ucs4} {
     testutfprev A\xF2\xA0
 } 1
 test utf-7.10.2 {Tcl_UtfPrev} {testutfprev ucs2} {
     testutfprev A\xF2\xA0\xA0\xA0 3
 } 2
-test utf-7.10.3 {Tcl_UtfPrev} {testutfprev fullutf} {
+test utf-7.10.3 {Tcl_UtfPrev} {testutfprev ucs4} {
     testutfprev A\xF2\xA0\xA0\xA0 3
 } 1
 test utf-7.10.4 {Tcl_UtfPrev} {testutfprev ucs2} {
     testutfprev A\xF2\xA0\xF8\xA0 3
 } 2
-test utf-7.10.5 {Tcl_UtfPrev} {testutfprev fullutf} {
+test utf-7.10.5 {Tcl_UtfPrev} {testutfprev ucs4} {
     testutfprev A\xF2\xA0\xF8\xA0 3
 } 1
 test utf-7.11 {Tcl_UtfPrev} testutfprev {
@@ -750,19 +747,19 @@ test utf-7.14.2 {Tcl_UtfPrev} testutfprev {
 test utf-7.15.0 {Tcl_UtfPrev} {testutfprev ucs2} {
     testutfprev A\xF2\xA0\xA0
 } 3
-test utf-7.15.1 {Tcl_UtfPrev} {testutfprev fullutf} {
+test utf-7.15.1 {Tcl_UtfPrev} {testutfprev ucs4} {
     testutfprev A\xF2\xA0\xA0
 } 1
 test utf-7.15.1.0 {Tcl_UtfPrev} {testutfprev ucs2} {
     testutfprev A\xF2\xA0\xA0\xA0 4
 } 3
-test utf-7.15.1.1 {Tcl_UtfPrev} {testutfprev fullutf} {
+test utf-7.15.1.1 {Tcl_UtfPrev} {testutfprev ucs4} {
     testutfprev A\xF2\xA0\xA0\xA0 4
 } 1
 test utf-7.15.2.0 {Tcl_UtfPrev} {testutfprev ucs2} {
     testutfprev A\xF2\xA0\xA0\xF8 4
 } 3
-test utf-7.15.2.1 {Tcl_UtfPrev} {testutfprev fullutf} {
+test utf-7.15.2.1 {Tcl_UtfPrev} {testutfprev ucs4} {
     testutfprev A\xF2\xA0\xA0\xF8 4
 } 1
 test utf-7.16 {Tcl_UtfPrev} testutfprev {
@@ -888,19 +885,19 @@ test utf-7.38 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
 test utf-7.39.0 {Tcl_UtfPrev -- overlong sequence}  {testutfprev ucs2} {
     testutfprev A\xF0\x90\x80\x80
 } 2
-test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence}  {testutfprev fullutf} {
+test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence}  {testutfprev ucs4} {
     testutfprev A\xF0\x90\x80\x80
 } 1
 test utf-7.40.0 {Tcl_UtfPrev -- overlong sequence}  {testutfprev ucs2} {
     testutfprev A\xF0\x90\x80\x80 4
 } 3
-test utf-7.40.1 {Tcl_UtfPrev -- overlong sequence}  {testutfprev fullutf} {
+test utf-7.40.1 {Tcl_UtfPrev -- overlong sequence}  {testutfprev ucs4} {
     testutfprev A\xF0\x90\x80\x80 4
 } 1
 test utf-7.41.0 {Tcl_UtfPrev -- overlong sequence}  {testutfprev ucs2} {
     testutfprev A\xF0\x90\x80\x80 3
 } 2
-test utf-7.41.1 {Tcl_UtfPrev -- overlong sequence}  {testutfprev fullutf} {
+test utf-7.41.1 {Tcl_UtfPrev -- overlong sequence}  {testutfprev ucs4} {
     testutfprev A\xF0\x90\x80\x80 3
 } 1
 test utf-7.42 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
@@ -933,19 +930,19 @@ test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} t
 test utf-7.48.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs2} {
     testutfprev A\xF4\x8F\xBF\xBF
 } 2
-test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} {
+test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs4} {
     testutfprev A\xF4\x8F\xBF\xBF
 } 1
 test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs2} {
     testutfprev A\xF4\x8F\xBF\xBF 4
 } 3
-test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} {
+test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs4} {
     testutfprev A\xF4\x8F\xBF\xBF 4
 } 1
 test utf-7.48.4 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs2} {
     testutfprev A\xF4\x8F\xBF\xBF 3
 } 2
-test utf-7.48.5 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} {
+test utf-7.48.5 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs4} {
     testutfprev A\xF4\x8F\xBF\xBF 3
 } 1
 test utf-7.48.6 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
@@ -978,37 +975,37 @@ test utf-8.1 {Tcl_UniCharAtIndex: index = 0} {
 } a
 test utf-8.2 {Tcl_UniCharAtIndex: index = 0} {
     string index \u4E4E\u25A 0
-} "\u4E4E"
+} \u4E4E
 test utf-8.3 {Tcl_UniCharAtIndex: index > 0} {
     string index abcd 2
 } c
 test utf-8.4 {Tcl_UniCharAtIndex: index > 0} {
     string index \u4E4E\u25A\xFF\u543 2
-} "\uFF"
+} \uFF
 test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 {
     string index \uD842 0
-} "\uD842"
-test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 {
-    string index \uD842 0
-} "\uD842"
+} \uD842
+test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} {teststringbytes ucs4} {
+    teststringbytes [string index \uD842 0]
+} \xF0
 test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {teststringbytes tip389} {
     teststringbytes [string index \uD842 0]
 } \xF0
 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} {
     string index \uDC42 0
-} "\uDC42"
+} \uDC42
 test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
     string index \uD83D\uDE00G 0
-} "\uD83D"
+} \uD83D
 test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
     string index \uD83D\uDE00G 0
-} "\U1F600"
+} \U1F600
 test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} {
     teststringbytes [string index \uD83D\uDE00G 0]
 } \xF0
 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
     string index \uD83D\uDE00G 1
-} "\uDE00"
+} \uDE00
 test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
     string index \uD83D\uDE00G 1
 } G
@@ -1026,10 +1023,10 @@ test utf-8.9.2 {Tcl_UniCharAtIndex: Emoji} tip389 {
 } G
 test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
     string index \U1F600G 0
-} "\uFFFD"
+} \uFFFD
 test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} {
     string index \U1F600G 0
-} "\U1F600"
+} \U1F600
 test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc teststringbytes tip389} {
     teststringbytes [string index \U1F600G 0]
 } \xF0
@@ -1057,22 +1054,22 @@ test utf-9.1 {Tcl_UtfAtIndex: index = 0} {
 } abc
 test utf-9.2 {Tcl_UtfAtIndex: index > 0} {
     string range \u4E4E\u25A\xFF\u543klmnop 1 5
-} "\u25A\xFF\u543kl"
+} \u25A\xFF\u543kl
 test utf-9.3.0 {Tcl_UtfAtIndex: index = 0, Emoji} ucs2 {
     string range \uD83D\uDE00G 0 0
-} "\uD83D"
+} \uD83D
 test utf-9.3.1 {Tcl_UtfAtIndex: index = 0, Emoji} ucs4 {
     string range \uD83D\uDE00G 0 0
-} "\U1F600"
+} \U1F600
 test utf-9.3.2 {Tcl_UtfAtIndex: index = 0, Emoji} tip389 {
     string range \uD83D\uDE00G 0 0
-} "\U1F600"
+} \U1F600
 test utf-9.4.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 {
     string range \uD83D\uDE00G 1 1
-} "\uDE00"
+} \uDE00
 test utf-9.4.1 {Tcl_UtfAtIndex: index > 0, Emoji} ucs4 {
     string range \uD83D\uDE00G 1 1
-} "G"
+} G
 test utf-9.4.2 {Tcl_UtfAtIndex: index > 0, Emoji} tip389 {
     string range \uD83D\uDE00G 1 1
 } {}
@@ -1087,19 +1084,19 @@ test utf-9.5.2 {Tcl_UtfAtIndex: index > 0, Emoji} tip389 {
 } G
 test utf-9.6.0 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs2} {
     string range \U1f600G 0 0
-} "\uFFFD"
+} \uFFFD
 test utf-9.6.1 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs4} {
     string range \U1f600G 0 0
-} "\U1F600"
+} \U1F600
 test utf-9.6.2 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc tip389} {
     string range \U1f600G 0 0
-} "\U1F600"
+} \U1F600
 test utf-9.7.0 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs2} {
     string range \U1f600G 1 1
 } G
 test utf-9.7.1 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs4} {
     string range \U1f600G 1 1
-} "G"
+} G
 test utf-9.7.2 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc tip389} {
     string range \U1f600G 1 1
 } {}
-- 
cgit v0.12


From 413ea81a284c691dc5ed4ad48217370ce83f65f7 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Mon, 4 May 2020 14:17:23 +0000
Subject: More progress/simplification

---
 generic/tclUtf.c | 23 ++---------------------
 tests/utf.test   | 33 ++++++++++++---------------------
 2 files changed, 14 insertions(+), 42 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index c4b5305..b964b7e 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -70,25 +70,6 @@ static const unsigned char totalBytes[256] = {
 /* End of "continuation byte section" */
     2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-#if TCL_UTF_MAX > 3
-    4,4,4,4,4,
-#else
-    1,1,1,1,1,
-#endif
-    1,1,1,1,1,1,1,1,1,1,1
-};
-
-static const unsigned char complete[256] = {
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
-    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-/* End of "continuation byte section" */
-    2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
 #if TCL_UTF_MAX > 4
     4,4,4,4,4,
 #else
@@ -183,7 +164,7 @@ Invalid(
     unsigned char byte = *src;
     int index;
 
-    if (byte % 0x04) {
+    if ((byte & 0xC3) != 0xC0) {
 	/* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */
 	return 0;
     }
@@ -573,7 +554,7 @@ Tcl_UtfCharComplete(
 				 * a complete UTF-8 character. */
     int length)			/* Length of above string in bytes. */
 {
-    return length >= complete[UCHAR(*src)];
+    return length >= totalBytes[UCHAR(*src)];
 }
 
 /*
diff --git a/tests/utf.test b/tests/utf.test
index c455078..3f74f6f 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -493,25 +493,16 @@ test utf-6.91.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs2} {
 test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} {
     testutfnext \xF4\x90\x80\x80
 } 1
-test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext ucs2} {
+test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext {
     testutfnext \xA0\xA0\xA0
-} 1
-test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext fullutf knownBug} {
-    testutfnext \xA0\xA0\xA0
-} 3
-test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} {
-    testutfnext \x80\x80\x80
 } 3
-test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf knownBug} {
+test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext {
     testutfnext \x80\x80\x80
 } 3
-test utf-6.94.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext ucs2} {
-    testutfnext \xA0\xA0\xA0\xA0
-} 1
-test utf-6.94.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext fullutf knownBug} {
+test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext {
     testutfnext \xA0\xA0\xA0\xA0
 } 3
-test utf-6.95 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext ucs2} {
+test utf-6.95 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext {
     testutfnext \x80\x80\x80\x80
 } 3
 test utf-6.96 {Tcl_UtfNext, read limits} testutfnext {
@@ -619,18 +610,18 @@ test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
 test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
     testutfnext \xA0\xA0\xA0 2
 } 0
-test utf-6.123 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+test utf-6.123 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0\xA0G 3
-} 1
-test utf-6.124 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 3
+test utf-6.124 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0\xA0\xA0 3
-} 1
-test utf-6.125 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 3
+test utf-6.125 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0\xA0\xA0G 4
-} 1
-test utf-6.126 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 3
+test utf-6.126 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0\xA0\xA0\xA0 4
-} 1
+} 3
 
 test utf-7.1 {Tcl_UtfPrev} testutfprev {
     testutfprev {}
-- 
cgit v0.12


From cc86f6e11e85d0a01675ec31e3677ad2e63cddc4 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Tue, 5 May 2020 13:23:17 +0000
Subject: Add 4 test-cases that could fool Tcl_UtfPrev (but ... actually they
 don't). Make sure that Tcl_UtfPrev() never reads more than 3 trail bytes (or
 4 when TCL_UTF_MAX > 4). Those are the same limits as for Tcl_UtfNext() and
 Tcl_UtfToUniChar()

---
 generic/tclUtf.c |  2 +-
 tests/utf.test   | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index ac87978..2439a54 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -855,7 +855,7 @@ Tcl_UtfPrev(
 
 	/* Continue the search backwards... */
 	look--;
-    } while (trailBytesSeen < TCL_UTF_MAX);
+    } while (trailBytesSeen < ((TCL_UTF_MAX > 3) ? 4 : 3));
 
     /*
      * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a
diff --git a/tests/utf.test b/tests/utf.test
index e8fa603..ffe7896 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -968,6 +968,18 @@ test utf-7.49.5 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf}
 test utf-7.49.6 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
     testutfprev A\xF4\x90\x80\x80 2
 } 1
+test utf-7.50.0 {Tcl_UtfPrev, 4-byte valid sequence with additional trail} {testutfprev ucs2} {
+    testutfprev \xF2\xA0\xA0\xA0\xA0
+} 2
+test utf-7.50.1 {Tcl_UtfPrev, 4-byte valid sequence with additional trail} {testutfprev fullutf} {
+    testutfprev \xF2\xA0\xA0\xA0\xA0
+} 4
+test utf-7.51.0 {Tcl_UtfPrev, 4-byte valid sequence with additional trail} {testutfprev ucs2} {
+    testutfprev \xF2\x80\x80\x80\x80
+} 2
+test utf-7.51.1 {Tcl_UtfPrev, 4-byte valid sequence with additional trail} {testutfprev fullutf} {
+    testutfprev \xF2\x80\x80\x80\x80
+} 4
 
 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} {
     string index abcd 0
-- 
cgit v0.12


From 8fc378853391cde228bf25c1491e9ba02ebf0f2c Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 7 May 2020 18:23:36 +0000
Subject: New approach to fixing the regression reported in [31aa44375d] builds
 on recent reforms.  Older efforts aborted.

---
 generic/tclUtf.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 12eb637..8ae4b15 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -381,7 +381,7 @@ Tcl_UtfToUniChar(
 	 * characters representing themselves.
 	 */
 
-#if TCL_UTF_MAX <= 4
+#if TCL_UTF_MAX == 4
 	/* If *chPtr contains a high surrogate (produced by a previous
 	 * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
 	 * bytes, then we must produce a follow-up low surrogate. We only
@@ -437,7 +437,7 @@ Tcl_UtfToUniChar(
 	     * Four-byte-character lead byte followed by at least two trail bytes.
 	     * We don't test the validity of 3th trail byte, see [ed29806ba]
 	     */
-#if TCL_UTF_MAX <= 4
+#if TCL_UTF_MAX == 4
 	    Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
 		    | ((src[2] & 0x3F) >> 4)) - 0x40;
 	    if (high < 0x400) {
@@ -446,7 +446,7 @@ Tcl_UtfToUniChar(
 		return 1;
 	    }
 	    /* out of range, < 0x10000 or > 0x10FFFF */
-#else
+#elif TCL_UTF_MAX > 4
 	    if ((src[3] & 0xC0) == 0x80) {
 		*chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
 			| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
@@ -617,25 +617,11 @@ Tcl_NumUtfChars(
 	 */
 	while (src <= optPtr
 		/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
-#if TCL_UTF_MAX < 4
-	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
-		/* treat F0 - F4 as single character */
-		ch = 0;
-		src++;
-	    } else
-#endif
 	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
 	/* Loop over the remaining string where call must happen */
 	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
-#if TCL_UTF_MAX < 4
-	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
-		/* treat F0 - F4 as single character */
-		ch = 0;
-		src++;
-	    } else
-#endif
 	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
-- 
cgit v0.12


From 6d36267ee03ebbf37b1843d0602220bbc299f8e9 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 7 May 2020 18:36:25 +0000
Subject: split and constrain the failing test.

---
 tests/encoding.test | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/encoding.test b/tests/encoding.test
index 552c97f..84f9ae1 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -335,7 +335,12 @@ test encoding-15.4 {UtfToUtfProc emoji character input} -body {
     set y [encoding convertfrom utf-8 \xED\xA0\xBD\xED\xB8\x82]
     list [string length $x] $y
 } -result "6 \uD83D\uDE02"
-test encoding-15.5 {UtfToUtfProc emoji character input} {
+test encoding-15.5.0 {UtfToUtfProc emoji character input} ucs2 {
+    set x \xF0\x9F\x98\x82
+    set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
+    list [string length $x] $y
+} "4 \xF0\x9F\x98\x82"
+test encoding-15.5.1 {UtfToUtfProc emoji character input} fullutf {
     set x \xF0\x9F\x98\x82
     set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
     list [string length $x] $y
-- 
cgit v0.12