4 files changed, 71 insertions, 41 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 1584de0..5c7aab8 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2341,7 +2341,7 @@ UtfToUtfProc(
 	    *dst++ = 0;
 	    *chPtr = 0; /* reset surrogate handling */
 	    src += 2;
-	} else if (!TclUCS4Complete(src, srcEnd - src)) {
+	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
 	    /*
 	     * Always check before using TclUtfToUniChar. Not doing can so
 	     * cause it run beyond the end of the buffer! If we happen such an
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 593d878..5c46470 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3184,8 +3184,13 @@ MODULE_SCOPE int	TclTrimRight(const char *bytes, int numBytes,
 			    const char *trim, int numTrim);
 MODULE_SCOPE int	TclUtfCasecmp(const char *cs, const char *ct);
 MODULE_SCOPE int	TclUtfToUCS4(const char *src, int *ucs4Ptr);
+/*
+ * Bytes F0-F4 are start-bytes for 4-byte sequences.
+ * Byte 0xED can be the start-byte of an upper surrogate. In that case,
+ * TclUtfToUCS4() might read the lower surrogate following it too.
+ */
 #   define TclUCS4Complete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \
-	    ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length)))
+	    ? ((length) >= 4) : (UCHAR(*(src)) == 0xED) ? ((length) >= 6) : Tcl_UtfCharComplete((src), (length)))
 MODULE_SCOPE Tcl_Obj *	TclpNativeToNormalized(ClientData clientData);
 MODULE_SCOPE Tcl_Obj *	TclpFilesystemPathType(Tcl_Obj *pathPtr);
 MODULE_SCOPE int	TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr,
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 9ffbfba..9375a01 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -81,6 +81,28 @@ static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1
 };
 
+static const unsigned char complete[256] = {
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+#if TCL_UTF_MAX > 4
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#endif
+    2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#if TCL_UTF_MAX > 3
+    4,4,4,4,4,
+#else
+    1,1,1,1,1,
+#endif
+    1,1,1,1,1,1,1,1,1,1,1
+};
+
 /*
  * Functions used only in this module.
  */
@@ -359,8 +381,8 @@ Tcl_UniCharToUtfDString(
 
 int
 Tcl_UtfToUniChar(
-    register const char *src,	/* The UTF-8 string. */
-    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+    const char *src,	/* The UTF-8 string. */
+    Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
 				 * the UTF-8 string. */
 {
     Tcl_UniChar byte;
@@ -557,7 +579,7 @@ Tcl_UtfCharComplete(
 				 * a complete UTF-8 character. */
     int length)			/* Length of above string in bytes. */
 {
-    return length >= totalBytes[UCHAR(*src)];
+    return length >= complete[UCHAR(*src)];
 }
 
 /*
@@ -580,12 +602,12 @@ Tcl_UtfCharComplete(
 
 int
 Tcl_NumUtfChars(
-    register const char *src,	/* The UTF-8 string to measure. */
+    const char *src,	/* The UTF-8 string to measure. */
     int length)			/* The length of the string in bytes, or -1
 				 * for strlen(string). */
 {
     Tcl_UniChar ch = 0;
-    register int i = 0;
+    int i = 0;
 
     /*
      * The separate implementations are faster.
@@ -601,27 +623,29 @@ Tcl_NumUtfChars(
 	}
 	if (i < 0) i = INT_MAX; /* Bug [2738427] */
     } else {
-	register const char *endPtr = src + length - TCL_UTF_MAX;
+	const char *endPtr = src + length - TCL_UTF_MAX;
 
 	while (src < endPtr) {
+#if TCL_UTF_MAX < 4
 	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
 		/* treat F0 - F4 as single character */
 		ch = 0;
 		src++;
-	    } else {
-		src += TclUtfToUniChar(src, &ch);
-	    }
+	    } else
+#endif
+	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
 	endPtr += TCL_UTF_MAX;
 	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
+#if TCL_UTF_MAX < 4
 	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
 		/* treat F0 - F4 as single character */
 		ch = 0;
 		src++;
-	    } else {
-		src += TclUtfToUniChar(src, &ch);
-	    }
+	    } else
+#endif
+	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
 	if (src < endPtr) {
@@ -890,8 +914,8 @@ Tcl_UtfPrev(
 
 Tcl_UniChar
 Tcl_UniCharAtIndex(
-    register const char *src,	/* The UTF-8 string to dereference. */
-    register int index)		/* The position of the desired character. */
+    const char *src,	/* The UTF-8 string to dereference. */
+    int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch = 0;
 
@@ -918,8 +942,8 @@ Tcl_UniCharAtIndex(
 
 const char *
 Tcl_UtfAtIndex(
-    register const char *src,	/* The UTF-8 string. */
-    register int index)		/* The position of the desired character. */
+    const char *src,	/* The UTF-8 string. */
+    int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch = 0;
     int len = 0;
@@ -1191,7 +1215,7 @@ TclpUtfNcmp2(
      * fine in the strcmp manner.
      */
 
-    register int result = 0;
+    int result = 0;
 
     for ( ; numBytes != 0; numBytes--, cs++, ct++) {
 	if (*cs != *ct) {
diff --git a/tests/utf.test b/tests/utf.test
index 0929801..50351cb 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -29,6 +29,7 @@ testConstraint pre388 [eq \x741 A]
 testConstraint pairsTo4bytes [expr {[llength [info commands teststringbytes]]
 		&& [string length [teststringbytes \uD83D\uDCA9]] == 4}]
 
+testConstraint teststringbytes [llength [info commands teststringbytes]]
 testConstraint testbytestring [llength [info commands testbytestring]]
 testConstraint testfindfirst [llength [info commands testfindfirst]]
 testConstraint testfindlast [llength [info commands testfindlast]]
@@ -501,7 +502,7 @@ test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {
 } 1
 test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} {
     testutfnext \x80\x80\x80
-} 1
+} 3
 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext {
     testutfnext \xA0\xA0\xA0\xA0
 } 1
@@ -601,18 +602,18 @@ test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} {
 test utf-6.118 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0G 0
 } 0
-test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+test utf-6.119 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0G 1
-} 1
-test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 0
+test utf-6.120 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0 1
-} 1
-test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 0
+test utf-6.121 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0G 2
-} 1
-test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} {
+} 0
+test utf-6.122 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0\xA0 2
-} 1
+} 0
 test utf-6.123 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext \xA0\xA0\xA0G 3
 } 1
@@ -990,9 +991,9 @@ test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 {
 test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 {
     string index \uD842 0
 } "\uD842"
-test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} tip389 {
-    string index \uD842 0
-} "\uD842"
+test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {teststringbytes tip389} {
+    teststringbytes [string index \uD842 0]
+} \xF0
 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} {
     string index \uDC42 0
 } "\uDC42"
@@ -1002,18 +1003,18 @@ test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
 test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
     string index \uD83D\uDE00G 0
 } "\U1F600"
-test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} tip389 {
-    string index \uD83D\uDE00G 0
-} "\U1F600"
+test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} {
+    teststringbytes [string index \uD83D\uDE00G 0]
+} \xF0
 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
     string index \uD83D\uDE00G 1
 } "\uDE00"
 test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
     string index \uD83D\uDE00G 1
 } G
-test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} tip389 {
-    string index \uD83D\uDE00G 1
-} {}
+test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} {
+     teststringbytes [string index \uD83D\uDE00G 1]
+} \xED\xB8\x80
 test utf-8.9.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
     string index \uD83D\uDE00G 2
 } G
@@ -1029,9 +1030,9 @@ test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
 test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} {
     string index \U1F600G 0
 } "\U1F600"
-test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} {
-    string index \U1F600G 0
-} "\U1F600"
+test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc teststringbytes tip389} {
+    teststringbytes [string index \U1F600G 0]
+} \xF0
 test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
     string index \U1F600G 1
 } G
@@ -1040,7 +1041,7 @@ test utf-8.11.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} {
 } G
 test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} {
     string index \U1F600G 1
-} {}
+} \uDE00
 test utf-8.12.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
     string index \U1F600G 2
 } {}