From 9501890b6c738830781eebe5d8bdcff2d6a0068c Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Sat, 2 May 2020 22:48:20 +0000
Subject: Join test-cases utf-6.93.0 and utf-6.93.1, which MUST give the same
 answer always for whatever testConstraints. Fix one invalid use of
 TclUCS4Complete(), and let TclUtfToUCS4() handle (invalid) 4-byte sequences.
 Test-case cleanup (removal of unnecessary quoting)

---
 generic/tclEncoding.c |  2 +-
 generic/tclUtf.c      |  2 +-
 tests/utf.test        | 59 ++++++++++++++++++++++++---------------------------
 3 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 1584de0..5c7aab8 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2341,7 +2341,7 @@ UtfToUtfProc(
 	    *dst++ = 0;
 	    *chPtr = 0; /* reset surrogate handling */
 	    src += 2;
-	} else if (!TclUCS4Complete(src, srcEnd - src)) {
+	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
 	    /*
 	     * Always check before using TclUtfToUniChar. Not doing can so
 	     * cause it run beyond the end of the buffer! If we happen such an
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 9ffbfba..160e444 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2360,7 +2360,7 @@ TclUtfToUCS4(
     len = TclUtfToUniChar(src, &ch);
     fullchar = ch;
 
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
     /* 4-byte UTF-8 is supported; decode surrogates */
 
     if ((ch >= 0xD800) && len < 3) {
diff --git a/tests/utf.test b/tests/utf.test
index 0929801..2bfb9ea 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -496,10 +496,7 @@ test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf}
 test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext {
     testutfnext \xA0\xA0\xA0
 } 1
-test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} {
-    testutfnext \x80\x80\x80
-} 1
-test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} {
+test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext {
     testutfnext \x80\x80\x80
 } 1
 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext {
@@ -977,37 +974,37 @@ test utf-8.1 {Tcl_UniCharAtIndex: index = 0} {
 } a
 test utf-8.2 {Tcl_UniCharAtIndex: index = 0} {
     string index \u4E4E\u25A 0
-} "\u4E4E"
+} \u4E4E
 test utf-8.3 {Tcl_UniCharAtIndex: index > 0} {
     string index abcd 2
 } c
 test utf-8.4 {Tcl_UniCharAtIndex: index > 0} {
     string index \u4E4E\u25A\xFF\u543 2
-} "\uFF"
+} \uFF
 test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 {
     string index \uD842 0
-} "\uD842"
+} \uD842
 test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 {
     string index \uD842 0
-} "\uD842"
+} \uD842
 test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} tip389 {
     string index \uD842 0
-} "\uD842"
+} \uD842
 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} {
     string index \uDC42 0
-} "\uDC42"
+} \uDC42
 test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
     string index \uD83D\uDE00G 0
-} "\uD83D"
+} \uD83D
 test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
     string index \uD83D\uDE00G 0
-} "\U1F600"
+} \U1F600
 test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} tip389 {
     string index \uD83D\uDE00G 0
-} "\U1F600"
+} \U1F600
 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
     string index \uD83D\uDE00G 1
-} "\uDE00"
+} \uDE00
 test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
     string index \uD83D\uDE00G 1
 } G
@@ -1025,13 +1022,13 @@ test utf-8.9.2 {Tcl_UniCharAtIndex: Emoji} tip389 {
 } G
 test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
     string index \U1F600G 0
-} "\uFFFD"
+} \uFFFD
 test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} {
     string index \U1F600G 0
-} "\U1F600"
+} \U1F600
 test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} {
     string index \U1F600G 0
-} "\U1F600"
+} \U1F600
 test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
     string index \U1F600G 1
 } G
@@ -1056,22 +1053,22 @@ test utf-9.1 {Tcl_UtfAtIndex: index = 0} {
 } abc
 test utf-9.2 {Tcl_UtfAtIndex: index > 0} {
     string range \u4E4E\u25A\xFF\u543klmnop 1 5
-} "\u25A\xFF\u543kl"
+} \u25A\xFF\u543kl
 test utf-9.3.0 {Tcl_UtfAtIndex: index = 0, Emoji} ucs2 {
     string range \uD83D\uDE00G 0 0
-} "\uD83D"
+} \uD83D
 test utf-9.3.1 {Tcl_UtfAtIndex: index = 0, Emoji} ucs4 {
     string range \uD83D\uDE00G 0 0
-} "\U1F600"
+} \U1F600
 test utf-9.3.2 {Tcl_UtfAtIndex: index = 0, Emoji} tip389 {
     string range \uD83D\uDE00G 0 0
-} "\U1F600"
+} \U1F600
 test utf-9.4.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 {
     string range \uD83D\uDE00G 1 1
-} "\uDE00"
+} \uDE00
 test utf-9.4.1 {Tcl_UtfAtIndex: index > 0, Emoji} ucs4 {
     string range \uD83D\uDE00G 1 1
-} "G"
+} G
 test utf-9.4.2 {Tcl_UtfAtIndex: index > 0, Emoji} tip389 {
     string range \uD83D\uDE00G 1 1
 } {}
@@ -1086,19 +1083,19 @@ test utf-9.5.2 {Tcl_UtfAtIndex: index > 0, Emoji} tip389 {
 } G
 test utf-9.6.0 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs2} {
     string range \U1f600G 0 0
-} "\uFFFD"
+} \uFFFD
 test utf-9.6.1 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs4} {
     string range \U1f600G 0 0
-} "\U1F600"
+} \U1F600
 test utf-9.6.2 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc tip389} {
     string range \U1f600G 0 0
-} "\U1F600"
+} \U1F600
 test utf-9.7.0 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs2} {
     string range \U1f600G 1 1
 } G
 test utf-9.7.1 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs4} {
     string range \U1f600G 1 1
-} "G"
+} G
 test utf-9.7.2 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc tip389} {
     string range \U1f600G 1 1
 } {}
@@ -1182,7 +1179,7 @@ bsCheck \uA	10
 bsCheck \340	224
 bsCheck \uA1	161
 bsCheck \u4E21	20001
-bsCheck \741    225	pre388	;# == \341 
+bsCheck \741    225	pre388	;# == \341
 bsCheck \741    60	!pre388	;# == \74 1
 bsCheck \U      85
 bsCheck \Uk     85
@@ -1344,7 +1341,7 @@ test utf-20.2 {[4c591fa487] TclUniCharNcmp/TclUtfNcmp} knownBug {
     set two [format %c 0x10000]
     set first [string compare $one $two]
     string range $one 0 0
-    string range $two 0 0 
+    string range $two 0 0
     set second [string compare $one $two]
     expr {($first == $second) ? "agree" : "disagree"}
 } agree
@@ -1466,9 +1463,9 @@ UniCharCaseCmpTest < a b
 UniCharCaseCmpTest > b a
 UniCharCaseCmpTest > B a
 UniCharCaseCmpTest > aBcB abca
-UniCharCaseCmpTest < \uFFFF [format %c 0x10000] ucs4	
+UniCharCaseCmpTest < \uFFFF [format %c 0x10000] ucs4
 UniCharCaseCmpTest < \uFFFF \U10000		{Uesc ucs4}
-UniCharCaseCmpTest > [format %c 0x10000] \uFFFF	ucs4	
+UniCharCaseCmpTest > [format %c 0x10000] \uFFFF	ucs4
 UniCharCaseCmpTest > \U10000 \uFFFF		{Uesc ucs4}
 
 
-- 
cgit v0.12


From c17661c31e3f4fac5a70dd487b4c9b3372ee5e5b Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Sun, 3 May 2020 22:16:21 +0000
Subject: Re-join utf-6.93.0 and utf-6.93.1 (please disregard comment in
 previous commit, it was not correct). Perfectionalize
 TclUtfToUCS4()/TclUCS4Complete() and new (internal) function TclUCS4ToUtf().
 They can help preventing bugs regarding splitting/joining surrogates. Used
 them in a few more places.

---
 generic/tclBinary.c | 16 ++++++-------
 generic/tclCmdMZ.c  |  9 ++------
 generic/tclInt.h    |  9 +++++++-
 generic/tclParse.c  | 23 ++++++++-----------
 generic/tclUtf.c    | 66 +++++++++++++++++++++++++++++++++++++++++++----------
 tests/utf.test      |  5 +++-
 6 files changed, 86 insertions(+), 42 deletions(-)

diff --git a/generic/tclBinary.c b/generic/tclBinary.c
index 6306159..52ef457 100644
--- a/generic/tclBinary.c
+++ b/generic/tclBinary.c
@@ -1222,11 +1222,11 @@ BinaryFormatCmd(
 
  badField:
     {
-	Tcl_UniChar ch = 0;
-	char buf[TCL_UTF_MAX + 1] = "";
+	int ch;
+	char buf[8] = "";
 
-	TclUtfToUniChar(errorString, &ch);
-	buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
+	TclUtfToUCS4(errorString, &ch);
+	buf[TclUCS4ToUtf(ch, buf)] = '\0';
 	Tcl_SetObjResult(interp, Tcl_ObjPrintf(
 		"bad field specifier \"%s\"", buf));
 	return TCL_ERROR;
@@ -1592,11 +1592,11 @@ BinaryScanCmd(
 
  badField:
     {
-	Tcl_UniChar ch = 0;
-	char buf[TCL_UTF_MAX + 1] = "";
+	int ch;
+	char buf[8] = "";
 
-	TclUtfToUniChar(errorString, &ch);
-	buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
+	TclUtfToUCS4(errorString, &ch);
+	buf[TclUCS4ToUtf(ch, buf)] = '\0';
 	Tcl_SetObjResult(interp, Tcl_ObjPrintf(
 		"bad field specifier \"%s\"", buf));
 	return TCL_ERROR;
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 162a5a6..011164b 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1413,14 +1413,9 @@ StringIndexCmd(
 
 	    Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1));
 	} else {
-	    char buf[TCL_UTF_MAX] = "";
+	    char buf[8] = "";
 
-	    length = Tcl_UniCharToUtf(ch, buf);
-#if TCL_UTF_MAX > 3
-	    if ((ch >= 0xD800) && (length < 3)) {
-		length += Tcl_UniCharToUtf(-1, buf + length);
-	    }
-#endif
+	    length = TclUCS4ToUtf(ch, buf);
 	    Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length));
 	}
     }
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 593d878..6f024a6 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3184,8 +3184,15 @@ MODULE_SCOPE int	TclTrimRight(const char *bytes, int numBytes,
 			    const char *trim, int numTrim);
 MODULE_SCOPE int	TclUtfCasecmp(const char *cs, const char *ct);
 MODULE_SCOPE int	TclUtfToUCS4(const char *src, int *ucs4Ptr);
+MODULE_SCOPE int	TclUCS4ToUtf(int, char *);
+
+/*
+ * Bytes F0-F4 are start-bytes for 4-byte sequences.
+ * Byte 0xED can be the start-byte of an upper surrogate. In that case,
+ * TclUtfToUCS4() might read the lower surrogate following it too.
+ */
 #   define TclUCS4Complete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \
-	    ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length)))
+	    ? ((length) >= 4) : (UCHAR(*(src)) == 0xED) ? ((length) >= 6) : Tcl_UtfCharComplete((src), (length)))
 MODULE_SCOPE Tcl_Obj *	TclpNativeToNormalized(ClientData clientData);
 MODULE_SCOPE Tcl_Obj *	TclpFilesystemPathType(Tcl_Obj *pathPtr);
 MODULE_SCOPE int	TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr,
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 7beaeea..23a07cf 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -843,7 +843,6 @@ TclParseBackslash(
 				 * written there. */
 {
     register const char *p = src+1;
-    Tcl_UniChar unichar = 0;
     int result;
     int count;
     char buf[TCL_UTF_MAX] = "";
@@ -943,7 +942,7 @@ TclParseBackslash(
 	     * No hexdigits -> This is just "U".
 	     */
 	    result = 'U';
-	} else if ((result | 0x7FF) == 0xDFFF) {
+	} else if ((result & ~0x7FF) == 0xD800) {
 	    /* Upper or lower surrogate, not allowed in this syntax. */
 	    result = 0xFFFD;
 	}
@@ -991,16 +990,15 @@ TclParseBackslash(
 	 * #217987] test subst-3.2
 	 */
 
-	if (Tcl_UtfCharComplete(p, numBytes - 1)) {
-	    count = TclUtfToUniChar(p, &unichar) + 1;	/* +1 for '\' */
+	if (TclUCS4Complete(p, numBytes - 1)) {
+	    count = TclUtfToUCS4(p, &result) + 1;	/* +1 for '\' */
 	} else {
-	    char utfBytes[TCL_UTF_MAX];
+	    char utfBytes[8];
 
-	    memcpy(utfBytes, p, (size_t) (numBytes - 1));
+	    memcpy(utfBytes, p, numBytes - 1);
 	    utfBytes[numBytes - 1] = '\0';
-	    count = TclUtfToUniChar(utfBytes, &unichar) + 1;
+	    count = TclUtfToUCS4(utfBytes, &result) + 1;
 	}
-	result = unichar;
 	break;
     }
 
@@ -1008,13 +1006,12 @@ TclParseBackslash(
     if (readPtr != NULL) {
 	*readPtr = count;
     }
-    count = Tcl_UniCharToUtf(result, dst);
-#if TCL_UTF_MAX > 3
-     if ((result >= 0xD800) && (count < 3)) {
-	count += Tcl_UniCharToUtf(-1, dst + count);
+#if TCL_UTF_MAX < 4
+    if (result > 0xFFFF) {
+    	result = 0xFFFD;
     }
 #endif
-    return count;
+    return TclUCS4ToUtf(result, dst);
 }
 
 /*
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 160e444..a2080dd 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2335,7 +2335,7 @@ TclUniCharMatch(
  *	routine does not run off the end and dereference non-existent memory
  *	looking for trail bytes. If the source buffer is known to be '\0'
  *	terminated, this cannot happen. Otherwise, the caller should call
- *	Tcl_UtfCharComplete() before calling this routine to ensure that
+ *	TclUCS4Complete() before calling this routine to ensure that
  *	enough bytes remain in the string.
  *
  * Results:
@@ -2354,26 +2354,68 @@ TclUtfToUCS4(
     int *ucs4Ptr)	/* Filled with the UCS4 codepoint represented
 			 * by the UTF-8 string. */
 {
-    int len, fullchar;
     Tcl_UniChar ch = 0;
-
-    len = TclUtfToUniChar(src, &ch);
-    fullchar = ch;
+    int len = Tcl_UtfToUniChar(src, &ch);
 
 #if TCL_UTF_MAX <= 4
-    /* 4-byte UTF-8 is supported; decode surrogates */
-
-    if ((ch >= 0xD800) && len < 3) {
-	len += Tcl_UtfToUniChar(src + len, &ch);
-	fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000;
+    if ((ch & ~0x3FF) == 0xD800) {
+	Tcl_UniChar low = ch;
+	int len2 = Tcl_UtfToUniChar(src+len, &low);
+	if ((low & ~0x3FF) == 0xDC00) {
+	    *ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
+	    return len + len2;
+	}
     }
 #endif
-
-    *ucs4Ptr = fullchar;
+    *ucs4Ptr = (int)ch;
     return len;
 }
 
 /*
+ *---------------------------------------------------------------------------
+ *
+ * TclUCS4ToUtf --
+ *
+ *	Store the given Unicode character as a sequence of UTF-8 bytes in the
+ *	provided buffer. Might output 6 bytes, if the code point > 0xFFFF.
+ *
+ * Results:
+ *	The return values is the number of bytes in the buffer that were
+ *	consumed.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUCS4ToUtf(
+    int ch,			/* Unicode character to be stored in the
+				 * buffer. */
+    char *buf)			/* Buffer in which the UTF-8 representation of
+				 * the Unicode character is stored. Buffer must be
+				 * large enough to hold the UTF-8 character(s)
+				 * (at most 6 bytes). */
+{
+#if TCL_UTF_MAX <= 4
+    if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
+	/* Spit out a 4-byte UTF-8 character or 2 x 3-byte UTF-8 characters, depending on Tcl
+	 * version and/or TCL_UTF_MAX build value */
+	int len = Tcl_UniCharToUtf(0xD800 | ((ch - 0x10000) >> 10), buf);
+	return len + Tcl_UniCharToUtf(0xDC00 | (ch & 0x7FF), buf + len);
+    }
+#endif
+    if ((ch & ~0x7FF) == 0xD800) {
+	buf[2] = (char) ((ch | 0x80) & 0xBF);
+	buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	buf[0] = (char) ((ch >> 12) | 0xE0);
+	return 3;
+    }
+    return Tcl_UniCharToUtf(ch, buf);
+}
+
+/*
  * Local Variables:
  * mode: c
  * c-basic-offset: 4
diff --git a/tests/utf.test b/tests/utf.test
index 2bfb9ea..c0fed6f 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -496,7 +496,10 @@ test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf}
 test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext {
     testutfnext \xA0\xA0\xA0
 } 1
-test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext {
+test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} {
+    testutfnext \x80\x80\x80
+} 1
+test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} {
     testutfnext \x80\x80\x80
 } 1
 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext {
-- 
cgit v0.12