From fbf2a3255bd9e59c1a84415305da87b202330606 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 2 Apr 2020 20:18:52 +0000
Subject: New utility routine TclUtfToUCS4() to contain some complexity. Two
 callers adapted.

---
 generic/tclCmdMZ.c | 34 +++++++--------------------------
 generic/tclInt.h   |  1 +
 generic/tclUtf.c   | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index d344678..23370a8 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1081,23 +1081,10 @@ Tcl_SplitObjCmd(
 	Tcl_InitHashTable(&charReuseTable, TCL_ONE_WORD_KEYS);
 
 	for ( ; stringPtr < end; stringPtr += len) {
-	    int fullchar;
-	    len = TclUtfToUniChar(stringPtr, &ch);
-	    fullchar = ch;
-
-#if TCL_UTF_MAX == 4
-	    if ((ch >= 0xD800) && (len < 3)) {
-		len += TclUtfToUniChar(stringPtr + len, &ch);
-		fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000;
-	    }
-#endif
+	    int ucs4;
 
-	    /*
-	     * Assume Tcl_UniChar is an integral type...
-	     */
-
-	    hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(fullchar),
-		    &isNew);
+	    len = TclUtfToUCS4(stringPtr, &ucs4);
+	    hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(ucs4), &isNew);
 	    if (isNew) {
 		TclNewStringObj(objPtr, stringPtr, len);
 
@@ -1466,7 +1453,6 @@ StringIsCmd(
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
     const char *string1, *end, *stop;
-    Tcl_UniChar ch = 0;
     int (*chcomp)(int) = NULL;	/* The UniChar comparison function. */
     int i, failat = 0, result = 1, strict = 0, index, length1, length2;
     Tcl_Obj *objPtr, *failVarObj = NULL;
@@ -1797,16 +1783,10 @@ StringIsCmd(
 	}
 	end = string1 + length1;
 	for (; string1 < end; string1 += length2, failat++) {
-	    int fullchar;
-	    length2 = TclUtfToUniChar(string1, &ch);
-	    fullchar = ch;
-#if TCL_UTF_MAX == 4
-	    if ((ch >= 0xD800) && (length2 < 3)) {
-	    	length2 += TclUtfToUniChar(string1 + length2, &ch);
-	    	fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000;
-	    }
-#endif
-	    if (!chcomp(fullchar)) {
+	    int ucs4;
+
+	    length2 = TclUtfToUCS4(string1, &ucs4);
+	    if (!chcomp(ucs4)) {
 		result = 0;
 		break;
 	    }
diff --git a/generic/tclInt.h b/generic/tclInt.h
index c30a257..74b2cc9 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3186,6 +3186,7 @@ MODULE_SCOPE int	TclTrimLeft(const char *bytes, int numBytes,
 MODULE_SCOPE int	TclTrimRight(const char *bytes, int numBytes,
 			    const char *trim, int numTrim);
 MODULE_SCOPE int	TclUtfCasecmp(const char *cs, const char *ct);
+MODULE_SCOPE int	TclUtfToUCS4(const char *src, int *ucs4Ptr);
 MODULE_SCOPE Tcl_Obj *	TclpNativeToNormalized(ClientData clientData);
 MODULE_SCOPE Tcl_Obj *	TclpFilesystemPathType(Tcl_Obj *pathPtr);
 MODULE_SCOPE int	TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr,
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index c58f5a9..0db06bd 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -278,8 +278,8 @@ Tcl_UniCharToUtfDString(
  *	If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done:
  *	For any UTF-8 string containing a character outside of the BMP, the
  *	first call to this function will fill *chPtr with the high surrogate
- *	and generate a return value of 0. Calling Tcl_UtfToUniChar again
- *	will produce the low surrogate and a return value of 4. Because *chPtr
+ *	and generate a return value of 1. Calling Tcl_UtfToUniChar again
+ *	will produce the low surrogate and a return value of 3. Because *chPtr
  *	is used to remember whether the high surrogate is already produced, it
  *	is recommended to initialize the variable it points to as 0 before
  *	the first call to Tcl_UtfToUniChar is done.
@@ -2156,6 +2156,57 @@ TclUniCharMatch(
 }
 
 /*
+ *---------------------------------------------------------------------------
+ *
+ * TclUtfToUCS4 --
+ *
+ *	Extract the 4-byte codepoint from the leading bytes of the
+ *	Modified UTF-8 string "src".  This is a utility routine to
+ *	contain the surrogate gymnastics in one place.
+ *
+ *	The caller must ensure that the source buffer is long enough that this
+ *	routine does not run off the end and dereference non-existent memory
+ *	looking for trail bytes. If the source buffer is known to be '\0'
+ *	terminated, this cannot happen. Otherwise, the caller should call
+ *	Tcl_UtfCharComplete() before calling this routine to ensure that
+ *	enough bytes remain in the string.
+ *
+ * Results:
+ *	*usc4Ptr is filled with the UCS4 code point, and the return value is
+ *	the number of bytes from the UTF-8 string that were consumed.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUtfToUCS4(
+    const char *src,	/* The UTF-8 string. */
+    int *ucs4Ptr)	/* Filled with the UCS4 codepoint represented
+			 * by the UTF-8 string. */
+{
+    int len, fullchar;
+    Tcl_UniChar ch = 0;
+
+    len = TclUtfToUniChar(src, &ch);
+    fullchar = ch;
+
+#if TCL_UTF_MAX == 4
+    /* 4-byte UTF-8 is supported; decode surrogates */
+
+    if ((ch >= 0xD800) && len < 3)
+	len += Tcl_UtfToUniChar(src + len, &ch);
+	fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000;
+    }
+#endif
+
+    *ucs4Ptr = fullchar;
+    return len;
+}
+
+/*
  * Local Variables:
  * mode: c
  * c-basic-offset: 4
-- 
cgit v0.12


From d429ddf40d173212a0fadb4794b91096c58201a1 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 2 Apr 2020 20:36:04 +0000
Subject: More callers.

---
 generic/tclUtf.c | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 0db06bd..8f02790 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -578,19 +578,11 @@ Tcl_UtfFindFirst(
     const char *src,		/* The UTF-8 string to be searched. */
     int ch)			/* The Unicode character to search for. */
 {
-    int len, fullchar;
-    Tcl_UniChar find = 0;
-
+fprintf(stdout, "COVER\n"); fflush(stdout);
     while (1) {
-	len = TclUtfToUniChar(src, &find);
-	fullchar = find;
-#if TCL_UTF_MAX <= 4
-	if ((fullchar != ch) && (find >= 0xD800) && (len < 3)) {
-	    len += TclUtfToUniChar(src + len, &find);
-	    fullchar = (((fullchar & 0x3FF) << 10) | (find & 0x3FF)) + 0x10000;
-	}
-#endif
-	if (fullchar == ch) {
+	int ucs4, len = TclUtfToUCS4(src, &ucs4);
+
+	if (ucs4 == ch) {
 	    return src;
 	}
 	if (*src == '\0') {
@@ -624,21 +616,12 @@ Tcl_UtfFindLast(
     const char *src,		/* The UTF-8 string to be searched. */
     int ch)			/* The Unicode character to search for. */
 {
-    int len, fullchar;
-    Tcl_UniChar find = 0;
-    const char *last;
+    const char *last = NULL;
 
-    last = NULL;
     while (1) {
-	len = TclUtfToUniChar(src, &find);
-	fullchar = find;
-#if TCL_UTF_MAX <= 4
-	if ((fullchar != ch) && (find >= 0xD800) && (len < 3)) {
-	    len += TclUtfToUniChar(src + len, &find);
-	    fullchar = (((fullchar & 0x3FF) << 10) | (find & 0x3FF)) + 0x10000;
-	}
-#endif
-	if (fullchar == ch) {
+	int ucs4, len = TclUtfToUCS4(src, &ucs4);
+
+	if (ucs4 == ch) {
 	    last = src;
 	}
 	if (*src == '\0') {
-- 
cgit v0.12


From c583e934033ce158b9b2b03320ecd598e908efcc Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 2 Apr 2020 20:50:32 +0000
Subject: Adapt another caller. This one had a bug when (TCL_UTF_MAX == 4)
 because it was never adapted when TclUtfToUniChar change from returning 0 to
 returning 1 for a high surrogate.

---
 generic/tclDisassemble.c | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/generic/tclDisassemble.c b/generic/tclDisassemble.c
index b60edca..94679fe 100644
--- a/generic/tclDisassemble.c
+++ b/generic/tclDisassemble.c
@@ -855,7 +855,6 @@ PrintSourceToObj(
 {
     register const char *p;
     register int i = 0, len;
-    Tcl_UniChar ch = 0;
 
     if (stringPtr == NULL) {
 	Tcl_AppendToObj(appendObj, "\"\"", -1);
@@ -865,9 +864,10 @@ PrintSourceToObj(
     Tcl_AppendToObj(appendObj, "\"", -1);
     p = stringPtr;
     for (;  (*p != '\0') && (i < maxChars);  p+=len) {
+	int ucs4;
 
-	len = TclUtfToUniChar(p, &ch);
-	switch (ch) {
+	len = TclUtfToUCS4(p, &ucs4);
+	switch (ucs4) {
 	case '"':
 	    Tcl_AppendToObj(appendObj, "\\\"", -1);
 	    i += 2;
@@ -893,28 +893,14 @@ PrintSourceToObj(
 	    i += 2;
 	    continue;
 	default:
-#if TCL_UTF_MAX > 4
-	    if (ch > 0xFFFF) {
-		Tcl_AppendPrintfToObj(appendObj, "\\U%08x", ch);
+	    if (ucs4 > 0xFFFF) {
+		Tcl_AppendPrintfToObj(appendObj, "\\U%08x", ucs4);
 		i += 10;
-	    } else
-#elif TCL_UTF_MAX > 3
-	    /* If len == 0, this means we have a char > 0xFFFF, resulting in
-	     * TclUtfToUniChar producing a surrogate pair. We want to output
-	     * this pair as a single Unicode character.
-	     */
-	    if (len == 0) {
-		int upper = ((ch & 0x3FF) + 1) << 10;
-		len = TclUtfToUniChar(p, &ch);
-		Tcl_AppendPrintfToObj(appendObj, "\\U%08x", upper + (ch & 0x3FF));
-		i += 10;
-	    } else
-#endif
-	    if (ch < 0x20 || ch >= 0x7F) {
-		Tcl_AppendPrintfToObj(appendObj, "\\u%04x", ch);
+	    } else if (ucs4 < 0x20 || ucs4 >= 0x7F) {
+		Tcl_AppendPrintfToObj(appendObj, "\\u%04x", ucs4);
 		i += 6;
 	    } else {
-		Tcl_AppendPrintfToObj(appendObj, "%c", ch);
+		Tcl_AppendPrintfToObj(appendObj, "%c", ucs4);
 		i++;
 	    }
 	    continue;
-- 
cgit v0.12


From 2f92315df0eafedcd9c06b98df46e3ce71701cb2 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 2 Apr 2020 21:29:26 +0000
Subject: One more caller conversion that simplifies. There may be other
 callers of TclUtfToUniChar that would be made more correct or consistent with
 a similar conversion.

---
 generic/tclScan.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/generic/tclScan.c b/generic/tclScan.c
index 4b9298d..c599797 100644
--- a/generic/tclScan.c
+++ b/generic/tclScan.c
@@ -881,14 +881,7 @@ Tcl_ScanObjCmd(
 	     * Scan a single Unicode character.
 	     */
 
-	    offset = TclUtfToUniChar(string, &sch);
-	    i = (int)sch;
-#if TCL_UTF_MAX == 4
-	    if ((sch >= 0xD800) && (offset < 3)) {
-		offset += TclUtfToUniChar(string+offset, &sch);
-		i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF);
-	    }
-#endif
+	    offset = TclUtfToUCS4(string, &i);
 	    string += offset;
 	    if (!(flags & SCAN_SUPPRESS)) {
 		objPtr = Tcl_NewIntObj(i);
-- 
cgit v0.12


From 6d656d18330f1b128b4587401e2bd4be7afbb0df Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 2 Apr 2020 22:05:52 +0000
Subject: typo

---
 generic/tclUtf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 8f02790..c5dc30a 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2179,7 +2179,7 @@ TclUtfToUCS4(
 #if TCL_UTF_MAX == 4
     /* 4-byte UTF-8 is supported; decode surrogates */
 
-    if ((ch >= 0xD800) && len < 3)
+    if ((ch >= 0xD800) && len < 3) {
 	len += Tcl_UtfToUniChar(src + len, &ch);
 	fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000;
     }
-- 
cgit v0.12


From 381c52e801647c8e1af5eb08a6f298a29cdbf72a Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 2 Apr 2020 22:18:06 +0000
Subject: Remove stray debug

---
 generic/tclUtf.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index c5dc30a..6782df9 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -578,7 +578,6 @@ Tcl_UtfFindFirst(
     const char *src,		/* The UTF-8 string to be searched. */
     int ch)			/* The Unicode character to search for. */
 {
-fprintf(stdout, "COVER\n"); fflush(stdout);
     while (1) {
 	int ucs4, len = TclUtfToUCS4(src, &ucs4);
 
-- 
cgit v0.12