From 7759441602f14b3772186ea0d87f61ffdafc8404 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Mon, 4 May 2020 12:31:51 +0000
Subject: New internal function TclGetUCS4() only available when TCL_UTF_MAX=4.
 This fixes all "knownBug" testcases related to tip389.

---
 generic/tclCmdMZ.c     |  2 +-
 generic/tclExecute.c   | 12 ++------
 generic/tclInt.h       |  7 ++++-
 generic/tclStringObj.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++--
 generic/tclUtf.c       |  6 +++-
 tests/utf.test         | 14 ++++-----
 6 files changed, 98 insertions(+), 21 deletions(-)

diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 011164b..7516208 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1401,7 +1401,7 @@ StringIndexCmd(
     }
 
     if ((index >= 0) && (index < length)) {
-	Tcl_UniChar ch = Tcl_GetUniChar(objv[1], index);
+	int ch = TclGetUCS4(objv[1], index);
 
 	/*
 	 * If we have a ByteArray object, we're careful to generate a new
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index f38e752..eeb69de 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -5571,16 +5571,10 @@ TEBCresume(
 	    objResultPtr = Tcl_NewStringObj((const char *)
 		    valuePtr->bytes+index, 1);
 	} else {
-	    char buf[4] = "";
-	    Tcl_UniChar ch = Tcl_GetUniChar(valuePtr, index);
+	    char buf[8] = "";
+	    int ch = TclGetUCS4(valuePtr, index);
 
-	    /*
-	     * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1)
-	     * but creating the object as a string seems to be faster in
-	     * practical use.
-	     */
-
-	    length = Tcl_UniCharToUtf(ch, buf);
+	    length = TclUCS4ToUtf(ch, buf);
 	    objResultPtr = Tcl_NewStringObj(buf, length);
 	}
 
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 6f024a6..8983659 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3183,8 +3183,13 @@ MODULE_SCOPE int	TclTrimLeft(const char *bytes, int numBytes,
 MODULE_SCOPE int	TclTrimRight(const char *bytes, int numBytes,
 			    const char *trim, int numTrim);
 MODULE_SCOPE int	TclUtfCasecmp(const char *cs, const char *ct);
-MODULE_SCOPE int	TclUtfToUCS4(const char *src, int *ucs4Ptr);
+MODULE_SCOPE int	TclUtfToUCS4(const char *, int *);
 MODULE_SCOPE int	TclUCS4ToUtf(int, char *);
+#if TCL_UTF_MAX == 4
+    MODULE_SCOPE int	TclGetUCS4(Tcl_Obj *, int);
+#else
+    #define TclGetUCS4 Tcl_GetUniChar
+#endif
 
 /*
  * Bytes F0-F4 are start-bytes for 4-byte sequences.
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 3ce8281..656d6ce 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -519,10 +519,10 @@ TclCheckEmptyString (
 /*
  *----------------------------------------------------------------------
  *
- * Tcl_GetUniChar --
+ * Tcl_GetUniChar/TclGetUCS4 --
  *
  *	Get the index'th Unicode character from the String object. If index
- *	is out of range, the result = 0xFFFD;
+ *	is out of range, the result = 0xFFFD (Tcl_GetUniChar) resp. -1 (TclGetUCS4)
  *
  * Results:
  *	Returns the index'th Unicode character in the Object.
@@ -587,6 +587,80 @@ Tcl_GetUniChar(
     }
     return stringPtr->unicode[index];
 }
+
+#if TCL_UTF_MAX == 4
+int
+TclGetUCS4(
+    Tcl_Obj *objPtr,		/* The object to get the Unicode charater
+				 * from. */
+    int index)			/* Get the index'th Unicode character. */
+{
+    String *stringPtr;
+    int ch, length;
+
+    if (index < 0) {
+	return -1;
+    }
+
+    /*
+     * Optimize the case where we're really dealing with a bytearray object
+     * we don't need to convert to a string to perform the indexing operation.
+     */
+
+    if (TclIsPureByteArray(objPtr)) {
+	unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length);
+	if (index >= length) {
+		return -1;
+	}
+
+	return (int) bytes[index];
+    }
+
+    /*
+     * OK, need to work with the object as a string.
+     */
+
+    SetStringFromAny(NULL, objPtr);
+    stringPtr = GET_STRING(objPtr);
+
+    if (stringPtr->hasUnicode == 0) {
+	/*
+	 * If numChars is unknown, compute it.
+	 */
+
+	if (stringPtr->numChars == -1) {
+	    TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length);
+	}
+	if (stringPtr->numChars == objPtr->length) {
+	    return (Tcl_UniChar) objPtr->bytes[index];
+	}
+	FillUnicodeRep(objPtr);
+	stringPtr = GET_STRING(objPtr);
+    }
+
+    if (index >= stringPtr->numChars) {
+	return -1;
+    }
+    ch = stringPtr->unicode[index];
+#if TCL_UTF_MAX <= 4
+    /* See: bug [11ae2be95dac9417] */
+    if ((ch & 0xF800) == 0xD800) {
+	if (ch & 0x400) {
+	    if ((index > 0)
+		    && ((stringPtr->unicode[index-1] & 0xFC00) == 0xD800)) {
+		ch = -1; /* low surrogate preceded by high surrogate */
+	    }
+	} else if ((++index < stringPtr->numChars)
+		&& ((stringPtr->unicode[index] & 0xFC00) == 0xDC00)) {
+	    /* high surrogate followed by low surrogate */
+	    ch = (((ch & 0x3FF) << 10) |
+			(stringPtr->unicode[index] & 0x3FF)) + 0x10000;
+	}
+    }
+#endif
+    return ch;
+}
+#endif
 
 /*
  *----------------------------------------------------------------------
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index ab3c577..9714204 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2383,7 +2383,8 @@ TclUtfToUCS4(
  *
  * Results:
  *	The return values is the number of bytes in the buffer that were
- *	consumed.
+ *	consumed. If ch == -1, this function outputs 0 bytes (empty string),
+ *	since TclGetUCS4 returns -1 for out-of-range indices.
  *
  * Side effects:
  *	None.
@@ -2414,6 +2415,9 @@ TclUCS4ToUtf(
 	buf[0] = (char) ((ch >> 12) | 0xE0);
 	return 3;
     }
+    if (ch == -1) {
+	return 0;
+    }
     return Tcl_UniCharToUtf(ch, buf);
 }
 
diff --git a/tests/utf.test b/tests/utf.test
index a3c049d..fd8231d 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -983,14 +983,14 @@ test utf-8.3 {Tcl_UniCharAtIndex: index > 0} {
 } c
 test utf-8.4 {Tcl_UniCharAtIndex: index > 0} {
     string index \u4E4E\u25A\xFF\u543 2
-} \uFF
+} \xFF
 test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 {
     string index \uD842 0
 } \uD842
-test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} {ucs4 knownBug} {
+test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} {ucs4} {
     string index \uD842 0
 } \uD842
-test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {tip389 knownBug} {
+test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {tip389} {
     string index \uD842 0
 } \uD842
 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} {
@@ -1002,7 +1002,7 @@ test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
 test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
     string index \uD83D\uDE00G 0
 } \U1F600
-test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {tip389 knownBug} {
+test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {tip389} {
     string index \uD83D\uDE00G 0
 } \U1F600
 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
@@ -1011,7 +1011,7 @@ test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
 test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 {
     string index \uD83D\uDE00G 1
 } G
-test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} {tip389 knownBug} {
+test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} {tip389} {
     string index \uD83D\uDE00G 1
 } {}
 test utf-8.9.0 {Tcl_UniCharAtIndex: Emoji} ucs2 {
@@ -1029,7 +1029,7 @@ test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
 test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} {
     string index \U1F600G 0
 } \U1F600
-test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389 knownBug} {
+test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} {
     string index \U1F600G 0
 } \U1F600
 test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
@@ -1038,7 +1038,7 @@ test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
 test utf-8.11.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} {
     string index \U1F600G 1
 } G
-test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389 knownBug} {
+test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} {
     string index \U1F600G 1
 } {}
 test utf-8.12.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} {
-- 
cgit v0.12