From b488f3edf6ee202281aca13745c0d4212310f654 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Thu, 3 Mar 2022 13:05:38 +0000
Subject: TIP #619 implementation. tests not working yet

---
 generic/tcl.h         |  7 +++++++
 generic/tclCmdMZ.c    |  2 ++
 generic/tclDecls.h    |  8 ++++++++
 generic/tclEncoding.c | 21 ++++++++++++---------
 generic/tclParse.c    |  7 ++++---
 generic/tclUtf.c      | 18 +++++++++++++++---
 tests/utf.test        |  6 +++---
 7 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/generic/tcl.h b/generic/tcl.h
index 6b69929..8778203 100644
--- a/generic/tcl.h
+++ b/generic/tcl.h
@@ -835,6 +835,13 @@ typedef struct Tcl_DString {
 #define TCL_INDEX_NULL_OK	4
 
 /*
+ * Flags that may be passed to Tcl_UniCharToUtf.
+ * TCL_COMBINE Combine surrogates
+ */
+
+#define TCL_COMBINE		0x200000
+
+/*
  *----------------------------------------------------------------------------
  * Flag values passed to Tcl_RecordAndEval, Tcl_EvalObj, Tcl_EvalObjv.
  * WARNING: these bit choices must not conflict with the bit choices for
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 85174ec..b50eacb 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1432,9 +1432,11 @@ StringIndexCmd(
 	    char buf[4] = "";
 
 	    end = Tcl_UniCharToUtf(ch, buf);
+#if TCL_UTF_MAX < 4
 	    if ((ch >= 0xD800) && (end < 3)) {
 		end += Tcl_UniCharToUtf(-1, buf + end);
 	    }
+#endif
 	    Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, end));
 	}
     }
diff --git a/generic/tclDecls.h b/generic/tclDecls.h
index 9205401..d073edd 100644
--- a/generic/tclDecls.h
+++ b/generic/tclDecls.h
@@ -3919,6 +3919,14 @@ extern const TclStubs *tclStubsPtr;
 #   define Tcl_UtfToUniChar Tcl_UtfToChar16
 #   undef Tcl_UniCharLen
 #   define Tcl_UniCharLen Tcl_Char16Len
+#   undef Tcl_UniCharToUtf
+#   if defined(USE_TCL_STUBS)
+#	define Tcl_UniCharToUtf(c, p) \
+		(tclStubsPtr->tcl_UniCharToUtf((c)|TCL_COMBINE, (p)))
+#   else
+#	define Tcl_UniCharToUtf(c, p) \
+		((Tcl_UniCharToUtf)((c)|TCL_COMBINE, (p)))
+#   endif
 #endif
 #if defined(USE_TCL_STUBS)
 #   define Tcl_WCharToUtfDString (sizeof(wchar_t) != sizeof(short) \
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 3a6385f..765f98b 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2228,7 +2228,6 @@ UtfToUtfProc(
 	    }
 	    dst += Tcl_UniCharToUtf(ch, dst);
 	} else {
-	    int low;
 	    const char *saveSrc = src;
 	    size_t len = TclUtfToUCS4(src, &ch);
 	    if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)
@@ -2246,13 +2245,20 @@ UtfToUtfProc(
 		    *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
 		    ch = (ch & 0x0CFF) | 0xDC00;
 		}
-		goto cesu8;
+#if TCL_UTF_MAX < 4
+	    cesu8:
+#endif
+		*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
+		*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
+		*dst++ = (char) ((ch | 0x80) & 0xBF);
+		continue;
+#if TCL_UTF_MAX < 4
 	    } else if ((ch | 0x7FF) == 0xDFFF) {
 		/*
 		 * A surrogate character is detected, handle especially.
 		 */
 
-		low = ch;
+		int low = ch;
 		len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
 
 		if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
@@ -2261,15 +2267,12 @@ UtfToUtfProc(
 			src = saveSrc;
 			break;
 		    }
-		cesu8:
-		    *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
-		    *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
-		    *dst++ = (char) ((ch | 0x80) & 0xBF);
-		    continue;
+		    goto cesu8;
 		}
 		src += len;
 		dst += Tcl_UniCharToUtf(ch, dst);
 		ch = low;
+#endif
 	    } else if (!Tcl_UniCharIsUnicode(ch)) {
 		if (flags & TCL_ENCODING_STOPONERROR) {
 		    result = TCL_CONVERT_UNKNOWN;
@@ -2578,7 +2581,7 @@ Utf16ToUtfProc(
 	if (ch && ch < 0x80) {
 	    *dst++ = (ch & 0xFF);
 	} else {
-	    dst += Tcl_UniCharToUtf(ch, dst);
+	    dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
 	}
 	src += sizeof(unsigned short);
     }
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 614401f..fdd1478 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -869,6 +869,7 @@ TclParseBackslash(
 	     * No hexdigits -> This is just "u".
 	     */
 	    result = 'u';
+#if TCL_UTF_MAX < 4
 	} else if (((result & 0xFC00) == 0xD800) && (count == 6)
 		    && (p[5] == '\\') && (p[6] == 'u') && (numBytes >= 10)) {
 	    /* If high surrogate is immediately followed by a low surrogate
@@ -879,6 +880,7 @@ TclParseBackslash(
 		result = ((result & 0x3FF)<<10 | (low & 0x3FF)) + 0x10000;
 		count += count2 + 2;
 	    }
+#endif
 	}
 	break;
     case 'U':
@@ -888,9 +890,6 @@ TclParseBackslash(
 	     * No hexdigits -> This is just "U".
 	     */
 	    result = 'U';
-	} else if ((result | 0x7FF) == 0xDFFF) {
-	    /* Upper or lower surrogate, not allowed in this syntax. */
-	    result = 0xFFFD;
 	}
 	break;
     case '\n':
@@ -954,10 +953,12 @@ TclParseBackslash(
 	*readPtr = count;
     }
     count = Tcl_UniCharToUtf(result, dst);
+#if TCL_UTF_MAX < 4
     if ((result >= 0xD800) && (count < 3)) {
 	/* Special case for handling high surrogates. */
 	count += Tcl_UniCharToUtf(-1, dst + count);
     }
+#endif
     return count;
 }
 
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index e353b7f..a04e41c 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -208,15 +208,23 @@ Invalid(
  *---------------------------------------------------------------------------
  */
 
+#undef Tcl_UniCharToUtf
 int
 Tcl_UniCharToUtf(
     int ch,			/* The Tcl_UniChar to be stored in the
-				 * buffer. */
+				 * buffer. Can be or'ed with flag TCL_COMBINE */
     char *buf)			/* Buffer in which the UTF-8 representation of
 				 * the Tcl_UniChar is stored. Buffer must be
 				 * large enough to hold the UTF-8 character
 				 * (at most 4 bytes). */
 {
+#if TCL_UTF_MAX > 3
+    int flags = ch;
+#endif
+
+    if (ch >= TCL_COMBINE) {
+	ch &= (TCL_COMBINE - 1);
+    }
     if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
 	buf[0] = (char) ch;
 	return 1;
@@ -228,7 +236,11 @@ Tcl_UniCharToUtf(
 	    return 2;
 	}
 	if (ch <= 0xFFFF) {
-	    if ((ch & 0xF800) == 0xD800) {
+	    if (
+#if TCL_UTF_MAX > 3
+		    (flags & TCL_COMBINE) &&
+#endif
+		    ((ch & 0xF800) == 0xD800)) {
 		if (ch & 0x0400) {
 		    /* Low surrogate */
 		    if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) {
@@ -377,7 +389,7 @@ Tcl_Char16ToUtfDString(
 	    /* Special case for handling high surrogates. */
 	    p += Tcl_UniCharToUtf(-1, p);
 	}
-	len = Tcl_UniCharToUtf(*w, p);
+	len = Tcl_UniCharToUtf(*w | TCL_COMBINE, p);
 	p += len;
 	if ((*w >= 0xD800) && (len < 3)) {
 	    len = 0; /* Indication that high surrogate was found */
diff --git a/tests/utf.test b/tests/utf.test
index 09599b6..f094a23 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -78,11 +78,11 @@ test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring
 test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} {
     expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]}
 } 1
-test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc ucs2} {
+test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc} {
     expr {"\UD842" eq "\uD842"}
 } 1
-test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} {
-    expr {"\UD842" eq [testbytestring \xEF\xBF\xBD]}
+test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {testbytestring} {
+    expr {"\UD842" eq [testbytestring \xED\xA1\x82]}
 } 1
 test utf-1.14 {Tcl_UniCharToUtf: surrogate pairs from concat} {
     set lo \uDE02
-- 
cgit v0.12