5 files changed, 77 insertions, 41 deletions
diff --git a/generic/tclBinary.c b/generic/tclBinary.c
index 6306159..52ef457 100644
--- a/generic/tclBinary.c
+++ b/generic/tclBinary.c
@@ -1222,11 +1222,11 @@ BinaryFormatCmd(
 
  badField:
     {
-	Tcl_UniChar ch = 0;
-	char buf[TCL_UTF_MAX + 1] = "";
+	int ch;
+	char buf[8] = "";
 
-	TclUtfToUniChar(errorString, &ch);
-	buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
+	TclUtfToUCS4(errorString, &ch);
+	buf[TclUCS4ToUtf(ch, buf)] = '\0';
 	Tcl_SetObjResult(interp, Tcl_ObjPrintf(
 		"bad field specifier \"%s\"", buf));
 	return TCL_ERROR;
@@ -1592,11 +1592,11 @@ BinaryScanCmd(
 
  badField:
     {
-	Tcl_UniChar ch = 0;
-	char buf[TCL_UTF_MAX + 1] = "";
+	int ch;
+	char buf[8] = "";
 
-	TclUtfToUniChar(errorString, &ch);
-	buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
+	TclUtfToUCS4(errorString, &ch);
+	buf[TclUCS4ToUtf(ch, buf)] = '\0';
 	Tcl_SetObjResult(interp, Tcl_ObjPrintf(
 		"bad field specifier \"%s\"", buf));
 	return TCL_ERROR;
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 162a5a6..011164b 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1413,14 +1413,9 @@ StringIndexCmd(
 
 	    Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1));
 	} else {
-	    char buf[TCL_UTF_MAX] = "";
+	    char buf[8] = "";
 
-	    length = Tcl_UniCharToUtf(ch, buf);
-#if TCL_UTF_MAX > 3
-	    if ((ch >= 0xD800) && (length < 3)) {
-		length += Tcl_UniCharToUtf(-1, buf + length);
-	    }
-#endif
+	    length = TclUCS4ToUtf(ch, buf);
 	    Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length));
 	}
     }
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 5c46470..6f024a6 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3184,6 +3184,8 @@ MODULE_SCOPE int	TclTrimRight(const char *bytes, int numBytes,
 			    const char *trim, int numTrim);
 MODULE_SCOPE int	TclUtfCasecmp(const char *cs, const char *ct);
 MODULE_SCOPE int	TclUtfToUCS4(const char *src, int *ucs4Ptr);
+MODULE_SCOPE int	TclUCS4ToUtf(int, char *);
+
 /*
  * Bytes F0-F4 are start-bytes for 4-byte sequences.
  * Byte 0xED can be the start-byte of an upper surrogate. In that case,
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 7beaeea..23a07cf 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -843,7 +843,6 @@ TclParseBackslash(
 				 * written there. */
 {
     register const char *p = src+1;
-    Tcl_UniChar unichar = 0;
     int result;
     int count;
     char buf[TCL_UTF_MAX] = "";
@@ -943,7 +942,7 @@ TclParseBackslash(
 	     * No hexdigits -> This is just "U".
 	     */
 	    result = 'U';
-	} else if ((result | 0x7FF) == 0xDFFF) {
+	} else if ((result & ~0x7FF) == 0xD800) {
 	    /* Upper or lower surrogate, not allowed in this syntax. */
 	    result = 0xFFFD;
 	}
@@ -991,16 +990,15 @@ TclParseBackslash(
 	 * #217987] test subst-3.2
 	 */
 
-	if (Tcl_UtfCharComplete(p, numBytes - 1)) {
-	    count = TclUtfToUniChar(p, &unichar) + 1;	/* +1 for '\' */
+	if (TclUCS4Complete(p, numBytes - 1)) {
+	    count = TclUtfToUCS4(p, &result) + 1;	/* +1 for '\' */
 	} else {
-	    char utfBytes[TCL_UTF_MAX];
+	    char utfBytes[8];
 
-	    memcpy(utfBytes, p, (size_t) (numBytes - 1));
+	    memcpy(utfBytes, p, numBytes - 1);
 	    utfBytes[numBytes - 1] = '\0';
-	    count = TclUtfToUniChar(utfBytes, &unichar) + 1;
+	    count = TclUtfToUCS4(utfBytes, &result) + 1;
 	}
-	result = unichar;
 	break;
     }
 
@@ -1008,13 +1006,12 @@ TclParseBackslash(
     if (readPtr != NULL) {
 	*readPtr = count;
     }
-    count = Tcl_UniCharToUtf(result, dst);
-#if TCL_UTF_MAX > 3
-     if ((result >= 0xD800) && (count < 3)) {
-	count += Tcl_UniCharToUtf(-1, dst + count);
+#if TCL_UTF_MAX < 4
+    if (result > 0xFFFF) {
+    	result = 0xFFFD;
     }
 #endif
-    return count;
+    return TclUCS4ToUtf(result, dst);
 }
 
 /*
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 03a7ca9..a14ce71 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2354,7 +2354,7 @@ TclUniCharMatch(
  *	routine does not run off the end and dereference non-existent memory
  *	looking for trail bytes. If the source buffer is known to be '\0'
  *	terminated, this cannot happen. Otherwise, the caller should call
- *	Tcl_UtfCharComplete() before calling this routine to ensure that
+ *	TclUCS4Complete() before calling this routine to ensure that
  *	enough bytes remain in the string.
  *
  * Results:
@@ -2373,26 +2373,68 @@ TclUtfToUCS4(
     int *ucs4Ptr)	/* Filled with the UCS4 codepoint represented
 			 * by the UTF-8 string. */
 {
-    int len, fullchar;
     Tcl_UniChar ch = 0;
+    int len = Tcl_UtfToUniChar(src, &ch);
 
-    len = TclUtfToUniChar(src, &ch);
-    fullchar = ch;
-
-#if TCL_UTF_MAX == 4
-    /* 4-byte UTF-8 is supported; decode surrogates */
-
-    if ((ch >= 0xD800) && len < 3) {
-	len += Tcl_UtfToUniChar(src + len, &ch);
-	fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000;
+#if TCL_UTF_MAX <= 4
+    if ((ch & ~0x3FF) == 0xD800) {
+	Tcl_UniChar low = ch;
+	int len2 = Tcl_UtfToUniChar(src+len, &low);
+	if ((low & ~0x3FF) == 0xDC00) {
+	    *ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
+	    return len + len2;
+	}
     }
 #endif
-
-    *ucs4Ptr = fullchar;
+    *ucs4Ptr = (int)ch;
     return len;
 }
 
 /*
+ *---------------------------------------------------------------------------
+ *
+ * TclUCS4ToUtf --
+ *
+ *	Store the given Unicode character as a sequence of UTF-8 bytes in the
+ *	provided buffer. Might output 6 bytes, if the code point > 0xFFFF.
+ *
+ * Results:
+ *	The return values is the number of bytes in the buffer that were
+ *	consumed.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUCS4ToUtf(
+    int ch,			/* Unicode character to be stored in the
+				 * buffer. */
+    char *buf)			/* Buffer in which the UTF-8 representation of
+				 * the Unicode character is stored. Buffer must be
+				 * large enough to hold the UTF-8 character(s)
+				 * (at most 6 bytes). */
+{
+#if TCL_UTF_MAX <= 4
+    if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
+	/* Spit out a 4-byte UTF-8 character or 2 x 3-byte UTF-8 characters, depending on Tcl
+	 * version and/or TCL_UTF_MAX build value */
+	int len = Tcl_UniCharToUtf(0xD800 | ((ch - 0x10000) >> 10), buf);
+	return len + Tcl_UniCharToUtf(0xDC00 | (ch & 0x7FF), buf + len);
+    }
+#endif
+    if ((ch & ~0x7FF) == 0xD800) {
+	buf[2] = (char) ((ch | 0x80) & 0xBF);
+	buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	buf[0] = (char) ((ch >> 12) | 0xE0);
+	return 3;
+    }
+    return Tcl_UniCharToUtf(ch, buf);
+}
+
+/*
  * Local Variables:
  * mode: c
  * c-basic-offset: 4