1 files changed, 312 insertions, 76 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 80f3be8..4103eff 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -64,11 +64,16 @@ static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+#if TCL_UTF_MAX != 4
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#endif
     2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
     4,4,4,4,4,
 #else
     1,1,1,1,1,
@@ -82,6 +87,8 @@ static const unsigned char totalBytes[256] = {
 
 static int		UtfCount(int ch);
 static int		Invalid(const char *src);
+static int		UCS4ToUpper(int ch);
+static int		UCS4ToTitle(int ch);
 
 /*
  *---------------------------------------------------------------------------
@@ -99,7 +106,7 @@ static int		Invalid(const char *src);
  *---------------------------------------------------------------------------
  */
 
-static INLINE int
+static inline int
 UtfCount(
     int ch)			/* The Unicode character whose size is returned. */
 {
@@ -160,7 +167,7 @@ static const unsigned char bounds[28] = {
 #endif
 };
 
-static INLINE int
+static int
 Invalid(
     const char *src)	/* Points to lead byte of a UTF-8 byte sequence */
 {
@@ -197,7 +204,7 @@ Invalid(
  *---------------------------------------------------------------------------
  */
 
-INLINE int
+int
 Tcl_UniCharToUtf(
     int ch,			/* The Tcl_UniChar to be stored in the
 				 * buffer. */
@@ -217,6 +224,29 @@ Tcl_UniCharToUtf(
 	    return 2;
 	}
 	if (ch <= 0xFFFF) {
+#if TCL_UTF_MAX > 3
+	    if ((ch & 0xF800) == 0xD800) {
+		if (ch & 0x0400) {
+		    /* Low surrogate */
+		    if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) {
+			/* Previous Tcl_UniChar was a high surrogate, so combine */
+			buf[2] = (char) ((ch & 0x3F) | 0x80);
+			buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80);
+			return 3;
+		    }
+		    /* Previous Tcl_UniChar was not a high surrogate, so just output */
+		} else {
+		    /* High surrogate */
+		    ch += 0x40;
+		    /* Fill buffer with specific 3-byte (invalid) byte combination,
+		       so following low surrogate can recognize it and combine */
+		    buf[2] = (char) ((ch << 4) & 0x30);
+		    buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80);
+		    buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0);
+		    return 1;
+		}
+	    }
+#endif
 	    goto three;
 	}
 
@@ -228,6 +258,16 @@ Tcl_UniCharToUtf(
 	    buf[0] = (char) ((ch >> 18) | 0xF0);
 	    return 4;
 	}
+    } else if (ch == -1) {
+	if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)
+		&& ((buf[-1] & 0xF8) == 0xF0)) {
+	    ch = 0xD7C0 + ((buf[-1] & 0x07) << 8) + ((buf[0] & 0x3F) << 2)
+		    + ((buf[1] & 0x30) >> 4);
+	    buf[1] = (char) ((ch | 0x80) & 0xBF);
+	    buf[0] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	    buf[-1] = (char) ((ch >> 12) | 0xE0);
+	    return 2;
+	}
 #endif
     }
 
@@ -305,6 +345,15 @@ Tcl_UniCharToUtfDString(
  *	Tcl_UtfCharComplete() before calling this routine to ensure that
  *	enough bytes remain in the string.
  *
+ *	If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done:
+ *	For any UTF-8 string containing a character outside of the BMP, the
+ *	first call to this function will fill *chPtr with the high surrogate
+ *	and generate a return value of 1. Calling Tcl_UtfToUniChar again
+ *	will produce the low surrogate and a return value of 3. Because *chPtr
+ *	is used to remember whether the high surrogate is already produced, it
+ *	is recommended to initialize the variable it points to as 0 before
+ *	the first call to Tcl_UtfToUniChar is done.
+ *
  * Results:
  *	*chPtr is filled with the Tcl_UniChar, and the return value is the
  *	number of bytes from the UTF-8 string that were consumed.
@@ -335,6 +384,20 @@ Tcl_UtfToUniChar(
 	 * characters representing themselves.
 	 */
 
+#if TCL_UTF_MAX <= 4
+	/* If *chPtr contains a high surrogate (produced by a previous
+	 * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
+	 * bytes, then we must produce a follow-up low surrogate. We only
+	 * do that if the high surrogate matches the bits we encounter.
+	 */
+	if (((byte & 0xC0) == 0x80)
+		&& ((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)
+		&& (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
+		&& ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))) {
+	    *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
+	    return 3;
+	}
+#endif
 	*chPtr = byte;
 	return 1;
     } else if (byte < 0xE0) {
@@ -371,17 +434,30 @@ Tcl_UtfToUniChar(
 	 * represents itself.
 	 */
     }
-#if TCL_UTF_MAX > 3
     else if (byte < 0xF5) {
-	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
 	    /*
-	     * Four-byte-character lead byte followed by three trail bytes.
+	     * Four-byte-character lead byte followed by at least two trail bytes.
+	     * We don't test the validity of 3th trail byte, see [ed29806ba]
 	     */
-	    *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
-		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
-	    if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
-		return 4;
+#if TCL_UTF_MAX <= 4
+	    Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
+		    | ((src[2] & 0x3F) >> 4)) - 0x40;
+	    if (high < 0x400) {
+		/* produce high surrogate, advance source pointer */
+		*chPtr = 0xD800 + high;
+		return 1;
 	    }
+	    /* out of range, < 0x10000 or > 0x10FFFF */
+#else
+	    if ((src[3] & 0xC0) == 0x80) {
+		*chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+			| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+		if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
+		    return 4;
+		}
+	    }
+#endif
 	}
 
 	/*
@@ -389,7 +465,6 @@ Tcl_UtfToUniChar(
 	 * represents itself.
 	 */
     }
-#endif
 
     *chPtr = byte;
     return 1;
@@ -422,13 +497,13 @@ Tcl_UtfToUniCharDString(
 				 * appended to this previously initialized
 				 * DString. */
 {
-    Tcl_UniChar *w, *wString;
+    Tcl_UniChar ch = 0, *w, *wString;
     const char *p;
     int oldLength;
-	/* Pointer to the end of string. Never read endPtr[0] */
-	const char *endPtr = src + length;
-	/* Pointer to breakpoint in scan where optimization is lost */
-	const char *optPtr = endPtr - TCL_UTF_MAX;
+    /* Pointer to the end of string. Never read endPtr[0] */
+    const char *endPtr = src + length;
+    /* Pointer to last byte where optimization still can be used */
+    const char *optPtr = endPtr - TCL_UTF_MAX;
 
     if (length < 0) {
 	length = strlen(src);
@@ -450,11 +525,12 @@ Tcl_UtfToUniCharDString(
     endPtr = src + length;
     optPtr = endPtr - TCL_UTF_MAX;
     while (p <= optPtr) {
-	p += TclUtfToUniChar(p, w);
-	w++;
+	p += TclUtfToUniChar(p, &ch);
+	*w++ = ch;
     }
     while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) {
-	p += TclUtfToUniChar(p, w++);
+	p += TclUtfToUniChar(p, &ch);
+	*w++ = ch;
     }
     while (p < endPtr) {
 	*w++ = UCHAR(*p++);
@@ -518,7 +594,7 @@ Tcl_NumUtfChars(
     int length)		/* The length of the string in bytes, or -1
 			 * for strlen(string). */
 {
-    Tcl_UniChar ch;
+    Tcl_UniChar ch = 0;
     int i = 0;
 
     if (length < 0) {
@@ -544,12 +620,26 @@ Tcl_NumUtfChars(
 	 */
 	while (src <= optPtr
 		/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
+#if TCL_UTF_MAX < 4
+	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
+		/* treat F0 - F4 as single character */
+		ch = 0;
+		src++;
+	    } else
+#endif
 	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
 	/* Loop over the remaining string where call must happen */
 	while (src < endPtr) {
 	    if (Tcl_UtfCharComplete(src, endPtr - src)) {
+#if TCL_UTF_MAX < 4
+		if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
+		    /* treat F0 - F4 as single character */
+		    ch = 0;
+		    src++;
+		} else
+#endif
 		src += TclUtfToUniChar(src, &ch);
 	    } else {
 		/*
@@ -586,11 +676,11 @@ Tcl_NumUtfChars(
 const char *
 Tcl_UtfFindFirst(
     const char *src,		/* The UTF-8 string to be searched. */
-    int ch)			/* The Tcl_UniChar to search for. */
+    int ch)			/* The Unicode character to search for. */
 {
     while (1) {
-	Tcl_UniChar find;
-	int len = TclUtfToUniChar(src, &find);
+	int find, len = TclUtfToUCS4(src, &find);
+
 	if (find == ch) {
 	    return src;
 	}
@@ -628,8 +718,7 @@ Tcl_UtfFindLast(
     const char *last = NULL;
 
     while (1) {
-	Tcl_UniChar find;
-	int len = TclUtfToUniChar(src, &find);
+	int find, len = TclUtfToUCS4(src, &find);
 
 	if (find == ch) {
 	    last = src;
@@ -799,15 +888,19 @@ Tcl_UtfPrev(
 
 	/* Continue the search backwards... */
 	look--;
-    } while (trailBytesSeen < TCL_UTF_MAX);
+    } while (trailBytesSeen < ((TCL_UTF_MAX > 4) ? 4 : 3));
 
     /*
-     * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a
+     * We've seen 3 (or 4) trail bytes, so we know there will not be a
      * properly formed byte sequence to find, and we can stop looking,
-     * accepting the fallback.
+     * accepting the fallback (for TCL_UTF_MAX > 4) or just go back as
+     * far as we can.
      */
-
+#if TCL_UTF_MAX > 4
     return fallback;
+#else
+    return src - 3;
+#endif
 }
 
 /*
@@ -832,7 +925,7 @@ Tcl_UniCharAtIndex(
     const char *src,	/* The UTF-8 string to dereference. */
     int index)		/* The position of the desired character. */
 {
-    Tcl_UniChar ch;
+    Tcl_UniChar ch = 0;
 
     TclUtfToUniChar(Tcl_UtfAtIndex(src, index), &ch);
     return ch;
@@ -860,11 +953,19 @@ Tcl_UtfAtIndex(
     const char *src,	/* The UTF-8 string. */
     int index)		/* The position of the desired character. */
 {
-    Tcl_UniChar ch;
+    Tcl_UniChar ch = 0;
+    int len = 0;
 
     while (index-- > 0) {
+	len = TclUtfToUniChar(src, &ch);
+	src += len;
+    }
+#if TCL_UTF_MAX == 4
+    if ((ch >= 0xD800) && (len < 3)) {
+	/* Index points at character following high Surrogate */
 	src += TclUtfToUniChar(src, &ch);
     }
+#endif
     return src;
 }
 
@@ -943,7 +1044,7 @@ int
 Tcl_UtfToUpper(
     char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch, upChar;
+    int ch, upChar;
     char *src, *dst;
     int len;
 
@@ -953,8 +1054,8 @@ Tcl_UtfToUpper(
 
     src = dst = str;
     while (*src) {
-	len = TclUtfToUniChar(src, &ch);
-	upChar = Tcl_UniCharToUpper(ch);
+	len = TclUtfToUCS4(src, &ch);
+	upChar = UCS4ToUpper(ch);
 
 	/*
 	 * To keep badly formed Utf strings from getting inflated by the
@@ -962,7 +1063,7 @@ Tcl_UtfToUpper(
 	 * char to dst if its size is <= the original char.
 	 */
 
-	if (len < UtfCount(upChar)) {
+	if (len < UtfCount(upChar) || ((upChar & ~0x7FF) == 0xD800)) {
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
@@ -996,7 +1097,7 @@ int
 Tcl_UtfToLower(
     char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch, lowChar;
+    int ch, lowChar;
     char *src, *dst;
     int len;
 
@@ -1006,8 +1107,8 @@ Tcl_UtfToLower(
 
     src = dst = str;
     while (*src) {
-	len = TclUtfToUniChar(src, &ch);
-	lowChar = Tcl_UniCharToLower(ch);
+	len = TclUtfToUCS4(src, &ch);
+	lowChar = TclUCS4ToLower(ch);
 
 	/*
 	 * To keep badly formed Utf strings from getting inflated by the
@@ -1015,7 +1116,7 @@ Tcl_UtfToLower(
 	 * char to dst if its size is <= the original char.
 	 */
 
-	if (len < UtfCount(lowChar)) {
+	if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) {
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
@@ -1050,7 +1151,7 @@ int
 Tcl_UtfToTitle(
     char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch, titleChar, lowChar;
+    int ch, titleChar, lowChar;
     char *src, *dst;
     int len;
 
@@ -1062,10 +1163,10 @@ Tcl_UtfToTitle(
     src = dst = str;
 
     if (*src) {
-	len = TclUtfToUniChar(src, &ch);
-	titleChar = Tcl_UniCharToTitle(ch);
+	len = TclUtfToUCS4(src, &ch);
+	titleChar = UCS4ToTitle(ch);
 
-	if (len < UtfCount(titleChar)) {
+	if (len < UtfCount(titleChar) || ((titleChar & ~0x7FF) == 0xD800)) {
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
@@ -1074,14 +1175,14 @@ Tcl_UtfToTitle(
 	src += len;
     }
     while (*src) {
-	len = TclUtfToUniChar(src, &ch);
+	len = TclUtfToUCS4(src, &ch);
 	lowChar = ch;
 	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
 	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
-	    lowChar = Tcl_UniCharToLower(lowChar);
+	    lowChar = TclUCS4ToLower(lowChar);
 	}
 
-	if (len < UtfCount(lowChar)) {
+	if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) {
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
@@ -1163,7 +1264,7 @@ Tcl_UtfNcmp(
     const char *ct,		/* UTF string cs is compared to. */
     unsigned long numChars)	/* Number of UTF chars to compare. */
 {
-    Tcl_UniChar ch1, ch2;
+    Tcl_UniChar ch1 = 0, ch2 = 0;
 
     /*
      * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
@@ -1181,6 +1282,16 @@ Tcl_UtfNcmp(
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
 	if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+	    /* Surrogates always report higher than non-surrogates */
+	    if (((ch1 & ~0x3FF) == 0xD800)) {
+	    if ((ch2 & ~0x3FF) != 0xD800) {
+		return ch1;
+	    }
+	    } else if ((ch2 & ~0x3FF) == 0xD800) {
+		return -ch2;
+	    }
+#endif
 	    return (ch1 - ch2);
 	}
     }
@@ -1211,7 +1322,8 @@ Tcl_UtfNcasecmp(
     const char *ct,		/* UTF string cs is compared to. */
     unsigned long numChars)	/* Number of UTF chars to compare. */
 {
-    Tcl_UniChar ch1, ch2;
+    Tcl_UniChar ch1 = 0, ch2 = 0;
+
     while (numChars-- > 0) {
 	/*
 	 * n must be interpreted as chars, not bytes.
@@ -1221,6 +1333,16 @@ Tcl_UtfNcasecmp(
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
 	if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+	    /* Surrogates always report higher than non-surrogates */
+	    if (((ch1 & 0xFC00) == 0xD800)) {
+	    if ((ch2 & 0xFC00) != 0xD800) {
+		return ch1;
+	    }
+	    } else if ((ch2 & 0xFC00) == 0xD800) {
+		return -ch2;
+	    }
+#endif
 	    ch1 = Tcl_UniCharToLower(ch1);
 	    ch2 = Tcl_UniCharToLower(ch2);
 	    if (ch1 != ch2) {
@@ -1254,12 +1376,22 @@ TclUtfCasecmp(
     const char *cs,		/* UTF string to compare to ct. */
     const char *ct)		/* UTF string cs is compared to. */
 {
-    Tcl_UniChar ch1, ch2;
+    Tcl_UniChar ch1 = 0, ch2 = 0;
 
     while (*cs && *ct) {
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
 	if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+	    /* Surrogates always report higher than non-surrogates */
+	    if (((ch1 & 0xFC00) == 0xD800)) {
+	    if ((ch2 & 0xFC00) != 0xD800) {
+		return ch1;
+	    }
+	    } else if ((ch2 & 0xFC00) == 0xD800) {
+		return -ch2;
+	    }
+#endif
 	    ch1 = Tcl_UniCharToLower(ch1);
 	    ch2 = Tcl_UniCharToLower(ch2);
 	    if (ch1 != ch2) {
@@ -1287,24 +1419,26 @@ TclUtfCasecmp(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
-Tcl_UniCharToUpper(
+static int
+UCS4ToUpper(
     int ch)			/* Unicode character to convert. */
 {
-#if TCL_UTF_MAX > 3
     if (!UNICODE_OUT_OF_RANGE(ch)) {
-#endif
 	int info = GetUniCharInfo(ch);
 
 	if (GetCaseType(info) & 0x04) {
 	    ch -= GetDelta(info);
 	}
-#if TCL_UTF_MAX > 3
     }
     /* Clear away extension bits, if any */
-    ch &= 0x1FFFFF;
-#endif
-    return (Tcl_UniChar) ch;
+    return ch & 0x1FFFFF;
+}
+
+Tcl_UniChar
+Tcl_UniCharToUpper(
+    int ch)			/* Unicode character to convert. */
+{
+    return (Tcl_UniChar) UCS4ToUpper(ch);
 }
 
 /*
@@ -1323,25 +1457,27 @@ Tcl_UniCharToUpper(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
-Tcl_UniCharToLower(
+int
+TclUCS4ToLower(
     int ch)			/* Unicode character to convert. */
 {
-#if TCL_UTF_MAX > 3
     if (!UNICODE_OUT_OF_RANGE(ch)) {
-#endif
 	int info = GetUniCharInfo(ch);
 	int mode = GetCaseType(info);
 
 	if ((mode & 0x02) && (mode != 0x7)) {
 	    ch += GetDelta(info);
 	}
-#if TCL_UTF_MAX > 3
     }
     /* Clear away extension bits, if any */
-    ch &= 0x1FFFFF;
-#endif
-    return (Tcl_UniChar) ch;
+    return ch & 0x1FFFFF;
+}
+
+Tcl_UniChar
+Tcl_UniCharToLower(
+    int ch)			/* Unicode character to convert. */
+{
+    return (Tcl_UniChar) TclUCS4ToLower(ch);
 }
 
 /*
@@ -1360,13 +1496,11 @@ Tcl_UniCharToLower(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
-Tcl_UniCharToTitle(
+static int
+UCS4ToTitle(
     int ch)			/* Unicode character to convert. */
 {
-#if TCL_UTF_MAX > 3
     if (!UNICODE_OUT_OF_RANGE(ch)) {
-#endif
 	int info = GetUniCharInfo(ch);
 	int mode = GetCaseType(info);
 
@@ -1381,12 +1515,16 @@ Tcl_UniCharToTitle(
 	} else if (mode == 0x4) {
 	    ch -= GetDelta(info);
 	}
-#if TCL_UTF_MAX > 3
     }
     /* Clear away extension bits, if any */
-    ch &= 0x1FFFFF;
-#endif
-    return (Tcl_UniChar) ch;
+    return ch & 0x1FFFFF;
+}
+
+Tcl_UniChar
+Tcl_UniCharToTitle(
+    int ch)			/* Unicode character to convert. */
+{
+    return (Tcl_UniChar) UCS4ToTitle(ch);
 }
 
 /*
@@ -1771,7 +1909,8 @@ Tcl_UniCharIsSpace(
     } else if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
 #endif
-    } else if (ch == 0x180E || ch == 0x202F) {
+    } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
+	    || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) {
 	return 1;
     } else {
 	return ((SPACE_BITS >> GetCategory(ch)) & 1);
@@ -1865,7 +2004,7 @@ Tcl_UniCharCaseMatch(
 				 * characters. */
     int nocase)			/* 0 for case sensitive, 1 for insensitive */
 {
-    Tcl_UniChar ch1, p;
+    Tcl_UniChar ch1 = 0, p;
 
     while (1) {
 	p = *uniPattern;
@@ -2216,6 +2355,103 @@ TclUniCharMatch(
 }
 
 /*
+ *---------------------------------------------------------------------------
+ *
+ * TclUtfToUCS4 --
+ *
+ *	Extract the 4-byte codepoint from the leading bytes of the
+ *	Modified UTF-8 string "src".  This is a utility routine to
+ *	contain the surrogate gymnastics in one place.
+ *
+ *	The caller must ensure that the source buffer is long enough that this
+ *	routine does not run off the end and dereference non-existent memory
+ *	looking for trail bytes. If the source buffer is known to be '\0'
+ *	terminated, this cannot happen. Otherwise, the caller should call
+ *	TclUCS4Complete() before calling this routine to ensure that
+ *	enough bytes remain in the string.
+ *
+ * Results:
+ *	*usc4Ptr is filled with the UCS4 code point, and the return value is
+ *	the number of bytes from the UTF-8 string that were consumed.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUtfToUCS4(
+    const char *src,	/* The UTF-8 string. */
+    int *ucs4Ptr)	/* Filled with the UCS4 codepoint represented
+			 * by the UTF-8 string. */
+{
+    Tcl_UniChar ch = 0;
+    int len = Tcl_UtfToUniChar(src, &ch);
+
+#if TCL_UTF_MAX <= 4
+    if ((ch & ~0x3FF) == 0xD800) {
+	Tcl_UniChar low = ch;
+	int len2 = Tcl_UtfToUniChar(src+len, &low);
+	if ((low & ~0x3FF) == 0xDC00) {
+	    *ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
+	    return len + len2;
+	}
+    }
+#endif
+    *ucs4Ptr = (int)ch;
+    return len;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * TclUCS4ToUtf --
+ *
+ *	Store the given Unicode character as a sequence of UTF-8 bytes in the
+ *	provided buffer. Might output 6 bytes, if the code point > 0xFFFF.
+ *
+ * Results:
+ *	The return values is the number of bytes in the buffer that were
+ *	consumed. If ch == -1, this function outputs 0 bytes (empty string),
+ *	since TclGetUCS4 returns -1 for out-of-range indices.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUCS4ToUtf(
+    int ch,			/* Unicode character to be stored in the
+				 * buffer. */
+    char *buf)			/* Buffer in which the UTF-8 representation of
+				 * the Unicode character is stored. Buffer must be
+				 * large enough to hold the UTF-8 character(s)
+				 * (at most 6 bytes). */
+{
+#if TCL_UTF_MAX <= 4
+    if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
+	/* Spit out a 4-byte UTF-8 character or 2 x 3-byte UTF-8 characters, depending on Tcl
+	 * version and/or TCL_UTF_MAX build value */
+	int len = Tcl_UniCharToUtf(0xD800 | ((ch - 0x10000) >> 10), buf);
+	return len + Tcl_UniCharToUtf(0xDC00 | (ch & 0x7FF), buf + len);
+    }
+#endif
+    if ((ch & ~0x7FF) == 0xD800) {
+	buf[2] = (char) ((ch | 0x80) & 0xBF);
+	buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	buf[0] = (char) ((ch >> 12) | 0xE0);
+	return 3;
+    }
+    if (ch == -1) {
+	return 0;
+    }
+    return Tcl_UniCharToUtf(ch, buf);
+}
+
+/*
  * Local Variables:
  * mode: c
  * c-basic-offset: 4