1 files changed, 159 insertions, 115 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index e5497a4..68119a4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -59,7 +59,7 @@
  * UTF-8.
  */
 
-static CONST unsigned char totalBytes[256] = {
+static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -73,16 +73,7 @@ static CONST unsigned char totalBytes[256] = {
 #else
     1,1,1,1,1,1,1,1,
 #endif
-#if TCL_UTF_MAX > 4
-    5,5,5,5,
-#else
-    1,1,1,1,
-#endif
-#if TCL_UTF_MAX > 5
-    6,6,6,6
-#else
-    1,1,1,1
-#endif
+    1,1,1,1,1,1,1,1
 };
 
 /*
@@ -111,25 +102,16 @@ INLINE static int
 UtfCount(
     int ch)			/* The Tcl_UniChar whose size is returned. */
 {
-    if ((ch > 0) && (ch < UNICODE_SELF)) {
+    if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
 	return 1;
     }
     if (ch <= 0x7FF) {
 	return 2;
     }
-    if (ch <= 0xFFFF) {
-	return 3;
-    }
 #if TCL_UTF_MAX > 3
-    if (ch <= 0x1FFFFF) {
+    if (((unsigned)(ch - 0x10000) <= 0xfffff)) {
 	return 4;
     }
-    if (ch <= 0x3FFFFFF) {
-	return 5;
-    }
-    if (ch <= 0x7FFFFFFF) {
-	return 6;
-    }
 #endif
     return 3;
 }
@@ -161,7 +143,7 @@ Tcl_UniCharToUtf(
 				 * large enough to hold the UTF-8 character
 				 * (at most TCL_UTF_MAX bytes). */
 {
-    if ((ch > 0) && (ch < UNICODE_SELF)) {
+    if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
 	buf[0] = (char) ch;
 	return 1;
     }
@@ -172,43 +154,43 @@ Tcl_UniCharToUtf(
 	    return 2;
 	}
 	if (ch <= 0xFFFF) {
-	three:
-	    buf[2] = (char) ((ch | 0x80) & 0xBF);
-	    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
-	    buf[0] = (char) ((ch >> 12) | 0xE0);
-	    return 3;
+#if TCL_UTF_MAX == 4
+	    if ((ch & 0xF800) == 0xD800) {
+		if (ch & 0x0400) {
+		    /* Low surrogate */
+		    buf[3] = (char) ((ch | 0x80) & 0xBF);
+		    buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
+		    return 4;
+		} else {
+		    /* High surrogate */
+		    ch += 0x40;
+		    buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
+		    buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
+		    buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
+		    return 0;
+		}
+	    }
+#endif
+	    goto three;
 	}
 
 #if TCL_UTF_MAX > 3
-	if (ch <= 0x1FFFFF) {
+	if (ch <= 0x10FFFF) {
 	    buf[3] = (char) ((ch | 0x80) & 0xBF);
 	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
 	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
 	    buf[0] = (char) ((ch >> 18) | 0xF0);
 	    return 4;
 	}
-	if (ch <= 0x3FFFFFF) {
-	    buf[4] = (char) ((ch | 0x80) & 0xBF);
-	    buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
-	    buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
-	    buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
-	    buf[0] = (char) ((ch >> 24) | 0xF8);
-	    return 5;
-	}
-	if (ch <= 0x7FFFFFFF) {
-	    buf[5] = (char) ((ch | 0x80) & 0xBF);
-	    buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
-	    buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
-	    buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
-	    buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
-	    buf[0] = (char) ((ch >> 30) | 0xFC);
-	    return 6;
-	}
 #endif
     }
 
     ch = 0xFFFD;
-    goto three;
+three:
+    buf[2] = (char) ((ch | 0x80) & 0xBF);
+    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+    buf[0] = (char) ((ch >> 12) | 0xE0);
+    return 3;
 }
 
 /*
@@ -231,13 +213,13 @@ Tcl_UniCharToUtf(
 
 char *
 Tcl_UniCharToUtfDString(
-    CONST Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */
+    const Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */
     int uniLength,		/* Length of Unicode string in Tcl_UniChars
 				 * (must be >= 0). */
     Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
 				 * to this previously initialized DString. */
 {
-    CONST Tcl_UniChar *w, *wEnd;
+    const Tcl_UniChar *w, *wEnd;
     char *p, *string;
     int oldLength;
 
@@ -289,7 +271,7 @@ Tcl_UniCharToUtfDString(
 
 int
 Tcl_UtfToUniChar(
-    register CONST char *src,	/* The UTF-8 string. */
+    register const char *src,	/* The UTF-8 string. */
     register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
 				 * the UTF-8 string. */
 {
@@ -323,9 +305,6 @@ Tcl_UtfToUniChar(
 	 * A two-byte-character lead-byte not followed by trail-byte
 	 * represents itself.
 	 */
-
-	*chPtr = (Tcl_UniChar) byte;
-	return 1;
     } else if (byte < 0xF0) {
 	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
 	    /*
@@ -341,31 +320,23 @@ Tcl_UtfToUniChar(
 	 * A three-byte-character lead-byte not followed by two trail-bytes
 	 * represents itself.
 	 */
-
-	*chPtr = (Tcl_UniChar) byte;
-	return 1;
     }
 #if TCL_UTF_MAX > 3
-    {
-	int ch, total, trail;
-
-	total = totalBytes[byte];
-	trail = total - 1;
-	if (trail > 0) {
-	    ch = byte & (0x3F >> trail);
-	    do {
-		src++;
-		if ((*src & 0xC0) != 0x80) {
-		    *chPtr = byte;
-		    return 1;
-		}
-		ch <<= 6;
-		ch |= (*src & 0x3F);
-		trail--;
-	    } while (trail > 0);
-	    *chPtr = ch;
-	    return total;
+    else if (byte < 0xF8) {
+	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+	    /*
+	     * Four-byte-character lead byte followed by three trail bytes.
+	     */
+
+	    *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
+		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+	    return 4;
 	}
+
+	/*
+	 * A three-byte-character lead-byte not followed by two trail-bytes
+	 * represents itself.
+	 */
     }
 #endif
 
@@ -393,7 +364,7 @@ Tcl_UtfToUniChar(
 
 Tcl_UniChar *
 Tcl_UtfToUniCharDString(
-    CONST char *src,		/* UTF-8 string to convert to Unicode. */
+    const char *src,		/* UTF-8 string to convert to Unicode. */
     int length,			/* Length of UTF-8 string in bytes, or -1 for
 				 * strlen(). */
     Tcl_DString *dsPtr)		/* Unicode representation of string is
@@ -401,7 +372,7 @@ Tcl_UtfToUniCharDString(
 				 * DString. */
 {
     Tcl_UniChar *w, *wString;
-    CONST char *p, *end;
+    const char *p, *end;
     int oldLength;
 
     if (length < 0) {
@@ -414,6 +385,7 @@ Tcl_UtfToUniCharDString(
      */
 
     oldLength = Tcl_DStringLength(dsPtr);
+/* TODO: fix overreach! */
     Tcl_DStringSetLength(dsPtr,
 	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
@@ -452,7 +424,7 @@ Tcl_UtfToUniCharDString(
 
 int
 Tcl_UtfCharComplete(
-    CONST char *src,		/* String to check if first few bytes contain
+    const char *src,		/* String to check if first few bytes contain
 				 * a complete UTF-8 character. */
     int length)			/* Length of above string in bytes. */
 {
@@ -482,7 +454,7 @@ Tcl_UtfCharComplete(
 
 int
 Tcl_NumUtfChars(
-    register CONST char *src,	/* The UTF-8 string to measure. */
+    register const char *src,	/* The UTF-8 string to measure. */
     int length)			/* The length of the string in bytes, or -1
 				 * for strlen(string). */
 {
@@ -540,9 +512,9 @@ Tcl_NumUtfChars(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfFindFirst(
-    CONST char *src,		/* The UTF-8 string to be searched. */
+    const char *src,		/* The UTF-8 string to be searched. */
     int ch)			/* The Tcl_UniChar to search for. */
 {
     int len;
@@ -579,14 +551,14 @@ Tcl_UtfFindFirst(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfFindLast(
-    CONST char *src,		/* The UTF-8 string to be searched. */
+    const char *src,		/* The UTF-8 string to be searched. */
     int ch)			/* The Tcl_UniChar to search for. */
 {
     int len;
     Tcl_UniChar find;
-    CONST char *last;
+    const char *last;
 
     last = NULL;
     while (1) {
@@ -621,9 +593,9 @@ Tcl_UtfFindLast(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfNext(
-    CONST char *src)		/* The current location in the string. */
+    const char *src)		/* The current location in the string. */
 {
     Tcl_UniChar ch;
 
@@ -651,13 +623,13 @@ Tcl_UtfNext(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfPrev(
-    CONST char *src,		/* The current location in the string. */
-    CONST char *start)		/* Pointer to the beginning of the string, to
+    const char *src,		/* The current location in the string. */
+    const char *start)		/* Pointer to the beginning of the string, to
 				 * avoid going backwards too far. */
 {
-    CONST char *look;
+    const char *look;
     int i, byte;
 
     src--;
@@ -700,10 +672,10 @@ Tcl_UtfPrev(
 
 Tcl_UniChar
 Tcl_UniCharAtIndex(
-    register CONST char *src,	/* The UTF-8 string to dereference. */
+    register const char *src,	/* The UTF-8 string to dereference. */
     register int index)		/* The position of the desired character. */
 {
-    Tcl_UniChar ch;
+    Tcl_UniChar ch = 0;
 
     while (index >= 0) {
 	index--;
@@ -729,9 +701,9 @@ Tcl_UniCharAtIndex(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfAtIndex(
-    register CONST char *src,	/* The UTF-8 string. */
+    register const char *src,	/* The UTF-8 string. */
     register int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch;
@@ -771,7 +743,7 @@ Tcl_UtfAtIndex(
 
 int
 Tcl_UtfBackslash(
-    CONST char *src,		/* Points to the backslash character of a
+    const char *src,		/* Points to the backslash character of a
 				 * backslash sequence. */
     int *readPtr,		/* Fill in with number of characters read from
 				 * src, unless NULL. */
@@ -983,8 +955,8 @@ Tcl_UtfToTitle(
 
 int
 TclpUtfNcmp2(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct,		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
     unsigned long numBytes)	/* Number of *bytes* to compare. */
 {
     /*
@@ -1030,8 +1002,8 @@ TclpUtfNcmp2(
 
 int
 Tcl_UtfNcmp(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct,		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
     unsigned long numChars)	/* Number of UTF chars to compare. */
 {
     Tcl_UniChar ch1, ch2;
@@ -1078,8 +1050,8 @@ Tcl_UtfNcmp(
 
 int
 Tcl_UtfNcasecmp(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct,		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
     unsigned long numChars)	/* Number of UTF chars to compare. */
 {
     Tcl_UniChar ch1, ch2;
@@ -1122,8 +1094,8 @@ Tcl_UtfNcasecmp(
 
 int
 TclUtfCasecmp(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct)		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct)		/* UTF string cs is compared to. */
 {
     while (*cs && *ct) {
 	Tcl_UniChar ch1, ch2;
@@ -1252,7 +1224,7 @@ Tcl_UniCharToTitle(
 
 int
 Tcl_UniCharLen(
-    CONST Tcl_UniChar *uniStr)	/* Unicode string to find length of. */
+    const Tcl_UniChar *uniStr)	/* Unicode string to find length of. */
 {
     int len = 0;
 
@@ -1282,8 +1254,8 @@ Tcl_UniCharLen(
 
 int
 Tcl_UniCharNcmp(
-    CONST Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
-    CONST Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
+    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
+    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
     unsigned long numChars)	/* Number of unichars to compare. */
 {
 #ifdef WORDS_BIGENDIAN
@@ -1327,8 +1299,8 @@ Tcl_UniCharNcmp(
 
 int
 Tcl_UniCharNcasecmp(
-    CONST Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
-    CONST Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
+    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
+    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
     unsigned long numChars)	/* Number of unichars to compare. */
 {
     for ( ; numChars != 0; numChars--, ucs++, uct++) {
@@ -1364,6 +1336,11 @@ int
 Tcl_UniCharIsAlnum(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    if (UNICODE_OUT_OF_RANGE(ch)) {
+	return 0;
+    }
+#endif
     return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
 }
 
@@ -1387,6 +1364,11 @@ int
 Tcl_UniCharIsAlpha(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    if (UNICODE_OUT_OF_RANGE(ch)) {
+	return 0;
+    }
+#endif
     return ((ALPHA_BITS >> GetCategory(ch)) & 1);
 }
 
@@ -1410,6 +1392,18 @@ int
 Tcl_UniCharIsControl(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    if (UNICODE_OUT_OF_RANGE(ch)) {
+	ch &= 0x1fffff;
+	if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) {
+	    return 1;
+	}
+	if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) {
+	    return 1;
+	}
+	return 0;
+    }
+#endif
     return ((CONTROL_BITS >> GetCategory(ch)) & 1);
 }
 
@@ -1433,6 +1427,11 @@ int
 Tcl_UniCharIsDigit(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    if (UNICODE_OUT_OF_RANGE(ch)) {
+	return 0;
+    }
+#endif
     return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
 }
 
@@ -1456,6 +1455,12 @@ int
 Tcl_UniCharIsGraph(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    if (UNICODE_OUT_OF_RANGE(ch)) {
+	ch &= 0x1fffff;
+	return (ch >= 0xe0100) && (ch <= 0xe01ef);
+    }
+#endif
     return ((GRAPH_BITS >> GetCategory(ch)) & 1);
 }
 
@@ -1479,6 +1484,11 @@ int
 Tcl_UniCharIsLower(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    if (UNICODE_OUT_OF_RANGE(ch)) {
+	return 0;
+    }
+#endif
     return (GetCategory(ch) == LOWERCASE_LETTER);
 }
 
@@ -1502,6 +1512,12 @@ int
 Tcl_UniCharIsPrint(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    if (UNICODE_OUT_OF_RANGE(ch)) {
+	ch &= 0x1fffff;
+	return (ch >= 0xe0100) && (ch <= 0xe01ef);
+    }
+#endif
     return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
 }
 
@@ -1525,6 +1541,11 @@ int
 Tcl_UniCharIsPunct(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    if (UNICODE_OUT_OF_RANGE(ch)) {
+	return 0;
+    }
+#endif
     return ((PUNCT_BITS >> GetCategory(ch)) & 1);
 }
 
@@ -1548,14 +1569,27 @@ int
 Tcl_UniCharIsSpace(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    /* Ignore upper 11 bits. */
+    ch &= 0x1fffff;
+#else
+    /* Ignore upper 16 bits. */
+    ch &= 0xffff;
+#endif
+
     /*
      * If the character is within the first 127 characters, just use the
      * standard C function, otherwise consult the Unicode table.
      */
 
-    if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
+    if (ch < 0x80) {
 	return TclIsSpaceProc((char) ch);
-    } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) {
+#if TCL_UTF_MAX > 3
+    } else if (UNICODE_OUT_OF_RANGE(ch)) {
+	return 0;
+#endif
+    } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b
+	    || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) {
 	return 1;
     } else {
 	return ((SPACE_BITS >> GetCategory(ch)) & 1);
@@ -1582,6 +1616,11 @@ int
 Tcl_UniCharIsUpper(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    if (UNICODE_OUT_OF_RANGE(ch)) {
+	return 0;
+    }
+#endif
     return (GetCategory(ch) == UPPERCASE_LETTER);
 }
 
@@ -1605,6 +1644,11 @@ int
 Tcl_UniCharIsWordChar(
     int ch)			/* Unicode character to test. */
 {
+#if TCL_UTF_MAX > 3
+    if (UNICODE_OUT_OF_RANGE(ch)) {
+	return 0;
+    }
+#endif
     return ((WORD_BITS >> GetCategory(ch)) & 1);
 }
 
@@ -1633,8 +1677,8 @@ Tcl_UniCharIsWordChar(
 
 int
 Tcl_UniCharCaseMatch(
-    CONST Tcl_UniChar *uniStr,	/* Unicode String. */
-    CONST Tcl_UniChar *uniPattern,
+    const Tcl_UniChar *uniStr,	/* Unicode String. */
+    const Tcl_UniChar *uniPattern,
 				/* Pattern, which may contain special
 				 * characters. */
     int nocase)			/* 0 for case sensitive, 1 for insensitive */
@@ -1821,14 +1865,14 @@ Tcl_UniCharCaseMatch(
 
 int
 TclUniCharMatch(
-    CONST Tcl_UniChar *string,	/* Unicode String. */
+    const Tcl_UniChar *string,	/* Unicode String. */
     int strLen,			/* Length of String */
-    CONST Tcl_UniChar *pattern,	/* Pattern, which may contain special
+    const Tcl_UniChar *pattern,	/* Pattern, which may contain special
 				 * characters. */
     int ptnLen,			/* Length of Pattern */
     int nocase)			/* 0 for case sensitive, 1 for insensitive */
 {
-    CONST Tcl_UniChar *stringEnd, *patternEnd;
+    const Tcl_UniChar *stringEnd, *patternEnd;
     Tcl_UniChar p;
 
     stringEnd = string + strLen;