1 files changed, 252 insertions, 220 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 992a55f..15529c7 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -7,8 +7,6 @@
  *
  * See the file "license.terms" for information on usage and redistribution of
  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
- *
- * RCS: @(#) $Id: tclUtf.c,v 1.36 2005/09/07 15:31:10 dgp Exp $
  */
 
 #include "tclInt.h"
@@ -23,46 +21,45 @@
  * The following macros are used for fast character category tests. The x_BITS
  * values are shifted right by the category value to determine whether the
  * given category is included in the set.
- */ 
+ */
 
 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
 	| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
 
+#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))
+
 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
 
 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
 	| (1 << PARAGRAPH_SEPARATOR))
 
-#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
-
-#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
-	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
-	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
-	(1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
-	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
-	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
-	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
-	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
-	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION))
 
 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
 	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
 	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
 	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
 
+#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \
+	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
+	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
+	(1 << OTHER_NUMBER) | \
+	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
+	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+
 /*
- * Unicode characters less than this value are represented by themselves 
- * in UTF-8 strings. 
+ * Unicode characters less than this value are represented by themselves in
+ * UTF-8 strings.
  */
 
 #define UNICODE_SELF	0x80
 
 /*
- * The following structures are used when mapping between Unicode (UCS-2)
- * and UTF-8.
+ * The following structures are used when mapping between Unicode (UCS-2) and
+ * UTF-8.
  */
 
-static CONST unsigned char totalBytes[256] = {
+static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -89,10 +86,10 @@ static CONST unsigned char totalBytes[256] = {
 };
 
 /*
- * Procedures used only in this module.
+ * Functions used only in this module.
  */
 
-static int UtfCount _ANSI_ARGS_((int ch));
+static int		UtfCount(int ch);
 
 /*
  *---------------------------------------------------------------------------
@@ -109,10 +106,10 @@ static int UtfCount _ANSI_ARGS_((int ch));
  *
  *---------------------------------------------------------------------------
  */
- 
+
 INLINE static int
-UtfCount(ch)
-    int ch;			/* The Tcl_UniChar whose size is returned. */
+UtfCount(
+    int ch)			/* The Tcl_UniChar whose size is returned. */
 {
     if ((ch > 0) && (ch < UNICODE_SELF)) {
 	return 1;
@@ -154,12 +151,12 @@ UtfCount(ch)
  *
  *---------------------------------------------------------------------------
  */
- 
+
 INLINE int
-Tcl_UniCharToUtf(ch, buf)
-    int ch;			/* The Tcl_UniChar to be stored in the
+Tcl_UniCharToUtf(
+    int ch,			/* The Tcl_UniChar to be stored in the
 				 * buffer. */
-    char *buf;			/* Buffer in which the UTF-8 representation of
+    char *buf)			/* Buffer in which the UTF-8 representation of
 				 * the Tcl_UniChar is stored. Buffer must be
 				 * large enough to hold the UTF-8 character
 				 * (at most TCL_UTF_MAX bytes). */
@@ -231,16 +228,16 @@ Tcl_UniCharToUtf(ch, buf)
  *
  *---------------------------------------------------------------------------
  */
- 
+
 char *
-Tcl_UniCharToUtfDString(uniStr, uniLength, dsPtr)
-    CONST Tcl_UniChar *uniStr;	/* Unicode string to convert to UTF-8. */
-    int uniLength;		/* Length of Unicode string in Tcl_UniChars
+Tcl_UniCharToUtfDString(
+    const Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */
+    int uniLength,		/* Length of Unicode string in Tcl_UniChars
 				 * (must be >= 0). */
-    Tcl_DString *dsPtr;		/* UTF-8 representation of string is appended
+    Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
 				 * to this previously initialized DString. */
 {
-    CONST Tcl_UniChar *w, *wEnd;
+    const Tcl_UniChar *w, *wEnd;
     char *p, *string;
     int oldLength;
 
@@ -289,15 +286,15 @@ Tcl_UniCharToUtfDString(uniStr, uniLength, dsPtr)
  *
  *---------------------------------------------------------------------------
  */
- 
+
 int
-Tcl_UtfToUniChar(src, chPtr)
-    register CONST char *src;	 /* The UTF-8 string. */
-    register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented by
-				  * the UTF-8 string. */
+Tcl_UtfToUniChar(
+    register const char *src,	/* The UTF-8 string. */
+    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+				 * the UTF-8 string. */
 {
     register int byte;
-    
+
     /*
      * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
      */
@@ -335,7 +332,7 @@ Tcl_UtfToUniChar(src, chPtr)
 	     * Three-byte-character lead byte followed by two trail bytes.
 	     */
 
-	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) 
+	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
 		    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
 	    return 3;
 	}
@@ -349,7 +346,7 @@ Tcl_UtfToUniChar(src, chPtr)
 	return 1;
     }
 #if TCL_UTF_MAX > 3
-    else {
+    {
 	int ch, total, trail;
 
 	total = totalBytes[byte];
@@ -395,16 +392,16 @@ Tcl_UtfToUniChar(src, chPtr)
  */
 
 Tcl_UniChar *
-Tcl_UtfToUniCharDString(src, length, dsPtr)
-    CONST char *src;		/* UTF-8 string to convert to Unicode. */
-    int length;			/* Length of UTF-8 string in bytes, or -1 for
+Tcl_UtfToUniCharDString(
+    const char *src,		/* UTF-8 string to convert to Unicode. */
+    int length,			/* Length of UTF-8 string in bytes, or -1 for
 				 * strlen(). */
-    Tcl_DString *dsPtr;		/* Unicode representation of string is
+    Tcl_DString *dsPtr)		/* Unicode representation of string is
 				 * appended to this previously initialized
 				 * DString. */
 {
     Tcl_UniChar *w, *wString;
-    CONST char *p, *end;
+    const char *p, *end;
     int oldLength;
 
     if (length < 0) {
@@ -417,6 +414,7 @@ Tcl_UtfToUniCharDString(src, length, dsPtr)
      */
 
     oldLength = Tcl_DStringLength(dsPtr);
+/* TODO: fix overreach! */
     Tcl_DStringSetLength(dsPtr,
 	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
@@ -454,10 +452,10 @@ Tcl_UtfToUniCharDString(src, length, dsPtr)
  */
 
 int
-Tcl_UtfCharComplete(src, length)
-    CONST char *src;		/* String to check if first few bytes contain
+Tcl_UtfCharComplete(
+    const char *src,		/* String to check if first few bytes contain
 				 * a complete UTF-8 character. */
-    int length;			/* Length of above string in bytes. */
+    int length)			/* Length of above string in bytes. */
 {
     int ch;
 
@@ -475,18 +473,18 @@ Tcl_UtfCharComplete(src, length)
  *	utflen() and utfnlen().
  *
  * Results:
- *	As above.  
+ *	As above.
  *
  * Side effects:
  *	None.
  *
  *---------------------------------------------------------------------------
  */
- 
-int 
-Tcl_NumUtfChars(src, length)
-    register CONST char *src;	/* The UTF-8 string to measure. */
-    int length;			/* The length of the string in bytes, or -1
+
+int
+Tcl_NumUtfChars(
+    register const char *src,	/* The UTF-8 string to measure. */
+    int length)			/* The length of the string in bytes, or -1
 				 * for strlen(string). */
 {
     Tcl_UniChar ch;
@@ -542,14 +540,15 @@ Tcl_NumUtfChars(src, length)
  *
  *---------------------------------------------------------------------------
  */
-CONST char *
-Tcl_UtfFindFirst(src, ch)
-    CONST char *src;		/* The UTF-8 string to be searched. */
-    int ch;			/* The Tcl_UniChar to search for. */
+
+const char *
+Tcl_UtfFindFirst(
+    const char *src,		/* The UTF-8 string to be searched. */
+    int ch)			/* The Tcl_UniChar to search for. */
 {
     int len;
     Tcl_UniChar find;
-    
+
     while (1) {
 	len = TclUtfToUniChar(src, &find);
 	if (find == ch) {
@@ -581,15 +580,15 @@ Tcl_UtfFindFirst(src, ch)
  *---------------------------------------------------------------------------
  */
 
-CONST char *
-Tcl_UtfFindLast(src, ch)
-    CONST char *src;		/* The UTF-8 string to be searched. */
-    int ch;			/* The Tcl_UniChar to search for. */
+const char *
+Tcl_UtfFindLast(
+    const char *src,		/* The UTF-8 string to be searched. */
+    int ch)			/* The Tcl_UniChar to search for. */
 {
     int len;
     Tcl_UniChar find;
-    CONST char *last;
-	
+    const char *last;
+
     last = NULL;
     while (1) {
 	len = TclUtfToUniChar(src, &find);
@@ -622,10 +621,10 @@ Tcl_UtfFindLast(src, ch)
  *
  *---------------------------------------------------------------------------
  */
- 
-CONST char *
-Tcl_UtfNext(src) 
-    CONST char *src;		    /* The current location in the string. */
+
+const char *
+Tcl_UtfNext(
+    const char *src)		/* The current location in the string. */
 {
     Tcl_UniChar ch;
 
@@ -653,15 +652,15 @@ Tcl_UtfNext(src)
  *---------------------------------------------------------------------------
  */
 
-CONST char *
-Tcl_UtfPrev(src, start)
-    CONST char *src;		/* The current location in the string. */
-    CONST char *start;		/* Pointer to the beginning of the string, to
+const char *
+Tcl_UtfPrev(
+    const char *src,		/* The current location in the string. */
+    const char *start)		/* Pointer to the beginning of the string, to
 				 * avoid going backwards too far. */
 {
-    CONST char *look;
+    const char *look;
     int i, byte;
-    
+
     src--;
     look = src;
     for (i = 0; i < TCL_UTF_MAX; i++) {
@@ -682,7 +681,7 @@ Tcl_UtfPrev(src, start)
     }
     return src;
 }
-	
+
 /*
  *---------------------------------------------------------------------------
  *
@@ -699,13 +698,13 @@ Tcl_UtfPrev(src, start)
  *
  *---------------------------------------------------------------------------
  */
- 
+
 Tcl_UniChar
-Tcl_UniCharAtIndex(src, index)
-    register CONST char *src;	/* The UTF-8 string to dereference. */
-    register int index;		/* The position of the desired character. */
+Tcl_UniCharAtIndex(
+    register const char *src,	/* The UTF-8 string to dereference. */
+    register int index)		/* The position of the desired character. */
 {
-    Tcl_UniChar ch;
+    Tcl_UniChar ch = 0;
 
     while (index >= 0) {
 	index--;
@@ -731,13 +730,13 @@ Tcl_UniCharAtIndex(src, index)
  *---------------------------------------------------------------------------
  */
 
-CONST char *
-Tcl_UtfAtIndex(src, index)
-    register CONST char *src;	/* The UTF-8 string. */
-    register int index;		/* The position of the desired character. */
+const char *
+Tcl_UtfAtIndex(
+    register const char *src,	/* The UTF-8 string. */
+    register int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch;
-    
+
     while (index > 0) {
 	index--;
 	src += TclUtfToUniChar(src, &ch);
@@ -772,12 +771,12 @@ Tcl_UtfAtIndex(src, index)
  */
 
 int
-Tcl_UtfBackslash(src, readPtr, dst)
-    CONST char *src;		/* Points to the backslash character of a
+Tcl_UtfBackslash(
+    const char *src,		/* Points to the backslash character of a
 				 * backslash sequence. */
-    int *readPtr;		/* Fill in with number of characters read from
+    int *readPtr,		/* Fill in with number of characters read from
 				 * src, unless NULL. */
-    char *dst;			/* Filled with the bytes represented by the
+    char *dst)			/* Filled with the bytes represented by the
 				 * backslash sequence. */
 {
 #define LINE_LENGTH 128
@@ -786,7 +785,10 @@ Tcl_UtfBackslash(src, readPtr, dst)
 
     result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
     if (numRead == LINE_LENGTH) {
-	/* We ate a whole line.  Pay the price of a strlen() */
+	/*
+	 * We ate a whole line. Pay the price of a strlen()
+	 */
+
 	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
     }
     if (readPtr != NULL) {
@@ -814,8 +816,8 @@ Tcl_UtfBackslash(src, readPtr, dst)
  */
 
 int
-Tcl_UtfToUpper(str)
-    char *str;			/* String to convert in place. */
+Tcl_UtfToUpper(
+    char *str)			/* String to convert in place. */
 {
     Tcl_UniChar ch, upChar;
     char *src, *dst;
@@ -835,7 +837,7 @@ Tcl_UtfToUpper(str)
 	 * conversion (thereby causing a segfault), only copy the upper case
 	 * char to dst if its size is <= the original char.
 	 */
-	
+
 	if (bytes < UtfCount(upChar)) {
 	    memcpy(dst, src, (size_t) bytes);
 	    dst += bytes;
@@ -867,13 +869,13 @@ Tcl_UtfToUpper(str)
  */
 
 int
-Tcl_UtfToLower(str)
-    char *str;			/* String to convert in place. */
+Tcl_UtfToLower(
+    char *str)			/* String to convert in place. */
 {
     Tcl_UniChar ch, lowChar;
     char *src, *dst;
     int bytes;
-    
+
     /*
      * Iterate over the string until we hit the terminating null.
      */
@@ -888,7 +890,7 @@ Tcl_UtfToLower(str)
 	 * conversion (thereby causing a segfault), only copy the lower case
 	 * char to dst if its size is <= the original char.
 	 */
-	
+
 	if (bytes < UtfCount(lowChar)) {
 	    memcpy(dst, src, (size_t) bytes);
 	    dst += bytes;
@@ -921,13 +923,13 @@ Tcl_UtfToLower(str)
  */
 
 int
-Tcl_UtfToTitle(str)
-    char *str;			/* String to convert in place. */
+Tcl_UtfToTitle(
+    char *str)			/* String to convert in place. */
 {
     Tcl_UniChar ch, titleChar, lowChar;
     char *src, *dst;
     int bytes;
-    
+
     /*
      * Capitalize the first character and then lowercase the rest of the
      * characters until we get to a null.
@@ -968,8 +970,8 @@ Tcl_UtfToTitle(str)
  *
  * TclpUtfNcmp2 --
  *
- *	Compare at most n bytes of utf-8 strings cs and ct. Both cs and ct are
- *	assumed to be at least n bytes long.
+ *	Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and
+ *	ct are assumed to be at least numBytes bytes long.
  *
  * Results:
  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
@@ -981,26 +983,26 @@ Tcl_UtfToTitle(str)
  */
 
 int
-TclpUtfNcmp2(cs, ct, n)
-    CONST char *cs;		/* UTF string to compare to ct. */
-    CONST char *ct;		/* UTF string cs is compared to. */
-    unsigned long n;		/* Number of *bytes* to compare. */
+TclpUtfNcmp2(
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
+    unsigned long numBytes)	/* Number of *bytes* to compare. */
 {
     /*
-     * We can't simply call 'memcmp(cs, ct, n);' because we need to check for
-     * Tcl's \xC0\x80 non-utf-8 null encoding.  Otherwise utf-8 lexes fine in
-     * the strcmp manner.
+     * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to
+     * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes
+     * fine in the strcmp manner.
      */
 
     register int result = 0;
 
-    for ( ; n != 0; n--, cs++, ct++) {
+    for ( ; numBytes != 0; numBytes--, cs++, ct++) {
 	if (*cs != *ct) {
 	    result = UCHAR(*cs) - UCHAR(*ct);
 	    break;
 	}
     }
-    if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
+    if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
 	unsigned char c1, c2;
 
 	c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
@@ -1028,10 +1030,10 @@ TclpUtfNcmp2(cs, ct, n)
  */
 
 int
-Tcl_UtfNcmp(cs, ct, numChars)
-    CONST char *cs;		/* UTF string to compare to ct. */
-    CONST char *ct;		/* UTF string cs is compared to. */
-    unsigned long numChars;	/* Number of UTF chars to compare. */
+Tcl_UtfNcmp(
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
+    unsigned long numChars)	/* Number of UTF chars to compare. */
 {
     Tcl_UniChar ch1, ch2;
 
@@ -1063,8 +1065,8 @@ Tcl_UtfNcmp(cs, ct, numChars)
  * Tcl_UtfNcasecmp --
  *
  *	Compare at most numChars UTF chars of string cs to string ct case
- *	insensitive.  Both cs and ct are assumed to be at least numChars
- *	UTF chars long.
+ *	insensitive. Both cs and ct are assumed to be at least numChars UTF
+ *	chars long.
  *
  * Results:
  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
@@ -1076,10 +1078,10 @@ Tcl_UtfNcmp(cs, ct, numChars)
  */
 
 int
-Tcl_UtfNcasecmp(cs, ct, numChars)
-    CONST char *cs;		/* UTF string to compare to ct. */
-    CONST char *ct;		/* UTF string cs is compared to. */
-    unsigned long numChars;	/* Number of UTF chars to compare. */
+Tcl_UtfNcasecmp(
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
+    unsigned long numChars)	/* Number of UTF chars to compare. */
 {
     Tcl_UniChar ch1, ch2;
     while (numChars-- > 0) {
@@ -1104,6 +1106,46 @@ Tcl_UtfNcasecmp(cs, ct, numChars)
 /*
  *----------------------------------------------------------------------
  *
+ * Tcl_UtfNcasecmp --
+ *
+ *	Compare UTF chars of string cs to string ct case insensitively.
+ *	Replacement for strcasecmp in Tcl core, in places where UTF-8 should
+ *	be handled.
+ *
+ * Results:
+ *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCasecmp(
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct)		/* UTF string cs is compared to. */
+{
+    while (*cs && *ct) {
+	Tcl_UniChar ch1, ch2;
+
+	cs += TclUtfToUniChar(cs, &ch1);
+	ct += TclUtfToUniChar(ct, &ch2);
+	if (ch1 != ch2) {
+	    ch1 = Tcl_UniCharToLower(ch1);
+	    ch2 = Tcl_UniCharToLower(ch2);
+	    if (ch1 != ch2) {
+		return ch1 - ch2;
+	    }
+	}
+    }
+    return UCHAR(*cs) - UCHAR(*ct);
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
  * Tcl_UniCharToUpper --
  *
  *	Compute the uppercase equivalent of the given Unicode character.
@@ -1118,16 +1160,15 @@ Tcl_UtfNcasecmp(cs, ct, numChars)
  */
 
 Tcl_UniChar
-Tcl_UniCharToUpper(ch)
-    int ch;			/* Unicode character to convert. */
+Tcl_UniCharToUpper(
+    int ch)			/* Unicode character to convert. */
 {
     int info = GetUniCharInfo(ch);
 
     if (GetCaseType(info) & 0x04) {
-	return (Tcl_UniChar) (ch - GetDelta(info));
-    } else {
-	return ch;
+	ch -= GetDelta(info);
     }
+    return (Tcl_UniChar) ch;
 }
 
 /*
@@ -1147,16 +1188,15 @@ Tcl_UniCharToUpper(ch)
  */
 
 Tcl_UniChar
-Tcl_UniCharToLower(ch)
-    int ch;			/* Unicode character to convert. */
+Tcl_UniCharToLower(
+    int ch)			/* Unicode character to convert. */
 {
     int info = GetUniCharInfo(ch);
 
     if (GetCaseType(info) & 0x02) {
-	return (Tcl_UniChar) (ch + GetDelta(info));
-    } else {
-	return ch;
+	ch += GetDelta(info);
     }
+    return (Tcl_UniChar) ch;
 }
 
 /*
@@ -1176,8 +1216,8 @@ Tcl_UniCharToLower(ch)
  */
 
 Tcl_UniChar
-Tcl_UniCharToTitle(ch)
-    int ch;			/* Unicode character to convert. */
+Tcl_UniCharToTitle(
+    int ch)			/* Unicode character to convert. */
 {
     int info = GetUniCharInfo(ch);
     int mode = GetCaseType(info);
@@ -1187,12 +1227,11 @@ Tcl_UniCharToTitle(ch)
 	 * Subtract or add one depending on the original case.
 	 */
 
-	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
+	ch += ((mode & 0x4) ? -1 : 1);
     } else if (mode == 0x4) {
-	return (Tcl_UniChar) (ch - GetDelta(info));
-    } else {
-	return ch;
+	ch -= GetDelta(info);
     }
+    return (Tcl_UniChar) ch;
 }
 
 /*
@@ -1200,7 +1239,7 @@ Tcl_UniCharToTitle(ch)
  *
  * Tcl_UniCharLen --
  *
- *	Find the length of a UniChar string.  The str input must be null
+ *	Find the length of a UniChar string. The str input must be null
  *	terminated.
  *
  * Results:
@@ -1213,11 +1252,11 @@ Tcl_UniCharToTitle(ch)
  */
 
 int
-Tcl_UniCharLen(uniStr)
-    CONST Tcl_UniChar *uniStr;	/* Unicode string to find length of. */
+Tcl_UniCharLen(
+    const Tcl_UniChar *uniStr)	/* Unicode string to find length of. */
 {
     int len = 0;
-    
+
     while (*uniStr != '\0') {
 	len++;
 	uniStr++;
@@ -1243,10 +1282,10 @@ Tcl_UniCharLen(uniStr)
  */
 
 int
-Tcl_UniCharNcmp(ucs, uct, numChars)
-    CONST Tcl_UniChar *ucs;		/* Unicode string to compare to uct. */
-    CONST Tcl_UniChar *uct;		/* Unicode string ucs is compared to. */
-    unsigned long numChars;		/* Number of unichars to compare. */
+Tcl_UniCharNcmp(
+    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
+    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
+    unsigned long numChars)	/* Number of unichars to compare. */
 {
 #ifdef WORDS_BIGENDIAN
     /*
@@ -1275,7 +1314,7 @@ Tcl_UniCharNcmp(ucs, uct, numChars)
  * Tcl_UniCharNcasecmp --
  *
  *	Compare at most numChars unichars of string ucs to string uct case
- *	insensitive.  Both ucs and uct are assumed to be at least numChars
+ *	insensitive. Both ucs and uct are assumed to be at least numChars
  *	unichars long.
  *
  * Results:
@@ -1288,15 +1327,16 @@ Tcl_UniCharNcmp(ucs, uct, numChars)
  */
 
 int
-Tcl_UniCharNcasecmp(ucs, uct, numChars)
-    CONST Tcl_UniChar *ucs;		/* Unicode string to compare to uct. */
-    CONST Tcl_UniChar *uct;		/* Unicode string ucs is compared to. */
-    unsigned long numChars;		/* Number of unichars to compare. */
+Tcl_UniCharNcasecmp(
+    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
+    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
+    unsigned long numChars)	/* Number of unichars to compare. */
 {
     for ( ; numChars != 0; numChars--, ucs++, uct++) {
 	if (*ucs != *uct) {
 	    Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
 	    Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
+
 	    if (lcs != lct) {
 		return (lcs - lct);
 	    }
@@ -1322,12 +1362,10 @@ Tcl_UniCharNcasecmp(ucs, uct, numChars)
  */
 
 int
-Tcl_UniCharIsAlnum(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsAlnum(
+    int ch)			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
-    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
+    return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1347,11 +1385,10 @@ Tcl_UniCharIsAlnum(ch)
  */
 
 int
-Tcl_UniCharIsAlpha(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsAlpha(
+    int ch)			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-    return ((ALPHA_BITS >> category) & 1);
+    return ((ALPHA_BITS >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1371,10 +1408,10 @@ Tcl_UniCharIsAlpha(ch)
  */
 
 int
-Tcl_UniCharIsControl(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsControl(
+    int ch)			/* Unicode character to test. */
 {
-    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
+    return ((CONTROL_BITS >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1394,11 +1431,10 @@ Tcl_UniCharIsControl(ch)
  */
 
 int
-Tcl_UniCharIsDigit(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsDigit(
+    int ch)			/* Unicode character to test. */
 {
-    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
-	    == DECIMAL_DIGIT_NUMBER);
+    return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
 }
 
 /*
@@ -1418,11 +1454,10 @@ Tcl_UniCharIsDigit(ch)
  */
 
 int
-Tcl_UniCharIsGraph(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsGraph(
+    int ch)			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-    return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
+    return ((GRAPH_BITS >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1442,10 +1477,10 @@ Tcl_UniCharIsGraph(ch)
  */
 
 int
-Tcl_UniCharIsLower(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsLower(
+    int ch)			/* Unicode character to test. */
 {
-    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
+    return (GetCategory(ch) == LOWERCASE_LETTER);
 }
 
 /*
@@ -1465,11 +1500,10 @@ Tcl_UniCharIsLower(ch)
  */
 
 int
-Tcl_UniCharIsPrint(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsPrint(
+    int ch)			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-    return ((PRINT_BITS >> category) & 1);
+    return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1489,11 +1523,10 @@ Tcl_UniCharIsPrint(ch)
  */
 
 int
-Tcl_UniCharIsPunct(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsPunct(
+    int ch)			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-    return ((PUNCT_BITS >> category) & 1);
+    return ((PUNCT_BITS >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1513,21 +1546,22 @@ Tcl_UniCharIsPunct(ch)
  */
 
 int
-Tcl_UniCharIsSpace(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsSpace(
+    int ch)			/* Unicode character to test. */
 {
-    register int category;
-
     /*
      * If the character is within the first 127 characters, just use the
      * standard C function, otherwise consult the Unicode table.
      */
 
-    if (ch < 0x80) {
-	return isspace(UCHAR(ch)); /* INTL: ISO space */
+    if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
+	return TclIsSpaceProc((char) ch);
+    } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e
+	    || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060
+	    || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) {
+	return 1;
     } else {
-	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-	return ((SPACE_BITS >> category) & 1);
+	return ((SPACE_BITS >> GetCategory(ch)) & 1);
     }
 }
 
@@ -1548,10 +1582,10 @@ Tcl_UniCharIsSpace(ch)
  */
 
 int
-Tcl_UniCharIsUpper(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsUpper(
+    int ch)			/* Unicode character to test. */
 {
-    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
+    return (GetCategory(ch) == UPPERCASE_LETTER);
 }
 
 /*
@@ -1571,12 +1605,10 @@ Tcl_UniCharIsUpper(ch)
  */
 
 int
-Tcl_UniCharIsWordChar(ch)
-    int ch;			/* Unicode character to test. */
+Tcl_UniCharIsWordChar(
+    int ch)			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
-    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
+    return ((WORD_BITS >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1603,24 +1635,24 @@ Tcl_UniCharIsWordChar(ch)
  */
 
 int
-Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
-    CONST Tcl_UniChar *uniStr;	/* Unicode String. */
-    CONST Tcl_UniChar *uniPattern;
+Tcl_UniCharCaseMatch(
+    const Tcl_UniChar *uniStr,	/* Unicode String. */
+    const Tcl_UniChar *uniPattern,
 				/* Pattern, which may contain special
 				 * characters. */
-    int nocase;			/* 0 for case sensitive, 1 for insensitive */
+    int nocase)			/* 0 for case sensitive, 1 for insensitive */
 {
     Tcl_UniChar ch1, p;
-    
+
     while (1) {
 	p = *uniPattern;
-	
+
 	/*
 	 * See if we're at the end of both the pattern and the string. If so,
 	 * we succeeded. If we're at the end of the pattern but not at the end
 	 * of the string, we failed.
 	 */
-	
+
 	if (p == 0) {
 	    return (*uniStr == 0);
 	}
@@ -1635,7 +1667,7 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
 	 * recursively for each postfix of string, until either we match or we
 	 * reach the end of the string.
 	 */
-	
+
 	if (p == '*') {
 	    /*
 	     * Skip all successive *'s in the pattern
@@ -1696,7 +1728,7 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
 	 * list of characters that are acceptable, or by a range (two
 	 * characters separated by "-").
 	 */
-	
+
 	if (p == '[') {
 	    Tcl_UniChar startChar, endChar;
 
@@ -1752,8 +1784,8 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
 	}
 
 	/*
-	 * There's no special character.  Just make sure that the next bytes
-	 * of each string match.
+	 * There's no special character. Just make sure that the next bytes of
+	 * each string match.
 	 */
 
 	if (nocase) {
@@ -1791,15 +1823,15 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
  */
 
 int
-TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
-    CONST Tcl_UniChar *string;	/* Unicode String. */
-    int strLen;			/* length of String */
-    CONST Tcl_UniChar *pattern;	/* Pattern, which may contain special
+TclUniCharMatch(
+    const Tcl_UniChar *string,	/* Unicode String. */
+    int strLen,			/* Length of String */
+    const Tcl_UniChar *pattern,	/* Pattern, which may contain special
 				 * characters. */
-    int ptnLen;			/* length of Pattern */
-    int nocase;			/* 0 for case sensitive, 1 for insensitive */
+    int ptnLen,			/* Length of Pattern */
+    int nocase)			/* 0 for case sensitive, 1 for insensitive */
 {
-    CONST Tcl_UniChar *stringEnd, *patternEnd;
+    const Tcl_UniChar *stringEnd, *patternEnd;
     Tcl_UniChar p;
 
     stringEnd = string + strLen;
@@ -1827,7 +1859,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
 	 * recursively for each postfix of string, until either we match or we
 	 * reach the end of the string.
 	 */
-	
+
 	if (p == '*') {
 	    /*
 	     * Skip all successive *'s in the pattern.
@@ -1889,7 +1921,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
 	 * list of characters that are acceptable, or by a range (two
 	 * characters separated by "-").
 	 */
-	
+
 	if (p == '[') {
 	    Tcl_UniChar ch1, startChar, endChar;