1 files changed, 87 insertions, 111 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index e5497a4..16acab2 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -7,6 +7,8 @@
  *
  * See the file "license.terms" for information on usage and redistribution of
  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
+ *
+ * RCS: @(#) $Id: tclUtf.c,v 1.39 2009/02/11 15:28:59 dgp Exp $
  */
 
 #include "tclInt.h"
@@ -26,27 +28,28 @@
 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
 	| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
 
-#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))
-
 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
 
 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
 	| (1 << PARAGRAPH_SEPARATOR))
 
-#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION))
-
-#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
-	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
-	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
-	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
+#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
 
-#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \
+#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
 	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
 	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
-	(1 << OTHER_NUMBER) | \
+	(1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
+	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
+	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
+	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
 	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
 	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
 
+#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
+	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
+	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
+	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
+
 /*
  * Unicode characters less than this value are represented by themselves in
  * UTF-8 strings.
@@ -59,7 +62,7 @@
  * UTF-8.
  */
 
-static CONST unsigned char totalBytes[256] = {
+static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -231,13 +234,13 @@ Tcl_UniCharToUtf(
 
 char *
 Tcl_UniCharToUtfDString(
-    CONST Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */
+    const Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */
     int uniLength,		/* Length of Unicode string in Tcl_UniChars
 				 * (must be >= 0). */
     Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
 				 * to this previously initialized DString. */
 {
-    CONST Tcl_UniChar *w, *wEnd;
+    const Tcl_UniChar *w, *wEnd;
     char *p, *string;
     int oldLength;
 
@@ -289,7 +292,7 @@ Tcl_UniCharToUtfDString(
 
 int
 Tcl_UtfToUniChar(
-    register CONST char *src,	/* The UTF-8 string. */
+    register const char *src,	/* The UTF-8 string. */
     register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
 				 * the UTF-8 string. */
 {
@@ -393,7 +396,7 @@ Tcl_UtfToUniChar(
 
 Tcl_UniChar *
 Tcl_UtfToUniCharDString(
-    CONST char *src,		/* UTF-8 string to convert to Unicode. */
+    const char *src,		/* UTF-8 string to convert to Unicode. */
     int length,			/* Length of UTF-8 string in bytes, or -1 for
 				 * strlen(). */
     Tcl_DString *dsPtr)		/* Unicode representation of string is
@@ -401,7 +404,7 @@ Tcl_UtfToUniCharDString(
 				 * DString. */
 {
     Tcl_UniChar *w, *wString;
-    CONST char *p, *end;
+    const char *p, *end;
     int oldLength;
 
     if (length < 0) {
@@ -414,6 +417,7 @@ Tcl_UtfToUniCharDString(
      */
 
     oldLength = Tcl_DStringLength(dsPtr);
+/* TODO: fix overreach! */
     Tcl_DStringSetLength(dsPtr,
 	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
@@ -452,7 +456,7 @@ Tcl_UtfToUniCharDString(
 
 int
 Tcl_UtfCharComplete(
-    CONST char *src,		/* String to check if first few bytes contain
+    const char *src,		/* String to check if first few bytes contain
 				 * a complete UTF-8 character. */
     int length)			/* Length of above string in bytes. */
 {
@@ -482,7 +486,7 @@ Tcl_UtfCharComplete(
 
 int
 Tcl_NumUtfChars(
-    register CONST char *src,	/* The UTF-8 string to measure. */
+    register const char *src,	/* The UTF-8 string to measure. */
     int length)			/* The length of the string in bytes, or -1
 				 * for strlen(string). */
 {
@@ -540,9 +544,9 @@ Tcl_NumUtfChars(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfFindFirst(
-    CONST char *src,		/* The UTF-8 string to be searched. */
+    const char *src,		/* The UTF-8 string to be searched. */
     int ch)			/* The Tcl_UniChar to search for. */
 {
     int len;
@@ -579,14 +583,14 @@ Tcl_UtfFindFirst(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfFindLast(
-    CONST char *src,		/* The UTF-8 string to be searched. */
+    const char *src,		/* The UTF-8 string to be searched. */
     int ch)			/* The Tcl_UniChar to search for. */
 {
     int len;
     Tcl_UniChar find;
-    CONST char *last;
+    const char *last;
 
     last = NULL;
     while (1) {
@@ -621,9 +625,9 @@ Tcl_UtfFindLast(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfNext(
-    CONST char *src)		/* The current location in the string. */
+    const char *src)		/* The current location in the string. */
 {
     Tcl_UniChar ch;
 
@@ -651,13 +655,13 @@ Tcl_UtfNext(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfPrev(
-    CONST char *src,		/* The current location in the string. */
-    CONST char *start)		/* Pointer to the beginning of the string, to
+    const char *src,		/* The current location in the string. */
+    const char *start)		/* Pointer to the beginning of the string, to
 				 * avoid going backwards too far. */
 {
-    CONST char *look;
+    const char *look;
     int i, byte;
 
     src--;
@@ -700,7 +704,7 @@ Tcl_UtfPrev(
 
 Tcl_UniChar
 Tcl_UniCharAtIndex(
-    register CONST char *src,	/* The UTF-8 string to dereference. */
+    register const char *src,	/* The UTF-8 string to dereference. */
     register int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch;
@@ -729,9 +733,9 @@ Tcl_UniCharAtIndex(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfAtIndex(
-    register CONST char *src,	/* The UTF-8 string. */
+    register const char *src,	/* The UTF-8 string. */
     register int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch;
@@ -771,7 +775,7 @@ Tcl_UtfAtIndex(
 
 int
 Tcl_UtfBackslash(
-    CONST char *src,		/* Points to the backslash character of a
+    const char *src,		/* Points to the backslash character of a
 				 * backslash sequence. */
     int *readPtr,		/* Fill in with number of characters read from
 				 * src, unless NULL. */
@@ -983,8 +987,8 @@ Tcl_UtfToTitle(
 
 int
 TclpUtfNcmp2(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct,		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
     unsigned long numBytes)	/* Number of *bytes* to compare. */
 {
     /*
@@ -1030,8 +1034,8 @@ TclpUtfNcmp2(
 
 int
 Tcl_UtfNcmp(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct,		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
     unsigned long numChars)	/* Number of UTF chars to compare. */
 {
     Tcl_UniChar ch1, ch2;
@@ -1078,8 +1082,8 @@ Tcl_UtfNcmp(
 
 int
 Tcl_UtfNcasecmp(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct,		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
     unsigned long numChars)	/* Number of UTF chars to compare. */
 {
     Tcl_UniChar ch1, ch2;
@@ -1105,46 +1109,6 @@ Tcl_UtfNcasecmp(
 /*
  *----------------------------------------------------------------------
  *
- * Tcl_UtfNcasecmp --
- *
- *	Compare UTF chars of string cs to string ct case insensitively.
- *	Replacement for strcasecmp in Tcl core, in places where UTF-8 should
- *	be handled.
- *
- * Results:
- *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
- *
- * Side effects:
- *	None.
- *
- *----------------------------------------------------------------------
- */
-
-int
-TclUtfCasecmp(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct)		/* UTF string cs is compared to. */
-{
-    while (*cs && *ct) {
-	Tcl_UniChar ch1, ch2;
-
-	cs += TclUtfToUniChar(cs, &ch1);
-	ct += TclUtfToUniChar(ct, &ch2);
-	if (ch1 != ch2) {
-	    ch1 = Tcl_UniCharToLower(ch1);
-	    ch2 = Tcl_UniCharToLower(ch2);
-	    if (ch1 != ch2) {
-		return ch1 - ch2;
-	    }
-	}
-    }
-    return UCHAR(*cs) - UCHAR(*ct);
-}
-
-
-/*
- *----------------------------------------------------------------------
- *
  * Tcl_UniCharToUpper --
  *
  *	Compute the uppercase equivalent of the given Unicode character.
@@ -1165,9 +1129,10 @@ Tcl_UniCharToUpper(
     int info = GetUniCharInfo(ch);
 
     if (GetCaseType(info) & 0x04) {
-	ch -= GetDelta(info);
+	return (Tcl_UniChar) (ch - GetDelta(info));
+    } else {
+	return ch;
     }
-    return (Tcl_UniChar) ch;
 }
 
 /*
@@ -1193,9 +1158,10 @@ Tcl_UniCharToLower(
     int info = GetUniCharInfo(ch);
 
     if (GetCaseType(info) & 0x02) {
-	ch += GetDelta(info);
+	return (Tcl_UniChar) (ch + GetDelta(info));
+    } else {
+	return ch;
     }
-    return (Tcl_UniChar) ch;
 }
 
 /*
@@ -1226,11 +1192,12 @@ Tcl_UniCharToTitle(
 	 * Subtract or add one depending on the original case.
 	 */
 
-	ch += ((mode & 0x4) ? -1 : 1);
+	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
     } else if (mode == 0x4) {
-	ch -= GetDelta(info);
+	return (Tcl_UniChar) (ch - GetDelta(info));
+    } else {
+	return ch;
     }
-    return (Tcl_UniChar) ch;
 }
 
 /*
@@ -1252,7 +1219,7 @@ Tcl_UniCharToTitle(
 
 int
 Tcl_UniCharLen(
-    CONST Tcl_UniChar *uniStr)	/* Unicode string to find length of. */
+    const Tcl_UniChar *uniStr)	/* Unicode string to find length of. */
 {
     int len = 0;
 
@@ -1282,8 +1249,8 @@ Tcl_UniCharLen(
 
 int
 Tcl_UniCharNcmp(
-    CONST Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
-    CONST Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
+    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
+    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
     unsigned long numChars)	/* Number of unichars to compare. */
 {
 #ifdef WORDS_BIGENDIAN
@@ -1327,8 +1294,8 @@ Tcl_UniCharNcmp(
 
 int
 Tcl_UniCharNcasecmp(
-    CONST Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
-    CONST Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
+    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
+    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
     unsigned long numChars)	/* Number of unichars to compare. */
 {
     for ( ; numChars != 0; numChars--, ucs++, uct++) {
@@ -1364,7 +1331,9 @@ int
 Tcl_UniCharIsAlnum(
     int ch)			/* Unicode character to test. */
 {
-    return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+
+    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
 }
 
 /*
@@ -1387,7 +1356,8 @@ int
 Tcl_UniCharIsAlpha(
     int ch)			/* Unicode character to test. */
 {
-    return ((ALPHA_BITS >> GetCategory(ch)) & 1);
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+    return ((ALPHA_BITS >> category) & 1);
 }
 
 /*
@@ -1410,7 +1380,7 @@ int
 Tcl_UniCharIsControl(
     int ch)			/* Unicode character to test. */
 {
-    return ((CONTROL_BITS >> GetCategory(ch)) & 1);
+    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
 }
 
 /*
@@ -1433,7 +1403,7 @@ int
 Tcl_UniCharIsDigit(
     int ch)			/* Unicode character to test. */
 {
-    return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
+    return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER;
 }
 
 /*
@@ -1456,7 +1426,8 @@ int
 Tcl_UniCharIsGraph(
     int ch)			/* Unicode character to test. */
 {
-    return ((GRAPH_BITS >> GetCategory(ch)) & 1);
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+    return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
 }
 
 /*
@@ -1479,7 +1450,7 @@ int
 Tcl_UniCharIsLower(
     int ch)			/* Unicode character to test. */
 {
-    return (GetCategory(ch) == LOWERCASE_LETTER);
+    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
 }
 
 /*
@@ -1502,7 +1473,8 @@ int
 Tcl_UniCharIsPrint(
     int ch)			/* Unicode character to test. */
 {
-    return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+    return ((PRINT_BITS >> category) & 1);
 }
 
 /*
@@ -1525,7 +1497,8 @@ int
 Tcl_UniCharIsPunct(
     int ch)			/* Unicode character to test. */
 {
-    return ((PUNCT_BITS >> GetCategory(ch)) & 1);
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+    return ((PUNCT_BITS >> category) & 1);
 }
 
 /*
@@ -1548,17 +1521,18 @@ int
 Tcl_UniCharIsSpace(
     int ch)			/* Unicode character to test. */
 {
+    register int category;
+
     /*
      * If the character is within the first 127 characters, just use the
      * standard C function, otherwise consult the Unicode table.
      */
 
-    if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
-	return TclIsSpaceProc((char) ch);
-    } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) {
-	return 1;
+    if (ch < 0x80) {
+	return isspace(UCHAR(ch)); /* INTL: ISO space */
     } else {
-	return ((SPACE_BITS >> GetCategory(ch)) & 1);
+	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+	return ((SPACE_BITS >> category) & 1);
     }
 }
 
@@ -1582,7 +1556,7 @@ int
 Tcl_UniCharIsUpper(
     int ch)			/* Unicode character to test. */
 {
-    return (GetCategory(ch) == UPPERCASE_LETTER);
+    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
 }
 
 /*
@@ -1605,7 +1579,9 @@ int
 Tcl_UniCharIsWordChar(
     int ch)			/* Unicode character to test. */
 {
-    return ((WORD_BITS >> GetCategory(ch)) & 1);
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+
+    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
 }
 
 /*
@@ -1633,8 +1609,8 @@ Tcl_UniCharIsWordChar(
 
 int
 Tcl_UniCharCaseMatch(
-    CONST Tcl_UniChar *uniStr,	/* Unicode String. */
-    CONST Tcl_UniChar *uniPattern,
+    const Tcl_UniChar *uniStr,	/* Unicode String. */
+    const Tcl_UniChar *uniPattern,
 				/* Pattern, which may contain special
 				 * characters. */
     int nocase)			/* 0 for case sensitive, 1 for insensitive */
@@ -1821,14 +1797,14 @@ Tcl_UniCharCaseMatch(
 
 int
 TclUniCharMatch(
-    CONST Tcl_UniChar *string,	/* Unicode String. */
+    const Tcl_UniChar *string,	/* Unicode String. */
     int strLen,			/* Length of String */
-    CONST Tcl_UniChar *pattern,	/* Pattern, which may contain special
+    const Tcl_UniChar *pattern,	/* Pattern, which may contain special
 				 * characters. */
     int ptnLen,			/* Length of Pattern */
     int nocase)			/* 0 for case sensitive, 1 for insensitive */
 {
-    CONST Tcl_UniChar *stringEnd, *patternEnd;
+    const Tcl_UniChar *stringEnd, *patternEnd;
     Tcl_UniChar p;
 
     stringEnd = string + strLen;