1 files changed, 102 insertions, 78 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index ee9f59a..9027921 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -8,10 +8,27 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclUtf.c,v 1.1.2.2 1998/10/03 01:56:42 stanton Exp $
+ * RCS: @(#) $Id: tclUtf.c,v 1.1.2.3 1998/10/16 01:16:57 stanton Exp $
  */
 
 #include "tclInt.h"
+#include "TclUtf.h"
+
+/*
+ * The following macros are used for fast character category tests.  The
+ * x_BITS values are shifted right by the category value to determine whether
+ * the given category is included in the set.
+ */ 
+
+#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
+    | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
+
+#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
+
+#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
+    | (1 << PARAGRAPH_SEPARATOR))
+
+#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
 
 /*
  * Unicode characters less than this value are represented by themselves 
@@ -813,10 +830,6 @@ Tcl_UtfBackslash(src, readPtr, dst)
  *	Convert lowercase characters to uppercase characters in a UTF
  *	string in place.  The conversion may shrink the UTF string.
  *
- *	INTL: This implementation only handles iso8859-1 characters
- *	in the current locale.  This should be changed to use the
- *	Unicode character tables.
- *
  * Results:
  *	Returns the number of bytes in the resulting string
  *	excluding the trailing null.
@@ -841,16 +854,7 @@ Tcl_UtfToUpper(str)
     src = dst = str;
     while (*src) {
 	src += Tcl_UtfToUniChar(src, &ch);
-
-	/*
-	 * INTL: This conversion should be replaced with a table lookup for the
-	 * full Unicode translation.
-	 */
-
-	if ((ch < 0x100) && islower(ch)) { /* INTL: ISO only */
-	    ch = (Tcl_UniChar) UCHAR(toupper(ch)); /* INTL: ISO only */
-	}
-	dst += Tcl_UniCharToUtf(ch, dst);
+	dst += Tcl_UniCharToUtf(Tcl_UniCharToUpper(ch), dst);
     }
     *dst = '\0';
     return (dst - str);
@@ -864,10 +868,6 @@ Tcl_UtfToUpper(str)
  *	Convert uppercase characters to lowercase characters in a UTF
  *	string in place.  The conversion may shrink the UTF string.
  *
- *	INTL: This implementation only handles iso8859-1 characters
- *	in the current locale.  This should be changed to use the
- *	Unicode character tables.
- *
  * Results:
  *	Returns the number of bytes in the resulting string
  *	excluding the trailing null.
@@ -892,16 +892,7 @@ Tcl_UtfToLower(str)
     src = dst = str;
     while (*src) {
 	src += Tcl_UtfToUniChar(src, &ch);
-
-	/*
-	 * INTL: This conversion should be replaced with a table lookup for the
-	 * full Unicode translation.
-	 */
-
-	if ((ch < 0x100) && isupper(ch)) { /* INTL: ISO only */
-	    ch = (Tcl_UniChar) UCHAR(tolower(ch)); /* INTL: ISO only */
-	}
-	dst += Tcl_UniCharToUtf(ch, dst);
+	dst += Tcl_UniCharToUtf(Tcl_UniCharToLower(ch), dst);
     }
     *dst = '\0';
     return (dst - str);
@@ -916,10 +907,6 @@ Tcl_UtfToLower(str)
  *	uppercase and the rest of the string to lowercase.  The
  *	conversion happens in place and may shrink the UTF string.
  *
- *	INTL: This implementation only handles iso8859-1 characters
- *	in the current locale.  This should be changed to use the
- *	Unicode character tables.
- *
  * Results:
  *	Returns the number of bytes in the resulting string
  *	excluding the trailing null.
@@ -946,17 +933,11 @@ Tcl_UtfToTitle(str)
 
     if (*src) {
 	src += Tcl_UtfToUniChar(src, &ch);
-	if ((ch < 0x100) && islower(ch)) { /* INTL: ISO only */
-	    ch = (Tcl_UniChar) UCHAR(toupper(ch)); /* INTL: ISO only */
-	}
-	dst += Tcl_UniCharToUtf(ch, dst);
+	dst += Tcl_UniCharToUtf(Tcl_UniCharToTitle(ch), dst);
     }
     while (*src) {
 	src += Tcl_UtfToUniChar(src, &ch);
-	if ((ch < 0x100) && isupper(ch)) { /* INTL: ISO only */
-	    ch = (Tcl_UniChar) UCHAR(tolower(ch)); /* INTL: ISO only */
-	}
-	dst += Tcl_UniCharToUtf(ch, dst);
+	dst += Tcl_UniCharToUtf(Tcl_UniCharToLower(ch), dst);
     }
     *dst = '\0';
     return (dst - str);
@@ -969,8 +950,6 @@ Tcl_UtfToTitle(str)
  *
  *	Compute the uppercase equivalent of the given Unicode character.
  *
- *	INTL: this implementation only works on ISO characters.
- *
  * Results:
  *	Returns the uppercase Unicode character.
  *
@@ -984,9 +963,13 @@ Tcl_UniChar
 Tcl_UniCharToUpper(ch)
     int ch;			/* Unicode character to convert. */
 {
-    return (Tcl_UniChar) ((ch < 0x100)
-	    ? UCHAR(toupper(ch))	/* INTL: ISO only */
-	    : ch);
+    int info = GetUniCharInfo(ch);
+
+    if (GetCaseType(info) & 0x04) {
+	return (Tcl_UniChar) (ch - GetDelta(info));
+    } else {
+	return ch;
+    }
 }
 
 /*
@@ -996,8 +979,6 @@ Tcl_UniCharToUpper(ch)
  *
  *	Compute the lowercase equivalent of the given Unicode character.
  *
- *	INTL: this implementation only works on ISO characters.
- *
  * Results:
  *	Returns the lowercase Unicode character.
  *
@@ -1011,9 +992,13 @@ Tcl_UniChar
 Tcl_UniCharToLower(ch)
     int ch;			/* Unicode character to convert. */
 {
-    return (Tcl_UniChar) ((ch < 0x100)
-	    ? UCHAR(tolower(ch))	/* INTL: ISO only */
-	    : ch);
+    int info = GetUniCharInfo(ch);
+
+    if (GetCaseType(info) & 0x02) {
+	return (Tcl_UniChar) (ch + GetDelta(info));
+    } else {
+	return ch;
+    }
 }
 
 /*
@@ -1023,8 +1008,6 @@ Tcl_UniCharToLower(ch)
  *
  *	Compute the titlecase equivalent of the given Unicode character.
  *
- *	INTL: this implementation only works on ISO characters.
- *
  * Results:
  *	Returns the titlecase Unicode character.
  *
@@ -1038,9 +1021,20 @@ Tcl_UniChar
 Tcl_UniCharToTitle(ch)
     int ch;			/* Unicode character to convert. */
 {
-    return (Tcl_UniChar) ((ch < 0x100)
-	    ? UCHAR(toupper(ch))	/* INTL: ISO only */
-	    : ch);
+    int info = GetUniCharInfo(ch);
+    int mode = GetCaseType(info);
+
+    if (mode & 0x1) {
+	/*
+	 * Subtract or add one depending on the original case.
+	 */
+
+	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
+    } else if (mode == 0x4) {
+	return (Tcl_UniChar) (ch - GetDelta(info));
+    } else {
+	return ch;
+    }
 }
 
 /*
@@ -1114,8 +1108,6 @@ TclUniCharNcmp(cs, ct, n)
  *
  *	Test if a character is an alphanumeric Unicode character.
  *
- *	INTL: this implementation only works on ISO characters.
- *
  * Results:
  *	Returns 1 if character is alphanumeric.
  *
@@ -1129,7 +1121,9 @@ int
 TclUniCharIsAlnum(ch)
     int ch;			/* Unicode character to test. */
 {
-    return ((ch < 0x100) ? isalnum(ch) : 0); /* INTL: ISO only */
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+
+    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
 }
 
 /*
@@ -1139,8 +1133,6 @@ TclUniCharIsAlnum(ch)
  *
  *	Test if a character is an alphabetic Unicode character.
  *
- *	INTL: this implementation only works on ISO characters.
- *
  * Results:
  *	Returns 1 if character is alphabetic.
  *
@@ -1154,7 +1146,8 @@ int
 TclUniCharIsAlpha(ch)
     int ch;			/* Unicode character to test. */
 {
-    return ((ch < 0x100) ? isalpha(ch) : 0); /* INTL: ISO only */
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+    return ((ALPHA_BITS >> category) & 1);
 }
 
 /*
@@ -1164,10 +1157,8 @@ TclUniCharIsAlpha(ch)
  *
  *	Test if a character is a numeric Unicode character.
  *
- *	INTL: this implementation only works on ISO characters.
- *
  * Results:
- *	Returns 1 if character is a digit.
+ *	Returns non-zero if character is a digit.
  *
  * Side effects:
  *	None.
@@ -1179,7 +1170,8 @@ int
 TclUniCharIsDigit(ch)
     int ch;			/* Unicode character to test. */
 {
-    return ((ch < 0x100) ? isdigit(ch) : 0); /* INTL: ISO only */
+    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
+	    == DECIMAL_DIGIT_NUMBER);
 }
 
 /*
@@ -1189,10 +1181,8 @@ TclUniCharIsDigit(ch)
  *
  *	Test if a character is a lowercase Unicode character.
  *
- *	INTL: this implementation only works on ISO characters.
- *
  * Results:
- *	Returns 1 if character is lowercase.
+ *	Returns non-zero if character is lowercase.
  *
  * Side effects:
  *	None.
@@ -1204,7 +1194,7 @@ int
 TclUniCharIsLower(ch)
     int ch;			/* Unicode character to test. */
 {
-    return ((ch < 0x100) ? islower(ch) : 0); /* INTL: ISO only */
+    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
 }
 
 /*
@@ -1214,10 +1204,8 @@ TclUniCharIsLower(ch)
  *
  *	Test if a character is a whitespace Unicode character.
  *
- *	INTL: this implementation only works on ISO characters.
- *
  * Results:
- *	Returns 1 if character is a space.
+ *	Returns non-zero if character is a space.
  *
  * Side effects:
  *	None.
@@ -1229,7 +1217,19 @@ int
 TclUniCharIsSpace(ch)
     int ch;			/* Unicode character to test. */
 {
-    return ((ch < 0x100) ? isspace(ch) : 0); /* INTL: ISO only */
+    register int category;
+
+    /*
+     * If the character is within the first 127 characters, just use the
+     * standard C function, otherwise consult the Unicode table.
+     */
+
+    if (ch < 0x80) {
+	return isspace(UCHAR(ch)); /* INTL: ISO space */
+    } else {
+	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+	return ((SPACE_BITS >> category) & 1);
+    }
 }
 
 /*
@@ -1239,10 +1239,8 @@ TclUniCharIsSpace(ch)
  *
  *	Test if a character is a uppercase Unicode character.
  *
- *	INTL: this implementation only works on ISO characters.
- *
  * Results:
- *	Returns 1 if character is uppercase.
+ *	Returns non-zero if character is uppercase.
  *
  * Side effects:
  *	None.
@@ -1254,5 +1252,31 @@ int
 TclUniCharIsUpper(ch)
     int ch;			/* Unicode character to test. */
 {
-    return ((ch < 0x100) ? isupper(ch) : 0); /* INTL: ISO only */
+    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * TclUniCharIsWordChar --
+ *
+ *	Test if a character is alphanumeric or a connector punctuation
+ *	mark.
+ *
+ * Results:
+ *	Returns 1 if character is a word character.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUniCharIsWordChar(ch)
+    int ch;			/* Unicode character to test. */
+{
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+
+    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
 }