summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c180
1 files changed, 102 insertions, 78 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index ee9f59a..9027921 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -8,10 +8,27 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclUtf.c,v 1.1.2.2 1998/10/03 01:56:42 stanton Exp $
+ * RCS: @(#) $Id: tclUtf.c,v 1.1.2.3 1998/10/16 01:16:57 stanton Exp $
*/
#include "tclInt.h"
+#include "TclUtf.h"
+
+/*
+ * The following macros are used for fast character category tests. The
+ * x_BITS values are shifted right by the category value to determine whether
+ * the given category is included in the set.
+ */
+
+#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
+ | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
+
+#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
+
+#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
+ | (1 << PARAGRAPH_SEPARATOR))
+
+#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
/*
* Unicode characters less than this value are represented by themselves
@@ -813,10 +830,6 @@ Tcl_UtfBackslash(src, readPtr, dst)
* Convert lowercase characters to uppercase characters in a UTF
* string in place. The conversion may shrink the UTF string.
*
- * INTL: This implementation only handles iso8859-1 characters
- * in the current locale. This should be changed to use the
- * Unicode character tables.
- *
* Results:
* Returns the number of bytes in the resulting string
* excluding the trailing null.
@@ -841,16 +854,7 @@ Tcl_UtfToUpper(str)
src = dst = str;
while (*src) {
src += Tcl_UtfToUniChar(src, &ch);
-
- /*
- * INTL: This conversion should be replaced with a table lookup for the
- * full Unicode translation.
- */
-
- if ((ch < 0x100) && islower(ch)) { /* INTL: ISO only */
- ch = (Tcl_UniChar) UCHAR(toupper(ch)); /* INTL: ISO only */
- }
- dst += Tcl_UniCharToUtf(ch, dst);
+ dst += Tcl_UniCharToUtf(Tcl_UniCharToUpper(ch), dst);
}
*dst = '\0';
return (dst - str);
@@ -864,10 +868,6 @@ Tcl_UtfToUpper(str)
* Convert uppercase characters to lowercase characters in a UTF
* string in place. The conversion may shrink the UTF string.
*
- * INTL: This implementation only handles iso8859-1 characters
- * in the current locale. This should be changed to use the
- * Unicode character tables.
- *
* Results:
* Returns the number of bytes in the resulting string
* excluding the trailing null.
@@ -892,16 +892,7 @@ Tcl_UtfToLower(str)
src = dst = str;
while (*src) {
src += Tcl_UtfToUniChar(src, &ch);
-
- /*
- * INTL: This conversion should be replaced with a table lookup for the
- * full Unicode translation.
- */
-
- if ((ch < 0x100) && isupper(ch)) { /* INTL: ISO only */
- ch = (Tcl_UniChar) UCHAR(tolower(ch)); /* INTL: ISO only */
- }
- dst += Tcl_UniCharToUtf(ch, dst);
+ dst += Tcl_UniCharToUtf(Tcl_UniCharToLower(ch), dst);
}
*dst = '\0';
return (dst - str);
@@ -916,10 +907,6 @@ Tcl_UtfToLower(str)
* uppercase and the rest of the string to lowercase. The
* conversion happens in place and may shrink the UTF string.
*
- * INTL: This implementation only handles iso8859-1 characters
- * in the current locale. This should be changed to use the
- * Unicode character tables.
- *
* Results:
* Returns the number of bytes in the resulting string
* excluding the trailing null.
@@ -946,17 +933,11 @@ Tcl_UtfToTitle(str)
if (*src) {
src += Tcl_UtfToUniChar(src, &ch);
- if ((ch < 0x100) && islower(ch)) { /* INTL: ISO only */
- ch = (Tcl_UniChar) UCHAR(toupper(ch)); /* INTL: ISO only */
- }
- dst += Tcl_UniCharToUtf(ch, dst);
+ dst += Tcl_UniCharToUtf(Tcl_UniCharToTitle(ch), dst);
}
while (*src) {
src += Tcl_UtfToUniChar(src, &ch);
- if ((ch < 0x100) && isupper(ch)) { /* INTL: ISO only */
- ch = (Tcl_UniChar) UCHAR(tolower(ch)); /* INTL: ISO only */
- }
- dst += Tcl_UniCharToUtf(ch, dst);
+ dst += Tcl_UniCharToUtf(Tcl_UniCharToLower(ch), dst);
}
*dst = '\0';
return (dst - str);
@@ -969,8 +950,6 @@ Tcl_UtfToTitle(str)
*
* Compute the uppercase equivalent of the given Unicode character.
*
- * INTL: this implementation only works on ISO characters.
- *
* Results:
* Returns the uppercase Unicode character.
*
@@ -984,9 +963,13 @@ Tcl_UniChar
Tcl_UniCharToUpper(ch)
int ch; /* Unicode character to convert. */
{
- return (Tcl_UniChar) ((ch < 0x100)
- ? UCHAR(toupper(ch)) /* INTL: ISO only */
- : ch);
+ int info = GetUniCharInfo(ch);
+
+ if (GetCaseType(info) & 0x04) {
+ return (Tcl_UniChar) (ch - GetDelta(info));
+ } else {
+ return ch;
+ }
}
/*
@@ -996,8 +979,6 @@ Tcl_UniCharToUpper(ch)
*
* Compute the lowercase equivalent of the given Unicode character.
*
- * INTL: this implementation only works on ISO characters.
- *
* Results:
* Returns the lowercase Unicode character.
*
@@ -1011,9 +992,13 @@ Tcl_UniChar
Tcl_UniCharToLower(ch)
int ch; /* Unicode character to convert. */
{
- return (Tcl_UniChar) ((ch < 0x100)
- ? UCHAR(tolower(ch)) /* INTL: ISO only */
- : ch);
+ int info = GetUniCharInfo(ch);
+
+ if (GetCaseType(info) & 0x02) {
+ return (Tcl_UniChar) (ch + GetDelta(info));
+ } else {
+ return ch;
+ }
}
/*
@@ -1023,8 +1008,6 @@ Tcl_UniCharToLower(ch)
*
* Compute the titlecase equivalent of the given Unicode character.
*
- * INTL: this implementation only works on ISO characters.
- *
* Results:
* Returns the titlecase Unicode character.
*
@@ -1038,9 +1021,20 @@ Tcl_UniChar
Tcl_UniCharToTitle(ch)
int ch; /* Unicode character to convert. */
{
- return (Tcl_UniChar) ((ch < 0x100)
- ? UCHAR(toupper(ch)) /* INTL: ISO only */
- : ch);
+ int info = GetUniCharInfo(ch);
+ int mode = GetCaseType(info);
+
+ if (mode & 0x1) {
+ /*
+ * Subtract or add one depending on the original case.
+ */
+
+ return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
+ } else if (mode == 0x4) {
+ return (Tcl_UniChar) (ch - GetDelta(info));
+ } else {
+ return ch;
+ }
}
/*
@@ -1114,8 +1108,6 @@ TclUniCharNcmp(cs, ct, n)
*
* Test if a character is an alphanumeric Unicode character.
*
- * INTL: this implementation only works on ISO characters.
- *
* Results:
* Returns 1 if character is alphanumeric.
*
@@ -1129,7 +1121,9 @@ int
TclUniCharIsAlnum(ch)
int ch; /* Unicode character to test. */
{
- return ((ch < 0x100) ? isalnum(ch) : 0); /* INTL: ISO only */
+ register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+
+ return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
}
/*
@@ -1139,8 +1133,6 @@ TclUniCharIsAlnum(ch)
*
* Test if a character is an alphabetic Unicode character.
*
- * INTL: this implementation only works on ISO characters.
- *
* Results:
* Returns 1 if character is alphabetic.
*
@@ -1154,7 +1146,8 @@ int
TclUniCharIsAlpha(ch)
int ch; /* Unicode character to test. */
{
- return ((ch < 0x100) ? isalpha(ch) : 0); /* INTL: ISO only */
+ register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+ return ((ALPHA_BITS >> category) & 1);
}
/*
@@ -1164,10 +1157,8 @@ TclUniCharIsAlpha(ch)
*
* Test if a character is a numeric Unicode character.
*
- * INTL: this implementation only works on ISO characters.
- *
* Results:
- * Returns 1 if character is a digit.
+ * Returns non-zero if character is a digit.
*
* Side effects:
* None.
@@ -1179,7 +1170,8 @@ int
TclUniCharIsDigit(ch)
int ch; /* Unicode character to test. */
{
- return ((ch < 0x100) ? isdigit(ch) : 0); /* INTL: ISO only */
+ return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
+ == DECIMAL_DIGIT_NUMBER);
}
/*
@@ -1189,10 +1181,8 @@ TclUniCharIsDigit(ch)
*
* Test if a character is a lowercase Unicode character.
*
- * INTL: this implementation only works on ISO characters.
- *
* Results:
- * Returns 1 if character is lowercase.
+ * Returns non-zero if character is lowercase.
*
* Side effects:
* None.
@@ -1204,7 +1194,7 @@ int
TclUniCharIsLower(ch)
int ch; /* Unicode character to test. */
{
- return ((ch < 0x100) ? islower(ch) : 0); /* INTL: ISO only */
+ return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
}
/*
@@ -1214,10 +1204,8 @@ TclUniCharIsLower(ch)
*
* Test if a character is a whitespace Unicode character.
*
- * INTL: this implementation only works on ISO characters.
- *
* Results:
- * Returns 1 if character is a space.
+ * Returns non-zero if character is a space.
*
* Side effects:
* None.
@@ -1229,7 +1217,19 @@ int
TclUniCharIsSpace(ch)
int ch; /* Unicode character to test. */
{
- return ((ch < 0x100) ? isspace(ch) : 0); /* INTL: ISO only */
+ register int category;
+
+ /*
+ * If the character is within the first 127 characters, just use the
+ * standard C function, otherwise consult the Unicode table.
+ */
+
+ if (ch < 0x80) {
+ return isspace(UCHAR(ch)); /* INTL: ISO space */
+ } else {
+ category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+ return ((SPACE_BITS >> category) & 1);
+ }
}
/*
@@ -1239,10 +1239,8 @@ TclUniCharIsSpace(ch)
*
* Test if a character is a uppercase Unicode character.
*
- * INTL: this implementation only works on ISO characters.
- *
* Results:
- * Returns 1 if character is uppercase.
+ * Returns non-zero if character is uppercase.
*
* Side effects:
* None.
@@ -1254,5 +1252,31 @@ int
TclUniCharIsUpper(ch)
int ch; /* Unicode character to test. */
{
- return ((ch < 0x100) ? isupper(ch) : 0); /* INTL: ISO only */
+ return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * TclUniCharIsWordChar --
+ *
+ * Test if a character is alphanumeric or a connector punctuation
+ * mark.
+ *
+ * Results:
+ * Returns 1 if character is a word character.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUniCharIsWordChar(ch)
+ int ch; /* Unicode character to test. */
+{
+ register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+
+ return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
}