diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 121 |
1 files changed, 74 insertions, 47 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 31e52ba..15529c7 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -7,8 +7,6 @@ * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. - * - * RCS: @(#) $Id: tclUtf.c,v 1.40 2009/09/07 07:28:38 das Exp $ */ #include "tclInt.h" @@ -28,28 +26,27 @@ #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) +#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE)) + #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ | (1 << PARAGRAPH_SEPARATOR)) -#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) - -#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ - (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ - (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ - (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ - (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ - (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ - (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ - (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ - (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) +#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION)) #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) +#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \ + (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ + (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ + (1 << OTHER_NUMBER) | \ + (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ + (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) + /* * Unicode characters less than this value are represented by themselves in * UTF-8 strings. @@ -1109,6 +1106,46 @@ Tcl_UtfNcasecmp( /* *---------------------------------------------------------------------- * + * Tcl_UtfNcasecmp -- + * + * Compare UTF chars of string cs to string ct case insensitively. + * Replacement for strcasecmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCasecmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + while (*cs && *ct) { + Tcl_UniChar ch1, ch2; + + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { + ch1 = Tcl_UniCharToLower(ch1); + ch2 = Tcl_UniCharToLower(ch2); + if (ch1 != ch2) { + return ch1 - ch2; + } + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + + +/* + *---------------------------------------------------------------------- + * * Tcl_UniCharToUpper -- * * Compute the uppercase equivalent of the given Unicode character. @@ -1129,10 +1166,9 @@ Tcl_UniCharToUpper( int info = GetUniCharInfo(ch); if (GetCaseType(info) & 0x04) { - return (Tcl_UniChar) (ch - GetDelta(info)); - } else { - return ch; + ch -= GetDelta(info); } + return (Tcl_UniChar) ch; } /* @@ -1158,10 +1194,9 @@ Tcl_UniCharToLower( int info = GetUniCharInfo(ch); if (GetCaseType(info) & 0x02) { - return (Tcl_UniChar) (ch + GetDelta(info)); - } else { - return ch; + ch += GetDelta(info); } + return (Tcl_UniChar) ch; } /* @@ -1192,12 +1227,11 @@ Tcl_UniCharToTitle( * Subtract or add one depending on the original case. */ - return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); + ch += ((mode & 0x4) ? -1 : 1); } else if (mode == 0x4) { - return (Tcl_UniChar) (ch - GetDelta(info)); - } else { - return ch; + ch -= GetDelta(info); } + return (Tcl_UniChar) ch; } /* @@ -1331,9 +1365,7 @@ int Tcl_UniCharIsAlnum( int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); + return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } /* @@ -1356,8 +1388,7 @@ int Tcl_UniCharIsAlpha( int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((ALPHA_BITS >> category) & 1); + return ((ALPHA_BITS >> GetCategory(ch)) & 1); } /* @@ -1380,7 +1411,7 @@ int Tcl_UniCharIsControl( int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); + return ((CONTROL_BITS >> GetCategory(ch)) & 1); } /* @@ -1403,7 +1434,7 @@ int Tcl_UniCharIsDigit( int ch) /* Unicode character to test. */ { - return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER; + return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } /* @@ -1426,8 +1457,7 @@ int Tcl_UniCharIsGraph( int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); + return ((GRAPH_BITS >> GetCategory(ch)) & 1); } /* @@ -1450,7 +1480,7 @@ int Tcl_UniCharIsLower( int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); + return (GetCategory(ch) == LOWERCASE_LETTER); } /* @@ -1473,8 +1503,7 @@ int Tcl_UniCharIsPrint( int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((PRINT_BITS >> category) & 1); + return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } /* @@ -1497,8 +1526,7 @@ int Tcl_UniCharIsPunct( int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((PUNCT_BITS >> category) & 1); + return ((PUNCT_BITS >> GetCategory(ch)) & 1); } /* @@ -1521,18 +1549,19 @@ int Tcl_UniCharIsSpace( int ch) /* Unicode character to test. */ { - register int category; - /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (ch < 0x80) { - return isspace(UCHAR(ch)); /* INTL: ISO space */ + if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { + return TclIsSpaceProc((char) ch); + } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e + || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060 + || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) { + return 1; } else { - category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((SPACE_BITS >> category) & 1); + return ((SPACE_BITS >> GetCategory(ch)) & 1); } } @@ -1556,7 +1585,7 @@ int Tcl_UniCharIsUpper( int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); + return (GetCategory(ch) == UPPERCASE_LETTER); } /* @@ -1579,9 +1608,7 @@ int Tcl_UniCharIsWordChar( int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); + return ((WORD_BITS >> GetCategory(ch)) & 1); } /* |