diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 472 |
1 files changed, 252 insertions, 220 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 992a55f..15529c7 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -7,8 +7,6 @@ * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. - * - * RCS: @(#) $Id: tclUtf.c,v 1.36 2005/09/07 15:31:10 dgp Exp $ */ #include "tclInt.h" @@ -23,46 +21,45 @@ * The following macros are used for fast character category tests. The x_BITS * values are shifted right by the category value to determine whether the * given category is included in the set. - */ + */ #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) +#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE)) + #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ | (1 << PARAGRAPH_SEPARATOR)) -#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) - -#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ - (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ - (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ - (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ - (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ - (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ - (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ - (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ - (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) +#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION)) #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) +#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \ + (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ + (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ + (1 << OTHER_NUMBER) | \ + (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ + (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) + /* - * Unicode characters less than this value are represented by themselves - * in UTF-8 strings. + * Unicode characters less than this value are represented by themselves in + * UTF-8 strings. */ #define UNICODE_SELF 0x80 /* - * The following structures are used when mapping between Unicode (UCS-2) - * and UTF-8. + * The following structures are used when mapping between Unicode (UCS-2) and + * UTF-8. */ -static CONST unsigned char totalBytes[256] = { +static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -89,10 +86,10 @@ static CONST unsigned char totalBytes[256] = { }; /* - * Procedures used only in this module. + * Functions used only in this module. */ -static int UtfCount _ANSI_ARGS_((int ch)); +static int UtfCount(int ch); /* *--------------------------------------------------------------------------- @@ -109,10 +106,10 @@ static int UtfCount _ANSI_ARGS_((int ch)); * *--------------------------------------------------------------------------- */ - + INLINE static int -UtfCount(ch) - int ch; /* The Tcl_UniChar whose size is returned. */ +UtfCount( + int ch) /* The Tcl_UniChar whose size is returned. */ { if ((ch > 0) && (ch < UNICODE_SELF)) { return 1; @@ -154,12 +151,12 @@ UtfCount(ch) * *--------------------------------------------------------------------------- */ - + INLINE int -Tcl_UniCharToUtf(ch, buf) - int ch; /* The Tcl_UniChar to be stored in the +Tcl_UniCharToUtf( + int ch, /* The Tcl_UniChar to be stored in the * buffer. */ - char *buf; /* Buffer in which the UTF-8 representation of + char *buf) /* Buffer in which the UTF-8 representation of * the Tcl_UniChar is stored. Buffer must be * large enough to hold the UTF-8 character * (at most TCL_UTF_MAX bytes). */ @@ -231,16 +228,16 @@ Tcl_UniCharToUtf(ch, buf) * *--------------------------------------------------------------------------- */ - + char * -Tcl_UniCharToUtfDString(uniStr, uniLength, dsPtr) - CONST Tcl_UniChar *uniStr; /* Unicode string to convert to UTF-8. */ - int uniLength; /* Length of Unicode string in Tcl_UniChars +Tcl_UniCharToUtfDString( + const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ + int uniLength, /* Length of Unicode string in Tcl_UniChars * (must be >= 0). */ - Tcl_DString *dsPtr; /* UTF-8 representation of string is appended + Tcl_DString *dsPtr) /* UTF-8 representation of string is appended * to this previously initialized DString. */ { - CONST Tcl_UniChar *w, *wEnd; + const Tcl_UniChar *w, *wEnd; char *p, *string; int oldLength; @@ -289,15 +286,15 @@ Tcl_UniCharToUtfDString(uniStr, uniLength, dsPtr) * *--------------------------------------------------------------------------- */ - + int -Tcl_UtfToUniChar(src, chPtr) - register CONST char *src; /* The UTF-8 string. */ - register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented by - * the UTF-8 string. */ +Tcl_UtfToUniChar( + register const char *src, /* The UTF-8 string. */ + register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by + * the UTF-8 string. */ { register int byte; - + /* * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. */ @@ -335,7 +332,7 @@ Tcl_UtfToUniChar(src, chPtr) * Three-byte-character lead byte followed by two trail bytes. */ - *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) + *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); return 3; } @@ -349,7 +346,7 @@ Tcl_UtfToUniChar(src, chPtr) return 1; } #if TCL_UTF_MAX > 3 - else { + { int ch, total, trail; total = totalBytes[byte]; @@ -395,16 +392,16 @@ Tcl_UtfToUniChar(src, chPtr) */ Tcl_UniChar * -Tcl_UtfToUniCharDString(src, length, dsPtr) - CONST char *src; /* UTF-8 string to convert to Unicode. */ - int length; /* Length of UTF-8 string in bytes, or -1 for +Tcl_UtfToUniCharDString( + const char *src, /* UTF-8 string to convert to Unicode. */ + int length, /* Length of UTF-8 string in bytes, or -1 for * strlen(). */ - Tcl_DString *dsPtr; /* Unicode representation of string is + Tcl_DString *dsPtr) /* Unicode representation of string is * appended to this previously initialized * DString. */ { Tcl_UniChar *w, *wString; - CONST char *p, *end; + const char *p, *end; int oldLength; if (length < 0) { @@ -417,6 +414,7 @@ Tcl_UtfToUniCharDString(src, length, dsPtr) */ oldLength = Tcl_DStringLength(dsPtr); +/* TODO: fix overreach! */ Tcl_DStringSetLength(dsPtr, (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); @@ -454,10 +452,10 @@ Tcl_UtfToUniCharDString(src, length, dsPtr) */ int -Tcl_UtfCharComplete(src, length) - CONST char *src; /* String to check if first few bytes contain +Tcl_UtfCharComplete( + const char *src, /* String to check if first few bytes contain * a complete UTF-8 character. */ - int length; /* Length of above string in bytes. */ + int length) /* Length of above string in bytes. */ { int ch; @@ -475,18 +473,18 @@ Tcl_UtfCharComplete(src, length) * utflen() and utfnlen(). * * Results: - * As above. + * As above. * * Side effects: * None. * *--------------------------------------------------------------------------- */ - -int -Tcl_NumUtfChars(src, length) - register CONST char *src; /* The UTF-8 string to measure. */ - int length; /* The length of the string in bytes, or -1 + +int +Tcl_NumUtfChars( + register const char *src, /* The UTF-8 string to measure. */ + int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { Tcl_UniChar ch; @@ -542,14 +540,15 @@ Tcl_NumUtfChars(src, length) * *--------------------------------------------------------------------------- */ -CONST char * -Tcl_UtfFindFirst(src, ch) - CONST char *src; /* The UTF-8 string to be searched. */ - int ch; /* The Tcl_UniChar to search for. */ + +const char * +Tcl_UtfFindFirst( + const char *src, /* The UTF-8 string to be searched. */ + int ch) /* The Tcl_UniChar to search for. */ { int len; Tcl_UniChar find; - + while (1) { len = TclUtfToUniChar(src, &find); if (find == ch) { @@ -581,15 +580,15 @@ Tcl_UtfFindFirst(src, ch) *--------------------------------------------------------------------------- */ -CONST char * -Tcl_UtfFindLast(src, ch) - CONST char *src; /* The UTF-8 string to be searched. */ - int ch; /* The Tcl_UniChar to search for. */ +const char * +Tcl_UtfFindLast( + const char *src, /* The UTF-8 string to be searched. */ + int ch) /* The Tcl_UniChar to search for. */ { int len; Tcl_UniChar find; - CONST char *last; - + const char *last; + last = NULL; while (1) { len = TclUtfToUniChar(src, &find); @@ -622,10 +621,10 @@ Tcl_UtfFindLast(src, ch) * *--------------------------------------------------------------------------- */ - -CONST char * -Tcl_UtfNext(src) - CONST char *src; /* The current location in the string. */ + +const char * +Tcl_UtfNext( + const char *src) /* The current location in the string. */ { Tcl_UniChar ch; @@ -653,15 +652,15 @@ Tcl_UtfNext(src) *--------------------------------------------------------------------------- */ -CONST char * -Tcl_UtfPrev(src, start) - CONST char *src; /* The current location in the string. */ - CONST char *start; /* Pointer to the beginning of the string, to +const char * +Tcl_UtfPrev( + const char *src, /* The current location in the string. */ + const char *start) /* Pointer to the beginning of the string, to * avoid going backwards too far. */ { - CONST char *look; + const char *look; int i, byte; - + src--; look = src; for (i = 0; i < TCL_UTF_MAX; i++) { @@ -682,7 +681,7 @@ Tcl_UtfPrev(src, start) } return src; } - + /* *--------------------------------------------------------------------------- * @@ -699,13 +698,13 @@ Tcl_UtfPrev(src, start) * *--------------------------------------------------------------------------- */ - + Tcl_UniChar -Tcl_UniCharAtIndex(src, index) - register CONST char *src; /* The UTF-8 string to dereference. */ - register int index; /* The position of the desired character. */ +Tcl_UniCharAtIndex( + register const char *src, /* The UTF-8 string to dereference. */ + register int index) /* The position of the desired character. */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; while (index >= 0) { index--; @@ -731,13 +730,13 @@ Tcl_UniCharAtIndex(src, index) *--------------------------------------------------------------------------- */ -CONST char * -Tcl_UtfAtIndex(src, index) - register CONST char *src; /* The UTF-8 string. */ - register int index; /* The position of the desired character. */ +const char * +Tcl_UtfAtIndex( + register const char *src, /* The UTF-8 string. */ + register int index) /* The position of the desired character. */ { Tcl_UniChar ch; - + while (index > 0) { index--; src += TclUtfToUniChar(src, &ch); @@ -772,12 +771,12 @@ Tcl_UtfAtIndex(src, index) */ int -Tcl_UtfBackslash(src, readPtr, dst) - CONST char *src; /* Points to the backslash character of a +Tcl_UtfBackslash( + const char *src, /* Points to the backslash character of a * backslash sequence. */ - int *readPtr; /* Fill in with number of characters read from + int *readPtr, /* Fill in with number of characters read from * src, unless NULL. */ - char *dst; /* Filled with the bytes represented by the + char *dst) /* Filled with the bytes represented by the * backslash sequence. */ { #define LINE_LENGTH 128 @@ -786,7 +785,10 @@ Tcl_UtfBackslash(src, readPtr, dst) result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst); if (numRead == LINE_LENGTH) { - /* We ate a whole line. Pay the price of a strlen() */ + /* + * We ate a whole line. Pay the price of a strlen() + */ + result = TclParseBackslash(src, (int)strlen(src), &numRead, dst); } if (readPtr != NULL) { @@ -814,8 +816,8 @@ Tcl_UtfBackslash(src, readPtr, dst) */ int -Tcl_UtfToUpper(str) - char *str; /* String to convert in place. */ +Tcl_UtfToUpper( + char *str) /* String to convert in place. */ { Tcl_UniChar ch, upChar; char *src, *dst; @@ -835,7 +837,7 @@ Tcl_UtfToUpper(str) * conversion (thereby causing a segfault), only copy the upper case * char to dst if its size is <= the original char. */ - + if (bytes < UtfCount(upChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; @@ -867,13 +869,13 @@ Tcl_UtfToUpper(str) */ int -Tcl_UtfToLower(str) - char *str; /* String to convert in place. */ +Tcl_UtfToLower( + char *str) /* String to convert in place. */ { Tcl_UniChar ch, lowChar; char *src, *dst; int bytes; - + /* * Iterate over the string until we hit the terminating null. */ @@ -888,7 +890,7 @@ Tcl_UtfToLower(str) * conversion (thereby causing a segfault), only copy the lower case * char to dst if its size is <= the original char. */ - + if (bytes < UtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; @@ -921,13 +923,13 @@ Tcl_UtfToLower(str) */ int -Tcl_UtfToTitle(str) - char *str; /* String to convert in place. */ +Tcl_UtfToTitle( + char *str) /* String to convert in place. */ { Tcl_UniChar ch, titleChar, lowChar; char *src, *dst; int bytes; - + /* * Capitalize the first character and then lowercase the rest of the * characters until we get to a null. @@ -968,8 +970,8 @@ Tcl_UtfToTitle(str) * * TclpUtfNcmp2 -- * - * Compare at most n bytes of utf-8 strings cs and ct. Both cs and ct are - * assumed to be at least n bytes long. + * Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and + * ct are assumed to be at least numBytes bytes long. * * Results: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. @@ -981,26 +983,26 @@ Tcl_UtfToTitle(str) */ int -TclpUtfNcmp2(cs, ct, n) - CONST char *cs; /* UTF string to compare to ct. */ - CONST char *ct; /* UTF string cs is compared to. */ - unsigned long n; /* Number of *bytes* to compare. */ +TclpUtfNcmp2( + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ + unsigned long numBytes) /* Number of *bytes* to compare. */ { /* - * We can't simply call 'memcmp(cs, ct, n);' because we need to check for - * Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes fine in - * the strcmp manner. + * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to + * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes + * fine in the strcmp manner. */ register int result = 0; - for ( ; n != 0; n--, cs++, ct++) { + for ( ; numBytes != 0; numBytes--, cs++, ct++) { if (*cs != *ct) { result = UCHAR(*cs) - UCHAR(*ct); break; } } - if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { + if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { unsigned char c1, c2; c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); @@ -1028,10 +1030,10 @@ TclpUtfNcmp2(cs, ct, n) */ int -Tcl_UtfNcmp(cs, ct, numChars) - CONST char *cs; /* UTF string to compare to ct. */ - CONST char *ct; /* UTF string cs is compared to. */ - unsigned long numChars; /* Number of UTF chars to compare. */ +Tcl_UtfNcmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ + unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; @@ -1063,8 +1065,8 @@ Tcl_UtfNcmp(cs, ct, numChars) * Tcl_UtfNcasecmp -- * * Compare at most numChars UTF chars of string cs to string ct case - * insensitive. Both cs and ct are assumed to be at least numChars - * UTF chars long. + * insensitive. Both cs and ct are assumed to be at least numChars UTF + * chars long. * * Results: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. @@ -1076,10 +1078,10 @@ Tcl_UtfNcmp(cs, ct, numChars) */ int -Tcl_UtfNcasecmp(cs, ct, numChars) - CONST char *cs; /* UTF string to compare to ct. */ - CONST char *ct; /* UTF string cs is compared to. */ - unsigned long numChars; /* Number of UTF chars to compare. */ +Tcl_UtfNcasecmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ + unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; while (numChars-- > 0) { @@ -1104,6 +1106,46 @@ Tcl_UtfNcasecmp(cs, ct, numChars) /* *---------------------------------------------------------------------- * + * Tcl_UtfNcasecmp -- + * + * Compare UTF chars of string cs to string ct case insensitively. + * Replacement for strcasecmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCasecmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + while (*cs && *ct) { + Tcl_UniChar ch1, ch2; + + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { + ch1 = Tcl_UniCharToLower(ch1); + ch2 = Tcl_UniCharToLower(ch2); + if (ch1 != ch2) { + return ch1 - ch2; + } + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + + +/* + *---------------------------------------------------------------------- + * * Tcl_UniCharToUpper -- * * Compute the uppercase equivalent of the given Unicode character. @@ -1118,16 +1160,15 @@ Tcl_UtfNcasecmp(cs, ct, numChars) */ Tcl_UniChar -Tcl_UniCharToUpper(ch) - int ch; /* Unicode character to convert. */ +Tcl_UniCharToUpper( + int ch) /* Unicode character to convert. */ { int info = GetUniCharInfo(ch); if (GetCaseType(info) & 0x04) { - return (Tcl_UniChar) (ch - GetDelta(info)); - } else { - return ch; + ch -= GetDelta(info); } + return (Tcl_UniChar) ch; } /* @@ -1147,16 +1188,15 @@ Tcl_UniCharToUpper(ch) */ Tcl_UniChar -Tcl_UniCharToLower(ch) - int ch; /* Unicode character to convert. */ +Tcl_UniCharToLower( + int ch) /* Unicode character to convert. */ { int info = GetUniCharInfo(ch); if (GetCaseType(info) & 0x02) { - return (Tcl_UniChar) (ch + GetDelta(info)); - } else { - return ch; + ch += GetDelta(info); } + return (Tcl_UniChar) ch; } /* @@ -1176,8 +1216,8 @@ Tcl_UniCharToLower(ch) */ Tcl_UniChar -Tcl_UniCharToTitle(ch) - int ch; /* Unicode character to convert. */ +Tcl_UniCharToTitle( + int ch) /* Unicode character to convert. */ { int info = GetUniCharInfo(ch); int mode = GetCaseType(info); @@ -1187,12 +1227,11 @@ Tcl_UniCharToTitle(ch) * Subtract or add one depending on the original case. */ - return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); + ch += ((mode & 0x4) ? -1 : 1); } else if (mode == 0x4) { - return (Tcl_UniChar) (ch - GetDelta(info)); - } else { - return ch; + ch -= GetDelta(info); } + return (Tcl_UniChar) ch; } /* @@ -1200,7 +1239,7 @@ Tcl_UniCharToTitle(ch) * * Tcl_UniCharLen -- * - * Find the length of a UniChar string. The str input must be null + * Find the length of a UniChar string. The str input must be null * terminated. * * Results: @@ -1213,11 +1252,11 @@ Tcl_UniCharToTitle(ch) */ int -Tcl_UniCharLen(uniStr) - CONST Tcl_UniChar *uniStr; /* Unicode string to find length of. */ +Tcl_UniCharLen( + const Tcl_UniChar *uniStr) /* Unicode string to find length of. */ { int len = 0; - + while (*uniStr != '\0') { len++; uniStr++; @@ -1243,10 +1282,10 @@ Tcl_UniCharLen(uniStr) */ int -Tcl_UniCharNcmp(ucs, uct, numChars) - CONST Tcl_UniChar *ucs; /* Unicode string to compare to uct. */ - CONST Tcl_UniChar *uct; /* Unicode string ucs is compared to. */ - unsigned long numChars; /* Number of unichars to compare. */ +Tcl_UniCharNcmp( + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ { #ifdef WORDS_BIGENDIAN /* @@ -1275,7 +1314,7 @@ Tcl_UniCharNcmp(ucs, uct, numChars) * Tcl_UniCharNcasecmp -- * * Compare at most numChars unichars of string ucs to string uct case - * insensitive. Both ucs and uct are assumed to be at least numChars + * insensitive. Both ucs and uct are assumed to be at least numChars * unichars long. * * Results: @@ -1288,15 +1327,16 @@ Tcl_UniCharNcmp(ucs, uct, numChars) */ int -Tcl_UniCharNcasecmp(ucs, uct, numChars) - CONST Tcl_UniChar *ucs; /* Unicode string to compare to uct. */ - CONST Tcl_UniChar *uct; /* Unicode string ucs is compared to. */ - unsigned long numChars; /* Number of unichars to compare. */ +Tcl_UniCharNcasecmp( + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ { for ( ; numChars != 0; numChars--, ucs++, uct++) { if (*ucs != *uct) { Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); Tcl_UniChar lct = Tcl_UniCharToLower(*uct); + if (lcs != lct) { return (lcs - lct); } @@ -1322,12 +1362,10 @@ Tcl_UniCharNcasecmp(ucs, uct, numChars) */ int -Tcl_UniCharIsAlnum(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsAlnum( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); + return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } /* @@ -1347,11 +1385,10 @@ Tcl_UniCharIsAlnum(ch) */ int -Tcl_UniCharIsAlpha(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsAlpha( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((ALPHA_BITS >> category) & 1); + return ((ALPHA_BITS >> GetCategory(ch)) & 1); } /* @@ -1371,10 +1408,10 @@ Tcl_UniCharIsAlpha(ch) */ int -Tcl_UniCharIsControl(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsControl( + int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); + return ((CONTROL_BITS >> GetCategory(ch)) & 1); } /* @@ -1394,11 +1431,10 @@ Tcl_UniCharIsControl(ch) */ int -Tcl_UniCharIsDigit(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsDigit( + int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) - == DECIMAL_DIGIT_NUMBER); + return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } /* @@ -1418,11 +1454,10 @@ Tcl_UniCharIsDigit(ch) */ int -Tcl_UniCharIsGraph(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsGraph( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); + return ((GRAPH_BITS >> GetCategory(ch)) & 1); } /* @@ -1442,10 +1477,10 @@ Tcl_UniCharIsGraph(ch) */ int -Tcl_UniCharIsLower(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsLower( + int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); + return (GetCategory(ch) == LOWERCASE_LETTER); } /* @@ -1465,11 +1500,10 @@ Tcl_UniCharIsLower(ch) */ int -Tcl_UniCharIsPrint(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsPrint( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((PRINT_BITS >> category) & 1); + return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } /* @@ -1489,11 +1523,10 @@ Tcl_UniCharIsPrint(ch) */ int -Tcl_UniCharIsPunct(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsPunct( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((PUNCT_BITS >> category) & 1); + return ((PUNCT_BITS >> GetCategory(ch)) & 1); } /* @@ -1513,21 +1546,22 @@ Tcl_UniCharIsPunct(ch) */ int -Tcl_UniCharIsSpace(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsSpace( + int ch) /* Unicode character to test. */ { - register int category; - /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (ch < 0x80) { - return isspace(UCHAR(ch)); /* INTL: ISO space */ + if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { + return TclIsSpaceProc((char) ch); + } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e + || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060 + || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) { + return 1; } else { - category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((SPACE_BITS >> category) & 1); + return ((SPACE_BITS >> GetCategory(ch)) & 1); } } @@ -1548,10 +1582,10 @@ Tcl_UniCharIsSpace(ch) */ int -Tcl_UniCharIsUpper(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsUpper( + int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); + return (GetCategory(ch) == UPPERCASE_LETTER); } /* @@ -1571,12 +1605,10 @@ Tcl_UniCharIsUpper(ch) */ int -Tcl_UniCharIsWordChar(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsWordChar( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); + return ((WORD_BITS >> GetCategory(ch)) & 1); } /* @@ -1603,24 +1635,24 @@ Tcl_UniCharIsWordChar(ch) */ int -Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase) - CONST Tcl_UniChar *uniStr; /* Unicode String. */ - CONST Tcl_UniChar *uniPattern; +Tcl_UniCharCaseMatch( + const Tcl_UniChar *uniStr, /* Unicode String. */ + const Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ - int nocase; /* 0 for case sensitive, 1 for insensitive */ + int nocase) /* 0 for case sensitive, 1 for insensitive */ { Tcl_UniChar ch1, p; - + while (1) { p = *uniPattern; - + /* * See if we're at the end of both the pattern and the string. If so, * we succeeded. If we're at the end of the pattern but not at the end * of the string, we failed. */ - + if (p == 0) { return (*uniStr == 0); } @@ -1635,7 +1667,7 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase) * recursively for each postfix of string, until either we match or we * reach the end of the string. */ - + if (p == '*') { /* * Skip all successive *'s in the pattern @@ -1696,7 +1728,7 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase) * list of characters that are acceptable, or by a range (two * characters separated by "-"). */ - + if (p == '[') { Tcl_UniChar startChar, endChar; @@ -1752,8 +1784,8 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase) } /* - * There's no special character. Just make sure that the next bytes - * of each string match. + * There's no special character. Just make sure that the next bytes of + * each string match. */ if (nocase) { @@ -1791,15 +1823,15 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase) */ int -TclUniCharMatch(string, strLen, pattern, ptnLen, nocase) - CONST Tcl_UniChar *string; /* Unicode String. */ - int strLen; /* length of String */ - CONST Tcl_UniChar *pattern; /* Pattern, which may contain special +TclUniCharMatch( + const Tcl_UniChar *string, /* Unicode String. */ + int strLen, /* Length of String */ + const Tcl_UniChar *pattern, /* Pattern, which may contain special * characters. */ - int ptnLen; /* length of Pattern */ - int nocase; /* 0 for case sensitive, 1 for insensitive */ + int ptnLen, /* Length of Pattern */ + int nocase) /* 0 for case sensitive, 1 for insensitive */ { - CONST Tcl_UniChar *stringEnd, *patternEnd; + const Tcl_UniChar *stringEnd, *patternEnd; Tcl_UniChar p; stringEnd = string + strLen; @@ -1827,7 +1859,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase) * recursively for each postfix of string, until either we match or we * reach the end of the string. */ - + if (p == '*') { /* * Skip all successive *'s in the pattern. @@ -1889,7 +1921,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase) * list of characters that are acceptable, or by a range (two * characters separated by "-"). */ - + if (p == '[') { Tcl_UniChar ch1, startChar, endChar; |