diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 198 |
1 files changed, 87 insertions, 111 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e5497a4..16acab2 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -7,6 +7,8 @@ * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. + * + * RCS: @(#) $Id: tclUtf.c,v 1.39 2009/02/11 15:28:59 dgp Exp $ */ #include "tclInt.h" @@ -26,27 +28,28 @@ #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) -#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE)) - #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ | (1 << PARAGRAPH_SEPARATOR)) -#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION)) - -#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ - (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ - (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ - (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) +#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) -#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \ +#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ - (1 << OTHER_NUMBER) | \ + (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ + (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ + (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ + (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) +#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ + (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ + (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ + (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) + /* * Unicode characters less than this value are represented by themselves in * UTF-8 strings. @@ -59,7 +62,7 @@ * UTF-8. */ -static CONST unsigned char totalBytes[256] = { +static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -231,13 +234,13 @@ Tcl_UniCharToUtf( char * Tcl_UniCharToUtfDString( - CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ + const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ int uniLength, /* Length of Unicode string in Tcl_UniChars * (must be >= 0). */ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended * to this previously initialized DString. */ { - CONST Tcl_UniChar *w, *wEnd; + const Tcl_UniChar *w, *wEnd; char *p, *string; int oldLength; @@ -289,7 +292,7 @@ Tcl_UniCharToUtfDString( int Tcl_UtfToUniChar( - register CONST char *src, /* The UTF-8 string. */ + register const char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { @@ -393,7 +396,7 @@ Tcl_UtfToUniChar( Tcl_UniChar * Tcl_UtfToUniCharDString( - CONST char *src, /* UTF-8 string to convert to Unicode. */ + const char *src, /* UTF-8 string to convert to Unicode. */ int length, /* Length of UTF-8 string in bytes, or -1 for * strlen(). */ Tcl_DString *dsPtr) /* Unicode representation of string is @@ -401,7 +404,7 @@ Tcl_UtfToUniCharDString( * DString. */ { Tcl_UniChar *w, *wString; - CONST char *p, *end; + const char *p, *end; int oldLength; if (length < 0) { @@ -414,6 +417,7 @@ Tcl_UtfToUniCharDString( */ oldLength = Tcl_DStringLength(dsPtr); +/* TODO: fix overreach! */ Tcl_DStringSetLength(dsPtr, (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); @@ -452,7 +456,7 @@ Tcl_UtfToUniCharDString( int Tcl_UtfCharComplete( - CONST char *src, /* String to check if first few bytes contain + const char *src, /* String to check if first few bytes contain * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { @@ -482,7 +486,7 @@ Tcl_UtfCharComplete( int Tcl_NumUtfChars( - register CONST char *src, /* The UTF-8 string to measure. */ + register const char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { @@ -540,9 +544,9 @@ Tcl_NumUtfChars( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfFindFirst( - CONST char *src, /* The UTF-8 string to be searched. */ + const char *src, /* The UTF-8 string to be searched. */ int ch) /* The Tcl_UniChar to search for. */ { int len; @@ -579,14 +583,14 @@ Tcl_UtfFindFirst( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfFindLast( - CONST char *src, /* The UTF-8 string to be searched. */ + const char *src, /* The UTF-8 string to be searched. */ int ch) /* The Tcl_UniChar to search for. */ { int len; Tcl_UniChar find; - CONST char *last; + const char *last; last = NULL; while (1) { @@ -621,9 +625,9 @@ Tcl_UtfFindLast( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfNext( - CONST char *src) /* The current location in the string. */ + const char *src) /* The current location in the string. */ { Tcl_UniChar ch; @@ -651,13 +655,13 @@ Tcl_UtfNext( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfPrev( - CONST char *src, /* The current location in the string. */ - CONST char *start) /* Pointer to the beginning of the string, to + const char *src, /* The current location in the string. */ + const char *start) /* Pointer to the beginning of the string, to * avoid going backwards too far. */ { - CONST char *look; + const char *look; int i, byte; src--; @@ -700,7 +704,7 @@ Tcl_UtfPrev( Tcl_UniChar Tcl_UniCharAtIndex( - register CONST char *src, /* The UTF-8 string to dereference. */ + register const char *src, /* The UTF-8 string to dereference. */ register int index) /* The position of the desired character. */ { Tcl_UniChar ch; @@ -729,9 +733,9 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfAtIndex( - register CONST char *src, /* The UTF-8 string. */ + register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { Tcl_UniChar ch; @@ -771,7 +775,7 @@ Tcl_UtfAtIndex( int Tcl_UtfBackslash( - CONST char *src, /* Points to the backslash character of a + const char *src, /* Points to the backslash character of a * backslash sequence. */ int *readPtr, /* Fill in with number of characters read from * src, unless NULL. */ @@ -983,8 +987,8 @@ Tcl_UtfToTitle( int TclpUtfNcmp2( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct, /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ unsigned long numBytes) /* Number of *bytes* to compare. */ { /* @@ -1030,8 +1034,8 @@ TclpUtfNcmp2( int Tcl_UtfNcmp( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct, /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; @@ -1078,8 +1082,8 @@ Tcl_UtfNcmp( int Tcl_UtfNcasecmp( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct, /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; @@ -1105,46 +1109,6 @@ Tcl_UtfNcasecmp( /* *---------------------------------------------------------------------- * - * Tcl_UtfNcasecmp -- - * - * Compare UTF chars of string cs to string ct case insensitively. - * Replacement for strcasecmp in Tcl core, in places where UTF-8 should - * be handled. - * - * Results: - * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -TclUtfCasecmp( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct) /* UTF string cs is compared to. */ -{ - while (*cs && *ct) { - Tcl_UniChar ch1, ch2; - - cs += TclUtfToUniChar(cs, &ch1); - ct += TclUtfToUniChar(ct, &ch2); - if (ch1 != ch2) { - ch1 = Tcl_UniCharToLower(ch1); - ch2 = Tcl_UniCharToLower(ch2); - if (ch1 != ch2) { - return ch1 - ch2; - } - } - } - return UCHAR(*cs) - UCHAR(*ct); -} - - -/* - *---------------------------------------------------------------------- - * * Tcl_UniCharToUpper -- * * Compute the uppercase equivalent of the given Unicode character. @@ -1165,9 +1129,10 @@ Tcl_UniCharToUpper( int info = GetUniCharInfo(ch); if (GetCaseType(info) & 0x04) { - ch -= GetDelta(info); + return (Tcl_UniChar) (ch - GetDelta(info)); + } else { + return ch; } - return (Tcl_UniChar) ch; } /* @@ -1193,9 +1158,10 @@ Tcl_UniCharToLower( int info = GetUniCharInfo(ch); if (GetCaseType(info) & 0x02) { - ch += GetDelta(info); + return (Tcl_UniChar) (ch + GetDelta(info)); + } else { + return ch; } - return (Tcl_UniChar) ch; } /* @@ -1226,11 +1192,12 @@ Tcl_UniCharToTitle( * Subtract or add one depending on the original case. */ - ch += ((mode & 0x4) ? -1 : 1); + return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); } else if (mode == 0x4) { - ch -= GetDelta(info); + return (Tcl_UniChar) (ch - GetDelta(info)); + } else { + return ch; } - return (Tcl_UniChar) ch; } /* @@ -1252,7 +1219,7 @@ Tcl_UniCharToTitle( int Tcl_UniCharLen( - CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */ + const Tcl_UniChar *uniStr) /* Unicode string to find length of. */ { int len = 0; @@ -1282,8 +1249,8 @@ Tcl_UniCharLen( int Tcl_UniCharNcmp( - CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { #ifdef WORDS_BIGENDIAN @@ -1327,8 +1294,8 @@ Tcl_UniCharNcmp( int Tcl_UniCharNcasecmp( - CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { for ( ; numChars != 0; numChars--, ucs++, uct++) { @@ -1364,7 +1331,9 @@ int Tcl_UniCharIsAlnum( int ch) /* Unicode character to test. */ { - return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + + return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); } /* @@ -1387,7 +1356,8 @@ int Tcl_UniCharIsAlpha( int ch) /* Unicode character to test. */ { - return ((ALPHA_BITS >> GetCategory(ch)) & 1); + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((ALPHA_BITS >> category) & 1); } /* @@ -1410,7 +1380,7 @@ int Tcl_UniCharIsControl( int ch) /* Unicode character to test. */ { - return ((CONTROL_BITS >> GetCategory(ch)) & 1); + return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); } /* @@ -1433,7 +1403,7 @@ int Tcl_UniCharIsDigit( int ch) /* Unicode character to test. */ { - return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); + return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER; } /* @@ -1456,7 +1426,8 @@ int Tcl_UniCharIsGraph( int ch) /* Unicode character to test. */ { - return ((GRAPH_BITS >> GetCategory(ch)) & 1); + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); } /* @@ -1479,7 +1450,7 @@ int Tcl_UniCharIsLower( int ch) /* Unicode character to test. */ { - return (GetCategory(ch) == LOWERCASE_LETTER); + return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); } /* @@ -1502,7 +1473,8 @@ int Tcl_UniCharIsPrint( int ch) /* Unicode character to test. */ { - return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((PRINT_BITS >> category) & 1); } /* @@ -1525,7 +1497,8 @@ int Tcl_UniCharIsPunct( int ch) /* Unicode character to test. */ { - return ((PUNCT_BITS >> GetCategory(ch)) & 1); + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((PUNCT_BITS >> category) & 1); } /* @@ -1548,17 +1521,18 @@ int Tcl_UniCharIsSpace( int ch) /* Unicode character to test. */ { + register int category; + /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { - return TclIsSpaceProc((char) ch); - } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) { - return 1; + if (ch < 0x80) { + return isspace(UCHAR(ch)); /* INTL: ISO space */ } else { - return ((SPACE_BITS >> GetCategory(ch)) & 1); + category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((SPACE_BITS >> category) & 1); } } @@ -1582,7 +1556,7 @@ int Tcl_UniCharIsUpper( int ch) /* Unicode character to test. */ { - return (GetCategory(ch) == UPPERCASE_LETTER); + return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); } /* @@ -1605,7 +1579,9 @@ int Tcl_UniCharIsWordChar( int ch) /* Unicode character to test. */ { - return ((WORD_BITS >> GetCategory(ch)) & 1); + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + + return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); } /* @@ -1633,8 +1609,8 @@ Tcl_UniCharIsWordChar( int Tcl_UniCharCaseMatch( - CONST Tcl_UniChar *uniStr, /* Unicode String. */ - CONST Tcl_UniChar *uniPattern, + const Tcl_UniChar *uniStr, /* Unicode String. */ + const Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ @@ -1821,14 +1797,14 @@ Tcl_UniCharCaseMatch( int TclUniCharMatch( - CONST Tcl_UniChar *string, /* Unicode String. */ + const Tcl_UniChar *string, /* Unicode String. */ int strLen, /* Length of String */ - CONST Tcl_UniChar *pattern, /* Pattern, which may contain special + const Tcl_UniChar *pattern, /* Pattern, which may contain special * characters. */ int ptnLen, /* Length of Pattern */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - CONST Tcl_UniChar *stringEnd, *patternEnd; + const Tcl_UniChar *stringEnd, *patternEnd; Tcl_UniChar p; stringEnd = string + strLen; |