diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 125 |
1 files changed, 56 insertions, 69 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 505dc91..6b5e2e8 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -21,36 +21,35 @@ * The following macros are used for fast character category tests. The * x_BITS values are shifted right by the category value to determine whether * the given category is included in the set. - */ + */ #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER)) +#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE)) + #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ | (1 << PARAGRAPH_SEPARATOR)) -#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) - -#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ - (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ - (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ - (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ - (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ - (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ - (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ - (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ - (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) +#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION)) #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) +#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \ + (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ + (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ + (1 << OTHER_NUMBER) | \ + (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ + (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) + /* - * Unicode characters less than this value are represented by themselves - * in UTF-8 strings. + * Unicode characters less than this value are represented by themselves + * in UTF-8 strings. */ #define UNICODE_SELF 0x80 @@ -108,7 +107,7 @@ static int UtfCount _ANSI_ARGS_((int ch)); * *--------------------------------------------------------------------------- */ - + INLINE static int UtfCount(ch) int ch; /* The Tcl_UniChar whose size is returned. */ @@ -146,14 +145,14 @@ UtfCount(ch) * * Results: * The return values is the number of bytes in the buffer that - * were consumed. + * were consumed. * * Side effects: * None. * *--------------------------------------------------------------------------- */ - + INLINE int Tcl_UniCharToUtf(ch, str) int ch; /* The Tcl_UniChar to be stored in the @@ -230,7 +229,7 @@ Tcl_UniCharToUtf(ch, str) * *--------------------------------------------------------------------------- */ - + char * Tcl_UniCharToUtfDString(wString, numChars, dsPtr) CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */ @@ -289,7 +288,7 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr) * *--------------------------------------------------------------------------- */ - + int Tcl_UtfToUniChar(str, chPtr) register CONST char *str; /* The UTF-8 string. */ @@ -297,7 +296,7 @@ Tcl_UtfToUniChar(str, chPtr) * by the UTF-8 string. */ { register int byte; - + /* * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. */ @@ -334,7 +333,7 @@ Tcl_UtfToUniChar(str, chPtr) * Three-byte-character lead byte followed by two trail bytes. */ - *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) + *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); return 3; } @@ -474,15 +473,15 @@ Tcl_UtfCharComplete(str, len) * Plan 9 utflen() and utfnlen(). * * Results: - * As above. + * As above. * * Side effects: * None. * *--------------------------------------------------------------------------- */ - -int + +int Tcl_NumUtfChars(str, len) register CONST char *str; /* The UTF-8 string to measure. */ int len; /* The length of the string in bytes, or -1 @@ -549,7 +548,7 @@ Tcl_UtfFindFirst(string, ch) { int len; Tcl_UniChar find; - + while (1) { len = TclUtfToUniChar(string, &find); if (find == ch) { @@ -590,7 +589,7 @@ Tcl_UtfFindLast(string, ch) int len; Tcl_UniChar find; CONST char *last; - + last = NULL; while (1) { len = TclUtfToUniChar(string, &find); @@ -624,9 +623,9 @@ Tcl_UtfFindLast(string, ch) * *--------------------------------------------------------------------------- */ - + CONST char * -Tcl_UtfNext(str) +Tcl_UtfNext(str) CONST char *str; /* The current location in the string. */ { Tcl_UniChar ch; @@ -664,7 +663,7 @@ Tcl_UtfPrev(str, start) { CONST char *look; int i, byte; - + str--; look = str; for (i = 0; i < TCL_UTF_MAX; i++) { @@ -685,7 +684,7 @@ Tcl_UtfPrev(str, start) } return str; } - + /* *--------------------------------------------------------------------------- * @@ -702,7 +701,7 @@ Tcl_UtfPrev(str, start) * *--------------------------------------------------------------------------- */ - + Tcl_UniChar Tcl_UniCharAtIndex(src, index) register CONST char *src; /* The UTF-8 string to dereference. */ @@ -740,7 +739,7 @@ Tcl_UtfAtIndex(src, index) register int index; /* The position of the desired character. */ { Tcl_UniChar ch; - + while (index > 0) { index--; src += TclUtfToUniChar(src, &ch); @@ -760,7 +759,7 @@ Tcl_UtfAtIndex(src, index) * returns the number of bytes written to dst. At most TCL_UTF_MAX * bytes are written to dst; dst must have been large enough to accept * those bytes. If readPtr isn't NULL then it is filled in with a - * count of the number of bytes in the backslash sequence. + * count of the number of bytes in the backslash sequence. * * Side effects: * The maximum number of bytes it takes to represent a Unicode @@ -839,7 +838,7 @@ Tcl_UtfToUpper(str) * the conversion (thereby causing a segfault), only copy the * upper case char to dst if its size is <= the original char. */ - + if (bytes < UtfCount(upChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; @@ -877,7 +876,7 @@ Tcl_UtfToLower(str) Tcl_UniChar ch, lowChar; char *src, *dst; int bytes; - + /* * Iterate over the string until we hit the terminating null. */ @@ -892,7 +891,7 @@ Tcl_UtfToLower(str) * the conversion (thereby causing a segfault), only copy the * lower case char to dst if its size is <= the original char. */ - + if (bytes < UtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; @@ -931,7 +930,7 @@ Tcl_UtfToTitle(str) Tcl_UniChar ch, titleChar, lowChar; char *src, *dst; int bytes; - + /* * Capitalize the first character and then lowercase the rest of the * characters until we get to a null. @@ -1216,7 +1215,7 @@ Tcl_UniCharLen(str) CONST Tcl_UniChar *str; /* Unicode string to find length of. */ { int len = 0; - + while (*str != '\0') { len++; str++; @@ -1322,9 +1321,7 @@ int Tcl_UniCharIsAlnum(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); + return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } /* @@ -1347,8 +1344,7 @@ int Tcl_UniCharIsAlpha(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((ALPHA_BITS >> category) & 1); + return ((ALPHA_BITS >> GetCategory(ch)) & 1); } /* @@ -1371,7 +1367,7 @@ int Tcl_UniCharIsControl(ch) int ch; /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); + return ((CONTROL_BITS >> GetCategory(ch)) & 1); } /* @@ -1394,8 +1390,7 @@ int Tcl_UniCharIsDigit(ch) int ch; /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) - == DECIMAL_DIGIT_NUMBER); + return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } /* @@ -1418,8 +1413,7 @@ int Tcl_UniCharIsGraph(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return (((PRINT_BITS >> category) & 1) && (ch != ' ')); + return ((GRAPH_BITS >> GetCategory(ch)) & 1); } /* @@ -1442,7 +1436,7 @@ int Tcl_UniCharIsLower(ch) int ch; /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); + return (GetCategory(ch) == LOWERCASE_LETTER); } /* @@ -1465,8 +1459,7 @@ int Tcl_UniCharIsPrint(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((PRINT_BITS >> category) & 1); + return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } /* @@ -1489,8 +1482,7 @@ int Tcl_UniCharIsPunct(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((PUNCT_BITS >> category) & 1); + return ((PUNCT_BITS >> GetCategory(ch)) & 1); } /* @@ -1513,18 +1505,15 @@ int Tcl_UniCharIsSpace(ch) int ch; /* Unicode character to test. */ { - register int category; - /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (ch < 0x80) { + if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { return isspace(UCHAR(ch)); /* INTL: ISO space */ } else { - category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((SPACE_BITS >> category) & 1); + return ((SPACE_BITS >> GetCategory(ch)) & 1); } } @@ -1548,7 +1537,7 @@ int Tcl_UniCharIsUpper(ch) int ch; /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); + return (GetCategory(ch) == UPPERCASE_LETTER); } /* @@ -1572,9 +1561,7 @@ int Tcl_UniCharIsWordChar(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); + return ((WORD_BITS >> GetCategory(ch)) & 1); } /* @@ -1609,16 +1596,16 @@ Tcl_UniCharCaseMatch(string, pattern, nocase) int nocase; /* 0 for case sensitive, 1 for insensitive */ { Tcl_UniChar ch1, p; - + while (1) { p = *pattern; - + /* * See if we're at the end of both the pattern and the string. If * so, we succeeded. If we're at the end of the pattern but not at * the end of the string, we failed. */ - + if (p == 0) { return (*string == 0); } @@ -1633,7 +1620,7 @@ Tcl_UniCharCaseMatch(string, pattern, nocase) * recursively for each postfix of string, until either we match or we * reach the end of the string. */ - + if (p == '*') { /* * Skip all successive *'s in the pattern @@ -1688,7 +1675,7 @@ Tcl_UniCharCaseMatch(string, pattern, nocase) * by a list of characters that are acceptable, or by a range * (two characters separated by "-"). */ - + if (p == '[') { Tcl_UniChar startChar, endChar; @@ -1818,7 +1805,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase) * recursively for each postfix of string, until either we match or we * reach the end of the string. */ - + if (p == '*') { /* * Skip all successive *'s in the pattern @@ -1876,7 +1863,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase) * by a list of characters that are acceptable, or by a range * (two characters separated by "-"). */ - + if (p == '[') { Tcl_UniChar ch1, startChar, endChar; |