diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 342 |
1 files changed, 320 insertions, 22 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 169f240..82adf65 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -799,7 +799,7 @@ Tcl_UtfCharComplete( */ int -Tcl_NumUtfChars( +TclNumUtfChars( const char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ @@ -850,6 +850,61 @@ Tcl_NumUtfChars( return i; } +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +#undef Tcl_NumUtfChars +int +Tcl_NumUtfChars( + const char *src, /* The UTF-8 string to measure. */ + int length) /* The length of the string in bytes, or -1 + * for strlen(string). */ +{ + unsigned short ch = 0; + int i = 0; + + if (length < 0) { + /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ + while ((*src != '\0') && (i < INT_MAX)) { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + } else { + /* Will return value between 0 and length. No overflow checks. */ + + /* Pointer to the end of string. Never read endPtr[0] */ + const char *endPtr = src + length; + /* Pointer to last byte where optimization still can be used */ + const char *optPtr = endPtr - 4; + + /* + * Optimize away the call in this loop. Justified because... + * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr) + * By initialization above (endPtr - optPtr) = TCL_UTF_MAX + * So (endPtr - src) >= TCL_UTF_MAX, and passing that to + * Tcl_UtfCharComplete we know will cause return of 1. + */ + while (src <= optPtr + /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + /* Loop over the remaining string where call must happen */ + while (src < endPtr) { + if (Tcl_UtfCharComplete(src, endPtr - src)) { + src += Tcl_UtfToChar16(src, &ch); + } else { + /* + * src points to incomplete UTF-8 sequence + * Treat first byte as character and count it + */ + src++; + } + i++; + } + } + return i; +} +#endif + /* *--------------------------------------------------------------------------- * @@ -1127,22 +1182,20 @@ Tcl_UniCharAtIndex( const char *src, /* The UTF-8 string to dereference. */ int index) /* The position of the desired character. */ { - Tcl_UniChar ch = 0; + unsigned short ch = 0; int i = 0; if (index < 0) { return -1; } while (index-- > 0) { - i = TclUtfToUniChar(src, &ch); + i = Tcl_UtfToChar16(src, &ch); src += i; } -#if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (i < 3)) { /* Index points at character following high Surrogate */ return -1; } -#endif TclUtfToUCS4(src, &i); return i; } @@ -1166,27 +1219,56 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ +#if TCL_UTF_MAX < 4 +# undef Tcl_UtfToUniChar +# define Tcl_UtfToUniChar Tcl_UtfToChar16 +#endif + const char * -Tcl_UtfAtIndex( +TclUtfAtIndex( const char *src, /* The UTF-8 string. */ int index) /* The position of the desired character. */ { - Tcl_UniChar ch = 0; + Tcl_UniChar ch = 0; int len = 0; while (index-- > 0) { - len = TclUtfToUniChar(src, &ch); + len = (Tcl_UtfToUniChar)(src, &ch); src += len; } #if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (len < 3)) { /* Index points at character following high Surrogate */ - src += TclUtfToUniChar(src, &ch); + src += (Tcl_UtfToUniChar)(src, &ch); } #endif return src; } +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +#undef Tcl_UtfAtIndex +const char * +Tcl_UtfAtIndex( + const char *src, /* The UTF-8 string. */ + int index) /* The position of the desired character. */ +{ + unsigned short ch = 0; + int len = 0; + + while (index-- > 0) { + len = Tcl_UtfToChar16(src, &ch); + src += len; + } + if ((ch >= 0xD800) && (len < 3)) { + /* Index points at character following high Surrogate */ + src += Tcl_UtfToChar16(src, &ch); + } + return src; +} + + +#endif + /* *--------------------------------------------------------------------------- * @@ -1849,7 +1931,7 @@ Tcl_UniCharLen( */ int -Tcl_UniCharNcmp( +TclUniCharNcmp( const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ @@ -1868,21 +1950,47 @@ Tcl_UniCharNcmp( for ( ; numChars != 0; ucs++, uct++, numChars--) { if (*ucs != *uct) { -#if TCL_UTF_MAX < 4 + return (*ucs - *uct); + } + } + return 0; +#endif /* WORDS_BIGENDIAN */ +} + +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +int +Tcl_UniCharNcmp( + const unsigned short *ucs, /* Unicode string to compare to uct. */ + const unsigned short *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ +{ +#if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) + /* + * We are definitely on a big-endian machine; memcmp() is safe + */ + + return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); + +#else /* !WORDS_BIGENDIAN */ + /* + * We can't simply call memcmp() because that is not lexically correct. + */ + + for ( ; numChars != 0; ucs++, uct++, numChars--) { + if (*ucs != *uct) { /* special case for handling upper surrogates */ if (((*ucs & 0xFC00) == 0xD800) && ((*uct & 0xFC00) != 0xD800)) { return 1; } else if (((*uct & 0xFC00) == 0xD800)) { return -1; } -#endif return (*ucs - *uct); } } return 0; #endif /* WORDS_BIGENDIAN */ } - +#endif /* *---------------------------------------------------------------------- * @@ -1902,31 +2010,51 @@ Tcl_UniCharNcmp( */ int -Tcl_UniCharNcasecmp( +TclUniCharNcasecmp( const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { for ( ; numChars != 0; numChars--, ucs++, uct++) { if (*ucs != *uct) { - Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); - Tcl_UniChar lct = Tcl_UniCharToLower(*uct); + int lcs = Tcl_UniCharToLower(*ucs); + int lct = Tcl_UniCharToLower(*uct); + + if (lcs != lct) { + return (lcs - lct); + } + } + } + return 0; +} + +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +int +Tcl_UniCharNcasecmp( + const unsigned short *ucs, /* Unicode string to compare to uct. */ + const unsigned short *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ +{ + for ( ; numChars != 0; numChars--, ucs++, uct++) { + if (*ucs != *uct) { + unsigned short lcs = Tcl_UniCharToLower(*ucs); + unsigned short lct = Tcl_UniCharToLower(*uct); if (lcs != lct) { -#if TCL_UTF_MAX < 4 /* special case for handling upper surrogates */ if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) { return 1; } else if (((lct & 0xFC00) == 0xD800)) { return -1; } -#endif return (lcs - lct); } } } return 0; } +#endif + /* *---------------------------------------------------------------------- @@ -2290,14 +2418,182 @@ Tcl_UniCharIsWordChar( */ int -Tcl_UniCharCaseMatch( +TclUniCharCaseMatch( const Tcl_UniChar *uniStr, /* Unicode String. */ const Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - Tcl_UniChar ch1 = 0, p; + int ch1 = 0, p; + + while (1) { + p = *uniPattern; + + /* + * See if we're at the end of both the pattern and the string. If so, + * we succeeded. If we're at the end of the pattern but not at the end + * of the string, we failed. + */ + + if (p == 0) { + return (*uniStr == 0); + } + if ((*uniStr == 0) && (p != '*')) { + return 0; + } + + /* + * Check for a "*" as the next pattern character. It matches any + * substring. We handle this by skipping all the characters up to the + * next matching one in the pattern, and then calling ourselves + * recursively for each postfix of string, until either we match or we + * reach the end of the string. + */ + + if (p == '*') { + /* + * Skip all successive *'s in the pattern + */ + + while (*(++uniPattern) == '*') { + /* empty body */ + } + p = *uniPattern; + if (p == 0) { + return 1; + } + if (nocase) { + p = Tcl_UniCharToLower(p); + } + while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character + */ + + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while (*uniStr && (p != *uniStr) + && (p != Tcl_UniCharToLower(*uniStr))) { + uniStr++; + } + } else { + while (*uniStr && (p != *uniStr)) { + uniStr++; + } + } + } + if (TclUniCharCaseMatch(uniStr, uniPattern, nocase)) { + return 1; + } + if (*uniStr == 0) { + return 0; + } + uniStr++; + } + } + + /* + * Check for a "?" as the next pattern character. It matches any + * single character. + */ + + if (p == '?') { + uniPattern++; + uniStr++; + continue; + } + + /* + * Check for a "[" as the next pattern character. It is followed by a + * list of characters that are acceptable, or by a range (two + * characters separated by "-"). + */ + + if (p == '[') { + int startChar, endChar; + + uniPattern++; + ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); + uniStr++; + while (1) { + if ((*uniPattern == ']') || (*uniPattern == 0)) { + return 0; + } + startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) + : *uniPattern); + uniPattern++; + if (*uniPattern == '-') { + uniPattern++; + if (*uniPattern == 0) { + return 0; + } + endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) + : *uniPattern); + uniPattern++; + if (((startChar <= ch1) && (ch1 <= endChar)) + || ((endChar <= ch1) && (ch1 <= startChar))) { + /* + * Matches ranges of form [a-z] or [z-a]. + */ + break; + } + } else if (startChar == ch1) { + break; + } + } + while (*uniPattern != ']') { + if (*uniPattern == 0) { + uniPattern--; + break; + } + uniPattern++; + } + uniPattern++; + continue; + } + + /* + * If the next pattern character is '\', just strip off the '\' so we + * do exact matching on the character that follows. + */ + + if (p == '\\') { + if (*(++uniPattern) == '\0') { + return 0; + } + } + + /* + * There's no special character. Just make sure that the next bytes of + * each string match. + */ + + if (nocase) { + if (Tcl_UniCharToLower(*uniStr) != + Tcl_UniCharToLower(*uniPattern)) { + return 0; + } + } else if (*uniStr != *uniPattern) { + return 0; + } + uniStr++; + uniPattern++; + } +} + +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +int +Tcl_UniCharCaseMatch( + const unsigned short *uniStr, /* Unicode String. */ + const unsigned short *uniPattern, + /* Pattern, which may contain special + * characters. */ + int nocase) /* 0 for case sensitive, 1 for insensitive */ +{ + unsigned short ch1 = 0, p; while (1) { p = *uniPattern; @@ -2385,7 +2681,7 @@ Tcl_UniCharCaseMatch( */ if (p == '[') { - Tcl_UniChar startChar, endChar; + unsigned short startChar, endChar; uniPattern++; ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); @@ -2455,7 +2751,9 @@ Tcl_UniCharCaseMatch( uniPattern++; } } +#endif + /* *---------------------------------------------------------------------- * @@ -2680,7 +2978,7 @@ TclUtfToUCS4( int *ucs4Ptr) /* Filled with the UCS4 codepoint represented * by the UTF-8 string. */ { - /* Make use of the #undef Tcl_UtfToUniChar above, which already handles UCS4. */ +# undef Tcl_UtfToUniChar return Tcl_UtfToUniChar(src, ucs4Ptr); } |