diff options
author | hobbs <hobbs> | 2000-05-08 21:59:58 (GMT) |
---|---|---|
committer | hobbs <hobbs> | 2000-05-08 21:59:58 (GMT) |
commit | 09f4c1de476f86324d54f2e8c31a66870ce1c8bc (patch) | |
tree | 025da577bdce141098365ffb242ca0ae0be52104 /generic/tclUtf.c | |
parent | 63adaf2eb6d8949c310ea3f93c699ed6dd1c8839 (diff) | |
download | tcl-09f4c1de476f86324d54f2e8c31a66870ce1c8bc.zip tcl-09f4c1de476f86324d54f2e8c31a66870ce1c8bc.tar.gz tcl-09f4c1de476f86324d54f2e8c31a66870ce1c8bc.tar.bz2 |
* doc/Utf.3:
* generic/tclStubInit.c:
* generic/tcl.decls:
* generic/tclDecls.h:
* generic/tclUtf.c: Added new functions Tcl_UniCharNcasecmp and
Tcl_UniCharCaseMatch (unicode parallel to Tcl_StringCaseMatch)
* generic/tclUtil.c: rewrote Tcl_StringCaseMatch algorithm for
optimization and made Tcl_StringMatch just call Tcl_StringCaseMatch
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 217 |
1 files changed, 215 insertions, 2 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 5fe3c41..b62a26c 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclUtf.c,v 1.11 2000/01/11 22:09:00 hobbs Exp $ + * RCS: @(#) $Id: tclUtf.c,v 1.12 2000/05/08 21:59:58 hobbs Exp $ */ #include "tclInt.h" @@ -1301,7 +1301,43 @@ Tcl_UniCharNcmp(cs, ct, n) { for ( ; n != 0; n--, cs++, ct++) { if (*cs != *ct) { - return *cs - *ct; + return (*cs - *ct); + } + if (*cs == '\0') { + break; + } + } + return 0; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharNcasecmp -- + * + * Compare at most n unichars of string cs to string ct case + * insensitive. Both cs and ct are assumed to be at least n + * unichars long. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharNcasecmp(cs, ct, n) + CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */ + CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ + unsigned long n; /* Number of unichars to compare. */ +{ + for ( ; n != 0; n--, cs++, ct++) { + if ((*cs != *ct) && + (Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) { + return (*cs - *ct); } if (*cs == '\0') { break; @@ -1584,3 +1620,180 @@ Tcl_UniCharIsWordChar(ch) return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharCaseMatch -- + * + * See if a particular Unicode string matches a particular pattern. + * Allows case insensitivity. Thie is the Unicode equivalent of + * the char* Tcl_StringCaseMatch. + * + * Results: + * The return value is 1 if string matches pattern, and + * 0 otherwise. The matching operation permits the following + * special characters in the pattern: *?\[] (see the manual + * entry for details on what these mean). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharCaseMatch(string, pattern, nocase) + CONST Tcl_UniChar *string; /* Unicode String. */ + CONST Tcl_UniChar *pattern; /* Pattern, which may contain special + * characters. */ + int nocase; /* 0 for case sensitive, 1 for insensitive */ +{ + Tcl_UniChar ch1, p; + + while (1) { + p = *pattern; + + /* + * See if we're at the end of both the pattern and the string. If + * so, we succeeded. If we're at the end of the pattern but not at + * the end of the string, we failed. + */ + + if (p == 0) { + return (*string == 0); + } + if ((*string == 0) && (p != '*')) { + return 0; + } + + /* + * Check for a "*" as the next pattern character. It matches any + * substring. We handle this by skipping all the characters up to the + * next matching one in the pattern, and then calling ourselves + * recursively for each postfix of string, until either we match or we + * reach the end of the string. + */ + + if (p == '*') { + int pSpecial; + /* + * Skip all successive *'s in the pattern + */ + while (*(++pattern) == '*') {} + p = *pattern; + if (p == 0) { + return 1; + } + while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character + */ + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while (*string && (p != *string) + && (p != Tcl_UniCharToLower(*string))) { + string++; + } + } else { + while (*string && (p != *string)) { string++; } + } + } + if (Tcl_UniCharCaseMatch(string, pattern, nocase)) { + return 1; + } + if (*string == 0) { + return 0; + } + string++; + } + } + + /* + * Check for a "?" as the next pattern character. It matches + * any single character. + */ + + if (p == '?') { + pattern++; + string++; + continue; + } + + /* + * Check for a "[" as the next pattern character. It is followed + * by a list of characters that are acceptable, or by a range + * (two characters separated by "-"). + */ + + if (p == '[') { + Tcl_UniChar startChar, endChar; + + pattern++; + ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); + string++; + while (1) { + if ((*pattern == ']') || (*pattern == 0)) { + return 0; + } + startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); + pattern++; + if (*pattern == '-') { + pattern++; + if (*pattern == 0) { + return 0; + } + endChar = (nocase ? Tcl_UniCharToLower(*pattern) + : *pattern); + pattern++; + if (((startChar <= ch1) && (ch1 <= endChar)) + || ((endChar <= ch1) && (ch1 <= startChar))) { + /* + * Matches ranges of form [a-z] or [z-a]. + */ + break; + } + } else if (startChar == ch1) { + break; + } + } + while (*pattern != ']') { + if (*pattern == 0) { + pattern--; + break; + } + pattern++; + } + pattern++; + continue; + } + + /* + * If the next pattern character is '\', just strip off the '\' + * so we do exact matching on the character that follows. + */ + + if (p == '\\') { + if (*(++pattern) == '\0') { + return 0; + } + } + + /* + * There's no special character. Just make sure that the next + * bytes of each string match. + */ + + if (nocase) { + if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { + return 0; + } + } else if (*string != *pattern) { + return 0; + } + string++; + pattern++; + } +} |