diff options
author | hobbs <hobbs> | 2003-02-18 02:25:41 (GMT) |
---|---|---|
committer | hobbs <hobbs> | 2003-02-18 02:25:41 (GMT) |
commit | 4ab5c4158044ba81cf3aa93c71d446fddd9c7ca5 (patch) | |
tree | 4897dca0fb72c73e8c8bee4e5d6b292195b07662 /generic/tclUtf.c | |
parent | d86b1af8bec78fdbcc8bf65bc205fd287e19fd5d (diff) | |
download | tcl-4ab5c4158044ba81cf3aa93c71d446fddd9c7ca5.zip tcl-4ab5c4158044ba81cf3aa93c71d446fddd9c7ca5.tar.gz tcl-4ab5c4158044ba81cf3aa93c71d446fddd9c7ca5.tar.bz2 |
* generic/tclExecute.c (TclExecuteByteCode INST_STR_MATCH):
* generic/tclCmdMZ.c (Tcl_StringObjCmd STR_MATCH):
* generic/tclUtf.c (TclUniCharMatch):
* generic/tclInt.decls: add private TclUniCharMatch function that
* generic/tclIntDecls.h: does string match on counted unicode
* generic/tclStubInit.c: strings. Tcl_UniCharCaseMatch has the
* tests/string.test: failing that it can't handle strings or
* tests/stringComp.test: patterns with embedded NULLs. Added
tests that actually try strings/pats with NULLs. TclUniCharMatch
should be TIPed and made public in the next minor version rev.
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 195 |
1 files changed, 193 insertions, 2 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 72a23ca..6c6835c 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclUtf.c,v 1.29 2002/11/12 02:26:29 hobbs Exp $ + * RCS: @(#) $Id: tclUtf.c,v 1.30 2003/02/18 02:25:45 hobbs Exp $ */ #include "tclInt.h" @@ -1584,7 +1584,10 @@ Tcl_UniCharIsWordChar(ch) * * See if a particular Unicode string matches a particular pattern. * Allows case insensitivity. This is the Unicode equivalent of - * the char* Tcl_StringCaseMatch. + * the char* Tcl_StringCaseMatch. The UniChar strings must be + * NULL-terminated. This has no provision for counted UniChar + * strings, thus should not be used where NULLs are expected in the + * UniChar string. Use TclUniCharMatch where possible. * * Results: * The return value is 1 if string matches pattern, and @@ -1755,3 +1758,191 @@ Tcl_UniCharCaseMatch(string, pattern, nocase) pattern++; } } + +/* + *---------------------------------------------------------------------- + * + * TclUniCharMatch -- + * + * See if a particular Unicode string matches a particular pattern. + * Allows case insensitivity. This is the Unicode equivalent of the + * char* Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch + * uses counted Strings, so embedded NULLs are allowed. + * + * Results: + * The return value is 1 if string matches pattern, and + * 0 otherwise. The matching operation permits the following + * special characters in the pattern: *?\[] (see the manual + * entry for details on what these mean). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUniCharMatch(string, strLen, pattern, ptnLen, nocase) + CONST Tcl_UniChar *string; /* Unicode String. */ + int strLen; /* length of String */ + CONST Tcl_UniChar *pattern; /* Pattern, which may contain special + * characters. */ + int ptnLen; /* length of Pattern */ + int nocase; /* 0 for case sensitive, 1 for insensitive */ +{ + CONST Tcl_UniChar *stringEnd, *patternEnd; + Tcl_UniChar p; + + stringEnd = string + strLen; + patternEnd = pattern + ptnLen; + + while (1) { + /* + * See if we're at the end of both the pattern and the string. If + * so, we succeeded. If we're at the end of the pattern but not at + * the end of the string, we failed. + */ + + if (pattern == patternEnd) { + return (string == stringEnd); + } + p = *pattern; + if ((string == stringEnd) && (p != '*')) { + return 0; + } + + /* + * Check for a "*" as the next pattern character. It matches any + * substring. We handle this by skipping all the characters up to the + * next matching one in the pattern, and then calling ourselves + * recursively for each postfix of string, until either we match or we + * reach the end of the string. + */ + + if (p == '*') { + /* + * Skip all successive *'s in the pattern + */ + while (*(++pattern) == '*') {} + if (pattern == patternEnd) { + return 1; + } + p = *pattern; + if (nocase) { + p = Tcl_UniCharToLower(p); + } + while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character + */ + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while ((string < stringEnd) && (p != *string) + && (p != Tcl_UniCharToLower(*string))) { + string++; + } + } else { + while ((string < stringEnd) && (p != *string)) { + string++; + } + } + } + if (TclUniCharMatch(string, stringEnd - string, + pattern, patternEnd - pattern, nocase)) { + return 1; + } + if (string == stringEnd) { + return 0; + } + string++; + } + } + + /* + * Check for a "?" as the next pattern character. It matches + * any single character. + */ + + if (p == '?') { + pattern++; + string++; + continue; + } + + /* + * Check for a "[" as the next pattern character. It is followed + * by a list of characters that are acceptable, or by a range + * (two characters separated by "-"). + */ + + if (p == '[') { + Tcl_UniChar ch1, startChar, endChar; + + pattern++; + ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); + string++; + while (1) { + if ((*pattern == ']') || (pattern == patternEnd)) { + return 0; + } + startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); + pattern++; + if (*pattern == '-') { + pattern++; + if (pattern == patternEnd) { + return 0; + } + endChar = (nocase ? Tcl_UniCharToLower(*pattern) + : *pattern); + pattern++; + if (((startChar <= ch1) && (ch1 <= endChar)) + || ((endChar <= ch1) && (ch1 <= startChar))) { + /* + * Matches ranges of form [a-z] or [z-a]. + */ + break; + } + } else if (startChar == ch1) { + break; + } + } + while (*pattern != ']') { + if (pattern == patternEnd) { + pattern--; + break; + } + pattern++; + } + pattern++; + continue; + } + + /* + * If the next pattern character is '\', just strip off the '\' + * so we do exact matching on the character that follows. + */ + + if (p == '\\') { + if (++pattern == patternEnd) { + return 0; + } + } + + /* + * There's no special character. Just make sure that the next + * bytes of each string match. + */ + + if (nocase) { + if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { + return 0; + } + } else if (*string != *pattern) { + return 0; + } + string++; + pattern++; + } +} |