diff options
author | aspect <aspect+tclcore@abstracted-spleen.org> | 2017-05-06 00:36:49 (GMT) |
---|---|---|
committer | aspect <aspect+tclcore@abstracted-spleen.org> | 2017-05-06 00:36:49 (GMT) |
commit | e12f29cce2b1255cb3a6e788b67148d415a13f06 (patch) | |
tree | fdbc52dc37e410842489414695a4245630a43133 | |
parent | a291109ed2c14d7a263b8b8132e2688266223c97 (diff) | |
download | tcl-e12f29cce2b1255cb3a6e788b67148d415a13f06.zip tcl-e12f29cce2b1255cb3a6e788b67148d415a13f06.tar.gz tcl-e12f29cce2b1255cb3a6e788b67148d415a13f06.tar.bz2 |
streamline StringCaseMatch with switch and comments. BEHAVIOUR CHANGE: incomplete bracket groups now match nothing
-rw-r--r-- | generic/tclUtil.c | 278 |
1 files changed, 117 insertions, 161 deletions
diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 34d4be2..17a796e 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1920,206 +1920,162 @@ Tcl_StringCaseMatch( { int p, charLen; CONST char *pstart = pattern; - Tcl_UniChar ch1, ch2; + Tcl_UniChar pch, sch; + Tcl_UniChar startChar, endChar; while (1) { - p = *pattern; - - /* - * See if we're at the end of both the pattern and the string. If so, - * we succeeded. If we're at the end of the pattern but not at the end - * of the string, we failed. - */ + switch (*pattern) { + case '\0': + return (*str == '\0'); - if (p == '\0') { - return (*str == '\0'); - } - if ((*str == '\0') && (p != '*')) { - return 0; - } - - /* - * Check for a "*" as the next pattern character. It matches any - * substring. We handle this by calling ourselves recursively for each - * postfix of string, until either we match or we reach the end of the - * string. - */ - - if (p == '*') { - /* - * Skip all successive *'s in the pattern - */ + case '?': + if (*str == '\0') { + return 0; + } else { + str += TclUtfToUniChar(str, &sch); + } + ++pattern; + break; - while (*(++pattern) == '*') {} - p = *pattern; - if (p == '\0') { - return 1; - } + case '*': + /* skip runs of *** */ + while (*(++pattern) == '*') {} - /* - * This is a special case optimization for single-byte utf. - */ + /* if end of pattern, we have a match */ + if (*pattern == '\0') { + return 1; + } - if (UCHAR(*pattern) < 0x80) { - ch2 = (Tcl_UniChar) + /* peek at the next pattern char */ + if (UCHAR(*pattern) < 0x80) { + pch = (Tcl_UniChar) (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); - } else { - Tcl_UtfToUniChar(pattern, &ch2); - if (nocase) { - ch2 = Tcl_UniCharToLower(ch2); + } else { + TclUtfToUniChar(pattern, &pch); + if (nocase) { + pch = Tcl_UniCharToLower(pch); + } } - } - while (1) { - /* - * Optimization for matching - cruise through the string - * quickly if the next char in the pattern isn't a special - * character - */ - - if ((p != '[') && (p != '?') && (p != '\\')) { - if (nocase) { + /* if the next char in pattern is a literal, zoom through str to the next match */ + switch (*pattern) { + case '[': case '?': case '\\': + break; + default: while (*str) { - charLen = TclUtfToUniChar(str, &ch1); - if (ch2==ch1 || ch2==Tcl_UniCharToLower(ch1)) { + charLen = TclUtfToUniChar(str, &sch); + if ( (pch == sch) + || (nocase && (pch == Tcl_UniCharToLower(sch)))) { break; } str += charLen; } - } else { - /* - * There's no point in trying to make this code - * shorter, as the number of bytes you want to compare - * each time is non-constant. - */ + } - while (*str) { - charLen = TclUtfToUniChar(str, &ch1); - if (ch2 == ch1) { - break; - } - str += charLen; - } + while (*str != '\0') { + /* recursion! */ + if (Tcl_StringCaseMatch(str, pattern, nocase)) { + return 1; } + str += TclUtfToUniChar(str, &sch); } - if (Tcl_StringCaseMatch(str, pattern, nocase)) { - return 1; - } - if (*str == '\0') { - return 0; - } - str += TclUtfToUniChar(str, &ch1); - } - } - - /* - * Check for a "?" as the next pattern character. It matches any - * single character. - */ - - if (p == '?') { - pattern++; - str += TclUtfToUniChar(str, &ch1); - continue; - } - - /* - * Check for a "[" as the next pattern character. It is followed by a - * list of characters that are acceptable, or by a range (two - * characters separated by "-"). - */ + break; - if (p == '[') { - Tcl_UniChar startChar, endChar; + case '[': + ++pattern; - pattern++; - if (UCHAR(*str) < 0x80) { - ch1 = (Tcl_UniChar) + if(UCHAR(*str) < 0x80) { + sch = (Tcl_UniChar) (nocase ? tolower(UCHAR(*str)) : UCHAR(*str)); - str++; - } else { - str += Tcl_UtfToUniChar(str, &ch1); - if (nocase) { - ch1 = Tcl_UniCharToLower(ch1); - } - } - while (1) { - if ((*pattern == ']') || (*pattern == '\0')) { - return 0; - } - if (UCHAR(*pattern) < 0x80) { - startChar = (Tcl_UniChar) (nocase - ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); - pattern++; + ++str; } else { - pattern += Tcl_UtfToUniChar(pattern, &startChar); + str += TclUtfToUniChar(str, &sch); if (nocase) { - startChar = Tcl_UniCharToLower(startChar); + sch = Tcl_UniCharToLower(sch); } } - if (*pattern == '-') { - pattern++; + + while (1) { + if (*pattern == ']') { + /* end of range */ + return 0; + } if (*pattern == '\0') { + /* illegal pattern */ + // WAS: break; return 0; } if (UCHAR(*pattern) < 0x80) { - endChar = (Tcl_UniChar) (nocase - ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); - pattern++; + pch = (Tcl_UniChar) + (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); + ++pattern; } else { - pattern += Tcl_UtfToUniChar(pattern, &endChar); + pattern += TclUtfToUniChar(pattern, &pch); if (nocase) { - endChar = Tcl_UniCharToLower(endChar); + pch = Tcl_UniCharToLower(pch); } } - if (((startChar <= ch1) && (ch1 <= endChar)) - || ((endChar <= ch1) && (ch1 <= startChar))) { - /* - * Matches ranges of form [a-z] or [z-a]. - */ - + startChar = pch; + if (*pattern == '-') { + ++pattern; + if (*pattern == '\0') { + /* illegal pattern */ + return 0; + } + if (UCHAR(*pattern) < 0x80) { + pch = (Tcl_UniChar) + (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); + ++pattern; + } else { + pattern += TclUtfToUniChar(pattern, &pch); + if (nocase) { + pch = Tcl_UniCharToLower(pch); + } + } + endChar = pch; + if (((startChar <= sch) && (sch <= endChar)) + || ((endChar <= sch) && (sch <= startChar))) { + /* matches ranges of form [a-z] or [z-a] */ + break; + } + /* otherwise, process the rest of the [] */ + continue; + } else if (startChar == sch) { break; } - } else if (startChar == ch1) { - break; } - } - while (*pattern != ']') { - if (*pattern == '\0') { - pattern = Tcl_UtfPrev(pattern, pstart); - break; + /* if we've matched in [], *pattern is still inside the brackets */ + while (*pattern != ']') { + if (*pattern == '\0') { + /* illegal pattern */ + // WAS: --pattern; break; + return 0; + } + pattern++; } pattern++; - } - pattern++; - continue; - } - - /* - * If the next pattern character is '\', just strip off the '\' so we - * do exact matching on the character that follows. - */ - - if (p == '\\') { - pattern++; - if (*pattern == '\0') { - return 0; - } - } + break; - /* - * There's no special character. Just make sure that the next bytes of - * each string match. - */ + case '\\': + ++pattern; + if (*pattern == '\0') { + /* illegal pattern */ + return 0; + } + // fall through to literal match - str += TclUtfToUniChar(str, &ch1); - pattern += TclUtfToUniChar(pattern, &ch2); - if (nocase) { - if (Tcl_UniCharToLower(ch1) != Tcl_UniCharToLower(ch2)) { - return 0; - } - } else if (ch1 != ch2) { - return 0; + default: + /* literal match */ + str += TclUtfToUniChar(str, &sch); + pattern += TclUtfToUniChar(pattern, &pch); + if (nocase) { + if (Tcl_UniCharToLower(sch) != Tcl_UniCharToLower(pch)) { + return 0; + } + } else if (sch != pch) { + return 0; + } + break; } } } |