streamline StringCaseMatch with switch and comments. BEHAVIOUR CHANGE: incomplete bracket groups now match nothing

author: aspect <aspect+tclcore@abstracted-spleen.org> 2017-05-06 00:36:49 (GMT)
committer: aspect <aspect+tclcore@abstracted-spleen.org> 2017-05-06 00:36:49 (GMT)
commit: e12f29cce2b1255cb3a6e788b67148d415a13f06 (patch)
tree: fdbc52dc37e410842489414695a4245630a43133
parent: a291109ed2c14d7a263b8b8132e2688266223c97 (diff)
download: tcl-e12f29cce2b1255cb3a6e788b67148d415a13f06.zip
tcl-e12f29cce2b1255cb3a6e788b67148d415a13f06.tar.gz
tcl-e12f29cce2b1255cb3a6e788b67148d415a13f06.tar.bz2
1 files changed, 117 insertions, 161 deletions
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index 34d4be2..17a796e 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -1920,206 +1920,162 @@ Tcl_StringCaseMatch(
 {
     int p, charLen;
     CONST char *pstart = pattern;
-    Tcl_UniChar ch1, ch2;
+    Tcl_UniChar pch, sch;
+    Tcl_UniChar startChar, endChar;
 
     while (1) {
-	p = *pattern;
-
-	/*
-	 * See if we're at the end of both the pattern and the string. If so,
-	 * we succeeded. If we're at the end of the pattern but not at the end
-	 * of the string, we failed.
-	 */
+	switch (*pattern) {
+	    case '\0':
+		return (*str == '\0');
 
-	if (p == '\0') {
-	    return (*str == '\0');
-	}
-	if ((*str == '\0') && (p != '*')) {
-	    return 0;
-	}
-
-	/*
-	 * Check for a "*" as the next pattern character. It matches any
-	 * substring. We handle this by calling ourselves recursively for each
-	 * postfix of string, until either we match or we reach the end of the
-	 * string.
-	 */
-
-	if (p == '*') {
-	    /*
-	     * Skip all successive *'s in the pattern
-	     */
+	    case '?':
+		if (*str == '\0') {
+		    return 0;
+		} else {
+		    str += TclUtfToUniChar(str, &sch);
+		}
+		++pattern;
+		break;
 
-	    while (*(++pattern) == '*') {}
-	    p = *pattern;
-	    if (p == '\0') {
-		return 1;
-	    }
+	    case '*':
+		/* skip runs of *** */
+		while (*(++pattern) == '*') {}
 
-	    /*
-	     * This is a special case optimization for single-byte utf.
-	     */
+		/* if end of pattern, we have a match */
+		if (*pattern == '\0') {
+		    return 1;
+		}
 
-	    if (UCHAR(*pattern) < 0x80) {
-		ch2 = (Tcl_UniChar)
+		/* peek at the next pattern char */
+		if (UCHAR(*pattern) < 0x80) {
+		    pch = (Tcl_UniChar)
 			(nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
-	    } else {
-		Tcl_UtfToUniChar(pattern, &ch2);
-		if (nocase) {
-		    ch2 = Tcl_UniCharToLower(ch2);
+		} else {
+		    TclUtfToUniChar(pattern, &pch);
+		    if (nocase) {
+			pch = Tcl_UniCharToLower(pch);
+		    }
 		}
-	    }
 
-	    while (1) {
-		/*
-		 * Optimization for matching - cruise through the string
-		 * quickly if the next char in the pattern isn't a special
-		 * character
-		 */
-
-		if ((p != '[') && (p != '?') && (p != '\\')) {
-		    if (nocase) {
+		/* if the next char in pattern is a literal, zoom through str to the next match */
+		switch (*pattern) {
+		    case '[': case '?': case '\\':
+			break;
+		    default:
 			while (*str) {
-			    charLen = TclUtfToUniChar(str, &ch1);
-			    if (ch2==ch1 || ch2==Tcl_UniCharToLower(ch1)) {
+			    charLen = TclUtfToUniChar(str, &sch);
+			    if ( (pch == sch)
+				    || (nocase && (pch == Tcl_UniCharToLower(sch)))) {
 				break;
 			    }
 			    str += charLen;
 			}
-		    } else {
-			/*
-			 * There's no point in trying to make this code
-			 * shorter, as the number of bytes you want to compare
-			 * each time is non-constant.
-			 */
+		}
 
-			while (*str) {
-			    charLen = TclUtfToUniChar(str, &ch1);
-			    if (ch2 == ch1) {
-				break;
-			    }
-			    str += charLen;
-			}
+		while (*str != '\0') {
+		    /* recursion! */
+		    if (Tcl_StringCaseMatch(str, pattern, nocase)) {
+			return 1;
 		    }
+		    str += TclUtfToUniChar(str, &sch);
 		}
-		if (Tcl_StringCaseMatch(str, pattern, nocase)) {
-		    return 1;
-		}
-		if (*str == '\0') {
-		    return 0;
-		}
-		str += TclUtfToUniChar(str, &ch1);
-	    }
-	}
-
-	/*
-	 * Check for a "?" as the next pattern character. It matches any
-	 * single character.
-	 */
-
-	if (p == '?') {
-	    pattern++;
-	    str += TclUtfToUniChar(str, &ch1);
-	    continue;
-	}
-
-	/*
-	 * Check for a "[" as the next pattern character. It is followed by a
-	 * list of characters that are acceptable, or by a range (two
-	 * characters separated by "-").
-	 */
+		break;
 
-	if (p == '[') {
-	    Tcl_UniChar startChar, endChar;
+	    case '[':
+		++pattern;
 
-	    pattern++;
-	    if (UCHAR(*str) < 0x80) {
-		ch1 = (Tcl_UniChar)
+		if(UCHAR(*str) < 0x80) {
+		    sch = (Tcl_UniChar)
 			(nocase ? tolower(UCHAR(*str)) : UCHAR(*str));
-		str++;
-	    } else {
-		str += Tcl_UtfToUniChar(str, &ch1);
-		if (nocase) {
-		    ch1 = Tcl_UniCharToLower(ch1);
-		}
-	    }
-	    while (1) {
-		if ((*pattern == ']') || (*pattern == '\0')) {
-		    return 0;
-		}
-		if (UCHAR(*pattern) < 0x80) {
-		    startChar = (Tcl_UniChar) (nocase
-			    ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
-		    pattern++;
+		    ++str;
 		} else {
-		    pattern += Tcl_UtfToUniChar(pattern, &startChar);
+		    str += TclUtfToUniChar(str, &sch);
 		    if (nocase) {
-			startChar = Tcl_UniCharToLower(startChar);
+			sch = Tcl_UniCharToLower(sch);
 		    }
 		}
-		if (*pattern == '-') {
-		    pattern++;
+
+		while (1) {
+		    if (*pattern == ']') {
+			/* end of range */
+			return 0;
+		    }
 		    if (*pattern == '\0') {
+			/* illegal pattern */
+			// WAS: break;
 			return 0;
 		    }
 		    if (UCHAR(*pattern) < 0x80) {
-			endChar = (Tcl_UniChar) (nocase
-				? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
-			pattern++;
+			pch = (Tcl_UniChar)
+			    (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
+			++pattern;
 		    } else {
-			pattern += Tcl_UtfToUniChar(pattern, &endChar);
+			pattern += TclUtfToUniChar(pattern, &pch);
 			if (nocase) {
-			    endChar = Tcl_UniCharToLower(endChar);
+			    pch = Tcl_UniCharToLower(pch);
 			}
 		    }
-		    if (((startChar <= ch1) && (ch1 <= endChar))
-			    || ((endChar <= ch1) && (ch1 <= startChar))) {
-			/*
-			 * Matches ranges of form [a-z] or [z-a].
-			 */
-
+		    startChar = pch;
+		    if (*pattern == '-') {
+			++pattern;
+			if (*pattern == '\0') {
+			    /* illegal pattern */
+			    return 0;
+			}
+			if (UCHAR(*pattern) < 0x80) {
+			    pch = (Tcl_UniChar)
+				(nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
+			    ++pattern;
+			} else {
+			    pattern += TclUtfToUniChar(pattern, &pch);
+			    if (nocase) {
+				pch = Tcl_UniCharToLower(pch);
+			    }
+			}
+			endChar = pch;
+			if (((startChar <= sch) && (sch <= endChar))
+				|| ((endChar <= sch) && (sch <= startChar))) {
+			    /* matches ranges of form [a-z] or [z-a] */
+			    break;
+			}
+			/* otherwise, process the rest of the [] */
+			continue;
+		    } else if (startChar == sch) {
 			break;
 		    }
-		} else if (startChar == ch1) {
-		    break;
 		}
-	    }
-	    while (*pattern != ']') {
-		if (*pattern == '\0') {
-		    pattern = Tcl_UtfPrev(pattern, pstart);
-		    break;
+		/* if we've matched in [], *pattern is still inside the brackets */
+		while (*pattern != ']') {
+		    if (*pattern == '\0') {
+			/* illegal pattern */
+			// WAS: --pattern; break;
+			return 0;
+		    }
+		    pattern++;
 		}
 		pattern++;
-	    }
-	    pattern++;
-	    continue;
-	}
-
-	/*
-	 * If the next pattern character is '\', just strip off the '\' so we
-	 * do exact matching on the character that follows.
-	 */
-
-	if (p == '\\') {
-	    pattern++;
-	    if (*pattern == '\0') {
-		return 0;
-	    }
-	}
+		break;
 
-	/*
-	 * There's no special character. Just make sure that the next bytes of
-	 * each string match.
-	 */
+	    case '\\':
+		++pattern;
+		if (*pattern == '\0') {
+		    /* illegal pattern */
+		    return 0;
+		}
+		// fall through to literal match
 
-	str += TclUtfToUniChar(str, &ch1);
-	pattern += TclUtfToUniChar(pattern, &ch2);
-	if (nocase) {
-	    if (Tcl_UniCharToLower(ch1) != Tcl_UniCharToLower(ch2)) {
-		return 0;
-	    }
-	} else if (ch1 != ch2) {
-	    return 0;
+	    default:
+		/* literal match */
+		str += TclUtfToUniChar(str, &sch);
+		pattern += TclUtfToUniChar(pattern, &pch);
+		if (nocase) {
+		    if (Tcl_UniCharToLower(sch) != Tcl_UniCharToLower(pch)) {
+			return 0;
+		    }
+		} else if (sch != pch) {
+		    return 0;
+		}
+		break;
 	}
     }
 }
author	aspect <aspect+tclcore@abstracted-spleen.org>	2017-05-06 00:36:49 (GMT)
committer	aspect <aspect+tclcore@abstracted-spleen.org>	2017-05-06 00:36:49 (GMT)
commit	e12f29cce2b1255cb3a6e788b67148d415a13f06 (patch)
tree	fdbc52dc37e410842489414695a4245630a43133
parent	a291109ed2c14d7a263b8b8132e2688266223c97 (diff)
download	tcl-e12f29cce2b1255cb3a6e788b67148d415a13f06.zip tcl-e12f29cce2b1255cb3a6e788b67148d415a13f06.tar.gz tcl-e12f29cce2b1255cb3a6e788b67148d415a13f06.tar.bz2