From 4ab5c4158044ba81cf3aa93c71d446fddd9c7ca5 Mon Sep 17 00:00:00 2001 From: hobbs Date: Tue, 18 Feb 2003 02:25:41 +0000 Subject: * generic/tclExecute.c (TclExecuteByteCode INST_STR_MATCH): * generic/tclCmdMZ.c (Tcl_StringObjCmd STR_MATCH): * generic/tclUtf.c (TclUniCharMatch): * generic/tclInt.decls: add private TclUniCharMatch function that * generic/tclIntDecls.h: does string match on counted unicode * generic/tclStubInit.c: strings. Tcl_UniCharCaseMatch has the * tests/string.test: failing that it can't handle strings or * tests/stringComp.test: patterns with embedded NULLs. Added tests that actually try strings/pats with NULLs. TclUniCharMatch should be TIPed and made public in the next minor version rev. --- ChangeLog | 13 ++++ generic/tclCmdMZ.c | 11 +-- generic/tclExecute.c | 12 +++- generic/tclInt.decls | 9 ++- generic/tclIntDecls.h | 12 +++- generic/tclStubInit.c | 3 +- generic/tclUtf.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++++- tests/string.test | 37 +++++++++- tests/stringComp.test | 48 ++++++++++++- 9 files changed, 324 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 447cbba..8a9a233 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +2003-02-17 Jeff Hobbs + + * generic/tclExecute.c (TclExecuteByteCode INST_STR_MATCH): + * generic/tclCmdMZ.c (Tcl_StringObjCmd STR_MATCH): + * generic/tclUtf.c (TclUniCharMatch): + * generic/tclInt.decls: add private TclUniCharMatch function that + * generic/tclIntDecls.h: does string match on counted unicode + * generic/tclStubInit.c: strings. Tcl_UniCharCaseMatch has the + * tests/string.test: failing that it can't handle strings or + * tests/stringComp.test: patterns with embedded NULLs. Added + tests that actually try strings/pats with NULLs. TclUniCharMatch + should be TIPed and made public in the next minor version rev. + 2003-02-17 Miguel Sofer * generic/tclBasic.c (TclRenameCommand): 'oldFullName' object was diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d3deaae..2339965 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -14,7 +14,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclCmdMZ.c,v 1.80 2003/01/17 14:19:44 vincentdarley Exp $ + * RCS: @(#) $Id: tclCmdMZ.c,v 1.81 2003/02/18 02:25:43 hobbs Exp $ */ #include "tclInt.h" @@ -2008,6 +2008,7 @@ Tcl_StringObjCmd(dummy, interp, objc, objv) break; } case STR_MATCH: { + Tcl_UniChar *ustring1, *ustring2; int nocase = 0; if (objc < 4 || objc > 5) { @@ -2027,10 +2028,10 @@ Tcl_StringObjCmd(dummy, interp, objc, objv) return TCL_ERROR; } } - - Tcl_SetBooleanObj(resultPtr, - Tcl_UniCharCaseMatch(Tcl_GetUnicode(objv[objc-1]), - Tcl_GetUnicode(objv[objc-2]), nocase)); + ustring1 = Tcl_GetUnicodeFromObj(objv[objc-1], &length1); + ustring2 = Tcl_GetUnicodeFromObj(objv[objc-2], &length2); + Tcl_SetBooleanObj(resultPtr, TclUniCharMatch(ustring1, length1, + ustring2, length2, nocase)); break; } case STR_RANGE: { diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 93bf3a9..3333959 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -11,7 +11,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclExecute.c,v 1.92 2003/02/06 22:44:57 mdejong Exp $ + * RCS: @(#) $Id: tclExecute.c,v 1.93 2003/02/18 02:25:44 hobbs Exp $ */ #include "tclInt.h" @@ -2650,10 +2650,16 @@ TclExecuteByteCode(interp, codePtr) * Check that at least one of the objects is Unicode before * promoting both. */ + if ((valuePtr->typePtr == &tclStringType) || (value2Ptr->typePtr == &tclStringType)) { - match = Tcl_UniCharCaseMatch(Tcl_GetUnicode(valuePtr), - Tcl_GetUnicode(value2Ptr), nocase); + Tcl_UniChar *ustring1, *ustring2; + int length1, length2; + + ustring1 = Tcl_GetUnicodeFromObj(valuePtr, &length1); + ustring2 = Tcl_GetUnicodeFromObj(value2Ptr, &length2); + match = TclUniCharMatch(ustring1, length1, ustring2, length2, + nocase); } else { match = Tcl_StringCaseMatch(TclGetString(valuePtr), TclGetString(value2Ptr), nocase); diff --git a/generic/tclInt.decls b/generic/tclInt.decls index f2b2f99..8a68f70 100644 --- a/generic/tclInt.decls +++ b/generic/tclInt.decls @@ -12,7 +12,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: tclInt.decls,v 1.58 2002/12/06 23:22:59 hobbs Exp $ +# RCS: @(#) $Id: tclInt.decls,v 1.59 2003/02/18 02:25:45 hobbs Exp $ library tcl @@ -690,6 +690,13 @@ declare 172 generic { int TclInThreadExit(void) } +# added for 8.4.2 + +declare 173 generic { + int TclUniCharMatch (CONST Tcl_UniChar *string, int strLen, \ + CONST Tcl_UniChar *pattern, int ptnLen, int nocase) +} + ############################################################################## # Define the platform specific internal Tcl interface. These functions are diff --git a/generic/tclIntDecls.h b/generic/tclIntDecls.h index 5c23144..e41dca6 100644 --- a/generic/tclIntDecls.h +++ b/generic/tclIntDecls.h @@ -11,7 +11,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclIntDecls.h,v 1.48 2002/11/07 02:13:36 mdejong Exp $ + * RCS: @(#) $Id: tclIntDecls.h,v 1.49 2003/02/18 02:25:45 hobbs Exp $ */ #ifndef _TCLINTDECLS @@ -498,6 +498,11 @@ EXTERN int TclCheckExecutionTraces _ANSI_ARGS_(( Tcl_Obj *CONST objv[])); /* 172 */ EXTERN int TclInThreadExit _ANSI_ARGS_((void)); +/* 173 */ +EXTERN int TclUniCharMatch _ANSI_ARGS_(( + CONST Tcl_UniChar * string, int strLen, + CONST Tcl_UniChar * pattern, int ptnLen, + int nocase)); typedef struct TclIntStubs { int magic; @@ -700,6 +705,7 @@ typedef struct TclIntStubs { int (*tclCheckInterpTraces) _ANSI_ARGS_((Tcl_Interp * interp, CONST char * command, int numChars, Command * cmdPtr, int result, int traceFlags, int objc, Tcl_Obj *CONST objv[])); /* 170 */ int (*tclCheckExecutionTraces) _ANSI_ARGS_((Tcl_Interp * interp, CONST char * command, int numChars, Command * cmdPtr, int result, int traceFlags, int objc, Tcl_Obj *CONST objv[])); /* 171 */ int (*tclInThreadExit) _ANSI_ARGS_((void)); /* 172 */ + int (*tclUniCharMatch) _ANSI_ARGS_((CONST Tcl_UniChar * string, int strLen, CONST Tcl_UniChar * pattern, int ptnLen, int nocase)); /* 173 */ } TclIntStubs; #ifdef __cplusplus @@ -1306,6 +1312,10 @@ extern TclIntStubs *tclIntStubsPtr; #define TclInThreadExit \ (tclIntStubsPtr->tclInThreadExit) /* 172 */ #endif +#ifndef TclUniCharMatch +#define TclUniCharMatch \ + (tclIntStubsPtr->tclUniCharMatch) /* 173 */ +#endif #endif /* defined(USE_TCL_STUBS) && !defined(USE_TCL_STUB_PROCS) */ diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index c25f053..f303e0a 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclStubInit.c,v 1.78 2002/12/06 23:22:59 hobbs Exp $ + * RCS: @(#) $Id: tclStubInit.c,v 1.79 2003/02/18 02:25:45 hobbs Exp $ */ #include "tclInt.h" @@ -244,6 +244,7 @@ TclIntStubs tclIntStubs = { TclCheckInterpTraces, /* 170 */ TclCheckExecutionTraces, /* 171 */ TclInThreadExit, /* 172 */ + TclUniCharMatch, /* 173 */ }; TclIntPlatStubs tclIntPlatStubs = { diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 72a23ca..6c6835c 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclUtf.c,v 1.29 2002/11/12 02:26:29 hobbs Exp $ + * RCS: @(#) $Id: tclUtf.c,v 1.30 2003/02/18 02:25:45 hobbs Exp $ */ #include "tclInt.h" @@ -1584,7 +1584,10 @@ Tcl_UniCharIsWordChar(ch) * * See if a particular Unicode string matches a particular pattern. * Allows case insensitivity. This is the Unicode equivalent of - * the char* Tcl_StringCaseMatch. + * the char* Tcl_StringCaseMatch. The UniChar strings must be + * NULL-terminated. This has no provision for counted UniChar + * strings, thus should not be used where NULLs are expected in the + * UniChar string. Use TclUniCharMatch where possible. * * Results: * The return value is 1 if string matches pattern, and @@ -1755,3 +1758,191 @@ Tcl_UniCharCaseMatch(string, pattern, nocase) pattern++; } } + +/* + *---------------------------------------------------------------------- + * + * TclUniCharMatch -- + * + * See if a particular Unicode string matches a particular pattern. + * Allows case insensitivity. This is the Unicode equivalent of the + * char* Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch + * uses counted Strings, so embedded NULLs are allowed. + * + * Results: + * The return value is 1 if string matches pattern, and + * 0 otherwise. The matching operation permits the following + * special characters in the pattern: *?\[] (see the manual + * entry for details on what these mean). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUniCharMatch(string, strLen, pattern, ptnLen, nocase) + CONST Tcl_UniChar *string; /* Unicode String. */ + int strLen; /* length of String */ + CONST Tcl_UniChar *pattern; /* Pattern, which may contain special + * characters. */ + int ptnLen; /* length of Pattern */ + int nocase; /* 0 for case sensitive, 1 for insensitive */ +{ + CONST Tcl_UniChar *stringEnd, *patternEnd; + Tcl_UniChar p; + + stringEnd = string + strLen; + patternEnd = pattern + ptnLen; + + while (1) { + /* + * See if we're at the end of both the pattern and the string. If + * so, we succeeded. If we're at the end of the pattern but not at + * the end of the string, we failed. + */ + + if (pattern == patternEnd) { + return (string == stringEnd); + } + p = *pattern; + if ((string == stringEnd) && (p != '*')) { + return 0; + } + + /* + * Check for a "*" as the next pattern character. It matches any + * substring. We handle this by skipping all the characters up to the + * next matching one in the pattern, and then calling ourselves + * recursively for each postfix of string, until either we match or we + * reach the end of the string. + */ + + if (p == '*') { + /* + * Skip all successive *'s in the pattern + */ + while (*(++pattern) == '*') {} + if (pattern == patternEnd) { + return 1; + } + p = *pattern; + if (nocase) { + p = Tcl_UniCharToLower(p); + } + while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character + */ + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while ((string < stringEnd) && (p != *string) + && (p != Tcl_UniCharToLower(*string))) { + string++; + } + } else { + while ((string < stringEnd) && (p != *string)) { + string++; + } + } + } + if (TclUniCharMatch(string, stringEnd - string, + pattern, patternEnd - pattern, nocase)) { + return 1; + } + if (string == stringEnd) { + return 0; + } + string++; + } + } + + /* + * Check for a "?" as the next pattern character. It matches + * any single character. + */ + + if (p == '?') { + pattern++; + string++; + continue; + } + + /* + * Check for a "[" as the next pattern character. It is followed + * by a list of characters that are acceptable, or by a range + * (two characters separated by "-"). + */ + + if (p == '[') { + Tcl_UniChar ch1, startChar, endChar; + + pattern++; + ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); + string++; + while (1) { + if ((*pattern == ']') || (pattern == patternEnd)) { + return 0; + } + startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); + pattern++; + if (*pattern == '-') { + pattern++; + if (pattern == patternEnd) { + return 0; + } + endChar = (nocase ? Tcl_UniCharToLower(*pattern) + : *pattern); + pattern++; + if (((startChar <= ch1) && (ch1 <= endChar)) + || ((endChar <= ch1) && (ch1 <= startChar))) { + /* + * Matches ranges of form [a-z] or [z-a]. + */ + break; + } + } else if (startChar == ch1) { + break; + } + } + while (*pattern != ']') { + if (pattern == patternEnd) { + pattern--; + break; + } + pattern++; + } + pattern++; + continue; + } + + /* + * If the next pattern character is '\', just strip off the '\' + * so we do exact matching on the character that follows. + */ + + if (p == '\\') { + if (++pattern == patternEnd) { + return 0; + } + } + + /* + * There's no special character. Just make sure that the next + * bytes of each string match. + */ + + if (nocase) { + if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { + return 0; + } + } else if (*string != *pattern) { + return 0; + } + string++; + pattern++; + } +} diff --git a/tests/string.test b/tests/string.test index bffb623..ae84010 100644 --- a/tests/string.test +++ b/tests/string.test @@ -12,7 +12,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: string.test,v 1.35 2002/11/19 02:34:50 hobbs Exp $ +# RCS: @(#) $Id: string.test,v 1.36 2003/02/18 02:25:45 hobbs Exp $ if {[lsearch [namespace children] ::tcltest] == -1} { package require tcltest @@ -895,7 +895,40 @@ test string-11.51 {string match; *, -nocase and UTF-8} { string match -nocase [binary format I 717316707] \ [binary format I 2028036707] } 1 - +test string-11.52 {string match, null char in string} { + set out "" + set ptn "*abc*" + foreach elem [list "\u0000@abc" "@abc" "\u0000@abc\u0000" "blahabcblah"] { + lappend out [string match $ptn $elem] + } + set out +} {1 1 1 1} +test string-11.53 {string match, null char in pattern} { + set out "" + foreach {ptn elem} [list \ + "*\u0000abc\u0000" "\u0000abc\u0000" \ + "*\u0000abc\u0000" "\u0000abc\u0000ef" \ + "*\u0000abc\u0000*" "\u0000abc\u0000ef" \ + "*\u0000abc\u0000" "@\u0000abc\u0000ef" \ + "*\u0000abc\u0000*" "@\u0000abc\u0000ef" \ + ] { + lappend out [string match $ptn $elem] + } + set out +} {1 0 1 0 1} +test string-11.54 {string match, failure} { + set longString "" + for {set i 0} {$i < 10} {incr i} { + append longString "abcdefghijklmnopqrstuvwxy\u0000z01234567890123" + } + string first $longString 123 + list [string match *cba* $longString] \ + [string match *a*l*\u0000* $longString] \ + [string match *a*l*\u0000*123 $longString] \ + [string match *a*l*\u0000*123* $longString] \ + [string match *a*l*\u0000*cba* $longString] \ + [string match *===* $longString] +} {0 1 1 1 0 0} test string-12.1 {string range} { list [catch {string range} msg] $msg diff --git a/tests/stringComp.test b/tests/stringComp.test index 20779e4..14b0107 100644 --- a/tests/stringComp.test +++ b/tests/stringComp.test @@ -15,7 +15,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: stringComp.test,v 1.5 2002/05/29 09:09:00 hobbs Exp $ +# RCS: @(#) $Id: stringComp.test,v 1.6 2003/02/18 02:25:45 hobbs Exp $ if {[lsearch [namespace children] ::tcltest] == -1} { package require tcltest @@ -643,6 +643,52 @@ test string-11.50 {string match, *special case} { proc foo {} {string match "\\" "\\"} foo } 0 +test string-11.51 {string match; *, -nocase and UTF-8} { + proc foo {} {string match -nocase [binary format I 717316707] \ + [binary format I 2028036707]} + foo +} 1 +test string-11.52 {string match, null char in string} { + proc foo {} { + set ptn "*abc*" + foreach elem [list "\u0000@abc" "@abc" "\u0000@abc\u0000" "blahabcblah"] { + lappend out [string match $ptn $elem] + } + set out + } + foo +} {1 1 1 1} +test string-11.53 {string match, null char in pattern} { + proc foo {} { + set out "" + foreach {ptn elem} [list \ + "*\u0000abc\u0000" "\u0000abc\u0000" \ + "*\u0000abc\u0000" "\u0000abc\u0000ef" \ + "*\u0000abc\u0000*" "\u0000abc\u0000ef" \ + "*\u0000abc\u0000" "@\u0000abc\u0000ef" \ + "*\u0000abc\u0000*" "@\u0000abc\u0000ef" \ + ] { + lappend out [string match $ptn $elem] + } + set out + } + foo +} {1 0 1 0 1} +test string-11.54 {string match, failure} { + proc foo {} { + set longString "" + for {set i 0} {$i < 10} {incr i} { + append longString "abcdefghijklmnopqrstuvwxy\u0000z01234567890123" + } + list [string match *cba* $longString] \ + [string match *a*l*\u0000* $longString] \ + [string match *a*l*\u0000*123 $longString] \ + [string match *a*l*\u0000*123* $longString] \ + [string match *a*l*\u0000*cba* $longString] \ + [string match *===* $longString] + } + foo +} {0 1 1 1 0 0} ## string range ## not yet bc -- cgit v0.12