From 09f4c1de476f86324d54f2e8c31a66870ce1c8bc Mon Sep 17 00:00:00 2001 From: hobbs Date: Mon, 8 May 2000 21:59:58 +0000 Subject: * doc/Utf.3: * generic/tclStubInit.c: * generic/tcl.decls: * generic/tclDecls.h: * generic/tclUtf.c: Added new functions Tcl_UniCharNcasecmp and Tcl_UniCharCaseMatch (unicode parallel to Tcl_StringCaseMatch) * generic/tclUtil.c: rewrote Tcl_StringCaseMatch algorithm for optimization and made Tcl_StringMatch just call Tcl_StringCaseMatch --- ChangeLog | 11 +++ doc/Utf.3 | 42 +++++++--- generic/tcl.decls | 10 ++- generic/tclDecls.h | 20 ++++- generic/tclStubInit.c | 4 +- generic/tclUtf.c | 217 +++++++++++++++++++++++++++++++++++++++++++++++++- generic/tclUtil.c | 189 +++++++++++-------------------------------- 7 files changed, 335 insertions(+), 158 deletions(-) diff --git a/ChangeLog b/ChangeLog index 44a25cd..74763a4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2000-05-08 Jeff Hobbs + + * doc/Utf.3: + * generic/tclStubInit.c: + * generic/tcl.decls: + * generic/tclDecls.h: + * generic/tclUtf.c: Added new functions Tcl_UniCharNcasecmp and + Tcl_UniCharCaseMatch (unicode parallel to Tcl_StringCaseMatch) + * generic/tclUtil.c: rewrote Tcl_StringCaseMatch algorithm for + optimization and made Tcl_StringMatch just call Tcl_StringCaseMatch + 2000-05-08 Eric Melski * tests/set-old.test: diff --git a/doc/Utf.3 b/doc/Utf.3 index bd4d233..f00dd6b 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -4,7 +4,7 @@ '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" -'\" RCS: @(#) $Id: Utf.3,v 1.7 2000/04/25 17:55:29 hobbs Exp $ +'\" RCS: @(#) $Id: Utf.3,v 1.8 2000/05/08 22:00:01 hobbs Exp $ '\" .so man.macros .TH Utf 3 "8.1" Tcl "Tcl Library Procedures" @@ -34,6 +34,14 @@ int .sp int \fBTcl_UniCharNcmp\fR(\fIuniStr, uniStr, num\fR) +.VS 8.4 +.sp +int +\fBTcl_UniCharNcasecmp\fR(\fIuniStr, uniStr, num\fR) +.sp +int +\fBTcl_UniCharCaseMatch\fR(\fIuniStr, uniPattern, nocase\fR) +.VE 8.4 .sp int \fBTcl_UtfNcmp\fR(\fIsrc, src, num\fR) @@ -80,6 +88,8 @@ Filled with the Tcl_UniChar represented by the head of the UTF-8 string. Pointer to a UTF-8 string. .AP "CONST Tcl_UniChar" *uniStr in A NULL-terminated Unicode string. +.AP "CONST Tcl_UniChar" *uniPattern in +A NULL-terminated Unicode string. .AP int len in The length of the UTF-8 string in bytes (not UTF-8 characters). If negative, all bytes up to the first null byte are used. @@ -100,6 +110,11 @@ including the backslash character. .AP char *dst out Buffer in which the bytes represented by the backslash sequence are stored. At most TCL_UTF_MAX bytes are stored in the buffer. +.VS 8.4 +.AP int nocase in +Specifies whether the match should be done case-sensitive (0) or +case-insensitive (1). +.VE 8.4 .BE .SH DESCRIPTION @@ -147,15 +162,22 @@ is terminated with a Unicode NULL character. characters. It accepts a NULL-terminated Unicode string and returns the number of Unicode characters (not bytes) in that string. .PP -\fBTcl_UniCharNcmp\fR corresponds to \fBstrncmp\fR for Unicode -characters. It accepts two NULL-terminated Unicode strings -and the number of characters to compare. (Both strings are -assumed to be at least \fIlen\fR characters long.) -\fBTcl_UniCharNcmp\fR compares the two strings character-by-character -according to the Unicode character ordering. It returns an integer -greater than, equal to, -or less than 0 if the first string is greater than, equal to, or -less than the second string respectively. +\fBTcl_UniCharNcmp\fR and \fBTcl_UniCharNcasecmp\fR correspond to +\fBstrncmp\fR and \fBstrncasecmp\fR, respectively, for Unicode characters. +They accepts two NULL-terminated Unicode strings and the number of characters +to compare. Both strings are assumed to be at least \fIlen\fR characters +long. \fBTcl_UniCharNcmp\fR compares the two strings character-by-character +according to the Unicode character ordering. It returns an integer greater +than, equal to, or less than 0 if the first string is greater than, equal +to, or less than the second string respectively. \fBTcl_UniCharNcasecmp\fR +is the Unicode case insensitive version. +.PP +.VS 8.4 +\fBTcl_UniCharCaseMatch\fR is the Unicode equivalent to +\fBTcl_StringCaseMatch\fR. It accepts a NULL-terminated Unicode string, +a Unicode pattern, and a boolean value specifying whether the match should +be case sensitive and returns whether the string matches the pattern. +.VE 8.4 .PP \fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR for UTF-8 strings. It accepts two NULL-terminated UTF-8 strings and the number of characters diff --git a/generic/tcl.decls b/generic/tcl.decls index 1f09cfa..39a6c21 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -10,7 +10,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: tcl.decls,v 1.34 2000/05/02 22:02:33 kupries Exp $ +# RCS: @(#) $Id: tcl.decls,v 1.35 2000/05/08 21:59:58 hobbs Exp $ library tcl @@ -1379,6 +1379,14 @@ declare 400 generic { int Tcl_IsChannelExisting (CONST char* channelName) } +declare 401 generic { + int Tcl_UniCharNcasecmp(CONST Tcl_UniChar *cs, CONST Tcl_UniChar *ct,\ + unsigned long n) +} +declare 402 generic { + int Tcl_UniCharCaseMatch(CONST Tcl_UniChar *ustr, \ + CONST Tcl_UniChar *pattern, int nocase) +} ############################################################################## diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 150d471..56c1d76 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclDecls.h,v 1.35 2000/05/02 22:02:33 kupries Exp $ + * RCS: @(#) $Id: tclDecls.h,v 1.36 2000/05/08 21:59:59 hobbs Exp $ */ #ifndef _TCLDECLS @@ -1246,6 +1246,14 @@ EXTERN void Tcl_ClearChannelHandlers _ANSI_ARGS_(( /* 400 */ EXTERN int Tcl_IsChannelExisting _ANSI_ARGS_(( CONST char* channelName)); +/* 401 */ +EXTERN int Tcl_UniCharNcasecmp _ANSI_ARGS_(( + CONST Tcl_UniChar * cs, + CONST Tcl_UniChar * ct, unsigned long n)); +/* 402 */ +EXTERN int Tcl_UniCharCaseMatch _ANSI_ARGS_(( + CONST Tcl_UniChar * ustr, + CONST Tcl_UniChar * pattern, int nocase)); typedef struct TclStubHooks { struct TclPlatStubs *tclPlatStubs; @@ -1714,6 +1722,8 @@ typedef struct TclStubs { void (*tcl_SpliceChannel) _ANSI_ARGS_((Tcl_Channel channel)); /* 398 */ void (*tcl_ClearChannelHandlers) _ANSI_ARGS_((Tcl_Channel channel)); /* 399 */ int (*tcl_IsChannelExisting) _ANSI_ARGS_((CONST char* channelName)); /* 400 */ + int (*tcl_UniCharNcasecmp) _ANSI_ARGS_((CONST Tcl_UniChar * cs, CONST Tcl_UniChar * ct, unsigned long n)); /* 401 */ + int (*tcl_UniCharCaseMatch) _ANSI_ARGS_((CONST Tcl_UniChar * ustr, CONST Tcl_UniChar * pattern, int nocase)); /* 402 */ } TclStubs; #ifdef __cplusplus @@ -3363,6 +3373,14 @@ extern TclStubs *tclStubsPtr; #define Tcl_IsChannelExisting \ (tclStubsPtr->tcl_IsChannelExisting) /* 400 */ #endif +#ifndef Tcl_UniCharNcasecmp +#define Tcl_UniCharNcasecmp \ + (tclStubsPtr->tcl_UniCharNcasecmp) /* 401 */ +#endif +#ifndef Tcl_UniCharCaseMatch +#define Tcl_UniCharCaseMatch \ + (tclStubsPtr->tcl_UniCharCaseMatch) /* 402 */ +#endif #endif /* defined(USE_TCL_STUBS) && !defined(USE_TCL_STUB_PROCS) */ diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index e9d6577..c2ac1e5 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclStubInit.c,v 1.36 2000/05/02 22:02:35 kupries Exp $ + * RCS: @(#) $Id: tclStubInit.c,v 1.37 2000/05/08 21:59:59 hobbs Exp $ */ #include "tclInt.h" @@ -798,6 +798,8 @@ TclStubs tclStubs = { Tcl_SpliceChannel, /* 398 */ Tcl_ClearChannelHandlers, /* 399 */ Tcl_IsChannelExisting, /* 400 */ + Tcl_UniCharNcasecmp, /* 401 */ + Tcl_UniCharCaseMatch, /* 402 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 5fe3c41..b62a26c 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclUtf.c,v 1.11 2000/01/11 22:09:00 hobbs Exp $ + * RCS: @(#) $Id: tclUtf.c,v 1.12 2000/05/08 21:59:58 hobbs Exp $ */ #include "tclInt.h" @@ -1301,7 +1301,43 @@ Tcl_UniCharNcmp(cs, ct, n) { for ( ; n != 0; n--, cs++, ct++) { if (*cs != *ct) { - return *cs - *ct; + return (*cs - *ct); + } + if (*cs == '\0') { + break; + } + } + return 0; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharNcasecmp -- + * + * Compare at most n unichars of string cs to string ct case + * insensitive. Both cs and ct are assumed to be at least n + * unichars long. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharNcasecmp(cs, ct, n) + CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */ + CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ + unsigned long n; /* Number of unichars to compare. */ +{ + for ( ; n != 0; n--, cs++, ct++) { + if ((*cs != *ct) && + (Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) { + return (*cs - *ct); } if (*cs == '\0') { break; @@ -1584,3 +1620,180 @@ Tcl_UniCharIsWordChar(ch) return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharCaseMatch -- + * + * See if a particular Unicode string matches a particular pattern. + * Allows case insensitivity. Thie is the Unicode equivalent of + * the char* Tcl_StringCaseMatch. + * + * Results: + * The return value is 1 if string matches pattern, and + * 0 otherwise. The matching operation permits the following + * special characters in the pattern: *?\[] (see the manual + * entry for details on what these mean). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharCaseMatch(string, pattern, nocase) + CONST Tcl_UniChar *string; /* Unicode String. */ + CONST Tcl_UniChar *pattern; /* Pattern, which may contain special + * characters. */ + int nocase; /* 0 for case sensitive, 1 for insensitive */ +{ + Tcl_UniChar ch1, p; + + while (1) { + p = *pattern; + + /* + * See if we're at the end of both the pattern and the string. If + * so, we succeeded. If we're at the end of the pattern but not at + * the end of the string, we failed. + */ + + if (p == 0) { + return (*string == 0); + } + if ((*string == 0) && (p != '*')) { + return 0; + } + + /* + * Check for a "*" as the next pattern character. It matches any + * substring. We handle this by skipping all the characters up to the + * next matching one in the pattern, and then calling ourselves + * recursively for each postfix of string, until either we match or we + * reach the end of the string. + */ + + if (p == '*') { + int pSpecial; + /* + * Skip all successive *'s in the pattern + */ + while (*(++pattern) == '*') {} + p = *pattern; + if (p == 0) { + return 1; + } + while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character + */ + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while (*string && (p != *string) + && (p != Tcl_UniCharToLower(*string))) { + string++; + } + } else { + while (*string && (p != *string)) { string++; } + } + } + if (Tcl_UniCharCaseMatch(string, pattern, nocase)) { + return 1; + } + if (*string == 0) { + return 0; + } + string++; + } + } + + /* + * Check for a "?" as the next pattern character. It matches + * any single character. + */ + + if (p == '?') { + pattern++; + string++; + continue; + } + + /* + * Check for a "[" as the next pattern character. It is followed + * by a list of characters that are acceptable, or by a range + * (two characters separated by "-"). + */ + + if (p == '[') { + Tcl_UniChar startChar, endChar; + + pattern++; + ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); + string++; + while (1) { + if ((*pattern == ']') || (*pattern == 0)) { + return 0; + } + startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); + pattern++; + if (*pattern == '-') { + pattern++; + if (*pattern == 0) { + return 0; + } + endChar = (nocase ? Tcl_UniCharToLower(*pattern) + : *pattern); + pattern++; + if (((startChar <= ch1) && (ch1 <= endChar)) + || ((endChar <= ch1) && (ch1 <= startChar))) { + /* + * Matches ranges of form [a-z] or [z-a]. + */ + break; + } + } else if (startChar == ch1) { + break; + } + } + while (*pattern != ']') { + if (*pattern == 0) { + pattern--; + break; + } + pattern++; + } + pattern++; + continue; + } + + /* + * If the next pattern character is '\', just strip off the '\' + * so we do exact matching on the character that follows. + */ + + if (p == '\\') { + if (*(++pattern) == '\0') { + return 0; + } + } + + /* + * There's no special character. Just make sure that the next + * bytes of each string match. + */ + + if (nocase) { + if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { + return 0; + } + } else if (*string != *pattern) { + return 0; + } + string++; + pattern++; + } +} diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 6e99f32..8a2aa94 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -10,7 +10,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclUtil.c,v 1.17 1999/12/12 02:26:43 hobbs Exp $ + * RCS: @(#) $Id: tclUtil.c,v 1.18 2000/05/08 21:59:59 hobbs Exp $ */ #include "tclInt.h" @@ -1136,131 +1136,7 @@ Tcl_StringMatch(string, pattern) CONST char *pattern; /* Pattern, which may contain special * characters. */ { - int p, s; - CONST char *pstart = pattern; - - while (1) { - p = *pattern; - s = *string; - - /* - * See if we're at the end of both the pattern and the string. If - * so, we succeeded. If we're at the end of the pattern but not at - * the end of the string, we failed. - */ - - if (p == '\0') { - if (s == '\0') { - return 1; - } else { - return 0; - } - } - if ((s == '\0') && (p != '*')) { - return 0; - } - - /* Check for a "*" as the next pattern character. It matches - * any substring. We handle this by calling ourselves - * recursively for each postfix of string, until either we - * match or we reach the end of the string. - */ - - if (p == '*') { - pattern++; - if (*pattern == '\0') { - return 1; - } - while (1) { - if (Tcl_StringMatch(string, pattern)) { - return 1; - } - if (*string == '\0') { - return 0; - } - string++; - } - } - - /* Check for a "?" as the next pattern character. It matches - * any single character. - */ - - if (p == '?') { - Tcl_UniChar ch; - - pattern++; - string += Tcl_UtfToUniChar(string, &ch); - continue; - } - - /* Check for a "[" as the next pattern character. It is followed - * by a list of characters that are acceptable, or by a range - * (two characters separated by "-"). - */ - - if (p == '[') { - Tcl_UniChar ch, startChar, endChar; - - pattern++; - string += Tcl_UtfToUniChar(string, &ch); - - while (1) { - if ((*pattern == ']') || (*pattern == '\0')) { - return 0; - } - pattern += Tcl_UtfToUniChar(pattern, &startChar); - if (*pattern == '-') { - pattern++; - if (*pattern == '\0') { - return 0; - } - pattern += Tcl_UtfToUniChar(pattern, &endChar); - if (((startChar <= ch) && (ch <= endChar)) - || ((endChar <= ch) && (ch <= startChar))) { - /* - * Matches ranges of form [a-z] or [z-a]. - */ - - break; - } - } else if (startChar == ch) { - break; - } - } - while (*pattern != ']') { - if (*pattern == '\0') { - pattern = Tcl_UtfPrev(pattern, pstart); - break; - } - pattern++; - } - pattern++; - continue; - } - - /* If the next pattern character is '\', just strip off the '\' - * so we do exact matching on the character that follows. - */ - - if (p == '\\') { - pattern++; - p = *pattern; - if (p == '\0') { - return 0; - } - } - - /* There's no special character. Just make sure that the next - * bytes of each string match. - */ - - if (s != p) { - return 0; - } - pattern++; - string++; - } + return Tcl_StringCaseMatch(string, pattern, 0); } /* @@ -1290,13 +1166,12 @@ Tcl_StringCaseMatch(string, pattern, nocase) * characters. */ int nocase; /* 0 for case sensitive, 1 for insensitive */ { - int p, s; + int p; CONST char *pstart = pattern; Tcl_UniChar ch1, ch2; while (1) { p = *pattern; - s = *string; /* * See if we're at the end of both the pattern and the string. If @@ -1305,35 +1180,61 @@ Tcl_StringCaseMatch(string, pattern, nocase) */ if (p == '\0') { - return (s == '\0'); + return (*string == '\0'); } - if ((s == '\0') && (p != '*')) { + if ((*string == '\0') && (p != '*')) { return 0; } - /* Check for a "*" as the next pattern character. It matches + /* + * Check for a "*" as the next pattern character. It matches * any substring. We handle this by calling ourselves * recursively for each postfix of string, until either we * match or we reach the end of the string. */ if (p == '*') { - pattern++; - if (*pattern == '\0') { + /* + * Skip all successive *'s in the pattern + */ + while (*(++pattern) == '*') {} + p = *pattern; + if (p == '\0') { return 1; } while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character + */ + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while (*string && (p != *string)) { + ch2 = Tcl_UtfToUniChar(string, &ch1); + if (p == Tcl_UniCharToLower(ch1)) { + break; + } + string += ch2; + } + } else { + while (*string && (p != *string)) { + string += Tcl_UtfToUniChar(string, &ch1); + } + } + } if (Tcl_StringCaseMatch(string, pattern, nocase)) { return 1; } if (*string == '\0') { return 0; } - string++; + string += Tcl_UtfToUniChar(string, &ch1); } } - /* Check for a "?" as the next pattern character. It matches + /* + * Check for a "?" as the next pattern character. It matches * any single character. */ @@ -1343,11 +1244,12 @@ Tcl_StringCaseMatch(string, pattern, nocase) continue; } - /* Check for a "[" as the next pattern character. It is followed + /* + * Check for a "[" as the next pattern character. It is followed * by a list of characters that are acceptable, or by a range * (two characters separated by "-"). */ - + if (p == '[') { Tcl_UniChar startChar, endChar; @@ -1396,22 +1298,23 @@ Tcl_StringCaseMatch(string, pattern, nocase) continue; } - /* If the next pattern character is '\', just strip off the '\' + /* + * If the next pattern character is '\', just strip off the '\' * so we do exact matching on the character that follows. */ - + if (p == '\\') { pattern++; - p = *pattern; - if (p == '\0') { + if (*pattern == '\0') { return 0; } } - /* There's no special character. Just make sure that the next + /* + * There's no special character. Just make sure that the next * bytes of each string match. */ - + string += Tcl_UtfToUniChar(string, &ch1); pattern += Tcl_UtfToUniChar(pattern, &ch2); if (nocase) { -- cgit v0.12