summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog11
-rw-r--r--doc/Utf.342
-rw-r--r--generic/tcl.decls10
-rw-r--r--generic/tclDecls.h20
-rw-r--r--generic/tclStubInit.c4
-rw-r--r--generic/tclUtf.c217
-rw-r--r--generic/tclUtil.c189
7 files changed, 335 insertions, 158 deletions
diff --git a/ChangeLog b/ChangeLog
index 44a25cd..74763a4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2000-05-08 Jeff Hobbs <hobbs@scriptics.com>
+
+ * doc/Utf.3:
+ * generic/tclStubInit.c:
+ * generic/tcl.decls:
+ * generic/tclDecls.h:
+ * generic/tclUtf.c: Added new functions Tcl_UniCharNcasecmp and
+ Tcl_UniCharCaseMatch (unicode parallel to Tcl_StringCaseMatch)
+ * generic/tclUtil.c: rewrote Tcl_StringCaseMatch algorithm for
+ optimization and made Tcl_StringMatch just call Tcl_StringCaseMatch
+
2000-05-08 Eric Melski <ericm@scriptics.com>
* tests/set-old.test:
diff --git a/doc/Utf.3 b/doc/Utf.3
index bd4d233..f00dd6b 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -4,7 +4,7 @@
'\" See the file "license.terms" for information on usage and redistribution
'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
'\"
-'\" RCS: @(#) $Id: Utf.3,v 1.7 2000/04/25 17:55:29 hobbs Exp $
+'\" RCS: @(#) $Id: Utf.3,v 1.8 2000/05/08 22:00:01 hobbs Exp $
'\"
.so man.macros
.TH Utf 3 "8.1" Tcl "Tcl Library Procedures"
@@ -34,6 +34,14 @@ int
.sp
int
\fBTcl_UniCharNcmp\fR(\fIuniStr, uniStr, num\fR)
+.VS 8.4
+.sp
+int
+\fBTcl_UniCharNcasecmp\fR(\fIuniStr, uniStr, num\fR)
+.sp
+int
+\fBTcl_UniCharCaseMatch\fR(\fIuniStr, uniPattern, nocase\fR)
+.VE 8.4
.sp
int
\fBTcl_UtfNcmp\fR(\fIsrc, src, num\fR)
@@ -80,6 +88,8 @@ Filled with the Tcl_UniChar represented by the head of the UTF-8 string.
Pointer to a UTF-8 string.
.AP "CONST Tcl_UniChar" *uniStr in
A NULL-terminated Unicode string.
+.AP "CONST Tcl_UniChar" *uniPattern in
+A NULL-terminated Unicode string.
.AP int len in
The length of the UTF-8 string in bytes (not UTF-8 characters). If
negative, all bytes up to the first null byte are used.
@@ -100,6 +110,11 @@ including the backslash character.
.AP char *dst out
Buffer in which the bytes represented by the backslash sequence are stored.
At most TCL_UTF_MAX bytes are stored in the buffer.
+.VS 8.4
+.AP int nocase in
+Specifies whether the match should be done case-sensitive (0) or
+case-insensitive (1).
+.VE 8.4
.BE
.SH DESCRIPTION
@@ -147,15 +162,22 @@ is terminated with a Unicode NULL character.
characters. It accepts a NULL-terminated Unicode string and returns
the number of Unicode characters (not bytes) in that string.
.PP
-\fBTcl_UniCharNcmp\fR corresponds to \fBstrncmp\fR for Unicode
-characters. It accepts two NULL-terminated Unicode strings
-and the number of characters to compare. (Both strings are
-assumed to be at least \fIlen\fR characters long.)
-\fBTcl_UniCharNcmp\fR compares the two strings character-by-character
-according to the Unicode character ordering. It returns an integer
-greater than, equal to,
-or less than 0 if the first string is greater than, equal to, or
-less than the second string respectively.
+\fBTcl_UniCharNcmp\fR and \fBTcl_UniCharNcasecmp\fR correspond to
+\fBstrncmp\fR and \fBstrncasecmp\fR, respectively, for Unicode characters.
+They accepts two NULL-terminated Unicode strings and the number of characters
+to compare. Both strings are assumed to be at least \fIlen\fR characters
+long. \fBTcl_UniCharNcmp\fR compares the two strings character-by-character
+according to the Unicode character ordering. It returns an integer greater
+than, equal to, or less than 0 if the first string is greater than, equal
+to, or less than the second string respectively. \fBTcl_UniCharNcasecmp\fR
+is the Unicode case insensitive version.
+.PP
+.VS 8.4
+\fBTcl_UniCharCaseMatch\fR is the Unicode equivalent to
+\fBTcl_StringCaseMatch\fR. It accepts a NULL-terminated Unicode string,
+a Unicode pattern, and a boolean value specifying whether the match should
+be case sensitive and returns whether the string matches the pattern.
+.VE 8.4
.PP
\fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR for UTF-8 strings. It
accepts two NULL-terminated UTF-8 strings and the number of characters
diff --git a/generic/tcl.decls b/generic/tcl.decls
index 1f09cfa..39a6c21 100644
--- a/generic/tcl.decls
+++ b/generic/tcl.decls
@@ -10,7 +10,7 @@
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
-# RCS: @(#) $Id: tcl.decls,v 1.34 2000/05/02 22:02:33 kupries Exp $
+# RCS: @(#) $Id: tcl.decls,v 1.35 2000/05/08 21:59:58 hobbs Exp $
library tcl
@@ -1379,6 +1379,14 @@ declare 400 generic {
int Tcl_IsChannelExisting (CONST char* channelName)
}
+declare 401 generic {
+ int Tcl_UniCharNcasecmp(CONST Tcl_UniChar *cs, CONST Tcl_UniChar *ct,\
+ unsigned long n)
+}
+declare 402 generic {
+ int Tcl_UniCharCaseMatch(CONST Tcl_UniChar *ustr, \
+ CONST Tcl_UniChar *pattern, int nocase)
+}
##############################################################################
diff --git a/generic/tclDecls.h b/generic/tclDecls.h
index 150d471..56c1d76 100644
--- a/generic/tclDecls.h
+++ b/generic/tclDecls.h
@@ -8,7 +8,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclDecls.h,v 1.35 2000/05/02 22:02:33 kupries Exp $
+ * RCS: @(#) $Id: tclDecls.h,v 1.36 2000/05/08 21:59:59 hobbs Exp $
*/
#ifndef _TCLDECLS
@@ -1246,6 +1246,14 @@ EXTERN void Tcl_ClearChannelHandlers _ANSI_ARGS_((
/* 400 */
EXTERN int Tcl_IsChannelExisting _ANSI_ARGS_((
CONST char* channelName));
+/* 401 */
+EXTERN int Tcl_UniCharNcasecmp _ANSI_ARGS_((
+ CONST Tcl_UniChar * cs,
+ CONST Tcl_UniChar * ct, unsigned long n));
+/* 402 */
+EXTERN int Tcl_UniCharCaseMatch _ANSI_ARGS_((
+ CONST Tcl_UniChar * ustr,
+ CONST Tcl_UniChar * pattern, int nocase));
typedef struct TclStubHooks {
struct TclPlatStubs *tclPlatStubs;
@@ -1714,6 +1722,8 @@ typedef struct TclStubs {
void (*tcl_SpliceChannel) _ANSI_ARGS_((Tcl_Channel channel)); /* 398 */
void (*tcl_ClearChannelHandlers) _ANSI_ARGS_((Tcl_Channel channel)); /* 399 */
int (*tcl_IsChannelExisting) _ANSI_ARGS_((CONST char* channelName)); /* 400 */
+ int (*tcl_UniCharNcasecmp) _ANSI_ARGS_((CONST Tcl_UniChar * cs, CONST Tcl_UniChar * ct, unsigned long n)); /* 401 */
+ int (*tcl_UniCharCaseMatch) _ANSI_ARGS_((CONST Tcl_UniChar * ustr, CONST Tcl_UniChar * pattern, int nocase)); /* 402 */
} TclStubs;
#ifdef __cplusplus
@@ -3363,6 +3373,14 @@ extern TclStubs *tclStubsPtr;
#define Tcl_IsChannelExisting \
(tclStubsPtr->tcl_IsChannelExisting) /* 400 */
#endif
+#ifndef Tcl_UniCharNcasecmp
+#define Tcl_UniCharNcasecmp \
+ (tclStubsPtr->tcl_UniCharNcasecmp) /* 401 */
+#endif
+#ifndef Tcl_UniCharCaseMatch
+#define Tcl_UniCharCaseMatch \
+ (tclStubsPtr->tcl_UniCharCaseMatch) /* 402 */
+#endif
#endif /* defined(USE_TCL_STUBS) && !defined(USE_TCL_STUB_PROCS) */
diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c
index e9d6577..c2ac1e5 100644
--- a/generic/tclStubInit.c
+++ b/generic/tclStubInit.c
@@ -8,7 +8,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclStubInit.c,v 1.36 2000/05/02 22:02:35 kupries Exp $
+ * RCS: @(#) $Id: tclStubInit.c,v 1.37 2000/05/08 21:59:59 hobbs Exp $
*/
#include "tclInt.h"
@@ -798,6 +798,8 @@ TclStubs tclStubs = {
Tcl_SpliceChannel, /* 398 */
Tcl_ClearChannelHandlers, /* 399 */
Tcl_IsChannelExisting, /* 400 */
+ Tcl_UniCharNcasecmp, /* 401 */
+ Tcl_UniCharCaseMatch, /* 402 */
};
/* !END!: Do not edit above this line. */
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 5fe3c41..b62a26c 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -8,7 +8,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclUtf.c,v 1.11 2000/01/11 22:09:00 hobbs Exp $
+ * RCS: @(#) $Id: tclUtf.c,v 1.12 2000/05/08 21:59:58 hobbs Exp $
*/
#include "tclInt.h"
@@ -1301,7 +1301,43 @@ Tcl_UniCharNcmp(cs, ct, n)
{
for ( ; n != 0; n--, cs++, ct++) {
if (*cs != *ct) {
- return *cs - *ct;
+ return (*cs - *ct);
+ }
+ if (*cs == '\0') {
+ break;
+ }
+ }
+ return 0;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharNcasecmp --
+ *
+ * Compare at most n unichars of string cs to string ct case
+ * insensitive. Both cs and ct are assumed to be at least n
+ * unichars long.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharNcasecmp(cs, ct, n)
+ CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
+ CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
+ unsigned long n; /* Number of unichars to compare. */
+{
+ for ( ; n != 0; n--, cs++, ct++) {
+ if ((*cs != *ct) &&
+ (Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) {
+ return (*cs - *ct);
}
if (*cs == '\0') {
break;
@@ -1584,3 +1620,180 @@ Tcl_UniCharIsWordChar(ch)
return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharCaseMatch --
+ *
+ * See if a particular Unicode string matches a particular pattern.
+ * Allows case insensitivity. Thie is the Unicode equivalent of
+ * the char* Tcl_StringCaseMatch.
+ *
+ * Results:
+ * The return value is 1 if string matches pattern, and
+ * 0 otherwise. The matching operation permits the following
+ * special characters in the pattern: *?\[] (see the manual
+ * entry for details on what these mean).
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharCaseMatch(string, pattern, nocase)
+ CONST Tcl_UniChar *string; /* Unicode String. */
+ CONST Tcl_UniChar *pattern; /* Pattern, which may contain special
+ * characters. */
+ int nocase; /* 0 for case sensitive, 1 for insensitive */
+{
+ Tcl_UniChar ch1, p;
+
+ while (1) {
+ p = *pattern;
+
+ /*
+ * See if we're at the end of both the pattern and the string. If
+ * so, we succeeded. If we're at the end of the pattern but not at
+ * the end of the string, we failed.
+ */
+
+ if (p == 0) {
+ return (*string == 0);
+ }
+ if ((*string == 0) && (p != '*')) {
+ return 0;
+ }
+
+ /*
+ * Check for a "*" as the next pattern character. It matches any
+ * substring. We handle this by skipping all the characters up to the
+ * next matching one in the pattern, and then calling ourselves
+ * recursively for each postfix of string, until either we match or we
+ * reach the end of the string.
+ */
+
+ if (p == '*') {
+ int pSpecial;
+ /*
+ * Skip all successive *'s in the pattern
+ */
+ while (*(++pattern) == '*') {}
+ p = *pattern;
+ if (p == 0) {
+ return 1;
+ }
+ while (1) {
+ /*
+ * Optimization for matching - cruise through the string
+ * quickly if the next char in the pattern isn't a special
+ * character
+ */
+ if ((p != '[') && (p != '?') && (p != '\\')) {
+ if (nocase) {
+ while (*string && (p != *string)
+ && (p != Tcl_UniCharToLower(*string))) {
+ string++;
+ }
+ } else {
+ while (*string && (p != *string)) { string++; }
+ }
+ }
+ if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {
+ return 1;
+ }
+ if (*string == 0) {
+ return 0;
+ }
+ string++;
+ }
+ }
+
+ /*
+ * Check for a "?" as the next pattern character. It matches
+ * any single character.
+ */
+
+ if (p == '?') {
+ pattern++;
+ string++;
+ continue;
+ }
+
+ /*
+ * Check for a "[" as the next pattern character. It is followed
+ * by a list of characters that are acceptable, or by a range
+ * (two characters separated by "-").
+ */
+
+ if (p == '[') {
+ Tcl_UniChar startChar, endChar;
+
+ pattern++;
+ ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
+ string++;
+ while (1) {
+ if ((*pattern == ']') || (*pattern == 0)) {
+ return 0;
+ }
+ startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
+ pattern++;
+ if (*pattern == '-') {
+ pattern++;
+ if (*pattern == 0) {
+ return 0;
+ }
+ endChar = (nocase ? Tcl_UniCharToLower(*pattern)
+ : *pattern);
+ pattern++;
+ if (((startChar <= ch1) && (ch1 <= endChar))
+ || ((endChar <= ch1) && (ch1 <= startChar))) {
+ /*
+ * Matches ranges of form [a-z] or [z-a].
+ */
+ break;
+ }
+ } else if (startChar == ch1) {
+ break;
+ }
+ }
+ while (*pattern != ']') {
+ if (*pattern == 0) {
+ pattern--;
+ break;
+ }
+ pattern++;
+ }
+ pattern++;
+ continue;
+ }
+
+ /*
+ * If the next pattern character is '\', just strip off the '\'
+ * so we do exact matching on the character that follows.
+ */
+
+ if (p == '\\') {
+ if (*(++pattern) == '\0') {
+ return 0;
+ }
+ }
+
+ /*
+ * There's no special character. Just make sure that the next
+ * bytes of each string match.
+ */
+
+ if (nocase) {
+ if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
+ return 0;
+ }
+ } else if (*string != *pattern) {
+ return 0;
+ }
+ string++;
+ pattern++;
+ }
+}
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index 6e99f32..8a2aa94 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -10,7 +10,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclUtil.c,v 1.17 1999/12/12 02:26:43 hobbs Exp $
+ * RCS: @(#) $Id: tclUtil.c,v 1.18 2000/05/08 21:59:59 hobbs Exp $
*/
#include "tclInt.h"
@@ -1136,131 +1136,7 @@ Tcl_StringMatch(string, pattern)
CONST char *pattern; /* Pattern, which may contain special
* characters. */
{
- int p, s;
- CONST char *pstart = pattern;
-
- while (1) {
- p = *pattern;
- s = *string;
-
- /*
- * See if we're at the end of both the pattern and the string. If
- * so, we succeeded. If we're at the end of the pattern but not at
- * the end of the string, we failed.
- */
-
- if (p == '\0') {
- if (s == '\0') {
- return 1;
- } else {
- return 0;
- }
- }
- if ((s == '\0') && (p != '*')) {
- return 0;
- }
-
- /* Check for a "*" as the next pattern character. It matches
- * any substring. We handle this by calling ourselves
- * recursively for each postfix of string, until either we
- * match or we reach the end of the string.
- */
-
- if (p == '*') {
- pattern++;
- if (*pattern == '\0') {
- return 1;
- }
- while (1) {
- if (Tcl_StringMatch(string, pattern)) {
- return 1;
- }
- if (*string == '\0') {
- return 0;
- }
- string++;
- }
- }
-
- /* Check for a "?" as the next pattern character. It matches
- * any single character.
- */
-
- if (p == '?') {
- Tcl_UniChar ch;
-
- pattern++;
- string += Tcl_UtfToUniChar(string, &ch);
- continue;
- }
-
- /* Check for a "[" as the next pattern character. It is followed
- * by a list of characters that are acceptable, or by a range
- * (two characters separated by "-").
- */
-
- if (p == '[') {
- Tcl_UniChar ch, startChar, endChar;
-
- pattern++;
- string += Tcl_UtfToUniChar(string, &ch);
-
- while (1) {
- if ((*pattern == ']') || (*pattern == '\0')) {
- return 0;
- }
- pattern += Tcl_UtfToUniChar(pattern, &startChar);
- if (*pattern == '-') {
- pattern++;
- if (*pattern == '\0') {
- return 0;
- }
- pattern += Tcl_UtfToUniChar(pattern, &endChar);
- if (((startChar <= ch) && (ch <= endChar))
- || ((endChar <= ch) && (ch <= startChar))) {
- /*
- * Matches ranges of form [a-z] or [z-a].
- */
-
- break;
- }
- } else if (startChar == ch) {
- break;
- }
- }
- while (*pattern != ']') {
- if (*pattern == '\0') {
- pattern = Tcl_UtfPrev(pattern, pstart);
- break;
- }
- pattern++;
- }
- pattern++;
- continue;
- }
-
- /* If the next pattern character is '\', just strip off the '\'
- * so we do exact matching on the character that follows.
- */
-
- if (p == '\\') {
- pattern++;
- p = *pattern;
- if (p == '\0') {
- return 0;
- }
- }
-
- /* There's no special character. Just make sure that the next
- * bytes of each string match.
- */
-
- if (s != p) {
- return 0;
- }
- pattern++;
- string++;
- }
+ return Tcl_StringCaseMatch(string, pattern, 0);
}
/*
@@ -1290,13 +1166,12 @@ Tcl_StringCaseMatch(string, pattern, nocase)
* characters. */
int nocase; /* 0 for case sensitive, 1 for insensitive */
{
- int p, s;
+ int p;
CONST char *pstart = pattern;
Tcl_UniChar ch1, ch2;
while (1) {
p = *pattern;
- s = *string;
/*
* See if we're at the end of both the pattern and the string. If
@@ -1305,35 +1180,61 @@ Tcl_StringCaseMatch(string, pattern, nocase)
*/
if (p == '\0') {
- return (s == '\0');
+ return (*string == '\0');
}
- if ((s == '\0') && (p != '*')) {
+ if ((*string == '\0') && (p != '*')) {
return 0;
}
- /* Check for a "*" as the next pattern character. It matches
+ /*
+ * Check for a "*" as the next pattern character. It matches
* any substring. We handle this by calling ourselves
* recursively for each postfix of string, until either we
* match or we reach the end of the string.
*/
if (p == '*') {
- pattern++;
- if (*pattern == '\0') {
+ /*
+ * Skip all successive *'s in the pattern
+ */
+ while (*(++pattern) == '*') {}
+ p = *pattern;
+ if (p == '\0') {
return 1;
}
while (1) {
+ /*
+ * Optimization for matching - cruise through the string
+ * quickly if the next char in the pattern isn't a special
+ * character
+ */
+ if ((p != '[') && (p != '?') && (p != '\\')) {
+ if (nocase) {
+ while (*string && (p != *string)) {
+ ch2 = Tcl_UtfToUniChar(string, &ch1);
+ if (p == Tcl_UniCharToLower(ch1)) {
+ break;
+ }
+ string += ch2;
+ }
+ } else {
+ while (*string && (p != *string)) {
+ string += Tcl_UtfToUniChar(string, &ch1);
+ }
+ }
+ }
if (Tcl_StringCaseMatch(string, pattern, nocase)) {
return 1;
}
if (*string == '\0') {
return 0;
}
- string++;
+ string += Tcl_UtfToUniChar(string, &ch1);
}
}
- /* Check for a "?" as the next pattern character. It matches
+ /*
+ * Check for a "?" as the next pattern character. It matches
* any single character.
*/
@@ -1343,11 +1244,12 @@ Tcl_StringCaseMatch(string, pattern, nocase)
continue;
}
- /* Check for a "[" as the next pattern character. It is followed
+ /*
+ * Check for a "[" as the next pattern character. It is followed
* by a list of characters that are acceptable, or by a range
* (two characters separated by "-").
*/
-
+
if (p == '[') {
Tcl_UniChar startChar, endChar;
@@ -1396,22 +1298,23 @@ Tcl_StringCaseMatch(string, pattern, nocase)
continue;
}
- /* If the next pattern character is '\', just strip off the '\'
+ /*
+ * If the next pattern character is '\', just strip off the '\'
* so we do exact matching on the character that follows.
*/
-
+
if (p == '\\') {
pattern++;
- p = *pattern;
- if (p == '\0') {
+ if (*pattern == '\0') {
return 0;
}
}
- /* There's no special character. Just make sure that the next
+ /*
+ * There's no special character. Just make sure that the next
* bytes of each string match.
*/
-
+
string += Tcl_UtfToUniChar(string, &ch1);
pattern += Tcl_UtfToUniChar(pattern, &ch2);
if (nocase) {