summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhobbs <hobbs>2002-05-29 09:09:57 (GMT)
committerhobbs <hobbs>2002-05-29 09:09:57 (GMT)
commit3a1941e7d007e93449aa22085687a8fafaddad7e (patch)
tree9794bcbc0e094b9645add8339f0278666543ef7e
parent514dd2a61babad4abc78895bdadec335dd9b4c71 (diff)
downloadtcl-3a1941e7d007e93449aa22085687a8fafaddad7e.zip
tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.gz
tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.bz2
* generic/tclInt.decls:
* generic/tclIntDecls.h: * generic/tclStubInit.c: * generic/tclUtf.c: added TclpUtfNcmp2 private command that mirrors Tcl_UtfNcmp, but takes n in bytes, not utf-8 chars. This provides a faster alternative for comparing utf strings internally. (Tcl_UniCharNcmp, Tcl_UniCharNcasecmp): removed the explicit end of string check as it wasn't correct for the function (by doc and logic). * generic/tclCmdMZ.c (Tcl_StringObjCmd): reworked the string equal comparison code to use TclpUtfNcmp2 as well as short-circuit for equal objects or unequal length strings in the equal case. Removed the use of goto and streamlined the other parts. * generic/tclExecute.c (TclExecuteByteCode): added check for object equality in the comparison instructions. Added short-circuit for != length strings in INST_EQ, INST_NEQ and INST_STR_CMP. Reworked INST_STR_CMP to use TclpUtfNcmp2 where appropriate, and only use Tcl_UniCharNcmp when at least one of the objects is a Unicode obj with no utf bytes.
-rw-r--r--ChangeLog32
-rw-r--r--generic/tclCmdMZ.c168
-rw-r--r--generic/tclExecute.c91
-rw-r--r--generic/tclInt.decls7
-rw-r--r--generic/tclIntDecls.h10
-rw-r--r--generic/tclStubInit.c3
-rw-r--r--generic/tclUtf.c67
7 files changed, 225 insertions, 153 deletions
diff --git a/ChangeLog b/ChangeLog
index 8effda2..2dad798 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,35 @@
+2002-05-29 Jeff Hobbs <jeffh@ActiveState.com>
+
+ * generic/tclInt.decls:
+ * generic/tclIntDecls.h:
+ * generic/tclStubInit.c:
+ * generic/tclUtf.c: added TclpUtfNcmp2 private command that
+ mirrors Tcl_UtfNcmp, but takes n in bytes, not utf-8 chars. This
+ provides a faster alternative for comparing utf strings internally.
+ (Tcl_UniCharNcmp, Tcl_UniCharNcasecmp): removed the explicit end
+ of string check as it wasn't correct for the function (by doc and
+ logic).
+
+ * generic/tclCmdMZ.c (Tcl_StringObjCmd): reworked the string equal
+ comparison code to use TclpUtfNcmp2 as well as short-circuit for
+ equal objects or unequal length strings in the equal case.
+ Removed the use of goto and streamlined the other parts.
+
+ * generic/tclExecute.c (TclExecuteByteCode): added check for
+ object equality in the comparison instructions. Added
+ short-circuit for != length strings in INST_EQ, INST_NEQ and
+ INST_STR_CMP. Reworked INST_STR_CMP to use TclpUtfNcmp2 where
+ appropriate, and only use Tcl_UniCharNcmp when at least one of the
+ objects is a Unicode obj with no utf bytes.
+
+ * generic/tclCompCmds.c (TclCompileStringCmd): removed error
+ creation in code that no longer throws an error.
+
+ * tests/string.test:
+ * tests/stringComp.test: added more string comparison checks.
+
+ * tests/clock.test: better qualified 9.1 constraint check for %s.
+
2002-05-28 Jeff Hobbs <jeffh@ActiveState.com>
* generic/tclThreadAlloc.c (TclpRealloc, TclpFree): protect
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 0f39ecb..a3b03df 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -14,7 +14,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclCmdMZ.c,v 1.67 2002/04/18 13:49:30 dkf Exp $
+ * RCS: @(#) $Id: tclCmdMZ.c,v 1.68 2002/05/29 09:09:57 hobbs Exp $
*/
#include "tclInt.h"
@@ -1094,6 +1094,11 @@ Tcl_SplitObjCmd(dummy, interp, objc, objv)
* that this command only functions correctly on properly formed
* Tcl UTF strings.
*
+ * Note that the primary methods here (equal, compare, match, ...)
+ * have bytecode equivalents. You will find the code for those in
+ * tclExecute.c. The code here will only be used in the non-bc
+ * case (like in an 'eval').
+ *
* Results:
* A standard Tcl result.
*
@@ -1153,6 +1158,7 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
* comparison in INST_EQ/INST_NEQ/INST_LT/...).
*/
int i, match, length, nocase = 0, reqlength = -1;
+ int (*strCmpFn)();
if (objc < 4 || objc > 7) {
str_cmp_args:
@@ -1183,134 +1189,84 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
}
}
- if (reqlength == 0) {
- /*
- * Anything matches at 0 chars, right?
- */
-
- match = 0;
- goto stringComparisonDone;
- }
-
/*
* From now on, we only access the two objects at the end
* of the argument array.
*/
objv += objc-2;
- /*
- * Use binary versions of comparisons since that won't
- * cause undue type conversions and it is much faster.
- * Only do this if we're case-sensitive (which is all
- * that really makes sense with byte arrays anyway, and
- * we have no memcasecmp() for some reason... :^)
- */
- if (!nocase && objv[0]->typePtr == &tclByteArrayType &&
- objv[1]->typePtr == &tclByteArrayType) {
- unsigned char *bytes1, *bytes2;
-
- bytes1 = Tcl_GetByteArrayFromObj(objv[0], &length1);
- bytes2 = Tcl_GetByteArrayFromObj(objv[1], &length2);
- length = (length1 < length2) ? length1 : length2;
-
- if ((reqlength > 0) && (reqlength < length)) {
- length = reqlength;
- } else if (reqlength < 0) {
- /*
- * The requested length is negative, so we ignore it by
- * setting it to the longer of the two lengths.
- */
-
- reqlength = (length1 > length2) ? length1 : length2;
- }
+ if ((reqlength == 0) || (objv[0] == objv[1])) {
+ /*
+ * Alway match at 0 chars of if it is the same obj.
+ */
- match = memcmp(bytes1, bytes2, (unsigned)length);
- if ((match == 0) && (reqlength > length)) {
- match = length1 - length2;
+ Tcl_SetBooleanObj(resultPtr,
+ ((enum options) index == STR_EQUAL));
+ break;
+ } else if (!nocase && objv[0]->typePtr == &tclByteArrayType &&
+ objv[1]->typePtr == &tclByteArrayType) {
+ /*
+ * Use binary versions of comparisons since that won't
+ * cause undue type conversions and it is much faster.
+ * Only do this if we're case-sensitive (which is all
+ * that really makes sense with byte arrays anyway, and
+ * we have no memcasecmp() for some reason... :^)
+ */
+ string1 = (char*) Tcl_GetByteArrayFromObj(objv[0], &length1);
+ string2 = (char*) Tcl_GetByteArrayFromObj(objv[1], &length2);
+ strCmpFn = memcmp;
+ } else if (((objv[0]->typePtr == &tclStringType)
+ && (objv[0]->bytes == NULL))
+ || ((objv[1]->typePtr == &tclStringType)
+ && (objv[1]->bytes == NULL))) {
+ /*
+ * Use UNICODE versions of string comparisons since that
+ * won't cause undue type conversions and we can work with
+ * characters all of a fixed size (much faster.), but only
+ * when one of the objects is a pure UNICODE object.
+ */
+ string1 = (char*) Tcl_GetUnicodeFromObj(objv[0], &length1);
+ string2 = (char*) Tcl_GetUnicodeFromObj(objv[1], &length2);
+ strCmpFn = nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp;
+ } else {
+ /*
+ * As a catch-all we will work with UTF-8. We cannot use
+ * memcmp() as that is unsafe with any string containing
+ * NULL (\xC0\x80 in Tcl's utf rep). We can use the more
+ * efficient TclpUtfNcmp2 if we are case-sensitive and no
+ * specific length was requested.
+ */
+ string1 = (char*) Tcl_GetStringFromObj(objv[0], &length1);
+ string2 = (char*) Tcl_GetStringFromObj(objv[1], &length2);
+ if ((reqlength < 0) && !nocase) {
+ strCmpFn = TclpUtfNcmp2;
+ } else {
+ length1 = Tcl_NumUtfChars(string1, length1);
+ length2 = Tcl_NumUtfChars(string2, length2);
+ strCmpFn = nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp;
}
- goto stringComparisonDone;
}
- /*
- * Use UNICODE versions of string comparisons since that
- * won't cause undue type conversions and we can work with
- * characters all of a fixed size (much faster.) Also use
- * this code for untyped objects, since like that we'll
- * pick up many things that are used for comparison in
- * scripts and convert them (efficiently) to UNICODE
- * strings for comparison, but exclude case where both are
- * untyped as that is a little bit aggressive.
- */
- if ((objv[0]->typePtr == &tclStringType ||
- objv[0]->typePtr == NULL) &&
- (objv[1]->typePtr == &tclStringType ||
- objv[1]->typePtr == NULL) &&
- !(objv[0]->typePtr == NULL && objv[1]->typePtr == NULL)) {
- Tcl_UniChar *uni1, *uni2;
-
- uni1 = Tcl_GetUnicodeFromObj(objv[0], &length1);
- uni2 = Tcl_GetUnicodeFromObj(objv[1], &length2);
+ if (((enum options) index == STR_EQUAL)
+ && (reqlength < 0) && (length1 != length2)) {
+ match = 1; /* this will be reversed below */
+ } else {
length = (length1 < length2) ? length1 : length2;
-
if (reqlength > 0 && reqlength < length) {
length = reqlength;
} else if (reqlength < 0) {
/*
* The requested length is negative, so we ignore it by
- * setting it to the longer of the two lengths.
+ * setting it to length + 1 so we correct the match var.
*/
-
- reqlength = (length1 < length2) ? length2 : length1;
- }
-
- if (nocase) {
- match = Tcl_UniCharNcasecmp(uni1, uni2, (unsigned)length);
- } else {
- match = Tcl_UniCharNcmp(uni1, uni2, (unsigned)length);
+ reqlength = length + 1;
}
-
+ match = strCmpFn(string1, string2, (unsigned) length);
if ((match == 0) && (reqlength > length)) {
match = length1 - length2;
}
- goto stringComparisonDone;
- }
-
- /*
- * Strings to be compared are not both UNICODE or byte
- * arrays, so we will need to convert to UTF-8 and work
- * there (cannot use memcmp() as that is an unsafe
- * operation with any string containing \u0000 and the
- * safety test is equivalent in cost to the comparison
- * itself!)
- */
- string1 = Tcl_GetStringFromObj(objv[0], &length1);
- string2 = Tcl_GetStringFromObj(objv[1], &length2);
- length1 = Tcl_NumUtfChars(string1, length1);
- length2 = Tcl_NumUtfChars(string2, length2);
- length = (length1 < length2) ? length1 : length2;
-
- if ((reqlength > 0) && (reqlength < length)) {
- length = reqlength;
- } else if (reqlength < 0) {
- /*
- * The requested length is negative, so we ignore it by
- * setting it to the longer of the two lengths.
- */
-
- reqlength = (length1 > length2) ? length1 : length2;
- }
-
- if (nocase) {
- match = Tcl_UtfNcasecmp(string1, string2, (unsigned) length);
- } else {
- match = Tcl_UtfNcmp(string1, string2, (unsigned) length);
- }
-
- if ((match == 0) && (reqlength > length)) {
- match = length1 - length2;
}
- stringComparisonDone:
if ((enum options) index == STR_EQUAL) {
Tcl_SetBooleanObj(resultPtr, (match) ? 0 : 1);
} else {
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index f6d2ea2..bbe2728 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -11,7 +11,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclExecute.c,v 1.53 2002/04/18 13:04:20 msofer Exp $
+ * RCS: @(#) $Id: tclExecute.c,v 1.54 2002/05/29 09:09:57 hobbs Exp $
*/
#include "tclInt.h"
@@ -2718,7 +2718,7 @@ TclExecuteByteCode(interp, codePtr)
/*
* String compare
*/
- char *s1, *s2;
+ CONST char *s1, *s2;
int s1len, s2len, iResult;
value2Ptr = POP_OBJECT();
@@ -2728,18 +2728,25 @@ TclExecuteByteCode(interp, codePtr)
* The comparison function should compare up to the
* minimum byte length only.
*/
- if ((valuePtr->typePtr == &tclByteArrayType) &&
+ if (valuePtr == value2Ptr) {
+ /*
+ * In the pure equality case, set lengths too for
+ * the checks below (or we could goto beyond it).
+ */
+ iResult = s1len = s2len = 0;
+ } else if ((valuePtr->typePtr == &tclByteArrayType) &&
(value2Ptr->typePtr == &tclByteArrayType)) {
s1 = (char *) Tcl_GetByteArrayFromObj(valuePtr, &s1len);
s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len);
iResult = memcmp(s1, s2,
(size_t) ((s1len < s2len) ? s1len : s2len));
- } else if ((valuePtr->typePtr == &tclStringType) ||
- (value2Ptr->typePtr == &tclStringType)) {
+ } else if (((valuePtr->typePtr == &tclStringType)
+ && (valuePtr->bytes == NULL))
+ || ((value2Ptr->typePtr == &tclStringType)
+ && (value2Ptr->bytes == NULL))) {
/*
- * The alternative is to break this into more code
- * that does type sensitive comparison, as done in
- * Tcl_StringObjCmd.
+ * Do a unicode-specific comparison if one of the args
+ * only has the unicode rep.
*/
Tcl_UniChar *uni1, *uni2;
uni1 = Tcl_GetUnicodeFromObj(valuePtr, &s1len);
@@ -2748,21 +2755,12 @@ TclExecuteByteCode(interp, codePtr)
(unsigned) ((s1len < s2len) ? s1len : s2len));
} else {
/*
- * This solution is less mem intensive, but it is
- * computationally expensive as the string grows. The
- * reason that we can't use a memcmp is that UTF-8 strings
- * that contain a \u0000 can't be compared with memcmp. If
- * we knew that the string was ascii-7 or had no null byte,
- * we could just do memcmp and save all the hassle.
+ * We can't do a simple memcmp in order to handle the
+ * special Tcl \xC0\x80 null encoding for utf-8.
*/
s1 = Tcl_GetStringFromObj(valuePtr, &s1len);
s2 = Tcl_GetStringFromObj(value2Ptr, &s2len);
- /*
- * These have to be in true chars
- */
- s1len = Tcl_NumUtfChars(s1, s1len);
- s2len = Tcl_NumUtfChars(s2, s2len);
- iResult = Tcl_UtfNcmp(s1, s2,
+ iResult = TclpUtfNcmp2(s1, s2,
(size_t) ((s1len < s2len) ? s1len : s2len));
}
@@ -2931,6 +2929,26 @@ TclExecuteByteCode(interp, codePtr)
value2Ptr = POP_OBJECT();
valuePtr = POP_OBJECT();
+
+ if (valuePtr == value2Ptr) {
+ /*
+ * Optimize the equal object case.
+ */
+ switch (*pc) {
+ case INST_EQ:
+ case INST_LE:
+ case INST_GE:
+ iResult = 1;
+ break;
+ case INST_NEQ:
+ case INST_LT:
+ case INST_GT:
+ iResult = 0;
+ break;
+ }
+ goto foundResult;
+ }
+
t1Ptr = valuePtr->typePtr;
t2Ptr = value2Ptr->typePtr;
@@ -2967,30 +2985,39 @@ TclExecuteByteCode(interp, codePtr)
if (!IS_NUMERIC_TYPE(t1Ptr) || !IS_NUMERIC_TYPE(t2Ptr)) {
/*
* One operand is not numeric. Compare as strings.
- * NOTE: strcmp is not correct for \x00 < \x01.
+ * NOTE: strcmp is not correct for \x00 < \x01, but
+ * that is unlikely to occur here. We could use the
+ * TclUtfNCmp2 to handle this.
*/
- int cmpValue;
- s1 = TclGetString(valuePtr);
- s2 = TclGetString(value2Ptr);
- cmpValue = strcmp(s1, s2);
+ int s1len, s2len;
+ s1 = Tcl_GetStringFromObj(valuePtr, &s1len);
+ s2 = Tcl_GetStringFromObj(value2Ptr, &s2len);
switch (*pc) {
case INST_EQ:
- iResult = (cmpValue == 0);
+ if (s1len == s2len) {
+ iResult = (strcmp(s1, s2) == 0);
+ } else {
+ iResult = 0;
+ }
break;
case INST_NEQ:
- iResult = (cmpValue != 0);
+ if (s1len == s2len) {
+ iResult = (strcmp(s1, s2) != 0);
+ } else {
+ iResult = 1;
+ }
break;
case INST_LT:
- iResult = (cmpValue < 0);
+ iResult = (strcmp(s1, s2) < 0);
break;
case INST_GT:
- iResult = (cmpValue > 0);
+ iResult = (strcmp(s1, s2) > 0);
break;
case INST_LE:
- iResult = (cmpValue <= 0);
+ iResult = (strcmp(s1, s2) <= 0);
break;
case INST_GE:
- iResult = (cmpValue >= 0);
+ iResult = (strcmp(s1, s2) >= 0);
break;
}
} else if ((t1Ptr == &tclDoubleType)
@@ -3094,7 +3121,7 @@ TclExecuteByteCode(interp, codePtr)
/*
* Reuse the valuePtr object already on stack if possible.
*/
-
+ foundResult:
TRACE(("%.20s %.20s => %ld\n",
O2S(valuePtr), O2S(value2Ptr), iResult));
if (Tcl_IsShared(valuePtr)) {
diff --git a/generic/tclInt.decls b/generic/tclInt.decls
index b4d07fa..094b9df 100644
--- a/generic/tclInt.decls
+++ b/generic/tclInt.decls
@@ -12,7 +12,7 @@
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
-# RCS: @(#) $Id: tclInt.decls,v 1.49 2002/05/29 00:19:40 hobbs Exp $
+# RCS: @(#) $Id: tclInt.decls,v 1.50 2002/05/29 09:09:57 hobbs Exp $
library tcl
@@ -666,6 +666,11 @@ declare 168 generic {
Tcl_Obj *TclGetStartupScriptPath(void)
}
+# variant of Tcl_UtfNCmp that takes n as bytes, not chars
+declare 169 generic {
+ int TclpUtfNcmp2(CONST char *s1, CONST char *s2, unsigned long n)
+}
+
##############################################################################
diff --git a/generic/tclIntDecls.h b/generic/tclIntDecls.h
index a16cc73..db179b9 100644
--- a/generic/tclIntDecls.h
+++ b/generic/tclIntDecls.h
@@ -11,7 +11,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclIntDecls.h,v 1.40 2002/05/29 00:19:40 hobbs Exp $
+ * RCS: @(#) $Id: tclIntDecls.h,v 1.41 2002/05/29 09:09:57 hobbs Exp $
*/
#ifndef _TCLINTDECLS
@@ -503,6 +503,9 @@ EXTERN void TclSetStartupScriptPath _ANSI_ARGS_((
Tcl_Obj * pathPtr));
/* 168 */
EXTERN Tcl_Obj * TclGetStartupScriptPath _ANSI_ARGS_((void));
+/* 169 */
+EXTERN int TclpUtfNcmp2 _ANSI_ARGS_((CONST char * s1,
+ CONST char * s2, unsigned long n));
typedef struct TclIntStubs {
int magic;
@@ -709,6 +712,7 @@ typedef struct TclIntStubs {
int (*tclListObjSetElement) _ANSI_ARGS_((Tcl_Interp * interp, Tcl_Obj * listPtr, int index, Tcl_Obj * valuePtr)); /* 166 */
void (*tclSetStartupScriptPath) _ANSI_ARGS_((Tcl_Obj * pathPtr)); /* 167 */
Tcl_Obj * (*tclGetStartupScriptPath) _ANSI_ARGS_((void)); /* 168 */
+ int (*tclpUtfNcmp2) _ANSI_ARGS_((CONST char * s1, CONST char * s2, unsigned long n)); /* 169 */
} TclIntStubs;
#ifdef __cplusplus
@@ -1325,6 +1329,10 @@ extern TclIntStubs *tclIntStubsPtr;
#define TclGetStartupScriptPath \
(tclIntStubsPtr->tclGetStartupScriptPath) /* 168 */
#endif
+#ifndef TclpUtfNcmp2
+#define TclpUtfNcmp2 \
+ (tclIntStubsPtr->tclpUtfNcmp2) /* 169 */
+#endif
#endif /* defined(USE_TCL_STUBS) && !defined(USE_TCL_STUB_PROCS) */
diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c
index be2902e..90f51ba 100644
--- a/generic/tclStubInit.c
+++ b/generic/tclStubInit.c
@@ -8,7 +8,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclStubInit.c,v 1.70 2002/05/24 21:19:06 dkf Exp $
+ * RCS: @(#) $Id: tclStubInit.c,v 1.71 2002/05/29 09:09:57 hobbs Exp $
*/
#include "tclInt.h"
@@ -248,6 +248,7 @@ TclIntStubs tclIntStubs = {
TclListObjSetElement, /* 166 */
TclSetStartupScriptPath, /* 167 */
TclGetStartupScriptPath, /* 168 */
+ TclpUtfNcmp2, /* 169 */
};
TclIntPlatStubs tclIntPlatStubs = {
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 0148ca6..56dcaca 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -8,7 +8,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclUtf.c,v 1.23 2002/02/08 02:52:54 dgp Exp $
+ * RCS: @(#) $Id: tclUtf.c,v 1.24 2002/05/29 09:09:57 hobbs Exp $
*/
#include "tclInt.h"
@@ -309,7 +309,7 @@ Tcl_UtfToUniChar(str, chPtr)
* Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
* characters representing themselves.
*/
-
+
*chPtr = (Tcl_UniChar) byte;
return 1;
} else if (byte < 0xE0) {
@@ -317,7 +317,7 @@ Tcl_UtfToUniChar(str, chPtr)
/*
* Two-byte-character lead-byte followed by a trail-byte.
*/
-
+
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
return 2;
}
@@ -325,7 +325,7 @@ Tcl_UtfToUniChar(str, chPtr)
* A two-byte-character lead-byte not followed by trail-byte
* represents itself.
*/
-
+
*chPtr = (Tcl_UniChar) byte;
return 1;
} else if (byte < 0xF0) {
@@ -671,7 +671,7 @@ Tcl_UtfPrev(str, start)
byte = *((unsigned char *) look);
if (byte < 0x80) {
break;
- }
+ }
if (byte >= 0xC0) {
return look;
}
@@ -1074,6 +1074,51 @@ Tcl_UtfToTitle(str)
/*
*----------------------------------------------------------------------
*
+ * TclpUtfNcmp2 --
+ *
+ * Compare at most n bytes of utf-8 strings cs and ct. Both cs
+ * and ct are assumed to be at least n bytes long.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclpUtfNcmp2(cs, ct, n)
+ CONST char *cs; /* UTF string to compare to ct. */
+ CONST char *ct; /* UTF string cs is compared to. */
+ unsigned long n; /* Number of *bytes* to compare. */
+{
+ /*
+ * We can't simply call 'memcmp(cs, ct, n);' because we need to check
+ * for Tcl's \xC0\x80 non-utf-8 null encoding.
+ * Otherwise utf-8 lexes fine in the strcmp manner.
+ */
+ register int result = 0;
+
+ for ( ; n != 0; n--, cs++, ct++) {
+ if (*cs != *ct) {
+ result = UCHAR(*cs) - UCHAR(*ct);
+ break;
+ }
+ }
+ if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
+ unsigned char c1, c2;
+ c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
+ c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
+ result = (c1 - c2);
+ }
+ return result;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
* Tcl_UtfNcmp --
*
* Compare at most n UTF chars of string cs to string ct. Both cs
@@ -1096,7 +1141,7 @@ Tcl_UtfNcmp(cs, ct, n)
{
Tcl_UniChar ch1, ch2;
/*
- * Cannot use memcmp()-based approach as byte representation of
+ * Cannot use 'memcmp(cs, ct, n);' as byte representation of
* \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
* representation of \u0001 (the byte 0x01.)
*/
@@ -1306,13 +1351,14 @@ Tcl_UniCharNcmp(cs, ct, n)
CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
unsigned long n; /* Number of unichars to compare. */
{
+ /*
+ * We can't simply call 'memcmp(cs, ct, n*sizeof(Tcl_UniChar));'
+ * because that may not be lexically correct.
+ */
for ( ; n != 0; n--, cs++, ct++) {
if (*cs != *ct) {
return (*cs - *ct);
}
- if (*cs == '\0') {
- break;
- }
}
return 0;
}
@@ -1346,9 +1392,6 @@ Tcl_UniCharNcasecmp(cs, ct, n)
(Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) {
return (*cs - *ct);
}
- if (*cs == '\0') {
- break;
- }
}
return 0;
}