diff options
-rw-r--r-- | ChangeLog | 32 | ||||
-rw-r--r-- | generic/tclCmdMZ.c | 168 | ||||
-rw-r--r-- | generic/tclExecute.c | 91 | ||||
-rw-r--r-- | generic/tclInt.decls | 7 | ||||
-rw-r--r-- | generic/tclIntDecls.h | 10 | ||||
-rw-r--r-- | generic/tclStubInit.c | 3 | ||||
-rw-r--r-- | generic/tclUtf.c | 67 |
7 files changed, 225 insertions, 153 deletions
@@ -1,3 +1,35 @@ +2002-05-29 Jeff Hobbs <jeffh@ActiveState.com> + + * generic/tclInt.decls: + * generic/tclIntDecls.h: + * generic/tclStubInit.c: + * generic/tclUtf.c: added TclpUtfNcmp2 private command that + mirrors Tcl_UtfNcmp, but takes n in bytes, not utf-8 chars. This + provides a faster alternative for comparing utf strings internally. + (Tcl_UniCharNcmp, Tcl_UniCharNcasecmp): removed the explicit end + of string check as it wasn't correct for the function (by doc and + logic). + + * generic/tclCmdMZ.c (Tcl_StringObjCmd): reworked the string equal + comparison code to use TclpUtfNcmp2 as well as short-circuit for + equal objects or unequal length strings in the equal case. + Removed the use of goto and streamlined the other parts. + + * generic/tclExecute.c (TclExecuteByteCode): added check for + object equality in the comparison instructions. Added + short-circuit for != length strings in INST_EQ, INST_NEQ and + INST_STR_CMP. Reworked INST_STR_CMP to use TclpUtfNcmp2 where + appropriate, and only use Tcl_UniCharNcmp when at least one of the + objects is a Unicode obj with no utf bytes. + + * generic/tclCompCmds.c (TclCompileStringCmd): removed error + creation in code that no longer throws an error. + + * tests/string.test: + * tests/stringComp.test: added more string comparison checks. + + * tests/clock.test: better qualified 9.1 constraint check for %s. + 2002-05-28 Jeff Hobbs <jeffh@ActiveState.com> * generic/tclThreadAlloc.c (TclpRealloc, TclpFree): protect diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 0f39ecb..a3b03df 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -14,7 +14,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclCmdMZ.c,v 1.67 2002/04/18 13:49:30 dkf Exp $ + * RCS: @(#) $Id: tclCmdMZ.c,v 1.68 2002/05/29 09:09:57 hobbs Exp $ */ #include "tclInt.h" @@ -1094,6 +1094,11 @@ Tcl_SplitObjCmd(dummy, interp, objc, objv) * that this command only functions correctly on properly formed * Tcl UTF strings. * + * Note that the primary methods here (equal, compare, match, ...) + * have bytecode equivalents. You will find the code for those in + * tclExecute.c. The code here will only be used in the non-bc + * case (like in an 'eval'). + * * Results: * A standard Tcl result. * @@ -1153,6 +1158,7 @@ Tcl_StringObjCmd(dummy, interp, objc, objv) * comparison in INST_EQ/INST_NEQ/INST_LT/...). */ int i, match, length, nocase = 0, reqlength = -1; + int (*strCmpFn)(); if (objc < 4 || objc > 7) { str_cmp_args: @@ -1183,134 +1189,84 @@ Tcl_StringObjCmd(dummy, interp, objc, objv) } } - if (reqlength == 0) { - /* - * Anything matches at 0 chars, right? - */ - - match = 0; - goto stringComparisonDone; - } - /* * From now on, we only access the two objects at the end * of the argument array. */ objv += objc-2; - /* - * Use binary versions of comparisons since that won't - * cause undue type conversions and it is much faster. - * Only do this if we're case-sensitive (which is all - * that really makes sense with byte arrays anyway, and - * we have no memcasecmp() for some reason... :^) - */ - if (!nocase && objv[0]->typePtr == &tclByteArrayType && - objv[1]->typePtr == &tclByteArrayType) { - unsigned char *bytes1, *bytes2; - - bytes1 = Tcl_GetByteArrayFromObj(objv[0], &length1); - bytes2 = Tcl_GetByteArrayFromObj(objv[1], &length2); - length = (length1 < length2) ? length1 : length2; - - if ((reqlength > 0) && (reqlength < length)) { - length = reqlength; - } else if (reqlength < 0) { - /* - * The requested length is negative, so we ignore it by - * setting it to the longer of the two lengths. - */ - - reqlength = (length1 > length2) ? length1 : length2; - } + if ((reqlength == 0) || (objv[0] == objv[1])) { + /* + * Alway match at 0 chars of if it is the same obj. + */ - match = memcmp(bytes1, bytes2, (unsigned)length); - if ((match == 0) && (reqlength > length)) { - match = length1 - length2; + Tcl_SetBooleanObj(resultPtr, + ((enum options) index == STR_EQUAL)); + break; + } else if (!nocase && objv[0]->typePtr == &tclByteArrayType && + objv[1]->typePtr == &tclByteArrayType) { + /* + * Use binary versions of comparisons since that won't + * cause undue type conversions and it is much faster. + * Only do this if we're case-sensitive (which is all + * that really makes sense with byte arrays anyway, and + * we have no memcasecmp() for some reason... :^) + */ + string1 = (char*) Tcl_GetByteArrayFromObj(objv[0], &length1); + string2 = (char*) Tcl_GetByteArrayFromObj(objv[1], &length2); + strCmpFn = memcmp; + } else if (((objv[0]->typePtr == &tclStringType) + && (objv[0]->bytes == NULL)) + || ((objv[1]->typePtr == &tclStringType) + && (objv[1]->bytes == NULL))) { + /* + * Use UNICODE versions of string comparisons since that + * won't cause undue type conversions and we can work with + * characters all of a fixed size (much faster.), but only + * when one of the objects is a pure UNICODE object. + */ + string1 = (char*) Tcl_GetUnicodeFromObj(objv[0], &length1); + string2 = (char*) Tcl_GetUnicodeFromObj(objv[1], &length2); + strCmpFn = nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp; + } else { + /* + * As a catch-all we will work with UTF-8. We cannot use + * memcmp() as that is unsafe with any string containing + * NULL (\xC0\x80 in Tcl's utf rep). We can use the more + * efficient TclpUtfNcmp2 if we are case-sensitive and no + * specific length was requested. + */ + string1 = (char*) Tcl_GetStringFromObj(objv[0], &length1); + string2 = (char*) Tcl_GetStringFromObj(objv[1], &length2); + if ((reqlength < 0) && !nocase) { + strCmpFn = TclpUtfNcmp2; + } else { + length1 = Tcl_NumUtfChars(string1, length1); + length2 = Tcl_NumUtfChars(string2, length2); + strCmpFn = nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp; } - goto stringComparisonDone; } - /* - * Use UNICODE versions of string comparisons since that - * won't cause undue type conversions and we can work with - * characters all of a fixed size (much faster.) Also use - * this code for untyped objects, since like that we'll - * pick up many things that are used for comparison in - * scripts and convert them (efficiently) to UNICODE - * strings for comparison, but exclude case where both are - * untyped as that is a little bit aggressive. - */ - if ((objv[0]->typePtr == &tclStringType || - objv[0]->typePtr == NULL) && - (objv[1]->typePtr == &tclStringType || - objv[1]->typePtr == NULL) && - !(objv[0]->typePtr == NULL && objv[1]->typePtr == NULL)) { - Tcl_UniChar *uni1, *uni2; - - uni1 = Tcl_GetUnicodeFromObj(objv[0], &length1); - uni2 = Tcl_GetUnicodeFromObj(objv[1], &length2); + if (((enum options) index == STR_EQUAL) + && (reqlength < 0) && (length1 != length2)) { + match = 1; /* this will be reversed below */ + } else { length = (length1 < length2) ? length1 : length2; - if (reqlength > 0 && reqlength < length) { length = reqlength; } else if (reqlength < 0) { /* * The requested length is negative, so we ignore it by - * setting it to the longer of the two lengths. + * setting it to length + 1 so we correct the match var. */ - - reqlength = (length1 < length2) ? length2 : length1; - } - - if (nocase) { - match = Tcl_UniCharNcasecmp(uni1, uni2, (unsigned)length); - } else { - match = Tcl_UniCharNcmp(uni1, uni2, (unsigned)length); + reqlength = length + 1; } - + match = strCmpFn(string1, string2, (unsigned) length); if ((match == 0) && (reqlength > length)) { match = length1 - length2; } - goto stringComparisonDone; - } - - /* - * Strings to be compared are not both UNICODE or byte - * arrays, so we will need to convert to UTF-8 and work - * there (cannot use memcmp() as that is an unsafe - * operation with any string containing \u0000 and the - * safety test is equivalent in cost to the comparison - * itself!) - */ - string1 = Tcl_GetStringFromObj(objv[0], &length1); - string2 = Tcl_GetStringFromObj(objv[1], &length2); - length1 = Tcl_NumUtfChars(string1, length1); - length2 = Tcl_NumUtfChars(string2, length2); - length = (length1 < length2) ? length1 : length2; - - if ((reqlength > 0) && (reqlength < length)) { - length = reqlength; - } else if (reqlength < 0) { - /* - * The requested length is negative, so we ignore it by - * setting it to the longer of the two lengths. - */ - - reqlength = (length1 > length2) ? length1 : length2; - } - - if (nocase) { - match = Tcl_UtfNcasecmp(string1, string2, (unsigned) length); - } else { - match = Tcl_UtfNcmp(string1, string2, (unsigned) length); - } - - if ((match == 0) && (reqlength > length)) { - match = length1 - length2; } - stringComparisonDone: if ((enum options) index == STR_EQUAL) { Tcl_SetBooleanObj(resultPtr, (match) ? 0 : 1); } else { diff --git a/generic/tclExecute.c b/generic/tclExecute.c index f6d2ea2..bbe2728 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -11,7 +11,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclExecute.c,v 1.53 2002/04/18 13:04:20 msofer Exp $ + * RCS: @(#) $Id: tclExecute.c,v 1.54 2002/05/29 09:09:57 hobbs Exp $ */ #include "tclInt.h" @@ -2718,7 +2718,7 @@ TclExecuteByteCode(interp, codePtr) /* * String compare */ - char *s1, *s2; + CONST char *s1, *s2; int s1len, s2len, iResult; value2Ptr = POP_OBJECT(); @@ -2728,18 +2728,25 @@ TclExecuteByteCode(interp, codePtr) * The comparison function should compare up to the * minimum byte length only. */ - if ((valuePtr->typePtr == &tclByteArrayType) && + if (valuePtr == value2Ptr) { + /* + * In the pure equality case, set lengths too for + * the checks below (or we could goto beyond it). + */ + iResult = s1len = s2len = 0; + } else if ((valuePtr->typePtr == &tclByteArrayType) && (value2Ptr->typePtr == &tclByteArrayType)) { s1 = (char *) Tcl_GetByteArrayFromObj(valuePtr, &s1len); s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len); iResult = memcmp(s1, s2, (size_t) ((s1len < s2len) ? s1len : s2len)); - } else if ((valuePtr->typePtr == &tclStringType) || - (value2Ptr->typePtr == &tclStringType)) { + } else if (((valuePtr->typePtr == &tclStringType) + && (valuePtr->bytes == NULL)) + || ((value2Ptr->typePtr == &tclStringType) + && (value2Ptr->bytes == NULL))) { /* - * The alternative is to break this into more code - * that does type sensitive comparison, as done in - * Tcl_StringObjCmd. + * Do a unicode-specific comparison if one of the args + * only has the unicode rep. */ Tcl_UniChar *uni1, *uni2; uni1 = Tcl_GetUnicodeFromObj(valuePtr, &s1len); @@ -2748,21 +2755,12 @@ TclExecuteByteCode(interp, codePtr) (unsigned) ((s1len < s2len) ? s1len : s2len)); } else { /* - * This solution is less mem intensive, but it is - * computationally expensive as the string grows. The - * reason that we can't use a memcmp is that UTF-8 strings - * that contain a \u0000 can't be compared with memcmp. If - * we knew that the string was ascii-7 or had no null byte, - * we could just do memcmp and save all the hassle. + * We can't do a simple memcmp in order to handle the + * special Tcl \xC0\x80 null encoding for utf-8. */ s1 = Tcl_GetStringFromObj(valuePtr, &s1len); s2 = Tcl_GetStringFromObj(value2Ptr, &s2len); - /* - * These have to be in true chars - */ - s1len = Tcl_NumUtfChars(s1, s1len); - s2len = Tcl_NumUtfChars(s2, s2len); - iResult = Tcl_UtfNcmp(s1, s2, + iResult = TclpUtfNcmp2(s1, s2, (size_t) ((s1len < s2len) ? s1len : s2len)); } @@ -2931,6 +2929,26 @@ TclExecuteByteCode(interp, codePtr) value2Ptr = POP_OBJECT(); valuePtr = POP_OBJECT(); + + if (valuePtr == value2Ptr) { + /* + * Optimize the equal object case. + */ + switch (*pc) { + case INST_EQ: + case INST_LE: + case INST_GE: + iResult = 1; + break; + case INST_NEQ: + case INST_LT: + case INST_GT: + iResult = 0; + break; + } + goto foundResult; + } + t1Ptr = valuePtr->typePtr; t2Ptr = value2Ptr->typePtr; @@ -2967,30 +2985,39 @@ TclExecuteByteCode(interp, codePtr) if (!IS_NUMERIC_TYPE(t1Ptr) || !IS_NUMERIC_TYPE(t2Ptr)) { /* * One operand is not numeric. Compare as strings. - * NOTE: strcmp is not correct for \x00 < \x01. + * NOTE: strcmp is not correct for \x00 < \x01, but + * that is unlikely to occur here. We could use the + * TclUtfNCmp2 to handle this. */ - int cmpValue; - s1 = TclGetString(valuePtr); - s2 = TclGetString(value2Ptr); - cmpValue = strcmp(s1, s2); + int s1len, s2len; + s1 = Tcl_GetStringFromObj(valuePtr, &s1len); + s2 = Tcl_GetStringFromObj(value2Ptr, &s2len); switch (*pc) { case INST_EQ: - iResult = (cmpValue == 0); + if (s1len == s2len) { + iResult = (strcmp(s1, s2) == 0); + } else { + iResult = 0; + } break; case INST_NEQ: - iResult = (cmpValue != 0); + if (s1len == s2len) { + iResult = (strcmp(s1, s2) != 0); + } else { + iResult = 1; + } break; case INST_LT: - iResult = (cmpValue < 0); + iResult = (strcmp(s1, s2) < 0); break; case INST_GT: - iResult = (cmpValue > 0); + iResult = (strcmp(s1, s2) > 0); break; case INST_LE: - iResult = (cmpValue <= 0); + iResult = (strcmp(s1, s2) <= 0); break; case INST_GE: - iResult = (cmpValue >= 0); + iResult = (strcmp(s1, s2) >= 0); break; } } else if ((t1Ptr == &tclDoubleType) @@ -3094,7 +3121,7 @@ TclExecuteByteCode(interp, codePtr) /* * Reuse the valuePtr object already on stack if possible. */ - + foundResult: TRACE(("%.20s %.20s => %ld\n", O2S(valuePtr), O2S(value2Ptr), iResult)); if (Tcl_IsShared(valuePtr)) { diff --git a/generic/tclInt.decls b/generic/tclInt.decls index b4d07fa..094b9df 100644 --- a/generic/tclInt.decls +++ b/generic/tclInt.decls @@ -12,7 +12,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: tclInt.decls,v 1.49 2002/05/29 00:19:40 hobbs Exp $ +# RCS: @(#) $Id: tclInt.decls,v 1.50 2002/05/29 09:09:57 hobbs Exp $ library tcl @@ -666,6 +666,11 @@ declare 168 generic { Tcl_Obj *TclGetStartupScriptPath(void) } +# variant of Tcl_UtfNCmp that takes n as bytes, not chars +declare 169 generic { + int TclpUtfNcmp2(CONST char *s1, CONST char *s2, unsigned long n) +} + ############################################################################## diff --git a/generic/tclIntDecls.h b/generic/tclIntDecls.h index a16cc73..db179b9 100644 --- a/generic/tclIntDecls.h +++ b/generic/tclIntDecls.h @@ -11,7 +11,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclIntDecls.h,v 1.40 2002/05/29 00:19:40 hobbs Exp $ + * RCS: @(#) $Id: tclIntDecls.h,v 1.41 2002/05/29 09:09:57 hobbs Exp $ */ #ifndef _TCLINTDECLS @@ -503,6 +503,9 @@ EXTERN void TclSetStartupScriptPath _ANSI_ARGS_(( Tcl_Obj * pathPtr)); /* 168 */ EXTERN Tcl_Obj * TclGetStartupScriptPath _ANSI_ARGS_((void)); +/* 169 */ +EXTERN int TclpUtfNcmp2 _ANSI_ARGS_((CONST char * s1, + CONST char * s2, unsigned long n)); typedef struct TclIntStubs { int magic; @@ -709,6 +712,7 @@ typedef struct TclIntStubs { int (*tclListObjSetElement) _ANSI_ARGS_((Tcl_Interp * interp, Tcl_Obj * listPtr, int index, Tcl_Obj * valuePtr)); /* 166 */ void (*tclSetStartupScriptPath) _ANSI_ARGS_((Tcl_Obj * pathPtr)); /* 167 */ Tcl_Obj * (*tclGetStartupScriptPath) _ANSI_ARGS_((void)); /* 168 */ + int (*tclpUtfNcmp2) _ANSI_ARGS_((CONST char * s1, CONST char * s2, unsigned long n)); /* 169 */ } TclIntStubs; #ifdef __cplusplus @@ -1325,6 +1329,10 @@ extern TclIntStubs *tclIntStubsPtr; #define TclGetStartupScriptPath \ (tclIntStubsPtr->tclGetStartupScriptPath) /* 168 */ #endif +#ifndef TclpUtfNcmp2 +#define TclpUtfNcmp2 \ + (tclIntStubsPtr->tclpUtfNcmp2) /* 169 */ +#endif #endif /* defined(USE_TCL_STUBS) && !defined(USE_TCL_STUB_PROCS) */ diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index be2902e..90f51ba 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclStubInit.c,v 1.70 2002/05/24 21:19:06 dkf Exp $ + * RCS: @(#) $Id: tclStubInit.c,v 1.71 2002/05/29 09:09:57 hobbs Exp $ */ #include "tclInt.h" @@ -248,6 +248,7 @@ TclIntStubs tclIntStubs = { TclListObjSetElement, /* 166 */ TclSetStartupScriptPath, /* 167 */ TclGetStartupScriptPath, /* 168 */ + TclpUtfNcmp2, /* 169 */ }; TclIntPlatStubs tclIntPlatStubs = { diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 0148ca6..56dcaca 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclUtf.c,v 1.23 2002/02/08 02:52:54 dgp Exp $ + * RCS: @(#) $Id: tclUtf.c,v 1.24 2002/05/29 09:09:57 hobbs Exp $ */ #include "tclInt.h" @@ -309,7 +309,7 @@ Tcl_UtfToUniChar(str, chPtr) * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid * characters representing themselves. */ - + *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xE0) { @@ -317,7 +317,7 @@ Tcl_UtfToUniChar(str, chPtr) /* * Two-byte-character lead-byte followed by a trail-byte. */ - + *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); return 2; } @@ -325,7 +325,7 @@ Tcl_UtfToUniChar(str, chPtr) * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ - + *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xF0) { @@ -671,7 +671,7 @@ Tcl_UtfPrev(str, start) byte = *((unsigned char *) look); if (byte < 0x80) { break; - } + } if (byte >= 0xC0) { return look; } @@ -1074,6 +1074,51 @@ Tcl_UtfToTitle(str) /* *---------------------------------------------------------------------- * + * TclpUtfNcmp2 -- + * + * Compare at most n bytes of utf-8 strings cs and ct. Both cs + * and ct are assumed to be at least n bytes long. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclpUtfNcmp2(cs, ct, n) + CONST char *cs; /* UTF string to compare to ct. */ + CONST char *ct; /* UTF string cs is compared to. */ + unsigned long n; /* Number of *bytes* to compare. */ +{ + /* + * We can't simply call 'memcmp(cs, ct, n);' because we need to check + * for Tcl's \xC0\x80 non-utf-8 null encoding. + * Otherwise utf-8 lexes fine in the strcmp manner. + */ + register int result = 0; + + for ( ; n != 0; n--, cs++, ct++) { + if (*cs != *ct) { + result = UCHAR(*cs) - UCHAR(*ct); + break; + } + } + if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { + unsigned char c1, c2; + c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); + c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct); + result = (c1 - c2); + } + return result; +} + +/* + *---------------------------------------------------------------------- + * * Tcl_UtfNcmp -- * * Compare at most n UTF chars of string cs to string ct. Both cs @@ -1096,7 +1141,7 @@ Tcl_UtfNcmp(cs, ct, n) { Tcl_UniChar ch1, ch2; /* - * Cannot use memcmp()-based approach as byte representation of + * Cannot use 'memcmp(cs, ct, n);' as byte representation of * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte * representation of \u0001 (the byte 0x01.) */ @@ -1306,13 +1351,14 @@ Tcl_UniCharNcmp(cs, ct, n) CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ unsigned long n; /* Number of unichars to compare. */ { + /* + * We can't simply call 'memcmp(cs, ct, n*sizeof(Tcl_UniChar));' + * because that may not be lexically correct. + */ for ( ; n != 0; n--, cs++, ct++) { if (*cs != *ct) { return (*cs - *ct); } - if (*cs == '\0') { - break; - } } return 0; } @@ -1346,9 +1392,6 @@ Tcl_UniCharNcasecmp(cs, ct, n) (Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) { return (*cs - *ct); } - if (*cs == '\0') { - break; - } } return 0; } |