From 8bccfa351c587aaf7e5d6aa27b0ef0141806c1cf Mon Sep 17 00:00:00 2001 From: dkf Date: Fri, 6 Apr 2001 10:50:00 +0000 Subject: Fixed problem with [string compare \x00 \x01] and hopefully sped the command up in a few cases too (notably byte arrays and UNICODE objects.) [Bug #219201] --- ChangeLog | 12 +++++ generic/tclCmdMZ.c | 132 +++++++++++++++++++++++++++++++++++++++++------------ generic/tclUtf.c | 10 ++-- tests/string.test | 11 +++-- 4 files changed, 128 insertions(+), 37 deletions(-) diff --git a/ChangeLog b/ChangeLog index 01a92fa..8e9ac3a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2001-04-06 Donal K. Fellows + + * tests/string.test (string-2.30): Test for this case + * generic/tclCmdMZ.c (Tcl_StringObjCmd, STR_COMPARE branch): Fixed + problem caused by Utf-rep of \x00 being more than Utf-rep of \x01 + fooling memcmp by forcing everything through Utf-based + comparisons. Added optimizations for case where objects have a + string/unicode-rep or a bytearray-rep (i.e. where we can perform + comparisons on fixed-size units.) [Bug #219201] + * generic/tclUtf.c (Tcl_UtfNcmp): Corrected seriously erroneous + comment. + 2001-04-05 Andreas Kupries * doc/Macintosh.3: Removed duplicates from .SH line diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 6cf9852..ff5362a 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -13,7 +13,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclCmdMZ.c,v 1.34 2001/04/05 10:20:18 dkf Exp $ + * RCS: @(#) $Id: tclCmdMZ.c,v 1.35 2001/04/06 10:50:00 dkf Exp $ */ #include "tclInt.h" @@ -1091,35 +1091,45 @@ Tcl_StringObjCmd(dummy, interp, objc, objv) } } - string1 = Tcl_GetStringFromObj(objv[objc-2], &length1); - string2 = Tcl_GetStringFromObj(objv[objc-1], &length2); - /* - * This is the min length IN BYTES of the two strings - */ - length = (length1 < length2) ? length1 : length2; - if (reqlength == 0) { /* * Anything matches at 0 chars, right? */ match = 0; - } else if (nocase || ((reqlength > 0) && (reqlength <= length))) { - /* - * with -nocase or -length we have to check true char length - * as it could be smaller than expected - */ + goto stringComparisonDone; + } - length1 = Tcl_NumUtfChars(string1, length1); - length2 = Tcl_NumUtfChars(string2, length2); - length = (length1 < length2) ? length1 : length2; + /* + * From now on, we only access the two objects at the end + * of the argument array. + */ + objv += objc-2; - /* - * Do the reqlength check again, against 0 as well for - * the benfit of nocase - */ + /* + * Use UNICODE versions of string comparisons since that + * won't cause undue type conversions and we can work with + * characters all of a fixed size (much faster.) Also use + * this code for untyped objects, since like that we'll + * pick up many things that are used for comparison in + * scripts and convert them (efficiently) to UNICODE + * strings for comparison, but exclude case where both are + * untyped as that is a little bit aggressive. + */ + if ((objv[0]->typePtr == &tclStringType || + objv[0]->typePtr == NULL) && + (objv[1]->typePtr == &tclStringType || + objv[1]->typePtr == NULL) && + !(objv[0]->typePtr == NULL && objv[1]->typePtr == NULL)) { + Tcl_UniChar *uni1, *uni2; + + length1 = Tcl_GetCharLength(objv[0]); + length2 = Tcl_GetCharLength(objv[1]); + length = (length1 < length2) ? length1 : length2; + uni1 = Tcl_GetUnicode(objv[0]); + uni2 = Tcl_GetUnicode(objv[1]); - if ((reqlength > 0) && (reqlength < length)) { + if (reqlength > 0 && reqlength < length) { length = reqlength; } else if (reqlength < 0) { /* @@ -1127,24 +1137,90 @@ Tcl_StringObjCmd(dummy, interp, objc, objv) * setting it to the longer of the two lengths. */ - reqlength = (length1 > length2) ? length1 : length2; + reqlength = (length1 < length2) ? length2 : length1; } + if (nocase) { - match = Tcl_UtfNcasecmp(string1, string2, - (unsigned) length); + match = Tcl_UniCharNcasecmp(uni1, uni2, (unsigned)length); } else { - match = Tcl_UtfNcmp(string1, string2, (unsigned) length); + match = Tcl_UniCharNcmp(uni1, uni2, (unsigned)length); } + if ((match == 0) && (reqlength > length)) { match = length1 - length2; } - } else { - match = memcmp(string1, string2, (unsigned) length); - if (match == 0) { + goto stringComparisonDone; + } + + /* + * Use binary versions of comparisons since that won't + * cause undue type conversions and it is much faster. + * Only do this if we're case-sensitive (which is all + * that really makes sense with byte arrays anyway, and + * we have no memcasecmp() for some reason... :^) + */ + if (objv[0]->typePtr == &tclByteArrayType && + objv[1]->typePtr == &tclByteArrayType && !nocase) { + unsigned char *bytes1, *bytes2; + + bytes1 = Tcl_GetByteArrayFromObj(objv[0], &length1); + bytes2 = Tcl_GetByteArrayFromObj(objv[1], &length2); + length = (length1 < length2) ? length1 : length2; + + if ((reqlength > 0) && (reqlength < length)) { + length = reqlength; + } else if (reqlength < 0) { + /* + * The requested length is negative, so we ignore it by + * setting it to the longer of the two lengths. + */ + + reqlength = (length1 > length2) ? length1 : length2; + } + + match = memcmp(bytes1, bytes2, (unsigned)length); + if ((match == 0) && (reqlength > length)) { match = length1 - length2; } + goto stringComparisonDone; + } + + /* + * Strings to be compared are not both UNICODE or byte + * arrays, so we will need to convert to UTF-8 and work + * there (cannot use memcmp() as that is an unsafe + * operation with any string containing \u0000 and the + * safety test is equivalent in cost to the comparison + * itself!) + */ + string1 = Tcl_GetStringFromObj(objv[0], &length1); + string2 = Tcl_GetStringFromObj(objv[1], &length2); + length1 = Tcl_NumUtfChars(string1, length1); + length2 = Tcl_NumUtfChars(string2, length2); + length = (length1 < length2) ? length1 : length2; + + if ((reqlength > 0) && (reqlength < length)) { + length = reqlength; + } else if (reqlength < 0) { + /* + * The requested length is negative, so we ignore it by + * setting it to the longer of the two lengths. + */ + + reqlength = (length1 > length2) ? length1 : length2; + } + + if (nocase) { + match = Tcl_UtfNcasecmp(string1, string2, (unsigned) length); + } else { + match = Tcl_UtfNcmp(string1, string2, (unsigned) length); + } + + if ((match == 0) && (reqlength > length)) { + match = length1 - length2; } + stringComparisonDone: if ((enum options) index == STR_EQUAL) { Tcl_SetBooleanObj(resultPtr, (match) ? 0 : 1); } else { diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b11fd85..2a6c217 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclUtf.c,v 1.14 2000/06/05 23:36:21 ericm Exp $ + * RCS: @(#) $Id: tclUtf.c,v 1.15 2001/04/06 10:50:00 dkf Exp $ */ #include "tclInt.h" @@ -1087,11 +1087,9 @@ Tcl_UtfNcmp(cs, ct, n) { Tcl_UniChar ch1, ch2; /* - * Another approach that should work is: - * return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs)); - * That assumes that ct is a properly formed UTF, so we will just - * be comparing the bytes that compromise those strings to the - * char length n. + * Cannot use memcmp()-based approach as byte representation of + * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte + * representation of \u0001 (the byte 0x01.) */ while (n-- > 0) { /* diff --git a/tests/string.test b/tests/string.test index d725cc4..aad463e 100644 --- a/tests/string.test +++ b/tests/string.test @@ -11,7 +11,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: string.test,v 1.27 2001/03/12 15:58:01 dkf Exp $ +# RCS: @(#) $Id: string.test,v 1.28 2001/04/06 10:50:00 dkf Exp $ if {[lsearch [namespace children] ::tcltest] == -1} { package require tcltest @@ -119,12 +119,17 @@ test string-2.26 {string compare -nocase, null strings} { test string-2.27 {string compare -nocase, null strings} { string compare -nocase foo "" } 1 -test string-2.28 {string equal with length, unequal strings} { +test string-2.28 {string compare with length, unequal strings} { string compare -length 2 abc abde } 0 -test string-2.29 {string equal with length, unequal strings} { +test string-2.29 {string compare with length, unequal strings} { string compare -length 2 ab abde } 0 +test string-2.30 {string compare with NUL character vs. other ASCII} { + # Be careful here, since UTF-8 rep comparison with memcmp() of + # these puts chars in the wrong order + string compare \x00 \x01 +} -1 # only need a few tests on equal, since it uses the same code as # string compare, but just modifies the return output -- cgit v0.12