summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog12
-rw-r--r--generic/tclCmdMZ.c132
-rw-r--r--generic/tclUtf.c10
-rw-r--r--tests/string.test11
4 files changed, 128 insertions, 37 deletions
diff --git a/ChangeLog b/ChangeLog
index 01a92fa..8e9ac3a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2001-04-06 Donal K. Fellows <fellowsd@cs.man.ac.uk>
+
+ * tests/string.test (string-2.30): Test for this case
+ * generic/tclCmdMZ.c (Tcl_StringObjCmd, STR_COMPARE branch): Fixed
+ problem caused by Utf-rep of \x00 being more than Utf-rep of \x01
+ fooling memcmp by forcing everything through Utf-based
+ comparisons. Added optimizations for case where objects have a
+ string/unicode-rep or a bytearray-rep (i.e. where we can perform
+ comparisons on fixed-size units.) [Bug #219201]
+ * generic/tclUtf.c (Tcl_UtfNcmp): Corrected seriously erroneous
+ comment.
+
2001-04-05 Andreas Kupries <andreas_kupries@users.sourceforge.net>
* doc/Macintosh.3: Removed duplicates from .SH line
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 6cf9852..ff5362a 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -13,7 +13,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclCmdMZ.c,v 1.34 2001/04/05 10:20:18 dkf Exp $
+ * RCS: @(#) $Id: tclCmdMZ.c,v 1.35 2001/04/06 10:50:00 dkf Exp $
*/
#include "tclInt.h"
@@ -1091,35 +1091,45 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
}
}
- string1 = Tcl_GetStringFromObj(objv[objc-2], &length1);
- string2 = Tcl_GetStringFromObj(objv[objc-1], &length2);
- /*
- * This is the min length IN BYTES of the two strings
- */
- length = (length1 < length2) ? length1 : length2;
-
if (reqlength == 0) {
/*
* Anything matches at 0 chars, right?
*/
match = 0;
- } else if (nocase || ((reqlength > 0) && (reqlength <= length))) {
- /*
- * with -nocase or -length we have to check true char length
- * as it could be smaller than expected
- */
+ goto stringComparisonDone;
+ }
- length1 = Tcl_NumUtfChars(string1, length1);
- length2 = Tcl_NumUtfChars(string2, length2);
- length = (length1 < length2) ? length1 : length2;
+ /*
+ * From now on, we only access the two objects at the end
+ * of the argument array.
+ */
+ objv += objc-2;
- /*
- * Do the reqlength check again, against 0 as well for
- * the benfit of nocase
- */
+ /*
+ * Use UNICODE versions of string comparisons since that
+ * won't cause undue type conversions and we can work with
+ * characters all of a fixed size (much faster.) Also use
+ * this code for untyped objects, since like that we'll
+ * pick up many things that are used for comparison in
+ * scripts and convert them (efficiently) to UNICODE
+ * strings for comparison, but exclude case where both are
+ * untyped as that is a little bit aggressive.
+ */
+ if ((objv[0]->typePtr == &tclStringType ||
+ objv[0]->typePtr == NULL) &&
+ (objv[1]->typePtr == &tclStringType ||
+ objv[1]->typePtr == NULL) &&
+ !(objv[0]->typePtr == NULL && objv[1]->typePtr == NULL)) {
+ Tcl_UniChar *uni1, *uni2;
+
+ length1 = Tcl_GetCharLength(objv[0]);
+ length2 = Tcl_GetCharLength(objv[1]);
+ length = (length1 < length2) ? length1 : length2;
+ uni1 = Tcl_GetUnicode(objv[0]);
+ uni2 = Tcl_GetUnicode(objv[1]);
- if ((reqlength > 0) && (reqlength < length)) {
+ if (reqlength > 0 && reqlength < length) {
length = reqlength;
} else if (reqlength < 0) {
/*
@@ -1127,24 +1137,90 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
* setting it to the longer of the two lengths.
*/
- reqlength = (length1 > length2) ? length1 : length2;
+ reqlength = (length1 < length2) ? length2 : length1;
}
+
if (nocase) {
- match = Tcl_UtfNcasecmp(string1, string2,
- (unsigned) length);
+ match = Tcl_UniCharNcasecmp(uni1, uni2, (unsigned)length);
} else {
- match = Tcl_UtfNcmp(string1, string2, (unsigned) length);
+ match = Tcl_UniCharNcmp(uni1, uni2, (unsigned)length);
}
+
if ((match == 0) && (reqlength > length)) {
match = length1 - length2;
}
- } else {
- match = memcmp(string1, string2, (unsigned) length);
- if (match == 0) {
+ goto stringComparisonDone;
+ }
+
+ /*
+ * Use binary versions of comparisons since that won't
+ * cause undue type conversions and it is much faster.
+ * Only do this if we're case-sensitive (which is all
+ * that really makes sense with byte arrays anyway, and
+ * we have no memcasecmp() for some reason... :^)
+ */
+ if (objv[0]->typePtr == &tclByteArrayType &&
+ objv[1]->typePtr == &tclByteArrayType && !nocase) {
+ unsigned char *bytes1, *bytes2;
+
+ bytes1 = Tcl_GetByteArrayFromObj(objv[0], &length1);
+ bytes2 = Tcl_GetByteArrayFromObj(objv[1], &length2);
+ length = (length1 < length2) ? length1 : length2;
+
+ if ((reqlength > 0) && (reqlength < length)) {
+ length = reqlength;
+ } else if (reqlength < 0) {
+ /*
+ * The requested length is negative, so we ignore it by
+ * setting it to the longer of the two lengths.
+ */
+
+ reqlength = (length1 > length2) ? length1 : length2;
+ }
+
+ match = memcmp(bytes1, bytes2, (unsigned)length);
+ if ((match == 0) && (reqlength > length)) {
match = length1 - length2;
}
+ goto stringComparisonDone;
+ }
+
+ /*
+ * Strings to be compared are not both UNICODE or byte
+ * arrays, so we will need to convert to UTF-8 and work
+ * there (cannot use memcmp() as that is an unsafe
+ * operation with any string containing \u0000 and the
+ * safety test is equivalent in cost to the comparison
+ * itself!)
+ */
+ string1 = Tcl_GetStringFromObj(objv[0], &length1);
+ string2 = Tcl_GetStringFromObj(objv[1], &length2);
+ length1 = Tcl_NumUtfChars(string1, length1);
+ length2 = Tcl_NumUtfChars(string2, length2);
+ length = (length1 < length2) ? length1 : length2;
+
+ if ((reqlength > 0) && (reqlength < length)) {
+ length = reqlength;
+ } else if (reqlength < 0) {
+ /*
+ * The requested length is negative, so we ignore it by
+ * setting it to the longer of the two lengths.
+ */
+
+ reqlength = (length1 > length2) ? length1 : length2;
+ }
+
+ if (nocase) {
+ match = Tcl_UtfNcasecmp(string1, string2, (unsigned) length);
+ } else {
+ match = Tcl_UtfNcmp(string1, string2, (unsigned) length);
+ }
+
+ if ((match == 0) && (reqlength > length)) {
+ match = length1 - length2;
}
+ stringComparisonDone:
if ((enum options) index == STR_EQUAL) {
Tcl_SetBooleanObj(resultPtr, (match) ? 0 : 1);
} else {
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index b11fd85..2a6c217 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -8,7 +8,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclUtf.c,v 1.14 2000/06/05 23:36:21 ericm Exp $
+ * RCS: @(#) $Id: tclUtf.c,v 1.15 2001/04/06 10:50:00 dkf Exp $
*/
#include "tclInt.h"
@@ -1087,11 +1087,9 @@ Tcl_UtfNcmp(cs, ct, n)
{
Tcl_UniChar ch1, ch2;
/*
- * Another approach that should work is:
- * return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));
- * That assumes that ct is a properly formed UTF, so we will just
- * be comparing the bytes that compromise those strings to the
- * char length n.
+ * Cannot use memcmp()-based approach as byte representation of
+ * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
+ * representation of \u0001 (the byte 0x01.)
*/
while (n-- > 0) {
/*
diff --git a/tests/string.test b/tests/string.test
index d725cc4..aad463e 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -11,7 +11,7 @@
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
-# RCS: @(#) $Id: string.test,v 1.27 2001/03/12 15:58:01 dkf Exp $
+# RCS: @(#) $Id: string.test,v 1.28 2001/04/06 10:50:00 dkf Exp $
if {[lsearch [namespace children] ::tcltest] == -1} {
package require tcltest
@@ -119,12 +119,17 @@ test string-2.26 {string compare -nocase, null strings} {
test string-2.27 {string compare -nocase, null strings} {
string compare -nocase foo ""
} 1
-test string-2.28 {string equal with length, unequal strings} {
+test string-2.28 {string compare with length, unequal strings} {
string compare -length 2 abc abde
} 0
-test string-2.29 {string equal with length, unequal strings} {
+test string-2.29 {string compare with length, unequal strings} {
string compare -length 2 ab abde
} 0
+test string-2.30 {string compare with NUL character vs. other ASCII} {
+ # Be careful here, since UTF-8 rep comparison with memcmp() of
+ # these puts chars in the wrong order
+ string compare \x00 \x01
+} -1
# only need a few tests on equal, since it uses the same code as
# string compare, but just modifies the return output