From 8bccfa351c587aaf7e5d6aa27b0ef0141806c1cf Mon Sep 17 00:00:00 2001
From: dkf <donal.k.fellows@manchester.ac.uk>
Date: Fri, 6 Apr 2001 10:50:00 +0000
Subject: Fixed problem with [string compare \x00 \x01] and hopefully sped the
 command up in a few cases too (notably byte arrays and UNICODE objects.) 
 [Bug #219201]

---
 ChangeLog          |  12 +++++
 generic/tclCmdMZ.c | 132 +++++++++++++++++++++++++++++++++++++++++------------
 generic/tclUtf.c   |  10 ++--
 tests/string.test  |  11 +++--
 4 files changed, 128 insertions(+), 37 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 01a92fa..8e9ac3a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2001-04-06  Donal K. Fellows  <fellowsd@cs.man.ac.uk>
+
+	* tests/string.test (string-2.30): Test for this case
+	* generic/tclCmdMZ.c (Tcl_StringObjCmd, STR_COMPARE branch): Fixed
+	problem caused by Utf-rep of \x00 being more than Utf-rep of \x01
+	fooling memcmp by forcing everything through Utf-based
+	comparisons.  Added optimizations for case where objects have a
+	string/unicode-rep or a bytearray-rep (i.e. where we can perform
+	comparisons on fixed-size units.) [Bug #219201]
+	* generic/tclUtf.c (Tcl_UtfNcmp): Corrected seriously erroneous
+	comment.
+
 2001-04-05  Andreas Kupries  <andreas_kupries@users.sourceforge.net>
 
 	* doc/Macintosh.3: Removed duplicates from .SH line
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 6cf9852..ff5362a 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -13,7 +13,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclCmdMZ.c,v 1.34 2001/04/05 10:20:18 dkf Exp $
+ * RCS: @(#) $Id: tclCmdMZ.c,v 1.35 2001/04/06 10:50:00 dkf Exp $
  */
 
 #include "tclInt.h"
@@ -1091,35 +1091,45 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 		}
 	    }
 
-	    string1 = Tcl_GetStringFromObj(objv[objc-2], &length1);
-	    string2 = Tcl_GetStringFromObj(objv[objc-1], &length2);
-	    /*
-	     * This is the min length IN BYTES of the two strings
-	     */
-	    length = (length1 < length2) ? length1 : length2;
-
 	    if (reqlength == 0) {
 		/*
 		 * Anything matches at 0 chars, right?
 		 */
 
 		match = 0;
-	    } else if (nocase || ((reqlength > 0) && (reqlength <= length))) {
-		/*
-		 * with -nocase or -length we have to check true char length
-		 * as it could be smaller than expected
-		 */
+		goto stringComparisonDone;
+	    }
 
-		length1 = Tcl_NumUtfChars(string1, length1);
-		length2 = Tcl_NumUtfChars(string2, length2);
-		length = (length1 < length2) ? length1 : length2;
+	    /*
+	     * From now on, we only access the two objects at the end
+	     * of the argument array.
+	     */
+	    objv += objc-2;
 
-		/*
-		 * Do the reqlength check again, against 0 as well for
-		 * the benfit of nocase
-		 */
+	    /*
+	     * Use UNICODE versions of string comparisons since that
+	     * won't cause undue type conversions and we can work with
+	     * characters all of a fixed size (much faster.)  Also use
+	     * this code for untyped objects, since like that we'll
+	     * pick up many things that are used for comparison in
+	     * scripts and convert them (efficiently) to UNICODE
+	     * strings for comparison, but exclude case where both are
+	     * untyped as that is a little bit aggressive.
+	     */
+	    if ((objv[0]->typePtr == &tclStringType ||
+		 objv[0]->typePtr == NULL) &&
+		(objv[1]->typePtr == &tclStringType ||
+		 objv[1]->typePtr == NULL) &&
+		!(objv[0]->typePtr == NULL && objv[1]->typePtr == NULL)) {
+		Tcl_UniChar *uni1, *uni2;
+
+		length1 = Tcl_GetCharLength(objv[0]);
+		length2 = Tcl_GetCharLength(objv[1]);
+		length = (length1 < length2) ? length1 : length2;
+		uni1 = Tcl_GetUnicode(objv[0]);
+		uni2 = Tcl_GetUnicode(objv[1]);
 
-		if ((reqlength > 0) && (reqlength < length)) {
+		if (reqlength > 0 && reqlength < length) {
 		    length = reqlength;
 		} else if (reqlength < 0) {
 		    /*
@@ -1127,24 +1137,90 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 		     * setting it to the longer of the two lengths.
 		     */
 
-		    reqlength = (length1 > length2) ? length1 : length2;
+		    reqlength = (length1 < length2) ? length2 : length1;
 		}
+
 		if (nocase) {
-		    match = Tcl_UtfNcasecmp(string1, string2,
-			    (unsigned) length);
+		    match = Tcl_UniCharNcasecmp(uni1, uni2, (unsigned)length);
 		} else {
-		    match = Tcl_UtfNcmp(string1, string2, (unsigned) length);
+		    match = Tcl_UniCharNcmp(uni1, uni2, (unsigned)length);
 		}
+
 		if ((match == 0) && (reqlength > length)) {
 		    match = length1 - length2;
 		}
-	    } else {
-		match = memcmp(string1, string2, (unsigned) length);
-		if (match == 0) {
+		goto stringComparisonDone;
+	    }
+
+	    /*
+	     * Use binary versions of comparisons since that won't
+	     * cause undue type conversions and it is much faster.
+	     * Only do this if we're case-sensitive (which is all
+	     * that really makes sense with byte arrays anyway, and
+	     * we have no memcasecmp() for some reason... :^)
+	     */
+	    if (objv[0]->typePtr == &tclByteArrayType &&
+		objv[1]->typePtr == &tclByteArrayType && !nocase) {
+		unsigned char *bytes1, *bytes2;
+
+		bytes1 = Tcl_GetByteArrayFromObj(objv[0], &length1);
+		bytes2 = Tcl_GetByteArrayFromObj(objv[1], &length2);
+		length = (length1 < length2) ? length1 : length2;
+
+		if ((reqlength > 0) && (reqlength < length)) {
+		    length = reqlength;
+		} else if (reqlength < 0) {
+		    /*
+		     * The requested length is negative, so we ignore it by
+		     * setting it to the longer of the two lengths.
+		     */
+
+		    reqlength = (length1 > length2) ? length1 : length2;
+		}
+
+		match = memcmp(bytes1, bytes2, (unsigned)length);
+		if ((match == 0) && (reqlength > length)) {
 		    match = length1 - length2;
 		}
+		goto stringComparisonDone;
+	    }
+
+	    /*
+	     * Strings to be compared are not both UNICODE or byte
+	     * arrays, so we will need to convert to UTF-8 and work
+	     * there (cannot use memcmp() as that is an unsafe
+	     * operation with any string containing \u0000 and the
+	     * safety test is equivalent in cost to the comparison
+	     * itself!)
+	     */
+	    string1 = Tcl_GetStringFromObj(objv[0], &length1);
+	    string2 = Tcl_GetStringFromObj(objv[1], &length2);
+	    length1 = Tcl_NumUtfChars(string1, length1);
+	    length2 = Tcl_NumUtfChars(string2, length2);
+	    length = (length1 < length2) ? length1 : length2;
+
+	    if ((reqlength > 0) && (reqlength < length)) {
+		length = reqlength;
+	    } else if (reqlength < 0) {
+		/*
+		 * The requested length is negative, so we ignore it by
+		 * setting it to the longer of the two lengths.
+		 */
+
+		reqlength = (length1 > length2) ? length1 : length2;
+	    }
+
+	    if (nocase) {
+		match = Tcl_UtfNcasecmp(string1, string2, (unsigned) length);
+	    } else {
+		match = Tcl_UtfNcmp(string1, string2, (unsigned) length);
+	    }
+
+	    if ((match == 0) && (reqlength > length)) {
+		match = length1 - length2;
 	    }
 
+	stringComparisonDone:
 	    if ((enum options) index == STR_EQUAL) {
 		Tcl_SetBooleanObj(resultPtr, (match) ? 0 : 1);
 	    } else {
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index b11fd85..2a6c217 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -8,7 +8,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclUtf.c,v 1.14 2000/06/05 23:36:21 ericm Exp $
+ * RCS: @(#) $Id: tclUtf.c,v 1.15 2001/04/06 10:50:00 dkf Exp $
  */
 
 #include "tclInt.h"
@@ -1087,11 +1087,9 @@ Tcl_UtfNcmp(cs, ct, n)
 {
     Tcl_UniChar ch1, ch2;
     /*
-     * Another approach that should work is:
-     *   return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));
-     * That assumes that ct is a properly formed UTF, so we will just
-     * be comparing the bytes that compromise those strings to the
-     * char length n.
+     * Cannot use memcmp()-based approach as byte representation of
+     * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
+     * representation of \u0001 (the byte 0x01.)
      */
     while (n-- > 0) {
 	/*
diff --git a/tests/string.test b/tests/string.test
index d725cc4..aad463e 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -11,7 +11,7 @@
 # See the file "license.terms" for information on usage and redistribution
 # of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 #
-# RCS: @(#) $Id: string.test,v 1.27 2001/03/12 15:58:01 dkf Exp $
+# RCS: @(#) $Id: string.test,v 1.28 2001/04/06 10:50:00 dkf Exp $
 
 if {[lsearch [namespace children] ::tcltest] == -1} {
     package require tcltest
@@ -119,12 +119,17 @@ test string-2.26 {string compare -nocase, null strings} {
 test string-2.27 {string compare -nocase, null strings} {
     string compare -nocase foo ""
 } 1
-test string-2.28 {string equal with length, unequal strings} {
+test string-2.28 {string compare with length, unequal strings} {
     string compare -length 2 abc abde
 } 0
-test string-2.29 {string equal with length, unequal strings} {
+test string-2.29 {string compare with length, unequal strings} {
     string compare -length 2 ab abde
 } 0
+test string-2.30 {string compare with NUL character vs. other ASCII} {
+    # Be careful here, since UTF-8 rep comparison with memcmp() of
+    # these puts chars in the wrong order
+    string compare \x00 \x01
+} -1
 
 # only need a few tests on equal, since it uses the same code as
 # string compare, but just modifies the return output
-- 
cgit v0.12