From 2dac256b72f43d6d47ba902d97c17f0a7445a12e Mon Sep 17 00:00:00 2001
From: hobbs <hobbs>
Date: Sat, 25 Sep 2010 02:25:54 +0000
Subject: 	* tests/stringComp.test: improved string test coverage 	*
 generic/tclExecute.c (TclExecuteByteCode): merge INST_STR_CMP 	and
 INST_STR_EQ/INST_STR_NEQ paths.  Speeds up eq/ne/[string eq] 	with obj-aware
 comparisons and eq/==/ne/!= with length equality 	check.

---
 ChangeLog             |   8 ++
 generic/tclExecute.c  | 168 +++++++++++---------------
 tests/stringComp.test | 320 ++++++++++++++++++++++++--------------------------
 3 files changed, 232 insertions(+), 264 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 20c82ff..7163a85 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2010-09-24  Jeff Hobbs  <jeffh@ActiveState.com>
+
+	* tests/stringComp.test: improved string eq/cmp test coverage
+	* generic/tclExecute.c (TclExecuteByteCode): merge INST_STR_CMP
+	and INST_STR_EQ/INST_STR_NEQ paths.  Speeds up eq/ne/[string eq]
+	with obj-aware comparisons and eq/==/ne/!= with length equality
+	check.
+
 2010-09-24  Andreas Kupries  <andreask@activestate.com>
 
 	* tclWinsock.c: [Bug 3056775]: Fixed race condition between thread
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index a6fae67..fa792df 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -14,7 +14,7 @@
  * See the file "license.terms" for information on usage and redistribution of
  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclExecute.c,v 1.497 2010/09/22 18:37:27 msofer Exp $
+ * RCS: @(#) $Id: tclExecute.c,v 1.498 2010/09/25 02:25:54 hobbs Exp $
  */
 
 #include "tclInt.h"
@@ -4528,114 +4528,90 @@ TclExecuteByteCode(
 
     case INST_STR_EQ:
     case INST_STR_NEQ:		/* String (in)equality check */
-	/*
-	 * TODO: Consider merging into INST_STR_CMP
-	 */
-
+    case INST_STR_CMP:		/* String compare. */
+    stringCompare:
 	value2Ptr = OBJ_AT_TOS;
 	valuePtr = OBJ_UNDER_TOS;
 
 	if (valuePtr == value2Ptr) {
+	    match = 0;
+	} else {
 	    /*
-	     * On the off-chance that the objects are the same, we don't
-	     * really have to think hard about equality.
+	     * We only need to check (in)equality when we have equal length
+	     * strings.  We can use memcmp in all (n)eq cases because we
+	     * don't need to worry about lexical LE/BE variance.
 	     */
-
-	    match = (*pc == INST_STR_EQ);
-	} else {
-	    s1 = TclGetStringFromObj(valuePtr, &s1len);
-	    s2 = TclGetStringFromObj(value2Ptr, &s2len);
-	    if (s1len == s2len) {
+	    typedef int (*memCmpFn_t)(const void*, const void*, size_t);
+	    memCmpFn_t memCmpFn;
+	    int checkEq = ((*pc == INST_EQ) || (*pc == INST_NEQ)
+		    || (*pc == INST_STR_EQ) || (*pc == INST_STR_NEQ));
+
+	    if (TclIsPureByteArray(valuePtr)
+		    && TclIsPureByteArray(value2Ptr)) {
+		s1 = (char *) Tcl_GetByteArrayFromObj(valuePtr, &s1len);
+		s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len);
+		memCmpFn = memcmp;
+	    } else if (((valuePtr->typePtr == &tclStringType)
+			    && (value2Ptr->typePtr == &tclStringType))) {
 		/*
-		 * We only need to check (in)equality when we have equal
-		 * length strings.
+		 * Do a unicode-specific comparison if both of the args are of
+		 * String type. If the char length == byte length, we can do a
+		 * memcmp. In benchmark testing this proved the most efficient
+		 * check between the unicode and string comparison operations.
 		 */
 
-		if (*pc == INST_STR_NEQ) {
-		    match = (memcmp(s1, s2, s1len) != 0);
+		s1len = Tcl_GetCharLength(valuePtr);
+		s2len = Tcl_GetCharLength(value2Ptr);
+		if ((s1len == valuePtr->length)
+			&& (s2len == value2Ptr->length)) {
+		    s1 = valuePtr->bytes;
+		    s2 = value2Ptr->bytes;
+		    memCmpFn = memcmp;
 		} else {
-		    /* INST_STR_EQ */
-		    match = (memcmp(s1, s2, s1len) == 0);
+		    s1 = (char *) Tcl_GetUnicode(valuePtr);
+		    s2 = (char *) Tcl_GetUnicode(value2Ptr);
+		    if (
+#ifdef WORDS_BIGENDIAN
+			1
+#else
+			checkEq
+#endif
+			) {
+			memCmpFn = memcmp;
+			s1len *= sizeof(Tcl_UniChar);
+			s2len *= sizeof(Tcl_UniChar);
+		    } else {
+			memCmpFn = (memCmpFn_t) Tcl_UniCharNcmp;
+		    }
 		}
 	    } else {
-		match = (*pc == INST_STR_NEQ);
-	    }
-	}
-
-	TRACE(("%.20s %.20s => %d\n", O2S(valuePtr),O2S(value2Ptr),match));
-
-	/*
-	 * Peep-hole optimisation: if you're about to jump, do jump from here.
-	 */
-
-	pc++;
-#ifndef TCL_COMPILE_DEBUG
-	switch (*pc) {
-	case INST_JUMP_FALSE1:
-	    NEXT_INST_F((match? 2 : TclGetInt1AtPtr(pc+1)), 2, 0);
-	case INST_JUMP_TRUE1:
-	    NEXT_INST_F((match? TclGetInt1AtPtr(pc+1) : 2), 2, 0);
-	case INST_JUMP_FALSE4:
-	    NEXT_INST_F((match? 5 : TclGetInt4AtPtr(pc+1)), 2, 0);
-	case INST_JUMP_TRUE4:
-	    NEXT_INST_F((match? TclGetInt4AtPtr(pc+1) : 5), 2, 0);
-	}
-#endif
-	objResultPtr = TCONST(match);
-	NEXT_INST_F(0, 2, 1);
-
-    stringCompare:
-    case INST_STR_CMP:		/* String compare. */
-	value2Ptr = OBJ_AT_TOS;
-	valuePtr = OBJ_UNDER_TOS;
-
-	/*
-	 * The comparison function should compare up to the minimum byte
-	 * length only.
-	 */
-
-	if (valuePtr == value2Ptr) {
-	    /*
-	     * In the pure equality case, set lengths too for the checks below
-	     * (or we could goto beyond it).
-	     */
+		/*
+		 * strcmp can't do a simple memcmp in order to handle the
+		 * special Tcl \xC0\x80 null encoding for utf-8.
+		 */
 
-	    match = s1len = s2len = 0;
-	} else if (TclIsPureByteArray(valuePtr)
-		&& TclIsPureByteArray(value2Ptr)) {
-	    s1 = (char *) Tcl_GetByteArrayFromObj(valuePtr, &s1len);
-	    s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len);
-	    match = memcmp(s1, s2,
-		    (size_t) ((s1len < s2len) ? s1len : s2len));
-	} else if (((valuePtr->typePtr == &tclStringType)
-		&& (value2Ptr->typePtr == &tclStringType))) {
-	    /*
-	     * Do a unicode-specific comparison if both of the args are of
-	     * String type. If the char length == byte length, we can do a
-	     * memcmp. In benchmark testing this proved the most efficient
-	     * check between the unicode and string comparison operations.
-	     */
+		s1 = TclGetStringFromObj(valuePtr, &s1len);
+		s2 = TclGetStringFromObj(value2Ptr, &s2len);
+		if (checkEq) {
+		    memCmpFn = memcmp;
+		} else {
+		    memCmpFn = (memCmpFn_t) TclpUtfNcmp2;
+		}
+	    }
 
-	    s1len = Tcl_GetCharLength(valuePtr);
-	    s2len = Tcl_GetCharLength(value2Ptr);
-	    if ((s1len == valuePtr->length) && (s2len == value2Ptr->length)) {
-		match = memcmp(valuePtr->bytes, value2Ptr->bytes,
-			(unsigned) ((s1len < s2len) ? s1len : s2len));
+	    if (checkEq && (s1len != s2len)) {
+		match = 1;
 	    } else {
-		match = TclUniCharNcmp(Tcl_GetUnicode(valuePtr),
-			Tcl_GetUnicode(value2Ptr),
-			(unsigned) ((s1len < s2len) ? s1len : s2len));
+		/*
+		 * The comparison function should compare up to the minimum
+		 * byte length only.
+		 */
+		match = memCmpFn(s1, s2,
+			(size_t) ((s1len < s2len) ? s1len : s2len));
+		if (match == 0) {
+		    match = s1len - s2len;
+		}
 	    }
-	} else {
-	    /*
-	     * We can't do a simple memcmp in order to handle the special Tcl
-	     * \xC0\x80 null encoding for utf-8.
-	     */
-
-	    s1 = TclGetStringFromObj(valuePtr, &s1len);
-	    s2 = TclGetStringFromObj(value2Ptr, &s2len);
-	    match = TclpUtfNcmp2(s1, s2,
-		    (size_t) ((s1len < s2len) ? s1len : s2len));
 	}
 
 	/*
@@ -4643,19 +4619,17 @@ TclExecuteByteCode(
 	 * TODO: consider peephole opt.
 	 */
 
-	if (match == 0) {
-	    match = s1len - s2len;
-	}
-
 	if (*pc != INST_STR_CMP) {
 	    /*
 	     * Take care of the opcodes that goto'ed into here.
 	     */
 
 	    switch (*pc) {
+	    case INST_STR_EQ:
 	    case INST_EQ:
 		match = (match == 0);
 		break;
+	    case INST_STR_NEQ:
 	    case INST_NEQ:
 		match = (match != 0);
 		break;
diff --git a/tests/stringComp.test b/tests/stringComp.test
index 2f187be..6ef94ee 100644
--- a/tests/stringComp.test
+++ b/tests/stringComp.test
@@ -15,7 +15,7 @@
 # See the file "license.terms" for information on usage and redistribution
 # of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 #
-# RCS: @(#) $Id: stringComp.test,v 1.17 2009/06/24 15:17:41 dgp Exp $
+# RCS: @(#) $Id: stringComp.test,v 1.18 2010/09/25 02:25:54 hobbs Exp $
 
 if {[lsearch [namespace children] ::tcltest] == -1} {
     package require tcltest
@@ -44,180 +44,166 @@ test stringComp-1.3 {error condition - undefined method during compile} {
     foo abc 0
 } a
 
-test stringComp-2.1 {string compare, too few args} {
-    proc foo {} {string compare a}
-    list [catch {foo} msg] $msg
-} {1 {wrong # args: should be "string compare ?-nocase? ?-length int? string1 string2"}}
-test stringComp-2.2 {string compare, bad args} {
-    proc foo {} {string compare a b c}
-    list [catch {foo} msg] $msg
-} {1 {bad option "a": must be -nocase or -length}}
-test stringComp-2.3 {string compare, bad args} {
-    list [catch {string compare -length -nocase str1 str2} msg] $msg
-} {1 {expected integer but got "-nocase"}}
-test stringComp-2.4 {string compare, too many args} {
-    list [catch {string compare -length 10 -nocase str1 str2 str3} msg] $msg
-} {1 {wrong # args: should be "string compare ?-nocase? ?-length int? string1 string2"}}
-test stringComp-2.5 {string compare with length unspecified} {
-    list [catch {string compare -length 10 10} msg] $msg
-} {1 {wrong # args: should be "string compare ?-nocase? ?-length int? string1 string2"}}
-test stringComp-2.6 {string compare} {
-    proc foo {} {string compare abcde abdef}
-    foo
-} -1
-test stringComp-2.7 {string compare, shortest method name} {
-    proc foo {} {string c abcde ABCDE}
-    foo
-} 1
-test stringComp-2.8 {string compare} {
-    proc foo {} {string compare abcde abcde}
-    foo
-} 0
-test stringComp-2.9 {string compare with length} {
-    proc foo {} {string compare -length 2 abcde abxyz}
-    foo
-} 0
-test stringComp-2.10 {string compare with special index} {
-    proc foo {} {string compare -length end-3 abcde abxyz}
-    list [catch {foo} msg] $msg
-} {1 {expected integer but got "end-3"}}
-test stringComp-2.11 {string compare, unicode} {
-    proc foo {} {string compare ab\u7266 ab\u7267}
-    foo
-} -1
-test stringComp-2.12 {string compare, high bit} {
-    # This test will fail if the underlying comparaison
-    # is using signed chars instead of unsigned chars.
-    # (like SunOS's default memcmp thus the compat/memcmp.c)
-    proc foo {} {string compare "\x80" "@"}
-    foo
-    # Nb this tests works also in utf8 space because \x80 is
-    # translated into a 2 or more bytelength but whose first byte has
-    # the high bit set.
-} 1
-test stringComp-2.13 {string compare -nocase} {
-    proc foo {} {string compare -nocase abcde abdef}
-    foo
-} -1
-test stringComp-2.14 {string compare -nocase} {
-    proc foo {} {string c -nocase abcde ABCDE}
-    foo
-} 0
-test stringComp-2.15 {string compare -nocase} {
-    proc foo {} {string compare -nocase abcde abcde}
-    foo
-} 0
-test stringComp-2.16 {string compare -nocase with length} {
-    proc foo {} {string compare -length 2 -nocase abcde Abxyz}
-    foo
-} 0
-test stringComp-2.17 {string compare -nocase with length} {
-    proc foo {} {string compare -nocase -length 3 abcde Abxyz}
-    foo
-} -1
-test stringComp-2.18 {string compare -nocase with length <= 0} {
-    proc foo {} {string compare -nocase -length -1 abcde AbCdEf}
-    foo
-} -1
-test stringComp-2.19 {string compare -nocase with excessive length} {
-    proc foo {} {string compare -nocase -length 50 AbCdEf abcde}
-    foo
-} 1
-test stringComp-2.20 {string compare -len unicode} {
-    # These are strings that are 6 BYTELENGTH long, but the length
-    # shouldn't make a different because there are actually 3 CHARS long
-    proc foo {} {string compare -len 5 \334\334\334 \334\334\374}
-    foo
-} -1
-test stringComp-2.21 {string compare -nocase with special index} {
-    proc foo {} {string compare -nocase -length end-3 Abcde abxyz}
-    list [catch {foo} msg] $msg
-} {1 {expected integer but got "end-3"}}
-test stringComp-2.22 {string compare, null strings} {
-    proc foo {} {string compare "" ""}
-    foo
-} 0
-test stringComp-2.23 {string compare, null strings} {
-    proc foo {} {string compare "" foo}
-    foo
-} -1
-test stringComp-2.24 {string compare, null strings} {
-    proc foo {} {string compare foo ""}
-    foo
-} 1
-test stringComp-2.25 {string compare -nocase, null strings} {
-    proc foo {} {string compare -nocase "" ""}
-    foo
-} 0
-test stringComp-2.26 {string compare -nocase, null strings} {
-    proc foo {} {string compare -nocase "" foo}
-    foo
-} -1
-test stringComp-2.27 {string compare -nocase, null strings} {
-    proc foo {} {string compare -nocase foo ""}
-    foo
-} 1
-test stringComp-2.28 {string compare with length, unequal strings} {
-    proc foo {} {string compare -length 2 abc abde}
-    foo
-} 0
-test stringComp-2.29 {string compare with length, unequal strings} {
-    proc foo {} {string compare -length 2 ab abde}
-    foo
-} 0
-test stringComp-2.30 {string compare with NUL character vs. other ASCII} {
-    # Be careful here, since UTF-8 rep comparison with memcmp() of
-    # these puts chars in the wrong order
-    proc foo {} {string compare \x00 \x01}
-    foo
-} -1
-test stringComp-2.31 {string compare, high bit} {
-    proc foo {} {string compare "a\x80" "a@"}
-    foo
-} 1
-test stringComp-2.32 {string compare, high bit} {
-    proc foo {} {string compare "a\x00" "a\x01"}
-    foo
-} -1
-test stringComp-2.33 {string compare, high bit} {
-    proc foo {} {string compare "\x00\x00" "\x00\x01"}
-    foo
-} -1
+## Test string compare|equal over equal constraints
+## Use result for string compare, and negate it for string equal
+## The body will be tested both in and outside a proc
+set i 0
+foreach {tname tbody tresult tcode} {
+    {too few args} {
+	string compare a
+    } {wrong # args: should be "string compare ?-nocase? ?-length int? string1 string2"} {error}
+    {bad args} {
+	string compare a b c
+    } {bad option "a": must be -nocase or -length} {error}
+    {bad args} {
+	string compare -length -nocase str1 str2
+    } {expected integer but got "-nocase"} {error}
+    {too many args} {
+	string compare -length 10 -nocase str1 str2 str3
+    } {wrong # args: should be "string compare ?-nocase? ?-length int? string1 string2"} {error}
+    {compare with length unspecified} {
+	string compare -length 10 10
+    } {wrong # args: should be "string compare ?-nocase? ?-length int? string1 string2"} {error}
+    {basic operation fail} {
+	string compare abcde abdef
+    } {-1} {}
+    {basic operation success} {
+	string compare abcde abcde
+    } {0} {}
+    {with length} {
+	string compare -length 2 abcde abxyz
+    } {0} {}
+    {with special index} {
+	string compare -length end-3 abcde abxyz
+    } {expected integer but got "end-3"} {error}
+    {unicode} {
+	string compare ab\u7266 ab\u7267
+    } {-1} {}
+    {unicode} {string compare \334 \u00dc} 0 {}
+    {unicode} {string compare \334 \u00fc} -1 {}
+    {unicode} {string compare \334\334\334\374\374 \334\334\334\334\334} 1 {}
+    {high bit} {
+	# This test will fail if the underlying comparaison
+	# is using signed chars instead of unsigned chars.
+	# (like SunOS's default memcmp thus the compat/memcmp.c)
+	string compare "\x80" "@"
+	# Nb this tests works also in utf8 space because \x80 is
+	# translated into a 2 or more bytelength but whose first byte has
+	# the high bit set.
+    } {1} {}
+    {-nocase 1} {string compare -nocase abcde abdef} {-1} {}
+    {-nocase 2} {string compare -nocase abcde Abdef} {-1} {}
+    {-nocase 3} {string compare -nocase abcde ABCDE} {0} {}
+    {-nocase 4} {string compare -nocase abcde abcde} {0} {}
+    {-nocase unicode} {
+	string compare -nocase \334 \u00dc
+    } 0 {}
+    {-nocase unicode} {
+	string compare -nocase \334\334\334\374\u00fc \334\334\334\334\334
+    } 0 {}
+    {-nocase with length} {
+	string compare -length 2 -nocase abcde Abxyz
+    } {0} {}
+    {-nocase with length} {
+	string compare -nocase -length 3 abcde Abxyz
+    } {-1} {}
+    {-nocase with length <= 0} {
+	string compare -nocase -length -1 abcde AbCdEf
+    } {-1} {}
+    {-nocase with excessive length} {
+	string compare -nocase -length 50 AbCdEf abcde
+    } {1} {}
+    {-len unicode} {
+	# These are strings that are 6 BYTELENGTH long, but the length
+	# shouldn't make a different because there are actually 3 CHARS long
+	string compare -len 5 \334\334\334 \334\334\374
+    } -1 {}
+    {-nocase with special index} {
+	string compare -nocase -length end-3 Abcde abxyz
+    } {expected integer but got "end-3"} error
+    {null strings} {
+	string compare "" ""
+    } 0 {}
+    {null strings} {
+	string compare "" foo
+    } -1 {}
+    {null strings} {
+	string compare foo ""
+    } 1 {}
+    {-nocase null strings} {
+	string compare -nocase "" ""
+    } 0 {}
+    {-nocase null strings} {
+	string compare -nocase "" foo
+    } -1 {}
+    {-nocase null strings} {
+	string compare -nocase foo ""
+    } 1 {}
+    {with length, unequal strings} {
+	string compare -length 2 abc abde
+    } 0 {}
+    {with length, unequal strings} {
+	string compare -length 2 ab abde
+    } 0 {}
+    {with NUL character vs. other ASCII} {
+	# Be careful here, since UTF-8 rep comparison with memcmp() of
+	# these puts chars in the wrong order
+	string compare \x00 \x01
+    } -1 {}
+    {high bit} {
+	string compare "a\x80" "a@"
+    } 1 {}
+    {high bit} {
+	string compare "a\x00" "a\x01"
+    } -1 {}
+    {high bit} {
+	string compare "\x00\x00" "\x00\x01"
+    } -1 {}
+    {binary equal} {
+	string compare [binary format a100 0] [binary format a100 0]
+    } 0 {}
+    {binary neq} {
+	string compare [binary format a100a 0 1] [binary format a100a 0 0]
+    } 1 {}
+    {binary neq inequal length} {
+	string compare [binary format a20a 0 1] [binary format a100a 0 0]
+    } 1 {}
+} {
+    if {$tname eq ""} { continue }
+    if {$tcode eq ""} { set tcode ok }
+    test stringComp-2.[incr i] "string compare, $tname" \
+	-body [list eval $tbody] \
+	-returnCodes $tcode -result $tresult
+    test stringComp-2.[incr i] "string compare bc, $tname" \
+	-body "[list proc foo {} $tbody];foo" \
+	-returnCodes $tcode -result $tresult
+    if {"error" ni $tcode} {
+	set tresult [expr {!$tresult}]
+    } else {
+	set tresult [string map {compare equal} $tresult]
+    }
+    set tbody [string map {compare equal} $tbody]
+    test stringComp-2.[incr i] "string equal, $tname" \
+	-body [list eval $tbody] \
+	-returnCodes $tcode -result $tresult
+    test stringComp-2.[incr i] "string equal bc, $tname" \
+	-body "[list proc foo {} $tbody];foo" \
+	-returnCodes $tcode -result $tresult
+}
 
-# only need a few tests on equal, since it uses the same code as
-# string compare, but just modifies the return output
-test stringComp-3.1 {string equal} {
-    proc foo {} {string equal abcde abdef}
-    foo
-} 0
-test stringComp-3.2 {string equal} {
-    proc foo {} {string eq abcde ABCDE}
-    foo
-} 0
-test stringComp-3.3 {string equal} {
-    proc foo {} {string equal abcde abcde}
-    foo
-} 1
-test stringComp-3.4 {string equal -nocase} {
-    proc foo {} {string equal -nocase \334\334\334\334\374\374\374\374 \334\334\334\334\334\334\334\334}
+# need a few extra tests short abbr cmd
+test stringComp-3.1 {string compare, shortest method name} {
+    proc foo {} {string c abcde ABCDE}
     foo
 } 1
-test stringComp-3.5 {string equal -nocase} {
-    proc foo {} {string equal -nocase abcde abdef}
+test stringComp-3.2 {string equal, shortest method name} {
+    proc foo {} {string e abcde ABCDE}
     foo
 } 0
-test stringComp-3.6 {string equal -nocase} {
+test stringComp-3.3 {string equal -nocase} {
     proc foo {} {string eq -nocase abcde ABCDE}
     foo
 } 1
-test stringComp-3.7 {string equal -nocase} {
-    proc foo {} {string equal -nocase abcde abcde}
-    foo
-} 1
-test stringComp-3.8 {string equal with length, unequal strings} {
-    proc foo {} {string equal -length 2 abc abde}
-    foo
-} 1
 
 test stringComp-4.1 {string first, too few args} {
     proc foo {} {string first a}
-- 
cgit v0.12