* generic/tclInt.decls:

* generic/tclIntDecls.h: * generic/tclStubInit.c: * generic/tclUtf.c: added TclpUtfNcmp2 private command that mirrors Tcl_UtfNcmp, but takes n in bytes, not utf-8 chars. This provides a faster alternative for comparing utf strings internally. (Tcl_UniCharNcmp, Tcl_UniCharNcasecmp): removed the explicit end of string check as it wasn't correct for the function (by doc and logic). * generic/tclCmdMZ.c (Tcl_StringObjCmd): reworked the string equal comparison code to use TclpUtfNcmp2 as well as short-circuit for equal objects or unequal length strings in the equal case. Removed the use of goto and streamlined the other parts. * generic/tclExecute.c (TclExecuteByteCode): added check for object equality in the comparison instructions. Added short-circuit for != length strings in INST_EQ, INST_NEQ and INST_STR_CMP. Reworked INST_STR_CMP to use TclpUtfNcmp2 where appropriate, and only use Tcl_UniCharNcmp when at least one of the objects is a Unicode obj with no utf bytes.
author: hobbs <hobbs> 2002-05-29 09:09:57 (GMT)
committer: hobbs <hobbs> 2002-05-29 09:09:57 (GMT)
commit: 3a1941e7d007e93449aa22085687a8fafaddad7e (patch)
tree: 9794bcbc0e094b9645add8339f0278666543ef7e
parent: 514dd2a61babad4abc78895bdadec335dd9b4c71 (diff)
download: tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.zip
tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.gz
tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.bz2
7 files changed, 225 insertions, 153 deletions
diff --git a/ChangeLog b/ChangeLog
index 8effda2..2dad798 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,35 @@
+2002-05-29  Jeff Hobbs  <jeffh@ActiveState.com>
+
+	* generic/tclInt.decls:
+	* generic/tclIntDecls.h:
+	* generic/tclStubInit.c:
+	* generic/tclUtf.c: added TclpUtfNcmp2 private command that
+	mirrors Tcl_UtfNcmp, but takes n in bytes, not utf-8 chars.  This
+	provides a faster alternative for comparing utf strings	internally.
+	(Tcl_UniCharNcmp, Tcl_UniCharNcasecmp): removed the explicit end
+	of string check as it wasn't correct for the function (by doc and
+	logic).
+
+	* generic/tclCmdMZ.c (Tcl_StringObjCmd): reworked the string equal
+	comparison code to use TclpUtfNcmp2 as well as short-circuit for
+	equal objects or unequal length strings in the equal case.
+	Removed the use of goto and streamlined the other parts.
+
+	* generic/tclExecute.c (TclExecuteByteCode): added check for
+	object equality in the comparison instructions.  Added
+	short-circuit for != length strings in INST_EQ, INST_NEQ and
+	INST_STR_CMP.  Reworked INST_STR_CMP to use TclpUtfNcmp2 where
+	appropriate, and only use Tcl_UniCharNcmp when at least one of the
+	objects is a Unicode obj with no utf bytes.
+
+	* generic/tclCompCmds.c (TclCompileStringCmd): removed error
+	creation in code that no longer throws an error.
+
+	* tests/string.test:
+	* tests/stringComp.test: added more string comparison checks.
+
+	* tests/clock.test: better qualified 9.1 constraint check for %s.
+
 2002-05-28  Jeff Hobbs  <jeffh@ActiveState.com>
 
 	* generic/tclThreadAlloc.c (TclpRealloc, TclpFree): protect
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 0f39ecb..a3b03df 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -14,7 +14,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclCmdMZ.c,v 1.67 2002/04/18 13:49:30 dkf Exp $
+ * RCS: @(#) $Id: tclCmdMZ.c,v 1.68 2002/05/29 09:09:57 hobbs Exp $
  */
 
 #include "tclInt.h"
@@ -1094,6 +1094,11 @@ Tcl_SplitObjCmd(dummy, interp, objc, objv)
  *	that this command only functions correctly on properly formed
  *	Tcl UTF strings.
  *
+ *	Note that the primary methods here (equal, compare, match, ...)
+ *	have bytecode equivalents.  You will find the code for those in
+ *	tclExecute.c.  The code here will only be used in the non-bc
+ *	case (like in an 'eval').
+ *
  * Results:
  *	A standard Tcl result.
  *
@@ -1153,6 +1158,7 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 	     * comparison in INST_EQ/INST_NEQ/INST_LT/...).
 	     */
 	    int i, match, length, nocase = 0, reqlength = -1;
+	    int (*strCmpFn)();
 
 	    if (objc < 4 || objc > 7) {
 	    str_cmp_args:
@@ -1183,134 +1189,84 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 		}
 	    }
 
-	    if (reqlength == 0) {
-		/*
-		 * Anything matches at 0 chars, right?
-		 */
-
-		match = 0;
-		goto stringComparisonDone;
-	    }
-
 	    /*
 	     * From now on, we only access the two objects at the end
 	     * of the argument array.
 	     */
 	    objv += objc-2;
 
-	    /*
-	     * Use binary versions of comparisons since that won't
-	     * cause undue type conversions and it is much faster.
-	     * Only do this if we're case-sensitive (which is all
-	     * that really makes sense with byte arrays anyway, and
-	     * we have no memcasecmp() for some reason... :^)
-	     */
-	    if (!nocase && objv[0]->typePtr == &tclByteArrayType &&
-		    objv[1]->typePtr == &tclByteArrayType) {
-		unsigned char *bytes1, *bytes2;
-
-		bytes1 = Tcl_GetByteArrayFromObj(objv[0], &length1);
-		bytes2 = Tcl_GetByteArrayFromObj(objv[1], &length2);
-		length = (length1 < length2) ? length1 : length2;
-
-		if ((reqlength > 0) && (reqlength < length)) {
-		    length = reqlength;
-		} else if (reqlength < 0) {
-		    /*
-		     * The requested length is negative, so we ignore it by
-		     * setting it to the longer of the two lengths.
-		     */
-
-		    reqlength = (length1 > length2) ? length1 : length2;
-		}
+	    if ((reqlength == 0) || (objv[0] == objv[1])) {
+		/*
+		 * Alway match at 0 chars of if it is the same obj.
+		 */
 
-		match = memcmp(bytes1, bytes2, (unsigned)length);
-		if ((match == 0) && (reqlength > length)) {
-		    match = length1 - length2;
+		Tcl_SetBooleanObj(resultPtr,
+			((enum options) index == STR_EQUAL));
+		break;
+	    } else if (!nocase && objv[0]->typePtr == &tclByteArrayType &&
+		    objv[1]->typePtr == &tclByteArrayType) {
+		/*
+		 * Use binary versions of comparisons since that won't
+		 * cause undue type conversions and it is much faster.
+		 * Only do this if we're case-sensitive (which is all
+		 * that really makes sense with byte arrays anyway, and
+		 * we have no memcasecmp() for some reason... :^)
+		 */
+		string1 = (char*) Tcl_GetByteArrayFromObj(objv[0], &length1);
+		string2 = (char*) Tcl_GetByteArrayFromObj(objv[1], &length2);
+		strCmpFn = memcmp;
+	    } else if (((objv[0]->typePtr == &tclStringType)
+		    && (objv[0]->bytes == NULL))
+		    || ((objv[1]->typePtr == &tclStringType)
+			    && (objv[1]->bytes == NULL))) {
+		/*
+		 * Use UNICODE versions of string comparisons since that
+		 * won't cause undue type conversions and we can work with
+		 * characters all of a fixed size (much faster.), but only
+		 * when one of the objects is a pure UNICODE object.
+		 */
+		string1 = (char*) Tcl_GetUnicodeFromObj(objv[0], &length1);
+		string2 = (char*) Tcl_GetUnicodeFromObj(objv[1], &length2);
+		strCmpFn = nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp;
+	    } else {
+		/*
+		 * As a catch-all we will work with UTF-8.  We cannot use
+		 * memcmp() as that is unsafe with any string containing
+		 * NULL (\xC0\x80 in Tcl's utf rep).  We can use the more
+		 * efficient TclpUtfNcmp2 if we are case-sensitive and no
+		 * specific length was requested.
+		 */
+		string1 = (char*) Tcl_GetStringFromObj(objv[0], &length1);
+		string2 = (char*) Tcl_GetStringFromObj(objv[1], &length2);
+		if ((reqlength < 0) && !nocase) {
+		    strCmpFn = TclpUtfNcmp2;
+		} else {
+		    length1 = Tcl_NumUtfChars(string1, length1);
+		    length2 = Tcl_NumUtfChars(string2, length2);
+		    strCmpFn = nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp;
 		}
-		goto stringComparisonDone;
 	    }
 
-	    /*
-	     * Use UNICODE versions of string comparisons since that
-	     * won't cause undue type conversions and we can work with
-	     * characters all of a fixed size (much faster.)  Also use
-	     * this code for untyped objects, since like that we'll
-	     * pick up many things that are used for comparison in
-	     * scripts and convert them (efficiently) to UNICODE
-	     * strings for comparison, but exclude case where both are
-	     * untyped as that is a little bit aggressive.
-	     */
-	    if ((objv[0]->typePtr == &tclStringType ||
-		    objv[0]->typePtr == NULL) &&
-		    (objv[1]->typePtr == &tclStringType ||
-			    objv[1]->typePtr == NULL) &&
-		    !(objv[0]->typePtr == NULL && objv[1]->typePtr == NULL)) {
-		Tcl_UniChar *uni1, *uni2;
-
-		uni1 = Tcl_GetUnicodeFromObj(objv[0], &length1);
-		uni2 = Tcl_GetUnicodeFromObj(objv[1], &length2);
+	    if (((enum options) index == STR_EQUAL)
+		    && (reqlength < 0) && (length1 != length2)) {
+		match = 1; /* this will be reversed below */
+	    } else {
 		length = (length1 < length2) ? length1 : length2;
-
 		if (reqlength > 0 && reqlength < length) {
 		    length = reqlength;
 		} else if (reqlength < 0) {
 		    /*
 		     * The requested length is negative, so we ignore it by
-		     * setting it to the longer of the two lengths.
+		     * setting it to length + 1 so we correct the match var.
 		     */
-
-		    reqlength = (length1 < length2) ? length2 : length1;
-		}
-
-		if (nocase) {
-		    match = Tcl_UniCharNcasecmp(uni1, uni2, (unsigned)length);
-		} else {
-		    match = Tcl_UniCharNcmp(uni1, uni2, (unsigned)length);
+		    reqlength = length + 1;
 		}
-
+		match = strCmpFn(string1, string2, (unsigned) length);
 		if ((match == 0) && (reqlength > length)) {
 		    match = length1 - length2;
 		}
-		goto stringComparisonDone;
-	    }
-
-	    /*
-	     * Strings to be compared are not both UNICODE or byte
-	     * arrays, so we will need to convert to UTF-8 and work
-	     * there (cannot use memcmp() as that is an unsafe
-	     * operation with any string containing \u0000 and the
-	     * safety test is equivalent in cost to the comparison
-	     * itself!)
-	     */
-	    string1 = Tcl_GetStringFromObj(objv[0], &length1);
-	    string2 = Tcl_GetStringFromObj(objv[1], &length2);
-	    length1 = Tcl_NumUtfChars(string1, length1);
-	    length2 = Tcl_NumUtfChars(string2, length2);
-	    length = (length1 < length2) ? length1 : length2;
-
-	    if ((reqlength > 0) && (reqlength < length)) {
-		length = reqlength;
-	    } else if (reqlength < 0) {
-		/*
-		 * The requested length is negative, so we ignore it by
-		 * setting it to the longer of the two lengths.
-		 */
-
-		reqlength = (length1 > length2) ? length1 : length2;
-	    }
-
-	    if (nocase) {
-		match = Tcl_UtfNcasecmp(string1, string2, (unsigned) length);
-	    } else {
-		match = Tcl_UtfNcmp(string1, string2, (unsigned) length);
-	    }
-
-	    if ((match == 0) && (reqlength > length)) {
-		match = length1 - length2;
 	    }
 
-	stringComparisonDone:
 	    if ((enum options) index == STR_EQUAL) {
 		Tcl_SetBooleanObj(resultPtr, (match) ? 0 : 1);
 	    } else {
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index f6d2ea2..bbe2728 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -11,7 +11,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclExecute.c,v 1.53 2002/04/18 13:04:20 msofer Exp $
+ * RCS: @(#) $Id: tclExecute.c,v 1.54 2002/05/29 09:09:57 hobbs Exp $
  */
 
 #include "tclInt.h"
@@ -2718,7 +2718,7 @@ TclExecuteByteCode(interp, codePtr)
 		/*
 		 * String compare
 		 */
-		char *s1, *s2;
+		CONST char *s1, *s2;
 		int s1len, s2len, iResult;
 
 		value2Ptr = POP_OBJECT();
@@ -2728,18 +2728,25 @@ TclExecuteByteCode(interp, codePtr)
 		 * The comparison function should compare up to the
 		 * minimum byte length only.
 		 */
-		if ((valuePtr->typePtr == &tclByteArrayType) &&
+		if (valuePtr == value2Ptr) {
+		    /*
+		     * In the pure equality case, set lengths too for
+		     * the checks below (or we could goto beyond it).
+		     */
+		    iResult = s1len = s2len = 0;
+		} else if ((valuePtr->typePtr == &tclByteArrayType) &&
 			(value2Ptr->typePtr == &tclByteArrayType)) {
 		    s1 = (char *) Tcl_GetByteArrayFromObj(valuePtr, &s1len);
 		    s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len);
 		    iResult = memcmp(s1, s2,
 			    (size_t) ((s1len < s2len) ? s1len : s2len));
-		} else if ((valuePtr->typePtr == &tclStringType) ||
-			(value2Ptr->typePtr == &tclStringType)) {
+		} else if (((valuePtr->typePtr == &tclStringType)
+			&& (valuePtr->bytes == NULL))
+			|| ((value2Ptr->typePtr == &tclStringType)
+				&& (value2Ptr->bytes == NULL))) {
 		    /*
-		     * The alternative is to break this into more code
-		     * that does type sensitive comparison, as done in
-		     * Tcl_StringObjCmd.
+		     * Do a unicode-specific comparison if one of the args
+		     * only has the unicode rep.
 		     */
 		    Tcl_UniChar *uni1, *uni2;
 		    uni1 = Tcl_GetUnicodeFromObj(valuePtr, &s1len);
@@ -2748,21 +2755,12 @@ TclExecuteByteCode(interp, codePtr)
 			    (unsigned) ((s1len < s2len) ? s1len : s2len));
 		} else {
 		    /*
-		     * This solution is less mem intensive, but it is
-		     * computationally expensive as the string grows.  The
-		     * reason that we can't use a memcmp is that UTF-8 strings
-		     * that contain a \u0000 can't be compared with memcmp.  If
-		     * we knew that the string was ascii-7 or had no null byte,
-		     * we could just do memcmp and save all the hassle.
+		     * We can't do a simple memcmp in order to handle the
+		     * special Tcl \xC0\x80 null encoding for utf-8.
 		     */
 		    s1 = Tcl_GetStringFromObj(valuePtr, &s1len);
 		    s2 = Tcl_GetStringFromObj(value2Ptr, &s2len);
-		    /*
-		     * These have to be in true chars
-		     */
-		    s1len = Tcl_NumUtfChars(s1, s1len);
-		    s2len = Tcl_NumUtfChars(s2, s2len);
-		    iResult = Tcl_UtfNcmp(s1, s2,
+		    iResult = TclpUtfNcmp2(s1, s2,
 			    (size_t) ((s1len < s2len) ? s1len : s2len));
 		}
 
@@ -2931,6 +2929,26 @@ TclExecuteByteCode(interp, codePtr)
 
 		value2Ptr = POP_OBJECT();
 		valuePtr  = POP_OBJECT();
+
+		if (valuePtr == value2Ptr) {
+		    /*
+		     * Optimize the equal object case.
+		     */
+		    switch (*pc) {
+			case INST_EQ:
+			case INST_LE:
+			case INST_GE:
+			    iResult = 1;
+			    break;
+			case INST_NEQ:
+			case INST_LT:
+			case INST_GT:
+			    iResult = 0;
+			    break;
+		    }
+		    goto foundResult;
+		}
+
 		t1Ptr = valuePtr->typePtr;
 		t2Ptr = value2Ptr->typePtr;
 
@@ -2967,30 +2985,39 @@ TclExecuteByteCode(interp, codePtr)
 		if (!IS_NUMERIC_TYPE(t1Ptr) || !IS_NUMERIC_TYPE(t2Ptr)) {
 		    /*
 		     * One operand is not numeric. Compare as strings.
-		     * NOTE: strcmp is not correct for \x00 < \x01.
+		     * NOTE: strcmp is not correct for \x00 < \x01, but
+		     * that is unlikely to occur here.  We could use the
+		     * TclUtfNCmp2 to handle this.
 		     */
-		    int cmpValue;
-		    s1 = TclGetString(valuePtr);
-		    s2 = TclGetString(value2Ptr);
-		    cmpValue = strcmp(s1, s2);
+		    int s1len, s2len;
+		    s1 = Tcl_GetStringFromObj(valuePtr, &s1len);
+		    s2 = Tcl_GetStringFromObj(value2Ptr, &s2len);
 		    switch (*pc) {
 		    case INST_EQ:
-			iResult = (cmpValue == 0);
+			if (s1len == s2len) {
+			    iResult = (strcmp(s1, s2) == 0);
+			} else {
+			    iResult = 0;
+			}
 			break;
 		    case INST_NEQ:
-			iResult = (cmpValue != 0);
+			if (s1len == s2len) {
+			    iResult = (strcmp(s1, s2) != 0);
+			} else {
+			    iResult = 1;
+			}
 			break;
 		    case INST_LT:
-			iResult = (cmpValue < 0);
+			iResult = (strcmp(s1, s2) < 0);
 			break;
 		    case INST_GT:
-			iResult = (cmpValue > 0);
+			iResult = (strcmp(s1, s2) > 0);
 			break;
 		    case INST_LE:
-			iResult = (cmpValue <= 0);
+			iResult = (strcmp(s1, s2) <= 0);
 			break;
 		    case INST_GE:
-			iResult = (cmpValue >= 0);
+			iResult = (strcmp(s1, s2) >= 0);
 			break;
 		    }
 		} else if ((t1Ptr == &tclDoubleType)
@@ -3094,7 +3121,7 @@ TclExecuteByteCode(interp, codePtr)
 		/*
 		 * Reuse the valuePtr object already on stack if possible.
 		 */
-		
+		foundResult:
 		TRACE(("%.20s %.20s => %ld\n",
 		       O2S(valuePtr), O2S(value2Ptr), iResult));
 		if (Tcl_IsShared(valuePtr)) {
diff --git a/generic/tclInt.decls b/generic/tclInt.decls
index b4d07fa..094b9df 100644
--- a/generic/tclInt.decls
+++ b/generic/tclInt.decls
@@ -12,7 +12,7 @@
 # See the file "license.terms" for information on usage and redistribution
 # of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 # 
-# RCS: @(#) $Id: tclInt.decls,v 1.49 2002/05/29 00:19:40 hobbs Exp $
+# RCS: @(#) $Id: tclInt.decls,v 1.50 2002/05/29 09:09:57 hobbs Exp $
 
 library tcl
 
@@ -666,6 +666,11 @@ declare 168 generic {
     Tcl_Obj *TclGetStartupScriptPath(void)
 }
 
+# variant of Tcl_UtfNCmp that takes n as bytes, not chars
+declare 169 generic {
+    int TclpUtfNcmp2(CONST char *s1, CONST char *s2, unsigned long n)
+}
+
 
 ##############################################################################
 
diff --git a/generic/tclIntDecls.h b/generic/tclIntDecls.h
index a16cc73..db179b9 100644
--- a/generic/tclIntDecls.h
+++ b/generic/tclIntDecls.h
@@ -11,7 +11,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclIntDecls.h,v 1.40 2002/05/29 00:19:40 hobbs Exp $
+ * RCS: @(#) $Id: tclIntDecls.h,v 1.41 2002/05/29 09:09:57 hobbs Exp $
  */
 
 #ifndef _TCLINTDECLS
@@ -503,6 +503,9 @@ EXTERN void		TclSetStartupScriptPath _ANSI_ARGS_((
 				Tcl_Obj * pathPtr));
 /* 168 */
 EXTERN Tcl_Obj *	TclGetStartupScriptPath _ANSI_ARGS_((void));
+/* 169 */
+EXTERN int		TclpUtfNcmp2 _ANSI_ARGS_((CONST char * s1, 
+				CONST char * s2, unsigned long n));
 
 typedef struct TclIntStubs {
     int magic;
@@ -709,6 +712,7 @@ typedef struct TclIntStubs {
     int (*tclListObjSetElement) _ANSI_ARGS_((Tcl_Interp * interp, Tcl_Obj * listPtr, int index, Tcl_Obj * valuePtr)); /* 166 */
     void (*tclSetStartupScriptPath) _ANSI_ARGS_((Tcl_Obj * pathPtr)); /* 167 */
     Tcl_Obj * (*tclGetStartupScriptPath) _ANSI_ARGS_((void)); /* 168 */
+    int (*tclpUtfNcmp2) _ANSI_ARGS_((CONST char * s1, CONST char * s2, unsigned long n)); /* 169 */
 } TclIntStubs;
 
 #ifdef __cplusplus
@@ -1325,6 +1329,10 @@ extern TclIntStubs *tclIntStubsPtr;
 #define TclGetStartupScriptPath \
 	(tclIntStubsPtr->tclGetStartupScriptPath) /* 168 */
 #endif
+#ifndef TclpUtfNcmp2
+#define TclpUtfNcmp2 \
+	(tclIntStubsPtr->tclpUtfNcmp2) /* 169 */
+#endif
 
 #endif /* defined(USE_TCL_STUBS) && !defined(USE_TCL_STUB_PROCS) */
 
diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c
index be2902e..90f51ba 100644
--- a/generic/tclStubInit.c
+++ b/generic/tclStubInit.c
@@ -8,7 +8,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclStubInit.c,v 1.70 2002/05/24 21:19:06 dkf Exp $
+ * RCS: @(#) $Id: tclStubInit.c,v 1.71 2002/05/29 09:09:57 hobbs Exp $
  */
 
 #include "tclInt.h"
@@ -248,6 +248,7 @@ TclIntStubs tclIntStubs = {
     TclListObjSetElement, /* 166 */
     TclSetStartupScriptPath, /* 167 */
     TclGetStartupScriptPath, /* 168 */
+    TclpUtfNcmp2, /* 169 */
 };
 
 TclIntPlatStubs tclIntPlatStubs = {
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 0148ca6..56dcaca 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -8,7 +8,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclUtf.c,v 1.23 2002/02/08 02:52:54 dgp Exp $
+ * RCS: @(#) $Id: tclUtf.c,v 1.24 2002/05/29 09:09:57 hobbs Exp $
  */
 
 #include "tclInt.h"
@@ -309,7 +309,7 @@ Tcl_UtfToUniChar(str, chPtr)
 	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
 	 * characters representing themselves.
 	 */
-	 
+
 	*chPtr = (Tcl_UniChar) byte;
 	return 1;
     } else if (byte < 0xE0) {
@@ -317,7 +317,7 @@ Tcl_UtfToUniChar(str, chPtr)
 	    /*
 	     * Two-byte-character lead-byte followed by a trail-byte.
 	     */
-	     
+
 	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
 	    return 2;
 	}
@@ -325,7 +325,7 @@ Tcl_UtfToUniChar(str, chPtr)
 	 * A two-byte-character lead-byte not followed by trail-byte
 	 * represents itself.
 	 */
-	 
+
 	*chPtr = (Tcl_UniChar) byte;
 	return 1;
     } else if (byte < 0xF0) {
@@ -671,7 +671,7 @@ Tcl_UtfPrev(str, start)
 	byte = *((unsigned char *) look);
 	if (byte < 0x80) {
 	    break;
-	} 
+	}
 	if (byte >= 0xC0) {
 	    return look;
 	}
@@ -1074,6 +1074,51 @@ Tcl_UtfToTitle(str)
 /*
  *----------------------------------------------------------------------
  *
+ * TclpUtfNcmp2 --
+ *
+ *	Compare at most n bytes of utf-8 strings cs and ct.  Both cs
+ *	and ct are assumed to be at least n bytes long.
+ *
+ * Results:
+ *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclpUtfNcmp2(cs, ct, n)
+    CONST char *cs;		/* UTF string to compare to ct. */
+    CONST char *ct;		/* UTF string cs is compared to. */
+    unsigned long n;		/* Number of *bytes* to compare. */
+{
+    /*
+     * We can't simply call 'memcmp(cs, ct, n);' because we need to check
+     * for Tcl's \xC0\x80 non-utf-8 null encoding.
+     * Otherwise utf-8 lexes fine in the strcmp manner.
+     */
+    register int result = 0;
+
+    for ( ; n != 0; n--, cs++, ct++) {
+	if (*cs != *ct) {
+	    result = UCHAR(*cs) - UCHAR(*ct);
+	    break;
+	}
+    }
+    if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
+	unsigned char c1, c2;
+	c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
+	c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
+	result = (c1 - c2);
+    }
+    return result;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
  * Tcl_UtfNcmp --
  *
  *	Compare at most n UTF chars of string cs to string ct.  Both cs
@@ -1096,7 +1141,7 @@ Tcl_UtfNcmp(cs, ct, n)
 {
     Tcl_UniChar ch1, ch2;
     /*
-     * Cannot use memcmp()-based approach as byte representation of
+     * Cannot use 'memcmp(cs, ct, n);' as byte representation of
      * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
      * representation of \u0001 (the byte 0x01.)
      */
@@ -1306,13 +1351,14 @@ Tcl_UniCharNcmp(cs, ct, n)
     CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */
     unsigned long n;			/* Number of unichars to compare. */
 {
+    /*
+     * We can't simply call 'memcmp(cs, ct, n*sizeof(Tcl_UniChar));'
+     * because that may not be lexically correct.
+     */
     for ( ; n != 0; n--, cs++, ct++) {
 	if (*cs != *ct) {
 	    return (*cs - *ct);
 	}
-	if (*cs == '\0') {
-	    break;
-	}
     }
     return 0;
 }
@@ -1346,9 +1392,6 @@ Tcl_UniCharNcasecmp(cs, ct, n)
 		(Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) {
 	    return (*cs - *ct);
 	}
-	if (*cs == '\0') {
-	    break;
-	}
     }
     return 0;
 }
author	hobbs <hobbs>	2002-05-29 09:09:57 (GMT)
committer	hobbs <hobbs>	2002-05-29 09:09:57 (GMT)
commit	3a1941e7d007e93449aa22085687a8fafaddad7e (patch)
tree	9794bcbc0e094b9645add8339f0278666543ef7e
parent	514dd2a61babad4abc78895bdadec335dd9b4c71 (diff)
download	tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.zip tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.gz tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.bz2