* generic/tclInt.decls:

* generic/tclIntDecls.h: * generic/tclStubInit.c: * generic/tclUtf.c: added TclpUtfNcmp2 private command that mirrors Tcl_UtfNcmp, but takes n in bytes, not utf-8 chars. This provides a faster alternative for comparing utf strings internally. (Tcl_UniCharNcmp, Tcl_UniCharNcasecmp): removed the explicit end of string check as it wasn't correct for the function (by doc and logic). * generic/tclCmdMZ.c (Tcl_StringObjCmd): reworked the string equal comparison code to use TclpUtfNcmp2 as well as short-circuit for equal objects or unequal length strings in the equal case. Removed the use of goto and streamlined the other parts. * generic/tclExecute.c (TclExecuteByteCode): added check for object equality in the comparison instructions. Added short-circuit for != length strings in INST_EQ, INST_NEQ and INST_STR_CMP. Reworked INST_STR_CMP to use TclpUtfNcmp2 where appropriate, and only use Tcl_UniCharNcmp when at least one of the objects is a Unicode obj with no utf bytes.
author: hobbs <hobbs> 2002-05-29 09:09:57 (GMT)
committer: hobbs <hobbs> 2002-05-29 09:09:57 (GMT)
commit: 3a1941e7d007e93449aa22085687a8fafaddad7e (patch)
tree: 9794bcbc0e094b9645add8339f0278666543ef7e /generic/tclUtf.c
parent: 514dd2a61babad4abc78895bdadec335dd9b4c71 (diff)
download: tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.zip
tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.gz
tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.bz2
1 files changed, 55 insertions, 12 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 0148ca6..56dcaca 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -8,7 +8,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclUtf.c,v 1.23 2002/02/08 02:52:54 dgp Exp $
+ * RCS: @(#) $Id: tclUtf.c,v 1.24 2002/05/29 09:09:57 hobbs Exp $
  */
 
 #include "tclInt.h"
@@ -309,7 +309,7 @@ Tcl_UtfToUniChar(str, chPtr)
 	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
 	 * characters representing themselves.
 	 */
-	 
+
 	*chPtr = (Tcl_UniChar) byte;
 	return 1;
     } else if (byte < 0xE0) {
@@ -317,7 +317,7 @@ Tcl_UtfToUniChar(str, chPtr)
 	    /*
 	     * Two-byte-character lead-byte followed by a trail-byte.
 	     */
-	     
+
 	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
 	    return 2;
 	}
@@ -325,7 +325,7 @@ Tcl_UtfToUniChar(str, chPtr)
 	 * A two-byte-character lead-byte not followed by trail-byte
 	 * represents itself.
 	 */
-	 
+
 	*chPtr = (Tcl_UniChar) byte;
 	return 1;
     } else if (byte < 0xF0) {
@@ -671,7 +671,7 @@ Tcl_UtfPrev(str, start)
 	byte = *((unsigned char *) look);
 	if (byte < 0x80) {
 	    break;
-	} 
+	}
 	if (byte >= 0xC0) {
 	    return look;
 	}
@@ -1074,6 +1074,51 @@ Tcl_UtfToTitle(str)
 /*
  *----------------------------------------------------------------------
  *
+ * TclpUtfNcmp2 --
+ *
+ *	Compare at most n bytes of utf-8 strings cs and ct.  Both cs
+ *	and ct are assumed to be at least n bytes long.
+ *
+ * Results:
+ *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclpUtfNcmp2(cs, ct, n)
+    CONST char *cs;		/* UTF string to compare to ct. */
+    CONST char *ct;		/* UTF string cs is compared to. */
+    unsigned long n;		/* Number of *bytes* to compare. */
+{
+    /*
+     * We can't simply call 'memcmp(cs, ct, n);' because we need to check
+     * for Tcl's \xC0\x80 non-utf-8 null encoding.
+     * Otherwise utf-8 lexes fine in the strcmp manner.
+     */
+    register int result = 0;
+
+    for ( ; n != 0; n--, cs++, ct++) {
+	if (*cs != *ct) {
+	    result = UCHAR(*cs) - UCHAR(*ct);
+	    break;
+	}
+    }
+    if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
+	unsigned char c1, c2;
+	c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
+	c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
+	result = (c1 - c2);
+    }
+    return result;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
  * Tcl_UtfNcmp --
  *
  *	Compare at most n UTF chars of string cs to string ct.  Both cs
@@ -1096,7 +1141,7 @@ Tcl_UtfNcmp(cs, ct, n)
 {
     Tcl_UniChar ch1, ch2;
     /*
-     * Cannot use memcmp()-based approach as byte representation of
+     * Cannot use 'memcmp(cs, ct, n);' as byte representation of
      * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
      * representation of \u0001 (the byte 0x01.)
      */
@@ -1306,13 +1351,14 @@ Tcl_UniCharNcmp(cs, ct, n)
     CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */
     unsigned long n;			/* Number of unichars to compare. */
 {
+    /*
+     * We can't simply call 'memcmp(cs, ct, n*sizeof(Tcl_UniChar));'
+     * because that may not be lexically correct.
+     */
     for ( ; n != 0; n--, cs++, ct++) {
 	if (*cs != *ct) {
 	    return (*cs - *ct);
 	}
-	if (*cs == '\0') {
-	    break;
-	}
     }
     return 0;
 }
@@ -1346,9 +1392,6 @@ Tcl_UniCharNcasecmp(cs, ct, n)
 		(Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) {
 	    return (*cs - *ct);
 	}
-	if (*cs == '\0') {
-	    break;
-	}
     }
     return 0;
 }
author	hobbs <hobbs>	2002-05-29 09:09:57 (GMT)
committer	hobbs <hobbs>	2002-05-29 09:09:57 (GMT)
commit	3a1941e7d007e93449aa22085687a8fafaddad7e (patch)
tree	9794bcbc0e094b9645add8339f0278666543ef7e /generic/tclUtf.c
parent	514dd2a61babad4abc78895bdadec335dd9b4c71 (diff)
download	tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.zip tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.gz tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.bz2