summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
authorhobbs <hobbs>2002-05-29 09:09:57 (GMT)
committerhobbs <hobbs>2002-05-29 09:09:57 (GMT)
commit3a1941e7d007e93449aa22085687a8fafaddad7e (patch)
tree9794bcbc0e094b9645add8339f0278666543ef7e /generic/tclUtf.c
parent514dd2a61babad4abc78895bdadec335dd9b4c71 (diff)
downloadtcl-3a1941e7d007e93449aa22085687a8fafaddad7e.zip
tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.gz
tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.bz2
* generic/tclInt.decls:
* generic/tclIntDecls.h: * generic/tclStubInit.c: * generic/tclUtf.c: added TclpUtfNcmp2 private command that mirrors Tcl_UtfNcmp, but takes n in bytes, not utf-8 chars. This provides a faster alternative for comparing utf strings internally. (Tcl_UniCharNcmp, Tcl_UniCharNcasecmp): removed the explicit end of string check as it wasn't correct for the function (by doc and logic). * generic/tclCmdMZ.c (Tcl_StringObjCmd): reworked the string equal comparison code to use TclpUtfNcmp2 as well as short-circuit for equal objects or unequal length strings in the equal case. Removed the use of goto and streamlined the other parts. * generic/tclExecute.c (TclExecuteByteCode): added check for object equality in the comparison instructions. Added short-circuit for != length strings in INST_EQ, INST_NEQ and INST_STR_CMP. Reworked INST_STR_CMP to use TclpUtfNcmp2 where appropriate, and only use Tcl_UniCharNcmp when at least one of the objects is a Unicode obj with no utf bytes.
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c67
1 files changed, 55 insertions, 12 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 0148ca6..56dcaca 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -8,7 +8,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclUtf.c,v 1.23 2002/02/08 02:52:54 dgp Exp $
+ * RCS: @(#) $Id: tclUtf.c,v 1.24 2002/05/29 09:09:57 hobbs Exp $
*/
#include "tclInt.h"
@@ -309,7 +309,7 @@ Tcl_UtfToUniChar(str, chPtr)
* Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
* characters representing themselves.
*/
-
+
*chPtr = (Tcl_UniChar) byte;
return 1;
} else if (byte < 0xE0) {
@@ -317,7 +317,7 @@ Tcl_UtfToUniChar(str, chPtr)
/*
* Two-byte-character lead-byte followed by a trail-byte.
*/
-
+
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
return 2;
}
@@ -325,7 +325,7 @@ Tcl_UtfToUniChar(str, chPtr)
* A two-byte-character lead-byte not followed by trail-byte
* represents itself.
*/
-
+
*chPtr = (Tcl_UniChar) byte;
return 1;
} else if (byte < 0xF0) {
@@ -671,7 +671,7 @@ Tcl_UtfPrev(str, start)
byte = *((unsigned char *) look);
if (byte < 0x80) {
break;
- }
+ }
if (byte >= 0xC0) {
return look;
}
@@ -1074,6 +1074,51 @@ Tcl_UtfToTitle(str)
/*
*----------------------------------------------------------------------
*
+ * TclpUtfNcmp2 --
+ *
+ * Compare at most n bytes of utf-8 strings cs and ct. Both cs
+ * and ct are assumed to be at least n bytes long.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclpUtfNcmp2(cs, ct, n)
+ CONST char *cs; /* UTF string to compare to ct. */
+ CONST char *ct; /* UTF string cs is compared to. */
+ unsigned long n; /* Number of *bytes* to compare. */
+{
+ /*
+ * We can't simply call 'memcmp(cs, ct, n);' because we need to check
+ * for Tcl's \xC0\x80 non-utf-8 null encoding.
+ * Otherwise utf-8 lexes fine in the strcmp manner.
+ */
+ register int result = 0;
+
+ for ( ; n != 0; n--, cs++, ct++) {
+ if (*cs != *ct) {
+ result = UCHAR(*cs) - UCHAR(*ct);
+ break;
+ }
+ }
+ if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
+ unsigned char c1, c2;
+ c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
+ c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
+ result = (c1 - c2);
+ }
+ return result;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
* Tcl_UtfNcmp --
*
* Compare at most n UTF chars of string cs to string ct. Both cs
@@ -1096,7 +1141,7 @@ Tcl_UtfNcmp(cs, ct, n)
{
Tcl_UniChar ch1, ch2;
/*
- * Cannot use memcmp()-based approach as byte representation of
+ * Cannot use 'memcmp(cs, ct, n);' as byte representation of
* \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
* representation of \u0001 (the byte 0x01.)
*/
@@ -1306,13 +1351,14 @@ Tcl_UniCharNcmp(cs, ct, n)
CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
unsigned long n; /* Number of unichars to compare. */
{
+ /*
+ * We can't simply call 'memcmp(cs, ct, n*sizeof(Tcl_UniChar));'
+ * because that may not be lexically correct.
+ */
for ( ; n != 0; n--, cs++, ct++) {
if (*cs != *ct) {
return (*cs - *ct);
}
- if (*cs == '\0') {
- break;
- }
}
return 0;
}
@@ -1346,9 +1392,6 @@ Tcl_UniCharNcasecmp(cs, ct, n)
(Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) {
return (*cs - *ct);
}
- if (*cs == '\0') {
- break;
- }
}
return 0;
}