diff options
author | hobbs <hobbs> | 2002-05-29 09:09:57 (GMT) |
---|---|---|
committer | hobbs <hobbs> | 2002-05-29 09:09:57 (GMT) |
commit | 3a1941e7d007e93449aa22085687a8fafaddad7e (patch) | |
tree | 9794bcbc0e094b9645add8339f0278666543ef7e /generic/tclUtf.c | |
parent | 514dd2a61babad4abc78895bdadec335dd9b4c71 (diff) | |
download | tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.zip tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.gz tcl-3a1941e7d007e93449aa22085687a8fafaddad7e.tar.bz2 |
* generic/tclInt.decls:
* generic/tclIntDecls.h:
* generic/tclStubInit.c:
* generic/tclUtf.c: added TclpUtfNcmp2 private command that
mirrors Tcl_UtfNcmp, but takes n in bytes, not utf-8 chars. This
provides a faster alternative for comparing utf strings internally.
(Tcl_UniCharNcmp, Tcl_UniCharNcasecmp): removed the explicit end
of string check as it wasn't correct for the function (by doc and
logic).
* generic/tclCmdMZ.c (Tcl_StringObjCmd): reworked the string equal
comparison code to use TclpUtfNcmp2 as well as short-circuit for
equal objects or unequal length strings in the equal case.
Removed the use of goto and streamlined the other parts.
* generic/tclExecute.c (TclExecuteByteCode): added check for
object equality in the comparison instructions. Added
short-circuit for != length strings in INST_EQ, INST_NEQ and
INST_STR_CMP. Reworked INST_STR_CMP to use TclpUtfNcmp2 where
appropriate, and only use Tcl_UniCharNcmp when at least one of the
objects is a Unicode obj with no utf bytes.
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 67 |
1 files changed, 55 insertions, 12 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 0148ca6..56dcaca 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclUtf.c,v 1.23 2002/02/08 02:52:54 dgp Exp $ + * RCS: @(#) $Id: tclUtf.c,v 1.24 2002/05/29 09:09:57 hobbs Exp $ */ #include "tclInt.h" @@ -309,7 +309,7 @@ Tcl_UtfToUniChar(str, chPtr) * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid * characters representing themselves. */ - + *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xE0) { @@ -317,7 +317,7 @@ Tcl_UtfToUniChar(str, chPtr) /* * Two-byte-character lead-byte followed by a trail-byte. */ - + *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); return 2; } @@ -325,7 +325,7 @@ Tcl_UtfToUniChar(str, chPtr) * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ - + *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xF0) { @@ -671,7 +671,7 @@ Tcl_UtfPrev(str, start) byte = *((unsigned char *) look); if (byte < 0x80) { break; - } + } if (byte >= 0xC0) { return look; } @@ -1074,6 +1074,51 @@ Tcl_UtfToTitle(str) /* *---------------------------------------------------------------------- * + * TclpUtfNcmp2 -- + * + * Compare at most n bytes of utf-8 strings cs and ct. Both cs + * and ct are assumed to be at least n bytes long. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclpUtfNcmp2(cs, ct, n) + CONST char *cs; /* UTF string to compare to ct. */ + CONST char *ct; /* UTF string cs is compared to. */ + unsigned long n; /* Number of *bytes* to compare. */ +{ + /* + * We can't simply call 'memcmp(cs, ct, n);' because we need to check + * for Tcl's \xC0\x80 non-utf-8 null encoding. + * Otherwise utf-8 lexes fine in the strcmp manner. + */ + register int result = 0; + + for ( ; n != 0; n--, cs++, ct++) { + if (*cs != *ct) { + result = UCHAR(*cs) - UCHAR(*ct); + break; + } + } + if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { + unsigned char c1, c2; + c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); + c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct); + result = (c1 - c2); + } + return result; +} + +/* + *---------------------------------------------------------------------- + * * Tcl_UtfNcmp -- * * Compare at most n UTF chars of string cs to string ct. Both cs @@ -1096,7 +1141,7 @@ Tcl_UtfNcmp(cs, ct, n) { Tcl_UniChar ch1, ch2; /* - * Cannot use memcmp()-based approach as byte representation of + * Cannot use 'memcmp(cs, ct, n);' as byte representation of * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte * representation of \u0001 (the byte 0x01.) */ @@ -1306,13 +1351,14 @@ Tcl_UniCharNcmp(cs, ct, n) CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ unsigned long n; /* Number of unichars to compare. */ { + /* + * We can't simply call 'memcmp(cs, ct, n*sizeof(Tcl_UniChar));' + * because that may not be lexically correct. + */ for ( ; n != 0; n--, cs++, ct++) { if (*cs != *ct) { return (*cs - *ct); } - if (*cs == '\0') { - break; - } } return 0; } @@ -1346,9 +1392,6 @@ Tcl_UniCharNcasecmp(cs, ct, n) (Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) { return (*cs - *ct); } - if (*cs == '\0') { - break; - } } return 0; } |