diff options
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 71 |
1 files changed, 36 insertions, 35 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 525cd50..28f725a 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -3,7 +3,7 @@ * * Routines for manipulating UTF-8 strings. * - * Copyright (c) 1997-1998 Sun Microsystems, Inc. + * Copyright © 1997-1998 Sun Microsystems, Inc. * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. @@ -64,20 +64,12 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -/* End of "continuation byte section" */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 1,1,1,1,1, -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; - + static const unsigned char complete[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -88,15 +80,9 @@ static const unsigned char complete[256] = { 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* End of "continuation byte section" */ 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 3,3,3,3,3, -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; - + /* * Functions used only in this module. */ @@ -694,7 +680,7 @@ Tcl_UtfToUniCharDString( p += TclUtfToUCS4(p, &ch); *w++ = ch; } - while ((p < endPtr) && TclUCS4Complete(p, endPtr-p)) { + while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { p += TclUtfToUCS4(p, &ch); *w++ = ch; } @@ -752,7 +738,7 @@ Tcl_UtfToChar16DString( *w++ = ch; } while (p < endPtr) { - if (TclChar16Complete(p, endPtr-p)) { + if (Tcl_UtfCharComplete(p, endPtr-p)) { p += Tcl_UtfToChar16(p, &ch); *w++ = ch; } else { @@ -833,7 +819,7 @@ Tcl_NumUtfChars( /* Pointer to the end of string. Never read endPtr[0] */ const char *endPtr = src + length; /* Pointer to last byte where optimization still can be used */ - const char *optPtr = endPtr - TCL_UTF_MAX; + const char *optPtr = endPtr - 4; /* * Optimize away the call in this loop. Justified because... @@ -970,6 +956,10 @@ Tcl_UtfNext( const char *next; if (((*src) & 0xC0) == 0x80) { + /* Continuation byte, so we start 'inside' a (possible valid) UTF-8 + * sequence. Since we are not allowed to access src[-1], we cannot + * check if the sequence is actually valid, the best we can do is + * just assume it is valid and locate the end. */ if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) { ++src; } @@ -1064,7 +1054,7 @@ Tcl_UtfPrev( * it (the fallback) is correct. */ - || (trailBytesSeen >= complete[byte])) { + || (trailBytesSeen >= totalBytes[byte])) { /* * That is, (1 + trailBytesSeen > needed). * We've examined more bytes than needed to complete @@ -1105,19 +1095,14 @@ Tcl_UtfPrev( /* Continue the search backwards... */ look--; - } while (trailBytesSeen < TCL_UTF_MAX); + } while (trailBytesSeen < 4); /* - * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a + * We've seen 4 trail bytes, so we know there will not be a * properly formed byte sequence to find, and we can stop looking, - * accepting the fallback (for TCL_UTF_MAX > 3) or just go back as - * far as we can. + * accepting the fallback. */ -#if TCL_UTF_MAX > 3 return fallback; -#else - return src - TCL_UTF_MAX; -#endif } /* @@ -1744,7 +1729,7 @@ Tcl_UniCharToLower( /* Clear away extension bits, if any */ return ch & 0x1FFFFF; } - + /* *---------------------------------------------------------------------- * @@ -1838,7 +1823,7 @@ Tcl_UniCharNcmp( const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { -#ifdef WORDS_BIGENDIAN +#if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) /* * We are definitely on a big-endian machine; memcmp() is safe */ @@ -1852,6 +1837,14 @@ Tcl_UniCharNcmp( for ( ; numChars != 0; ucs++, uct++, numChars--) { if (*ucs != *uct) { +#if TCL_UTF_MAX < 4 + /* special case for handling upper surrogates */ + if (((*ucs & 0xFC00) == 0xD800) && ((*uct & 0xFC00) != 0xD800)) { + return 1; + } else if (((*uct & 0xFC00) == 0xD800)) { + return -1; + } +#endif return (*ucs - *uct); } } @@ -1889,6 +1882,14 @@ Tcl_UniCharNcasecmp( Tcl_UniChar lct = Tcl_UniCharToLower(*uct); if (lcs != lct) { +#if TCL_UTF_MAX < 4 + /* special case for handling upper surrogates */ + if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) { + return 1; + } else if (((lct & 0xFC00) == 0xD800)) { + return -1; + } +#endif return (lcs - lct); } } |
