diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 556 |
1 files changed, 418 insertions, 138 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index ce80bd0..8bc4d49 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -67,25 +67,13 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 3,3,3,3,3, /* Tcl_UtfCharComplete() only checks TCL_UTF_MAX bytes */ -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; - -/* - * Functions used only in this module. - */ - -static int UtfCount(int ch); /* *--------------------------------------------------------------------------- * - * UtfCount -- + * TclUtfCount -- * * Find the number of bytes in the Utf character "ch". * @@ -98,8 +86,8 @@ static int UtfCount(int ch); *--------------------------------------------------------------------------- */ -static inline int -UtfCount( +int +TclUtfCount( int ch) /* The Unicode character whose size is returned. */ { if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { @@ -108,11 +96,9 @@ UtfCount( if (ch <= 0x7FF) { return 2; } -#if TCL_UTF_MAX > 3 if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) { return 4; } -#endif return 3; } @@ -124,6 +110,19 @@ UtfCount( * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the * provided buffer. Equivalent to Plan 9 runetochar(). * + * Special handling of Surrogate pairs is handled as follows: + * When this function is called for ch being a high surrogate, + * the first byte of the 4-byte UTF-8 sequence is produced and + * the function returns 1. Calling the function again with a + * low surrogate, the remaining 3 bytes of the 4-byte UTF-8 + * sequence is produced, and the function returns 3. The buffer + * is used to remember the high surrogate between the two calls. + * + * If no low surrogate follows the high surrogate (which is actually + * illegal), this can be handled reasonably by calling Tcl_UniCharToUtf + * again with ch = -1. This will produce a 3-byte UTF-8 sequence + * representing the high surrogate. + * * Results: * The return values is the number of bytes in the buffer that were * consumed. @@ -141,7 +140,7 @@ Tcl_UniCharToUtf( char *buf) /* Buffer in which the UTF-8 representation of * the Tcl_UniChar is stored. Buffer must be * large enough to hold the UTF-8 character - * (at most TCL_UTF_MAX bytes). */ + * (at most 4 bytes). */ { if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; @@ -154,7 +153,6 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { -#if TCL_UTF_MAX > 3 if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ @@ -176,11 +174,8 @@ Tcl_UniCharToUtf( return 1; } } -#endif goto three; } - -#if TCL_UTF_MAX > 3 if (ch <= 0x10FFFF) { buf[3] = (char) ((ch | 0x80) & 0xBF); buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); @@ -198,7 +193,6 @@ Tcl_UniCharToUtf( buf[-1] = (char) ((ch >> 12) | 0xE0); return 2; } -#endif } ch = 0xFFFD; @@ -227,25 +221,35 @@ three: *--------------------------------------------------------------------------- */ +#undef Tcl_UniCharToUtfDString char * Tcl_UniCharToUtfDString( - const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ - int uniLength, /* Length of Unicode string in Tcl_UniChars - * (must be >= 0). */ + const int *uniStr, /* Unicode string to convert to UTF-8. */ + int uniLength, /* Length of Unicode string. */ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended * to this previously initialized DString. */ { - const Tcl_UniChar *w, *wEnd; + const int *w, *wEnd; char *p, *string; int oldLength; /* - * UTF-8 string length in bytes will be <= Unicode string length * - * TCL_UTF_MAX. + * UTF-8 string length in bytes will be <= Unicode string length * 4. */ + if (uniStr == NULL) { + return NULL; + } + if (uniLength < 0) { + uniLength = 0; + w = uniStr; + while (*w != '\0') { + uniLength++; + w++; + } + } oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * TCL_UTF_MAX); + Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4); string = Tcl_DStringValue(dsPtr) + oldLength; p = string; @@ -259,6 +263,59 @@ Tcl_UniCharToUtfDString( return string; } +char * +Tcl_Char16ToUtfDString( + const unsigned short *uniStr,/* Utf-16 string to convert to UTF-8. */ + int uniLength, /* Length of Utf-16 string. */ + Tcl_DString *dsPtr) /* UTF-8 representation of string is appended + * to this previously initialized DString. */ +{ + const unsigned short *w, *wEnd; + char *p, *string; + int oldLength, len = 1; + + /* + * UTF-8 string length in bytes will be <= Utf16 string length * 3. + */ + + if (uniStr == NULL) { + return NULL; + } + if (uniLength < 0) { + + uniLength = 0; + w = uniStr; + while (*w != '\0') { + uniLength++; + w++; + } + } + oldLength = Tcl_DStringLength(dsPtr); + Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 3); + string = Tcl_DStringValue(dsPtr) + oldLength; + + p = string; + wEnd = uniStr + uniLength; + for (w = uniStr; w < wEnd; ) { + if (!len && ((*w & 0xFC00) != 0xDC00)) { + /* Special case for handling high surrogates. */ + p += Tcl_UniCharToUtf(-1, p); + } + len = Tcl_UniCharToUtf(*w, p); + p += len; + if ((*w >= 0xD800) && (len < 3)) { + len = 0; /* Indication that high surrogate was found */ + } + w++; + } + if (!len) { + /* Special case for handling high surrogates. */ + p += Tcl_UniCharToUtf(-1, p); + } + Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); + + return string; +} /* *--------------------------------------------------------------------------- * @@ -278,8 +335,8 @@ Tcl_UniCharToUtfDString( * If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done: * For any UTF-8 string containing a character outside of the BMP, the * first call to this function will fill *chPtr with the high surrogate - * and generate a return value of 0. Calling Tcl_UtfToUniChar again - * will produce the low surrogate and a return value of 4. Because *chPtr + * and generate a return value of 1. Calling Tcl_UtfToUniChar again + * will produce the low surrogate and a return value of 3. Because *chPtr * is used to remember whether the high surrogate is already produced, it * is recommended to initialize the variable it points to as 0 before * the first call to Tcl_UtfToUniChar is done. @@ -294,27 +351,120 @@ Tcl_UniCharToUtfDString( *--------------------------------------------------------------------------- */ +static const unsigned short cp1252[32] = { + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 +}; + +#undef Tcl_UtfToUniChar int Tcl_UtfToUniChar( - register const char *src, /* The UTF-8 string. */ - register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by + const char *src, /* The UTF-8 string. */ + int *chPtr)/* Filled with the unsigned int represented by * the UTF-8 string. */ { - Tcl_UniChar byte; + int byte; /* - * Unroll 1 to 3 (or 4) byte UTF-8 sequences. + * Unroll 1 to 4 byte UTF-8 sequences. */ byte = *((unsigned char *) src); if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid + * Treats naked trail bytes 0x80 to 0x9F as valid characters from + * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8> + * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid + * characters representing themselves. + */ + + if ((unsigned)(byte-0x80) < (unsigned)0x20) { + *chPtr = cp1252[byte-0x80]; + } else { + *chPtr = byte; + } + return 1; + } else if (byte < 0xE0) { + if ((src[1] & 0xC0) == 0x80) { + /* + * Two-byte-character lead-byte followed by a trail-byte. + */ + + *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F)); + if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { + return 2; + } + } + + /* + * A two-byte-character lead-byte not followed by trail-byte + * represents itself. + */ + } else if (byte < 0xF0) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { + /* + * Three-byte-character lead byte followed by two trail bytes. + */ + + *chPtr = (((byte & 0x0F) << 12) + | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); + if (*chPtr > 0x7FF) { + return 3; + } + } + + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ + } + else if (byte < 0xF8) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { + /* + * Four-byte-character lead byte followed by three trail bytes. + */ + *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) + | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); + if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { + return 4; + } + } + + /* + * A four-byte-character lead-byte not followed by three trail-bytes + * represents itself. + */ + } + + *chPtr = byte; + return 1; +} + +int +Tcl_UtfToChar16( + const char *src, /* The UTF-8 string. */ + unsigned short *chPtr)/* Filled with the unsigned short represented by + * the UTF-8 string. */ +{ + unsigned short byte; + + /* + * Unroll 1 to 4 byte UTF-8 sequences. + */ + + byte = *((unsigned char *) src); + if (byte < 0xC0) { + /* + * Handles properly formed UTF-8 characters between 0x01 and 0x7F. + * Treats naked trail bytes 0x80 to 0x9F as valid characters from + * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8> + * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid * characters representing themselves. */ -#if TCL_UTF_MAX <= 4 /* If *chPtr contains a high surrogate (produced by a previous * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation * bytes, then we must produce a follow-up low surrogate. We only @@ -327,8 +477,11 @@ Tcl_UtfToUniChar( *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; return 3; } -#endif - *chPtr = byte; + if ((unsigned)(byte-0x80) < (unsigned)0x20) { + *chPtr = cp1252[byte-0x80]; + } else { + *chPtr = byte; + } return 1; } else if (byte < 0xE0) { if ((src[1] & 0xC0) == 0x80) { @@ -369,8 +522,7 @@ Tcl_UtfToUniChar( /* * Four-byte-character lead byte followed by three trail bytes. */ -#if TCL_UTF_MAX <= 4 - Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) + unsigned short high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) | ((src[2] & 0x3F) >> 4)) - 0x40; if (high >= 0x400) { /* out of range, < 0x10000 or > 0x10ffff */ @@ -379,13 +531,6 @@ Tcl_UtfToUniChar( *chPtr = 0xD800 + high; return 1; } -#else - *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); - if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { - return 4; - } -#endif } /* @@ -397,7 +542,7 @@ Tcl_UtfToUniChar( *chPtr = byte; return 1; } - + /* *--------------------------------------------------------------------------- * @@ -416,7 +561,8 @@ Tcl_UtfToUniChar( *--------------------------------------------------------------------------- */ -Tcl_UniChar * +#undef Tcl_UtfToUniCharDString +int * Tcl_UtfToUniCharDString( const char *src, /* UTF-8 string to convert to Unicode. */ int length, /* Length of UTF-8 string in bytes, or -1 for @@ -425,10 +571,13 @@ Tcl_UtfToUniCharDString( * appended to this previously initialized * DString. */ { - Tcl_UniChar ch = 0, *w, *wString; + int ch = 0, *w, *wString; const char *p, *end; int oldLength; + if (src == NULL) { + return NULL; + } if (length < 0) { length = strlen(src); } @@ -441,20 +590,20 @@ Tcl_UtfToUniCharDString( oldLength = Tcl_DStringLength(dsPtr); Tcl_DStringSetLength(dsPtr, - oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar))); - wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); + oldLength + ((length + 1) * sizeof(int))); + wString = (int *) (Tcl_DStringValue(dsPtr) + oldLength); w = wString; p = src; - end = src + length - TCL_UTF_MAX; + end = src + length - 4; while (p < end) { - p += TclUtfToUniChar(p, &ch); + p += Tcl_UtfToUniChar(p, &ch); *w++ = ch; } - end += TCL_UTF_MAX; + end += 4; while (p < end) { if (Tcl_UtfCharComplete(p, end-p)) { - p += TclUtfToUniChar(p, &ch); + p += Tcl_UtfToUniChar(p, &ch); } else { ch = UCHAR(*p++); } @@ -462,11 +611,65 @@ Tcl_UtfToUniCharDString( } *w = '\0'; Tcl_DStringSetLength(dsPtr, - (oldLength + ((char *) w - (char *) wString))); + oldLength + ((char *) w - (char *) wString)); + + return wString; +} + +unsigned short * +Tcl_UtfToChar16DString( + const char *src, /* UTF-8 string to convert to Unicode. */ + int length, /* Length of UTF-8 string in bytes, or -1 for + * strlen(). */ + Tcl_DString *dsPtr) /* Unicode representation of string is + * appended to this previously initialized + * DString. */ +{ + unsigned short ch = 0; + unsigned short *w, *wString; + const char *p, *end; + int oldLength; + + if (src == NULL) { + return NULL; + } + if (length < 0) { + length = strlen(src); + } + + /* + * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in + * bytes. + */ + + oldLength = Tcl_DStringLength(dsPtr); + + Tcl_DStringSetLength(dsPtr, + oldLength + ((length + 1) * sizeof(unsigned short))); + wString = (unsigned short *) (Tcl_DStringValue(dsPtr) + oldLength); + + w = wString; + p = src; + end = src + length - 4; + while (p < end) { + p += Tcl_UtfToChar16(p, &ch); + *w++ = ch; + } + end += 4; + while (p < end) { + if (Tcl_UtfCharComplete(p, end-p)) { + p += Tcl_UtfToChar16(p, &ch); + } else { + ch = UCHAR(*p++); + } + *w++ = ch; + } + *w = '\0'; + Tcl_DStringSetLength(dsPtr, + oldLength + ((char *) w - (char *) wString)); return wString; } - /* *--------------------------------------------------------------------------- * @@ -515,12 +718,12 @@ Tcl_UtfCharComplete( int Tcl_NumUtfChars( - register const char *src, /* The UTF-8 string to measure. */ + const char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { Tcl_UniChar ch = 0; - register int i = 0; + int i = 0; /* * The separate implementations are faster. @@ -536,13 +739,13 @@ Tcl_NumUtfChars( } if (i < 0) i = INT_MAX; /* Bug [2738427] */ } else { - register const char *endPtr = src + length - TCL_UTF_MAX; + const char *endPtr = src + length - 4; while (src < endPtr) { src += TclUtfToUniChar(src, &ch); i++; } - endPtr += TCL_UTF_MAX; + endPtr += 4; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { src += TclUtfToUniChar(src, &ch); i++; @@ -750,17 +953,33 @@ Tcl_UtfPrev( *--------------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharAtIndex( - register const char *src, /* The UTF-8 string to dereference. */ - register int index) /* The position of the desired character. */ + const char *src, /* The UTF-8 string to dereference. */ + int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; + int fullchar = 0; +#if TCL_UTF_MAX <= 4 + int len = 0; +#endif while (index-- >= 0) { +#if TCL_UTF_MAX <= 4 + src += (len = TclUtfToUniChar(src, &ch)); +#else src += TclUtfToUniChar(src, &ch); +#endif } - return ch; + fullchar = ch; +#if TCL_UTF_MAX <= 4 + if ((ch >= 0xD800) && (len < 3)) { + /* If last Tcl_UniChar was a high surrogate, combine with low surrogate */ + (void)TclUtfToUniChar(src, &ch); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + return fullchar; } /* @@ -769,7 +988,9 @@ Tcl_UniCharAtIndex( * Tcl_UtfAtIndex -- * * Returns a pointer to the specified character (not byte) position in - * the UTF-8 string. + * the UTF-8 string. If TCL_UTF_MAX <= 4, characters > U+FFFF count as + * 2 positions, but then the pointer should never be placed between + * the two positions. * * Results: * As above. @@ -782,8 +1003,8 @@ Tcl_UniCharAtIndex( const char * Tcl_UtfAtIndex( - register const char *src, /* The UTF-8 string. */ - register int index) /* The position of the desired character. */ + const char *src, /* The UTF-8 string. */ + int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; int len = 0; @@ -792,7 +1013,7 @@ Tcl_UtfAtIndex( len = TclUtfToUniChar(src, &ch); src += len; } -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 if ((ch >= 0xD800) && (len < 3)) { /* Index points at character following high Surrogate */ src += TclUtfToUniChar(src, &ch); @@ -810,7 +1031,7 @@ Tcl_UtfAtIndex( * * Results: * Stores the bytes represented by the backslash sequence in dst and - * returns the number of bytes written to dst. At most TCL_UTF_MAX bytes + * returns the number of bytes written to dst. At most 4 bytes * are written to dst; dst must have been large enough to accept those * bytes. If readPtr isn't NULL then it is filled in with a count of the * number of bytes in the backslash sequence. @@ -876,7 +1097,8 @@ int Tcl_UtfToUpper( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, upChar; + Tcl_UniChar ch = 0; + int upChar; char *src, *dst; int len; @@ -887,7 +1109,15 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { len = TclUtfToUniChar(src, &ch); - upChar = Tcl_UniCharToUpper(ch); + upChar = ch; +#if TCL_UTF_MAX <= 4 + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &ch); + /* Combine surrogates */ + upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + upChar = Tcl_UniCharToUpper(upChar); /* * To keep badly formed Utf strings from getting inflated by the @@ -895,7 +1125,7 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if (len < UtfCount(upChar)) { + if ((len < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -929,7 +1159,8 @@ int Tcl_UtfToLower( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, lowChar; + Tcl_UniChar ch = 0; + int lowChar; char *src, *dst; int len; @@ -940,7 +1171,15 @@ Tcl_UtfToLower( src = dst = str; while (*src) { len = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; +#if TCL_UTF_MAX <= 4 + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + lowChar = Tcl_UniCharToLower(lowChar); /* * To keep badly formed Utf strings from getting inflated by the @@ -948,7 +1187,7 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if (len < UtfCount(lowChar)) { + if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -983,7 +1222,8 @@ int Tcl_UtfToTitle( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, titleChar, lowChar; + Tcl_UniChar ch = 0; + int titleChar, lowChar; char *src, *dst; int len; @@ -996,9 +1236,17 @@ Tcl_UtfToTitle( if (*src) { len = TclUtfToUniChar(src, &ch); - titleChar = Tcl_UniCharToTitle(ch); + titleChar = ch; +#if TCL_UTF_MAX <= 4 + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &ch); + /* Combine surrogates */ + titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + titleChar = Tcl_UniCharToTitle(titleChar); - if (len < UtfCount(titleChar)) { + if ((len < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1009,12 +1257,19 @@ Tcl_UtfToTitle( while (*src) { len = TclUtfToUniChar(src, &ch); lowChar = ch; +#if TCL_UTF_MAX <= 4 + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif /* Special exception for Georgian Asomtavruli chars, no titlecase. */ if ((unsigned)(lowChar - 0x1C90) >= 0x30) { lowChar = Tcl_UniCharToLower(lowChar); } - if (len < UtfCount(lowChar)) { + if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1055,7 +1310,7 @@ TclpUtfNcmp2( * fine in the strcmp manner. */ - register int result = 0; + int result = 0; for ( ; numBytes != 0; numBytes--, cs++, ct++) { if (*cs != *ct) { @@ -1114,7 +1369,7 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 /* Surrogates always report higher than non-surrogates */ if (((ch1 & 0xFC00) == 0xD800)) { if ((ch2 & 0xFC00) != 0xD800) { @@ -1165,7 +1420,7 @@ Tcl_UtfNcasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 /* Surrogates always report higher than non-surrogates */ if (((ch1 & 0xFC00) == 0xD800)) { if ((ch2 & 0xFC00) != 0xD800) { @@ -1184,6 +1439,52 @@ Tcl_UtfNcasecmp( } return 0; } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfCmp -- + * + * Compare UTF chars of string cs to string ct case sensitively. + * Replacement for strcmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + Tcl_UniChar ch1 = 0, ch2 = 0; + + while (*cs && *ct) { + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { +#if TCL_UTF_MAX <= 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif + return ch1 - ch2; + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + /* *---------------------------------------------------------------------- @@ -1214,7 +1515,7 @@ TclUtfCasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 /* Surrogates always report higher than non-surrogates */ if (((ch1 & 0xFC00) == 0xD800)) { if ((ch2 & 0xFC00) != 0xD800) { @@ -1251,16 +1552,18 @@ TclUtfCasecmp( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToUpper( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x04) { - ch -= GetDelta(info); + if (GetCaseType(info) & 0x04) { + ch -= GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1279,17 +1582,19 @@ Tcl_UniCharToUpper( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToLower( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); - if ((mode & 0x02) && (mode != 0x7)) { - ch += GetDelta(info); + if ((mode & 0x02) && (mode != 0x7)) { + ch += GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1308,25 +1613,27 @@ Tcl_UniCharToLower( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToTitle( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); - if (mode & 0x1) { - /* - * Subtract or add one depending on the original case. - */ + if (mode & 0x1) { + /* + * Subtract or add one depending on the original case. + */ - if (mode != 0x7) { - ch += ((mode & 0x4) ? -1 : 1); + if (mode != 0x7) { + ch += ((mode & 0x4) ? -1 : 1); + } + } else if (mode == 0x4) { + ch -= GetDelta(info); } - } else if (mode == 0x4) { - ch -= GetDelta(info); } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1460,11 +1767,9 @@ int Tcl_UniCharIsAlnum( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } @@ -1488,11 +1793,9 @@ int Tcl_UniCharIsAlpha( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return ((ALPHA_BITS >> GetCategory(ch)) & 1); } @@ -1516,7 +1819,6 @@ int Tcl_UniCharIsControl( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { ch &= 0x1FFFFF; if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) { @@ -1527,7 +1829,6 @@ Tcl_UniCharIsControl( } return 0; } -#endif return ((CONTROL_BITS >> GetCategory(ch)) & 1); } @@ -1551,11 +1852,9 @@ int Tcl_UniCharIsDigit( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } @@ -1579,12 +1878,10 @@ int Tcl_UniCharIsGraph( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { ch &= 0x1FFFFF; return (ch >= 0xE0100) && (ch <= 0xE01EF); } -#endif return ((GRAPH_BITS >> GetCategory(ch)) & 1); } @@ -1608,11 +1905,9 @@ int Tcl_UniCharIsLower( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (GetCategory(ch) == LOWERCASE_LETTER); } @@ -1636,12 +1931,10 @@ int Tcl_UniCharIsPrint( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { ch &= 0x1FFFFF; return (ch >= 0xE0100) && (ch <= 0xE01EF); } -#endif return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } @@ -1665,11 +1958,9 @@ int Tcl_UniCharIsPunct( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return ((PUNCT_BITS >> GetCategory(ch)) & 1); } @@ -1693,13 +1984,8 @@ int Tcl_UniCharIsSpace( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 /* Ignore upper 11 bits. */ ch &= 0x1FFFFF; -#else - /* Ignore upper 16 bits. */ - ch &= 0xFFFF; -#endif /* * If the character is within the first 127 characters, just use the @@ -1708,10 +1994,8 @@ Tcl_UniCharIsSpace( if (ch < 0x80) { return TclIsSpaceProc((char) ch); -#if TCL_UTF_MAX > 3 } else if (UNICODE_OUT_OF_RANGE(ch)) { return 0; -#endif } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) { return 1; @@ -1740,11 +2024,9 @@ int Tcl_UniCharIsUpper( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (GetCategory(ch) == UPPERCASE_LETTER); } @@ -1768,11 +2050,9 @@ int Tcl_UniCharIsWordChar( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return ((WORD_BITS >> GetCategory(ch)) & 1); } |