diff options
| author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2018-10-28 19:36:34 (GMT) |
|---|---|---|
| committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2018-10-28 19:36:34 (GMT) |
| commit | 1f6c360968fab22aa02a30a72c196a8f8eb19c0c (patch) | |
| tree | c490d9cba824ba803b713fe06b6ccfb7f98caca3 /generic/tclUtf.c | |
| parent | 63b8d1466dca7693cf0f6b61d39fada6fd1e0e7f (diff) | |
| parent | ccb97d88ffefe602e7eb5a9610bd356d66bc2f20 (diff) | |
| download | tcl-1f6c360968fab22aa02a30a72c196a8f8eb19c0c.zip tcl-1f6c360968fab22aa02a30a72c196a8f8eb19c0c.tar.gz tcl-1f6c360968fab22aa02a30a72c196a8f8eb19c0c.tar.bz2 | |
Merge tip-468 branch
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 544 |
1 files changed, 376 insertions, 168 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b33bf6a..ce67db7 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -68,11 +68,7 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 4,4,4,4,4,4,4,4, -#else - 1,1,1,1,1,1,1,1, -#endif 1,1,1,1,1,1,1,1 }; @@ -94,7 +90,7 @@ static const unsigned char totalBytes[256] = { int TclUtfCount( - int ch) /* The Tcl_UniChar whose size is returned. */ + int ch) /* The Unicode character whose size is returned. */ { if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { return 1; @@ -102,11 +98,9 @@ TclUtfCount( if (ch <= 0x7FF) { return 2; } -#if TCL_UTF_MAX > 3 - if (((unsigned)(ch - 0x10000) <= 0xfffff)) { + if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) { return 4; } -#endif return 3; } @@ -135,7 +129,7 @@ Tcl_UniCharToUtf( char *buf) /* Buffer in which the UTF-8 representation of * the Tcl_UniChar is stored. Buffer must be * large enough to hold the UTF-8 character - * (at most TCL_UTF_MAX bytes). */ + * (at most 4 bytes). */ { if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; @@ -148,27 +142,30 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { -#if TCL_UTF_MAX == 4 if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); - return 4; + if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) + && ((buf[2] & 0xCF) == 0)) { + /* Previous Tcl_UniChar was a High surrogate, so combine */ + buf[3] = (char) ((ch & 0x3F) | 0x80); + buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); + return 4; + } + /* Previous Tcl_UniChar was not a High surrogate, so just output */ } else { /* High surrogate */ ch += 0x40; - buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); - buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); - buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); + /* Fill buffer with specific 3-byte (invalid) byte combination, + so following Low surrogate can recognize it and combine */ + buf[2] = (char) ((ch << 4) & 0x30); + buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); + buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); return 0; } } -#endif goto three; } - -#if TCL_UTF_MAX > 3 if (ch <= 0x10FFFF) { buf[3] = (char) ((ch | 0x80) & 0xBF); buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); @@ -176,7 +173,13 @@ Tcl_UniCharToUtf( buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } -#endif + } else if (ch == -1) { + if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) + && ((buf[2] & 0xCF) == 0)) { + ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2) + + ((buf[2] & 0x30) >> 4); + goto three; + } } ch = 0xFFFD; @@ -215,23 +218,31 @@ Tcl_UniCharToUtfDString( { const Tcl_UniChar *w, *wEnd; char *p, *string; - int oldLength; + int oldLength, len = 1; /* - * UTF-8 string length in bytes will be <= Unicode string length * - * TCL_UTF_MAX. + * UTF-8 string length in bytes will be <= Unicode string length * 4. */ oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX); + Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4); string = Tcl_DStringValue(dsPtr) + oldLength; p = string; wEnd = uniStr + uniLength; for (w = uniStr; w < wEnd; ) { - p += Tcl_UniCharToUtf(*w, p); + if (!len && ((*w & 0xFC00) != 0xDC00)) { + /* Special case for handling upper surrogates. */ + p += Tcl_UniCharToUtf(-1, p); + } + len = Tcl_UniCharToUtf(*w, p); + p += len; w++; } + if (!len) { + /* Special case for handling upper surrogates. */ + p += Tcl_UniCharToUtf(-1, p); + } Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); return string; @@ -253,6 +264,15 @@ Tcl_UniCharToUtfDString( * Tcl_UtfCharComplete() before calling this routine to ensure that * enough bytes remain in the string. * + * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done: + * For any UTF-8 string containing a character outside of the BMP, the + * first call to this function will fill *chPtr with the high surrogate + * and generate a return value of 0. Calling Tcl_UtfToUniChar again + * will produce the low surrogate and a return value of 4. Because *chPtr + * is used to remember whether the high surrogate is already produced, it + * is recommended to initialize the variable it points to as 0 before + * the first call to Tcl_UtfToUniChar is done. + * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the * number of bytes from the UTF-8 string that were consumed. @@ -263,6 +283,13 @@ Tcl_UniCharToUtfDString( *--------------------------------------------------------------------------- */ +static const unsigned short cp1252[32] = { + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 +}; + int Tcl_UtfToUniChar( register const char *src, /* The UTF-8 string. */ @@ -272,18 +299,24 @@ Tcl_UtfToUniChar( register int byte; /* - * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. + * Unroll 1 to 3 (or 4) byte UTF-8 sequences. */ byte = *((unsigned char *) src); if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid + * Treats naked trail bytes 0x80 to 0x9F as valid characters from + * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8> + * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid * characters representing themselves. */ - *chPtr = (Tcl_UniChar) byte; + if ((unsigned)(byte-0x80) < (unsigned) 0x20) { + *chPtr = (Tcl_UniChar) cp1252[byte-0x80]; + } else { + *chPtr = (Tcl_UniChar) byte; + } return 1; } else if (byte < 0xE0) { if ((src[1] & 0xC0) == 0x80) { @@ -292,7 +325,9 @@ Tcl_UtfToUniChar( */ *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); - return 2; + if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { + return 2; + } } /* @@ -307,7 +342,9 @@ Tcl_UtfToUniChar( *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); - return 3; + if (*chPtr > 0x7FF) { + return 3; + } } /* @@ -315,24 +352,42 @@ Tcl_UtfToUniChar( * represents itself. */ } -#if TCL_UTF_MAX > 3 else if (byte < 0xF8) { if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { /* * Four-byte-character lead byte followed by three trail bytes. */ - - *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12) +#if TCL_UTF_MAX <= 4 + Tcl_UniChar surrogate; + + byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) + | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000; + surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10)); + if (byte & 0x100000) { + /* out of range, < 0x10000 or > 0x10ffff */ + } else if (*chPtr != surrogate) { + /* produce high surrogate, but don't advance source pointer */ + *chPtr = surrogate; + return 0; + } else { + /* produce low surrogate, and advance source pointer */ + *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF)); + return 4; + } +#else + *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); - return 4; + if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { + return 4; + } +#endif } /* - * A three-byte-character lead-byte not followed by two trail-bytes + * A four-byte-character lead-byte not followed by two trail-bytes * represents itself. */ } -#endif *chPtr = (Tcl_UniChar) byte; return 1; @@ -365,7 +420,7 @@ Tcl_UtfToUniCharDString( * appended to this previously initialized * DString. */ { - Tcl_UniChar *w, *wString; + Tcl_UniChar ch = 0, *w, *wString; const char *p, *end; int oldLength; @@ -379,20 +434,32 @@ Tcl_UtfToUniCharDString( */ oldLength = Tcl_DStringLength(dsPtr); -/* TODO: fix overreach! */ + Tcl_DStringSetLength(dsPtr, - (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); + oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar))); wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); w = wString; - end = src + length; - for (p = src; p < end; ) { - p += TclUtfToUniChar(p, w); - w++; + p = src; + end = src + length - 4; + while (p < end) { + p += TclUtfToUniChar(p, &ch); + *w++ = ch; + } + end += 4; + while (p < end) { + if (Tcl_UtfCharComplete(p, end-p)) { + p += TclUtfToUniChar(p, &ch); + } else if ((unsigned)((UCHAR(*p)-0x80)) < (unsigned) 0x20) { + ch = (Tcl_UniChar) cp1252[UCHAR(*p++)-0x80]; + } else { + ch = UCHAR(*p++); + } + *w++ = ch; } *w = '\0'; Tcl_DStringSetLength(dsPtr, - (oldLength + ((char *) w - (char *) wString))); + oldLength + ((char *) w - (char *) wString)); return wString; } @@ -422,10 +489,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - int ch; - - ch = *((unsigned char *) src); - return length >= totalBytes[ch]; + return length >= totalBytes[(unsigned char)*src]; } /* @@ -452,9 +516,8 @@ Tcl_NumUtfChars( int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { - Tcl_UniChar ch; - register Tcl_UniChar *chPtr = &ch; - register int i; + Tcl_UniChar ch = 0; + register int i = 0; /* * The separate implementations are faster. @@ -463,26 +526,27 @@ Tcl_NumUtfChars( * single-byte char case specially. */ - i = 0; if (length < 0) { while (*src != '\0') { - src += TclUtfToUniChar(src, chPtr); + src += TclUtfToUniChar(src, &ch); i++; } + if (i < 0) i = INT_MAX; /* Bug [2738427] */ } else { - register int n; + register const char *endPtr = src + length - 4; - while (length > 0) { - if (UCHAR(*src) < 0xC0) { - length--; - src++; - } else { - n = Tcl_UtfToUniChar(src, chPtr); - length -= n; - src += n; - } + while (src < endPtr) { + src += TclUtfToUniChar(src, &ch); + i++; + } + endPtr += 4; + while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { + src += TclUtfToUniChar(src, &ch); i++; } + if (src < endPtr) { + i += endPtr - src; + } } return i; } @@ -492,13 +556,13 @@ Tcl_NumUtfChars( * * Tcl_UtfFindFirst -- * - * Returns a pointer to the first occurance of the given Tcl_UniChar in - * the NULL-terminated UTF-8 string. The NULL terminator is considered + * Returns a pointer to the first occurance of the given Unicode character + * in the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). * * Results: - * As above. If the Tcl_UniChar does not exist in the given string, the - * return value is NULL. + * As above. If the Unicode character does not exist in the given string, + * the return value is NULL. * * Side effects: * None. @@ -509,14 +573,21 @@ Tcl_NumUtfChars( const char * Tcl_UtfFindFirst( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The Unicode character to search for. */ { - int len; - Tcl_UniChar find; + int len, fullchar; + Tcl_UniChar find = 0; while (1) { len = TclUtfToUniChar(src, &find); - if (find == ch) { + fullchar = find; +#if TCL_UTF_MAX <= 4 + if (!len) { + len += TclUtfToUniChar(src, &find); + fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; + } +#endif + if (fullchar == ch) { return src; } if (*src == '\0') { @@ -531,12 +602,12 @@ Tcl_UtfFindFirst( * * Tcl_UtfFindLast -- * - * Returns a pointer to the last occurance of the given Tcl_UniChar in - * the NULL-terminated UTF-8 string. The NULL terminator is considered + * Returns a pointer to the last occurance of the given Unicode character + * in the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). * * Results: - * As above. If the Tcl_UniChar does not exist in the given string, the + * As above. If the Unicode character does not exist in the given string, the * return value is NULL. * * Side effects: @@ -548,16 +619,23 @@ Tcl_UtfFindFirst( const char * Tcl_UtfFindLast( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The Unicode character to search for. */ { - int len; - Tcl_UniChar find; + int len, fullchar; + Tcl_UniChar find = 0; const char *last; last = NULL; while (1) { len = TclUtfToUniChar(src, &find); - if (find == ch) { + fullchar = find; +#if TCL_UTF_MAX <= 4 + if (!len) { + len += TclUtfToUniChar(src, &find); + fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; + } +#endif + if (fullchar == ch) { last = src; } if (*src == '\0') { @@ -591,9 +669,15 @@ const char * Tcl_UtfNext( const char *src) /* The current location in the string. */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; + int len = TclUtfToUniChar(src, &ch); - return src + TclUtfToUniChar(src, &ch); +#if TCL_UTF_MAX <= 4 + if (len == 0) { + len = TclUtfToUniChar(src, &ch); + } +#endif + return src + len; } /* @@ -626,9 +710,8 @@ Tcl_UtfPrev( const char *look; int i, byte; - src--; - look = src; - for (i = 0; i < TCL_UTF_MAX; i++) { + look = --src; + for (i = 0; i < 4; i++) { if (look < start) { if (src < start) { src = start; @@ -652,7 +735,7 @@ Tcl_UtfPrev( * * Tcl_UniCharAtIndex -- * - * Returns the Unicode character represented at the specified character + * Returns the Tcl_UniChar represented at the specified character * (not byte) position in the UTF-8 string. * * Results: @@ -664,18 +747,33 @@ Tcl_UtfPrev( *--------------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharAtIndex( register const char *src, /* The UTF-8 string to dereference. */ register int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; + int fullchar = 0; +#if TCL_UTF_MAX <= 4 + int len = 1; +#endif - while (index >= 0) { - index--; + while (index-- >= 0) { +#if TCL_UTF_MAX <= 4 + src += (len = TclUtfToUniChar(src, &ch)); +#else src += TclUtfToUniChar(src, &ch); +#endif } - return ch; + fullchar = ch; +#if TCL_UTF_MAX <= 4 + if (!len) { + /* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */ + (void)TclUtfToUniChar(src, &ch); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + return fullchar; } /* @@ -684,7 +782,9 @@ Tcl_UniCharAtIndex( * Tcl_UtfAtIndex -- * * Returns a pointer to the specified character (not byte) position in - * the UTF-8 string. + * the UTF-8 string. If TCL_UTF_MAX <= 4, characters > U+FFFF count as + * 2 positions, but then the pointer should never be placed between + * the two positions. * * Results: * As above. @@ -700,12 +800,19 @@ Tcl_UtfAtIndex( register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; + int len = 1; - while (index > 0) { - index--; + while (index-- > 0) { + len = TclUtfToUniChar(src, &ch); + src += len; + } +#if TCL_UTF_MAX <= 4 + if (!len) { + /* Index points at character following High Surrogate */ src += TclUtfToUniChar(src, &ch); } +#endif return src; } @@ -784,7 +891,8 @@ int Tcl_UtfToUpper( char *str) /* String to convert in place. */ { - Tcl_UniChar ch, upChar; + Tcl_UniChar ch = 0; + int upChar; char *src, *dst; int bytes; @@ -795,7 +903,16 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { bytes = TclUtfToUniChar(src, &ch); - upChar = Tcl_UniCharToUpper(ch); + upChar = ch; +#if TCL_UTF_MAX <= 4 + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + upChar = Tcl_UniCharToUpper(upChar); /* * To keep badly formed Utf strings from getting inflated by the @@ -803,7 +920,7 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if (bytes < TclUtfCount(upChar)) { + if ((bytes < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) { memcpy(dst, src, (size_t) bytes); dst += bytes; } else { @@ -837,7 +954,8 @@ int Tcl_UtfToLower( char *str) /* String to convert in place. */ { - Tcl_UniChar ch, lowChar; + Tcl_UniChar ch = 0; + int lowChar; char *src, *dst; int bytes; @@ -848,7 +966,16 @@ Tcl_UtfToLower( src = dst = str; while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; +#if TCL_UTF_MAX <= 4 + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + lowChar = Tcl_UniCharToLower(lowChar); /* * To keep badly formed Utf strings from getting inflated by the @@ -856,7 +983,7 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if (bytes < TclUtfCount(lowChar)) { + if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) { memcpy(dst, src, (size_t) bytes); dst += bytes; } else { @@ -891,7 +1018,8 @@ int Tcl_UtfToTitle( char *str) /* String to convert in place. */ { - Tcl_UniChar ch, titleChar, lowChar; + Tcl_UniChar ch = 0; + int titleChar, lowChar; char *src, *dst; int bytes; @@ -904,9 +1032,18 @@ Tcl_UtfToTitle( if (*src) { bytes = TclUtfToUniChar(src, &ch); - titleChar = Tcl_UniCharToTitle(ch); + titleChar = ch; +#if TCL_UTF_MAX <= 4 + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + titleChar = Tcl_UniCharToTitle(titleChar); - if (bytes < TclUtfCount(titleChar)) { + if ((bytes < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) { memcpy(dst, src, (size_t) bytes); dst += bytes; } else { @@ -916,9 +1053,21 @@ Tcl_UtfToTitle( } while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; +#if TCL_UTF_MAX <= 4 + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + /* Special exception for Georgian Asomtavruli chars, no titlecase. */ + if ((unsigned)(lowChar - 0x1C90) >= 0x30) { + lowChar = Tcl_UniCharToLower(lowChar); + } - if (bytes < TclUtfCount(lowChar)) { + if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) { memcpy(dst, src, (size_t) bytes); dst += bytes; } else { @@ -1000,11 +1149,11 @@ Tcl_UtfNcmp( const char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; /* * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the - * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001 + * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001 * (the byte 0x01.) */ @@ -1018,6 +1167,16 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX <= 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif return (ch1 - ch2); } } @@ -1048,7 +1207,8 @@ Tcl_UtfNcasecmp( const char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; + while (numChars-- > 0) { /* * n must be interpreted as chars, not bytes. @@ -1058,6 +1218,16 @@ Tcl_UtfNcasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX <= 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1067,11 +1237,57 @@ Tcl_UtfNcasecmp( } return 0; } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfCmp -- + * + * Compare UTF chars of string cs to string ct case sensitively. + * Replacement for strcmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + Tcl_UniChar ch1 = 0, ch2 = 0; + + while (*cs && *ct) { + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { +#if TCL_UTF_MAX <= 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif + return ch1 - ch2; + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + /* *---------------------------------------------------------------------- * - * Tcl_UtfNcasecmp -- + * TclUtfCasecmp -- * * Compare UTF chars of string cs to string ct case insensitively. * Replacement for strcasecmp in Tcl core, in places where UTF-8 should @@ -1091,12 +1307,22 @@ TclUtfCasecmp( const char *cs, /* UTF string to compare to ct. */ const char *ct) /* UTF string cs is compared to. */ { - while (*cs && *ct) { - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; + while (*cs && *ct) { cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX <= 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1124,16 +1350,18 @@ TclUtfCasecmp( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToUpper( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x04) { - ch -= GetDelta(info); + if (GetCaseType(info) & 0x04) { + ch -= GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1152,16 +1380,19 @@ Tcl_UniCharToUpper( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToLower( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); - if (GetCaseType(info) & 0x02) { - ch += GetDelta(info); + if ((mode & 0x02) && (mode != 0x7)) { + ch += GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1180,23 +1411,27 @@ Tcl_UniCharToLower( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToTitle( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); - if (mode & 0x1) { - /* - * Subtract or add one depending on the original case. - */ + if (mode & 0x1) { + /* + * Subtract or add one depending on the original case. + */ - ch += ((mode & 0x4) ? -1 : 1); - } else if (mode == 0x4) { - ch -= GetDelta(info); + if (mode != 0x7) { + ch += ((mode & 0x4) ? -1 : 1); + } + } else if (mode == 0x4) { + ch -= GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1330,11 +1565,9 @@ int Tcl_UniCharIsAlnum( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } @@ -1358,11 +1591,9 @@ int Tcl_UniCharIsAlpha( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return ((ALPHA_BITS >> GetCategory(ch)) & 1); } @@ -1386,18 +1617,16 @@ int Tcl_UniCharIsControl( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { - ch &= 0x1fffff; - if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) { + ch &= 0x1FFFFF; + if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) { return 1; } - if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) { + if ((ch >= 0xF0000) && ((ch & 0xFFFF) <= 0xFFFD)) { return 1; } return 0; } -#endif return ((CONTROL_BITS >> GetCategory(ch)) & 1); } @@ -1421,11 +1650,9 @@ int Tcl_UniCharIsDigit( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } @@ -1449,12 +1676,10 @@ int Tcl_UniCharIsGraph( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { - ch &= 0x1fffff; - return (ch >= 0xe0100) && (ch <= 0xe01ef); + ch &= 0x1FFFFF; + return (ch >= 0xE0100) && (ch <= 0xE01EF); } -#endif return ((GRAPH_BITS >> GetCategory(ch)) & 1); } @@ -1478,11 +1703,9 @@ int Tcl_UniCharIsLower( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (GetCategory(ch) == LOWERCASE_LETTER); } @@ -1506,12 +1729,10 @@ int Tcl_UniCharIsPrint( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { - ch &= 0x1fffff; - return (ch >= 0xe0100) && (ch <= 0xe01ef); + ch &= 0x1FFFFF; + return (ch >= 0xE0100) && (ch <= 0xE01EF); } -#endif return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } @@ -1535,11 +1756,9 @@ int Tcl_UniCharIsPunct( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return ((PUNCT_BITS >> GetCategory(ch)) & 1); } @@ -1563,13 +1782,8 @@ int Tcl_UniCharIsSpace( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 /* Ignore upper 11 bits. */ - ch &= 0x1fffff; -#else - /* Ignore upper 16 bits. */ - ch &= 0xffff; -#endif + ch &= 0x1FFFFF; /* * If the character is within the first 127 characters, just use the @@ -1578,12 +1792,10 @@ Tcl_UniCharIsSpace( if (ch < 0x80) { return TclIsSpaceProc((char) ch); -#if TCL_UTF_MAX > 3 } else if (UNICODE_OUT_OF_RANGE(ch)) { return 0; -#endif - } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b - || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) { + } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B + || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); @@ -1610,11 +1822,9 @@ int Tcl_UniCharIsUpper( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (GetCategory(ch) == UPPERCASE_LETTER); } @@ -1638,11 +1848,9 @@ int Tcl_UniCharIsWordChar( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return ((WORD_BITS >> GetCategory(ch)) & 1); } @@ -1677,7 +1885,7 @@ Tcl_UniCharCaseMatch( * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - Tcl_UniChar ch1, p; + Tcl_UniChar ch1 = 0, p; while (1) { p = *uniPattern; |
