diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 388 |
1 files changed, 312 insertions, 76 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 80f3be8..4103eff 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -64,11 +64,16 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +#if TCL_UTF_MAX != 4 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +#endif 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 4,4,4,4,4, #else 1,1,1,1,1, @@ -82,6 +87,8 @@ static const unsigned char totalBytes[256] = { static int UtfCount(int ch); static int Invalid(const char *src); +static int UCS4ToUpper(int ch); +static int UCS4ToTitle(int ch); /* *--------------------------------------------------------------------------- @@ -99,7 +106,7 @@ static int Invalid(const char *src); *--------------------------------------------------------------------------- */ -static INLINE int +static inline int UtfCount( int ch) /* The Unicode character whose size is returned. */ { @@ -160,7 +167,7 @@ static const unsigned char bounds[28] = { #endif }; -static INLINE int +static int Invalid( const char *src) /* Points to lead byte of a UTF-8 byte sequence */ { @@ -197,7 +204,7 @@ Invalid( *--------------------------------------------------------------------------- */ -INLINE int +int Tcl_UniCharToUtf( int ch, /* The Tcl_UniChar to be stored in the * buffer. */ @@ -217,6 +224,29 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { +#if TCL_UTF_MAX > 3 + if ((ch & 0xF800) == 0xD800) { + if (ch & 0x0400) { + /* Low surrogate */ + if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) { + /* Previous Tcl_UniChar was a high surrogate, so combine */ + buf[2] = (char) ((ch & 0x3F) | 0x80); + buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80); + return 3; + } + /* Previous Tcl_UniChar was not a high surrogate, so just output */ + } else { + /* High surrogate */ + ch += 0x40; + /* Fill buffer with specific 3-byte (invalid) byte combination, + so following low surrogate can recognize it and combine */ + buf[2] = (char) ((ch << 4) & 0x30); + buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); + buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); + return 1; + } + } +#endif goto three; } @@ -228,6 +258,16 @@ Tcl_UniCharToUtf( buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } + } else if (ch == -1) { + if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0) + && ((buf[-1] & 0xF8) == 0xF0)) { + ch = 0xD7C0 + ((buf[-1] & 0x07) << 8) + ((buf[0] & 0x3F) << 2) + + ((buf[1] & 0x30) >> 4); + buf[1] = (char) ((ch | 0x80) & 0xBF); + buf[0] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[-1] = (char) ((ch >> 12) | 0xE0); + return 2; + } #endif } @@ -305,6 +345,15 @@ Tcl_UniCharToUtfDString( * Tcl_UtfCharComplete() before calling this routine to ensure that * enough bytes remain in the string. * + * If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done: + * For any UTF-8 string containing a character outside of the BMP, the + * first call to this function will fill *chPtr with the high surrogate + * and generate a return value of 1. Calling Tcl_UtfToUniChar again + * will produce the low surrogate and a return value of 3. Because *chPtr + * is used to remember whether the high surrogate is already produced, it + * is recommended to initialize the variable it points to as 0 before + * the first call to Tcl_UtfToUniChar is done. + * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the * number of bytes from the UTF-8 string that were consumed. @@ -335,6 +384,20 @@ Tcl_UtfToUniChar( * characters representing themselves. */ +#if TCL_UTF_MAX <= 4 + /* If *chPtr contains a high surrogate (produced by a previous + * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation + * bytes, then we must produce a follow-up low surrogate. We only + * do that if the high surrogate matches the bits we encounter. + */ + if (((byte & 0xC0) == 0x80) + && ((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) + && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC)) + && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))) { + *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; + return 3; + } +#endif *chPtr = byte; return 1; } else if (byte < 0xE0) { @@ -371,17 +434,30 @@ Tcl_UtfToUniChar( * represents itself. */ } -#if TCL_UTF_MAX > 3 else if (byte < 0xF5) { - if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* - * Four-byte-character lead byte followed by three trail bytes. + * Four-byte-character lead byte followed by at least two trail bytes. + * We don't test the validity of 3th trail byte, see [ed29806ba] */ - *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); - if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { - return 4; +#if TCL_UTF_MAX <= 4 + Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) + | ((src[2] & 0x3F) >> 4)) - 0x40; + if (high < 0x400) { + /* produce high surrogate, advance source pointer */ + *chPtr = 0xD800 + high; + return 1; } + /* out of range, < 0x10000 or > 0x10FFFF */ +#else + if ((src[3] & 0xC0) == 0x80) { + *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) + | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); + if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { + return 4; + } + } +#endif } /* @@ -389,7 +465,6 @@ Tcl_UtfToUniChar( * represents itself. */ } -#endif *chPtr = byte; return 1; @@ -422,13 +497,13 @@ Tcl_UtfToUniCharDString( * appended to this previously initialized * DString. */ { - Tcl_UniChar *w, *wString; + Tcl_UniChar ch = 0, *w, *wString; const char *p; int oldLength; - /* Pointer to the end of string. Never read endPtr[0] */ - const char *endPtr = src + length; - /* Pointer to breakpoint in scan where optimization is lost */ - const char *optPtr = endPtr - TCL_UTF_MAX; + /* Pointer to the end of string. Never read endPtr[0] */ + const char *endPtr = src + length; + /* Pointer to last byte where optimization still can be used */ + const char *optPtr = endPtr - TCL_UTF_MAX; if (length < 0) { length = strlen(src); @@ -450,11 +525,12 @@ Tcl_UtfToUniCharDString( endPtr = src + length; optPtr = endPtr - TCL_UTF_MAX; while (p <= optPtr) { - p += TclUtfToUniChar(p, w); - w++; + p += TclUtfToUniChar(p, &ch); + *w++ = ch; } while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { - p += TclUtfToUniChar(p, w++); + p += TclUtfToUniChar(p, &ch); + *w++ = ch; } while (p < endPtr) { *w++ = UCHAR(*p++); @@ -518,7 +594,7 @@ Tcl_NumUtfChars( int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; int i = 0; if (length < 0) { @@ -544,12 +620,26 @@ Tcl_NumUtfChars( */ while (src <= optPtr /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { +#if TCL_UTF_MAX < 4 + if (((unsigned)UCHAR(*src) - 0xF0) < 5) { + /* treat F0 - F4 as single character */ + ch = 0; + src++; + } else +#endif src += TclUtfToUniChar(src, &ch); i++; } /* Loop over the remaining string where call must happen */ while (src < endPtr) { if (Tcl_UtfCharComplete(src, endPtr - src)) { +#if TCL_UTF_MAX < 4 + if (((unsigned)UCHAR(*src) - 0xF0) < 5) { + /* treat F0 - F4 as single character */ + ch = 0; + src++; + } else +#endif src += TclUtfToUniChar(src, &ch); } else { /* @@ -586,11 +676,11 @@ Tcl_NumUtfChars( const char * Tcl_UtfFindFirst( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The Unicode character to search for. */ { while (1) { - Tcl_UniChar find; - int len = TclUtfToUniChar(src, &find); + int find, len = TclUtfToUCS4(src, &find); + if (find == ch) { return src; } @@ -628,8 +718,7 @@ Tcl_UtfFindLast( const char *last = NULL; while (1) { - Tcl_UniChar find; - int len = TclUtfToUniChar(src, &find); + int find, len = TclUtfToUCS4(src, &find); if (find == ch) { last = src; @@ -799,15 +888,19 @@ Tcl_UtfPrev( /* Continue the search backwards... */ look--; - } while (trailBytesSeen < TCL_UTF_MAX); + } while (trailBytesSeen < ((TCL_UTF_MAX > 4) ? 4 : 3)); /* - * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a + * We've seen 3 (or 4) trail bytes, so we know there will not be a * properly formed byte sequence to find, and we can stop looking, - * accepting the fallback. + * accepting the fallback (for TCL_UTF_MAX > 4) or just go back as + * far as we can. */ - +#if TCL_UTF_MAX > 4 return fallback; +#else + return src - 3; +#endif } /* @@ -832,7 +925,7 @@ Tcl_UniCharAtIndex( const char *src, /* The UTF-8 string to dereference. */ int index) /* The position of the desired character. */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; TclUtfToUniChar(Tcl_UtfAtIndex(src, index), &ch); return ch; @@ -860,11 +953,19 @@ Tcl_UtfAtIndex( const char *src, /* The UTF-8 string. */ int index) /* The position of the desired character. */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; + int len = 0; while (index-- > 0) { + len = TclUtfToUniChar(src, &ch); + src += len; + } +#if TCL_UTF_MAX == 4 + if ((ch >= 0xD800) && (len < 3)) { + /* Index points at character following high Surrogate */ src += TclUtfToUniChar(src, &ch); } +#endif return src; } @@ -943,7 +1044,7 @@ int Tcl_UtfToUpper( char *str) /* String to convert in place. */ { - Tcl_UniChar ch, upChar; + int ch, upChar; char *src, *dst; int len; @@ -953,8 +1054,8 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { - len = TclUtfToUniChar(src, &ch); - upChar = Tcl_UniCharToUpper(ch); + len = TclUtfToUCS4(src, &ch); + upChar = UCS4ToUpper(ch); /* * To keep badly formed Utf strings from getting inflated by the @@ -962,7 +1063,7 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if (len < UtfCount(upChar)) { + if (len < UtfCount(upChar) || ((upChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -996,7 +1097,7 @@ int Tcl_UtfToLower( char *str) /* String to convert in place. */ { - Tcl_UniChar ch, lowChar; + int ch, lowChar; char *src, *dst; int len; @@ -1006,8 +1107,8 @@ Tcl_UtfToLower( src = dst = str; while (*src) { - len = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + len = TclUtfToUCS4(src, &ch); + lowChar = TclUCS4ToLower(ch); /* * To keep badly formed Utf strings from getting inflated by the @@ -1015,7 +1116,7 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if (len < UtfCount(lowChar)) { + if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1050,7 +1151,7 @@ int Tcl_UtfToTitle( char *str) /* String to convert in place. */ { - Tcl_UniChar ch, titleChar, lowChar; + int ch, titleChar, lowChar; char *src, *dst; int len; @@ -1062,10 +1163,10 @@ Tcl_UtfToTitle( src = dst = str; if (*src) { - len = TclUtfToUniChar(src, &ch); - titleChar = Tcl_UniCharToTitle(ch); + len = TclUtfToUCS4(src, &ch); + titleChar = UCS4ToTitle(ch); - if (len < UtfCount(titleChar)) { + if (len < UtfCount(titleChar) || ((titleChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1074,14 +1175,14 @@ Tcl_UtfToTitle( src += len; } while (*src) { - len = TclUtfToUniChar(src, &ch); + len = TclUtfToUCS4(src, &ch); lowChar = ch; /* Special exception for Georgian Asomtavruli chars, no titlecase. */ if ((unsigned)(lowChar - 0x1C90) >= 0x30) { - lowChar = Tcl_UniCharToLower(lowChar); + lowChar = TclUCS4ToLower(lowChar); } - if (len < UtfCount(lowChar)) { + if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1163,7 +1264,7 @@ Tcl_UtfNcmp( const char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; /* * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the @@ -1181,6 +1282,16 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & ~0x3FF) == 0xD800)) { + if ((ch2 & ~0x3FF) != 0xD800) { + return ch1; + } + } else if ((ch2 & ~0x3FF) == 0xD800) { + return -ch2; + } +#endif return (ch1 - ch2); } } @@ -1211,7 +1322,8 @@ Tcl_UtfNcasecmp( const char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; + while (numChars-- > 0) { /* * n must be interpreted as chars, not bytes. @@ -1221,6 +1333,16 @@ Tcl_UtfNcasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1254,12 +1376,22 @@ TclUtfCasecmp( const char *cs, /* UTF string to compare to ct. */ const char *ct) /* UTF string cs is compared to. */ { - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; while (*cs && *ct) { cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1287,24 +1419,26 @@ TclUtfCasecmp( *---------------------------------------------------------------------- */ -Tcl_UniChar -Tcl_UniCharToUpper( +static int +UCS4ToUpper( int ch) /* Unicode character to convert. */ { -#if TCL_UTF_MAX > 3 if (!UNICODE_OUT_OF_RANGE(ch)) { -#endif int info = GetUniCharInfo(ch); if (GetCaseType(info) & 0x04) { ch -= GetDelta(info); } -#if TCL_UTF_MAX > 3 } /* Clear away extension bits, if any */ - ch &= 0x1FFFFF; -#endif - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; +} + +Tcl_UniChar +Tcl_UniCharToUpper( + int ch) /* Unicode character to convert. */ +{ + return (Tcl_UniChar) UCS4ToUpper(ch); } /* @@ -1323,25 +1457,27 @@ Tcl_UniCharToUpper( *---------------------------------------------------------------------- */ -Tcl_UniChar -Tcl_UniCharToLower( +int +TclUCS4ToLower( int ch) /* Unicode character to convert. */ { -#if TCL_UTF_MAX > 3 if (!UNICODE_OUT_OF_RANGE(ch)) { -#endif int info = GetUniCharInfo(ch); int mode = GetCaseType(info); if ((mode & 0x02) && (mode != 0x7)) { ch += GetDelta(info); } -#if TCL_UTF_MAX > 3 } /* Clear away extension bits, if any */ - ch &= 0x1FFFFF; -#endif - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; +} + +Tcl_UniChar +Tcl_UniCharToLower( + int ch) /* Unicode character to convert. */ +{ + return (Tcl_UniChar) TclUCS4ToLower(ch); } /* @@ -1360,13 +1496,11 @@ Tcl_UniCharToLower( *---------------------------------------------------------------------- */ -Tcl_UniChar -Tcl_UniCharToTitle( +static int +UCS4ToTitle( int ch) /* Unicode character to convert. */ { -#if TCL_UTF_MAX > 3 if (!UNICODE_OUT_OF_RANGE(ch)) { -#endif int info = GetUniCharInfo(ch); int mode = GetCaseType(info); @@ -1381,12 +1515,16 @@ Tcl_UniCharToTitle( } else if (mode == 0x4) { ch -= GetDelta(info); } -#if TCL_UTF_MAX > 3 } /* Clear away extension bits, if any */ - ch &= 0x1FFFFF; -#endif - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; +} + +Tcl_UniChar +Tcl_UniCharToTitle( + int ch) /* Unicode character to convert. */ +{ + return (Tcl_UniChar) UCS4ToTitle(ch); } /* @@ -1771,7 +1909,8 @@ Tcl_UniCharIsSpace( } else if (UNICODE_OUT_OF_RANGE(ch)) { return 0; #endif - } else if (ch == 0x180E || ch == 0x202F) { + } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B + || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); @@ -1865,7 +2004,7 @@ Tcl_UniCharCaseMatch( * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - Tcl_UniChar ch1, p; + Tcl_UniChar ch1 = 0, p; while (1) { p = *uniPattern; @@ -2216,6 +2355,103 @@ TclUniCharMatch( } /* + *--------------------------------------------------------------------------- + * + * TclUtfToUCS4 -- + * + * Extract the 4-byte codepoint from the leading bytes of the + * Modified UTF-8 string "src". This is a utility routine to + * contain the surrogate gymnastics in one place. + * + * The caller must ensure that the source buffer is long enough that this + * routine does not run off the end and dereference non-existent memory + * looking for trail bytes. If the source buffer is known to be '\0' + * terminated, this cannot happen. Otherwise, the caller should call + * TclUCS4Complete() before calling this routine to ensure that + * enough bytes remain in the string. + * + * Results: + * *usc4Ptr is filled with the UCS4 code point, and the return value is + * the number of bytes from the UTF-8 string that were consumed. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +TclUtfToUCS4( + const char *src, /* The UTF-8 string. */ + int *ucs4Ptr) /* Filled with the UCS4 codepoint represented + * by the UTF-8 string. */ +{ + Tcl_UniChar ch = 0; + int len = Tcl_UtfToUniChar(src, &ch); + +#if TCL_UTF_MAX <= 4 + if ((ch & ~0x3FF) == 0xD800) { + Tcl_UniChar low = ch; + int len2 = Tcl_UtfToUniChar(src+len, &low); + if ((low & ~0x3FF) == 0xDC00) { + *ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000; + return len + len2; + } + } +#endif + *ucs4Ptr = (int)ch; + return len; +} + +/* + *--------------------------------------------------------------------------- + * + * TclUCS4ToUtf -- + * + * Store the given Unicode character as a sequence of UTF-8 bytes in the + * provided buffer. Might output 6 bytes, if the code point > 0xFFFF. + * + * Results: + * The return values is the number of bytes in the buffer that were + * consumed. If ch == -1, this function outputs 0 bytes (empty string), + * since TclGetUCS4 returns -1 for out-of-range indices. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +TclUCS4ToUtf( + int ch, /* Unicode character to be stored in the + * buffer. */ + char *buf) /* Buffer in which the UTF-8 representation of + * the Unicode character is stored. Buffer must be + * large enough to hold the UTF-8 character(s) + * (at most 6 bytes). */ +{ +#if TCL_UTF_MAX <= 4 + if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) { + /* Spit out a 4-byte UTF-8 character or 2 x 3-byte UTF-8 characters, depending on Tcl + * version and/or TCL_UTF_MAX build value */ + int len = Tcl_UniCharToUtf(0xD800 | ((ch - 0x10000) >> 10), buf); + return len + Tcl_UniCharToUtf(0xDC00 | (ch & 0x7FF), buf + len); + } +#endif + if ((ch & ~0x7FF) == 0xD800) { + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; + } + if (ch == -1) { + return 0; + } + return Tcl_UniCharToUtf(ch, buf); +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 |