diff options
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 210 |
1 files changed, 201 insertions, 9 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 7a55724..4871b85 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -191,6 +191,14 @@ static Tcl_Encoding systemEncoding; static unsigned short emptyPage[256]; /* + * Constants used in the (external) UTF-8 <--> (internal) Modified UTF-8 + * conversion code. + */ + +#define FROM_STANDARD_UTF8 0 +#define TO_STANDARD_UTF8 1 + +/* * Functions used only in this module. */ @@ -2160,7 +2168,7 @@ UtfIntToUtfExtProc( * output buffer. */ { return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, - srcReadPtr, dstWrotePtr, dstCharsPtr, 1); + srcReadPtr, dstWrotePtr, dstCharsPtr, TO_STANDARD_UTF8); } /* @@ -2209,7 +2217,7 @@ UtfExtToUtfIntProc( * output buffer. */ { return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, - srcReadPtr, dstWrotePtr, dstCharsPtr, 0); + srcReadPtr, dstWrotePtr, dstCharsPtr, FROM_STANDARD_UTF8); } /* @@ -2230,6 +2238,153 @@ UtfExtToUtfIntProc( *------------------------------------------------------------------------- */ +static INLINE int +IntToUtf( + unsigned ch, /* The character to be stored in the + * buffer. */ + char *buf) /* Buffer in which the UTF-8 representation of + * the character is stored. Buffer must be + * large enough to hold the UTF-8 character + * (at most 6 bytes). */ +{ + if ((ch > 0) && (ch < 0x80)) { + buf[0] = (char) ch; + return 1; + } + if (ch <= 0x7FF) { + buf[1] = (char) ((ch | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 6) | 0xC0); + return 2; + } + if (ch <= 0xFFFF) { + three: + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; + } + if (ch <= 0x1FFFFF) { + buf[3] = (char) ((ch | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 18) | 0xF0); + return 4; + } + if (ch <= 0x3FFFFFF) { + buf[4] = (char) ((ch | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 24) | 0xF8); + return 5; + } + if (ch <= 0x7FFFFFFF) { + buf[5] = (char) ((ch | 0x80) & 0xBF); + buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 30) | 0xFC); + return 6; + } + + ch = 0xFFFD; + goto three; +} + +static INLINE int +UtfToInt( + const char *src, /* The UTF-8 string. */ + unsigned *chPtr) /* Filled with the character represented by + * the front of the UTF-8 string. */ +{ + register int byte; + + /* + * Unroll 1 to 6 byte UTF-8 sequences, use loop to handle longer ones. + */ + + byte = *((unsigned char *) src); + if (byte < 0xC0) { + /* + * Handles properly formed UTF-8 characters between 0x01 and 0x7F. + * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid + * characters representing themselves. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } else if (byte < 0xE0) { + if ((src[1] & 0xC0) == 0x80) { + /* + * Two-byte-character lead-byte followed by a trail-byte. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); + return 2; + } + + /* + * A two-byte-character lead-byte not followed by trail-byte + * represents itself. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } else if (byte < 0xF0) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { + /* + * Three-byte-character lead byte followed by two trail bytes. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) + | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); + return 3; + } + + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } else { + int ch, total, trail; + static const unsigned char totalBytes[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 + }; + + total = totalBytes[byte]; + trail = total - 1; + if (trail > 0) { + ch = byte & (0x3F >> trail); + do { + src++; + if ((*src & 0xC0) != 0x80) { + *chPtr = byte; + return 1; + } + ch <<= 6; + ch |= (*src & 0x3F); + trail--; + } while (trail > 0); + *chPtr = ch; + return total; + } else { + *chPtr = (Tcl_UniChar) byte; + return 1; + } + } +} + static int UtfToUtfProc( ClientData clientData, /* Not used. */ @@ -2256,7 +2411,7 @@ UtfToUtfProc( int *dstCharsPtr, /* Filled with the number of characters that * correspond to the bytes stored in the * output buffer. */ - int pureNullMode) /* Convert embedded nulls from internal + int conversionMode) /* Convert embedded nulls from internal * representation to real null-bytes or vice * versa. */ { @@ -2291,15 +2446,16 @@ UtfToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && pureNullMode == 0)) { + if (UCHAR(*src) < 0x80 && + !(UCHAR(*src) == 0 && conversionMode == FROM_STANDARD_UTF8)) { /* * Copy 7bit chatacters, but skip null-bytes when we are in input * mode, so that they get converted to 0xc080. */ *dst++ = *src++; - } else if (pureNullMode == 1 && UCHAR(*src) == 0xc0 && - UCHAR(*(src+1)) == 0x80) { + } else if (conversionMode == TO_STANDARD_UTF8 && UCHAR(*src) == 0xc0 + && UCHAR(*(src+1)) == 0x80) { /* * Convert 0xc080 to real nulls when we are in output mode. */ @@ -2310,15 +2466,51 @@ UtfToUtfProc( /* * Always check before using Tcl_UtfToUniChar. Not doing can so * cause it run beyond the endof the buffer! If we happen such an - * incomplete char its byts are made to represent themselves. + * incomplete char, its bytes are made to represent themselves. */ ch = (unsigned char) *src; src += 1; dst += Tcl_UniCharToUtf(ch, dst); } else { - src += Tcl_UtfToUniChar(src, &ch); - dst += Tcl_UniCharToUtf(ch, dst); + /* + * This is where we ought to do surrogate pair handling, with the + * correct way of doing it depending on the conversionMode + * parameter. But we don't. Yet. KNOWN BUG/MISFEATURE! + */ + + if (conversionMode == TO_STANDARD_UTF8) { + const char *origin = src; + + src += Tcl_UtfToUniChar(src, &ch); + if (ch >= 0xD800 && ch < 0xDBFF) { + unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10; + + src += Tcl_UtfToUniChar(src, &ch); + if (ch >= 0xDC00 && ch < 0xDFFF) { + fullChar += 0x2400 + (unsigned) ch; + dst += IntToUtf(fullChar, dst); + continue; + } else { + src = origin + Tcl_UtfToUniChar(origin, &ch); + } + } + dst += Tcl_UniCharToUtf(ch, dst); + } else { + unsigned fullChar; + + src += UtfToInt(src, &fullChar); + if (fullChar > 0xFFFF) { + fullChar -= 0x10000; + ch = (Tcl_UniChar) ((fullChar >> 10) + 0xD800); + dst += Tcl_UniCharToUtf(ch, dst); + ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00); + dst += Tcl_UniCharToUtf(ch, dst); + } else { + ch = (Tcl_UniChar) fullChar; + dst += Tcl_UniCharToUtf(ch, dst); + } + } } } |