diff options
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 83 |
1 files changed, 71 insertions, 12 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 0bce51b..e68aa25 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -512,6 +512,8 @@ FillEncodingFileMap(void) /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ #define TCL_ENCODING_LE 0x80 /* Little-endian encoding, for ucs-2/utf-16 only */ +#define TCL_ENCODING_WTF 0x100 /* For WTF-8 encoding, don't check for surrogates/noncharacters */ +#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ void TclInitEncodingSubsystem(void) @@ -553,7 +555,13 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = NULL; + type.clientData = INT2PTR(TCL_ENCODING_UTF); + Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF); + type.encodingName = "wtf-8"; + Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(0); + type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; @@ -577,12 +585,21 @@ TclInitEncodingSubsystem(void) type.encodingName = "utf-16le"; type.clientData = INT2PTR(TCL_ENCODING_LE); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16le"; + type.clientData = INT2PTR(TCL_ENCODING_LE + TCL_ENCODING_WTF); + Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16be"; + type.clientData = INT2PTR(TCL_ENCODING_WTF); + Tcl_CreateEncoding(&type); type.encodingName = "utf-16"; type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16"; + type.clientData = INT2PTR(isLe.c + TCL_ENCODING_WTF); + Tcl_CreateEncoding(&type); #ifndef TCL_NO_DEPRECATED type.encodingName = "unicode"; @@ -1153,7 +1170,7 @@ Tcl_ExternalToUtfDStringEx( srcLen = encodingPtr->lengthProc(src); } - flags |= TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; while (1) { result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, @@ -1268,12 +1285,12 @@ Tcl_ExternalToUtf( dstLen--; } - flags |= TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; do { Tcl_EncodingState savedState = *statePtr; result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, - flags , statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, + flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr); if (*dstCharsPtr <= maxChars) { break; @@ -2200,7 +2217,7 @@ BinaryProc( static int UtfToUtfProc( - TCL_UNUSED(ClientData), + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2239,7 +2256,8 @@ UtfToUtfProc( } dstStart = dst; - dstEnd = dst + dstLen - TCL_UTF_MAX; + flags |= PTR2INT(clientData); + dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { @@ -2286,21 +2304,44 @@ UtfToUtfProc( src += 1; dst += Tcl_UniCharToUtf(ch, dst); } else { + int low; + const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) { result = TCL_CONVERT_SYNTAX; break; } src += len; - if ((ch | 0x7FF) == 0xDFFF) { + if (!(flags & TCL_ENCODING_UTF)) { + if (ch > 0xFFFF) { + /* CESU-8 6-byte sequence for chars > U+FFFF */ + ch -= 0x10000; + *dst++ = 0xED; + *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0); + *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); + ch = (ch & 0x0CFF) | 0xDC00; + } + goto cesu8; + } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ - int low = ch; + low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { + if (!(flags & TCL_ENCODING_WTF)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; + } + } + cesu8: *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); *dst++ = (char) ((ch | 0x80) & 0xBF); @@ -2309,6 +2350,15 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; + } else if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; + } } dst += Tcl_UniCharToUtf(ch, dst); } @@ -2444,7 +2494,7 @@ Utf16ToUtfProc( static int UtfToUtf16Proc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2467,7 +2517,7 @@ UtfToUtf16Proc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - int ch; + int ch, len; srcStart = src; srcEnd = src + srcLen; @@ -2478,6 +2528,7 @@ UtfToUtf16Proc( dstStart = dst; dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + flags |= PTR2INT(clientData); result = TCL_OK; for (numChars = 0; src < srcEnd; numChars++) { @@ -2494,8 +2545,16 @@ UtfToUtf16Proc( result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUCS4(src, &ch); - if (clientData) { + len = TclUtfToUCS4(src, &ch); + if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + break; + } + ch = 0xFFFD; + } + src += len; + if (flags & TCL_ENCODING_LE) { if (ch <= 0xFFFF) { *dst++ = (ch & 0xFF); *dst++ = (ch >> 8); |