diff options
| -rw-r--r-- | generic/tclEncoding.c | 30 | ||||
| -rw-r--r-- | tests/encoding.test | 12 |
2 files changed, 35 insertions, 7 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index fc3397a..4f334bb 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2603,6 +2603,7 @@ Utf32ToUtfProc( dst += Tcl_UniCharToUtf(-1, dst); } #endif + if ((unsigned)ch > 0x10FFFF) { ch = 0xFFFD; if (STOPONERROR) { @@ -2639,6 +2640,7 @@ Utf32ToUtfProc( dst += Tcl_UniCharToUtf(-1, dst); } #endif + if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { /* We have a single byte left-over at the end */ if (dst > dstEnd) { @@ -2846,6 +2848,13 @@ Utf16ToUtfProc( ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); } if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) { + if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { + result = TCL_CONVERT_UNKNOWN; + src -= 2; /* Go back to before the high surrogate */ + dst--; /* Also undo writing a single byte too much */ + numChars--; + break; + } /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } @@ -2855,17 +2864,30 @@ Utf16ToUtfProc( * unsigned short-size data. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (ch & 0xFF); - } else { + } else if (((prev & ~0x3FF) == 0xD800) || ((ch & ~0x3FF) == 0xD800)) { dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst); + } else if (((ch & ~0x3FF) == 0xDC00) && ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { + /* Lo surrogate not preceded by Hi surrogate */ + result = TCL_CONVERT_UNKNOWN; + break; + } else { + dst += Tcl_UniCharToUtf(ch, dst); } src += sizeof(unsigned short); } if ((ch & ~0x3FF) == 0xD800) { - /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); + if ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) { + result = TCL_CONVERT_UNKNOWN; + src -= 2; + dst--; + numChars--; + } else { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } } if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { /* We have a single byte left-over at the end */ diff --git a/tests/encoding.test b/tests/encoding.test index 8b14353..68b5dcd 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -554,15 +554,21 @@ test encoding-16.18 { return done } [namespace current]] } -result done -test encoding-16.19 {UnicodeToUtfProc, bug [d19fe0a5b]} -body { +test encoding-16.19 {Utf16ToUtfProc, bug [d19fe0a5b]} -body { encoding convertfrom utf-16 "\x41\x41\x41" } -result \u4141\uFFFD -test encoding-16.20 {UnicodeToUtfProc, bug [d19fe0a5b]} -constraints deprecated -body { +test encoding-16.20 {Utf16ToUtfProc, bug [d19fe0a5b]} -constraints deprecated -body { encoding convertfrom utf-16 "\xD8\xD8" } -result \uD8D8 -test encoding-16.21 {UnicodeToUtfProc, bug [d19fe0a5b]} -body { +test encoding-16.21 {Utf16ToUtfProc, bug [d19fe0a5b]} -body { encoding convertfrom utf-32 "\x00\x00\x00\x00\x41\x41" } -result \x00\uFFFD +test encoding-16.22 {Utf16ToUtfProc, strict, bug [db7a085bd9]} -body { + encoding convertfrom -strict utf-16le \x00\xD8 +} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\x00'} +test encoding-16.23 {Utf16ToUtfProc, strict, bug [db7a085bd9]} -body { + encoding convertfrom -strict utf-16le \x00\xDC +} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\x00'} test encoding-17.1 {UtfToUtf16Proc} -body { encoding convertto utf-16 "\U460DC" |
