diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-03-12 12:37:29 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-03-12 12:37:29 (GMT) |
commit | 77e951f2b86cf0b20c2b38c753726d149a1c4ea9 (patch) | |
tree | 62f596a906461e4a52c2fc37bce67bef0217e922 /generic | |
parent | 8bf2e2ace2224e4066dfe647f47b531591fe8666 (diff) | |
parent | 1889ded1144a4dbd44d0c6f03e72a01d70115a51 (diff) | |
download | tcl-77e951f2b86cf0b20c2b38c753726d149a1c4ea9.zip tcl-77e951f2b86cf0b20c2b38c753726d149a1c4ea9.tar.gz tcl-77e951f2b86cf0b20c2b38c753726d149a1c4ea9.tar.bz2 |
Fix [db7a085bd9]: encoding convertfrom -strict utf-16 accepts partial surrogates
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tclEncoding.c | 39 |
1 files changed, 34 insertions, 5 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index f32baac..b3409d6 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2602,8 +2602,15 @@ Utf32ToUtfProc( /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } - if ((unsigned)ch > 0x10FFFF || (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - && ((ch & ~0x7FF) == 0xD800))) { + + if ((unsigned)ch > 0x10FFFF) { + ch = 0xFFFD; + if (STOPONERROR) { + result = TCL_CONVERT_SYNTAX; + break; + } + } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) + && ((ch & ~0x7FF) == 0xD800)) { if (STOPONERROR) { result = TCL_CONVERT_SYNTAX; ch = 0; @@ -2628,6 +2635,7 @@ Utf32ToUtfProc( /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } + if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { /* We have a single byte left-over at the end */ if (dst > dstEnd) { @@ -2835,6 +2843,13 @@ Utf16ToUtfProc( ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); } if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) { + if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { + result = TCL_CONVERT_UNKNOWN; + src -= 2; /* Go back to before the high surrogate */ + dst--; /* Also undo writing a single byte too much */ + numChars--; + break; + } /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } @@ -2844,17 +2859,31 @@ Utf16ToUtfProc( * unsigned short-size data. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (ch & 0xFF); + } else if (((prev & ~0x3FF) == 0xD800) || ((ch & ~0x3FF) == 0xD800)) { + dst += Tcl_UniCharToUtf(ch, dst); + } else if (((ch & ~0x3FF) == 0xDC00) && ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { + /* Lo surrogate not preceded by Hi surrogate */ + result = TCL_CONVERT_UNKNOWN; + break; } else { + *dst = 0; /* In case of lower surrogate, don't try to combine */ dst += Tcl_UniCharToUtf(ch, dst); } src += sizeof(unsigned short); } if ((ch & ~0x3FF) == 0xD800) { - /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); + if ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) { + result = TCL_CONVERT_UNKNOWN; + src -= 2; + dst--; + numChars--; + } else { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } } if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { /* We have a single byte left-over at the end */ |