summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclEncoding.c30
-rw-r--r--tests/encoding.test12
2 files changed, 35 insertions, 7 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index fc3397a..4f334bb 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2603,6 +2603,7 @@ Utf32ToUtfProc(
dst += Tcl_UniCharToUtf(-1, dst);
}
#endif
+
if ((unsigned)ch > 0x10FFFF) {
ch = 0xFFFD;
if (STOPONERROR) {
@@ -2639,6 +2640,7 @@ Utf32ToUtfProc(
dst += Tcl_UniCharToUtf(-1, dst);
}
#endif
+
if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
/* We have a single byte left-over at the end */
if (dst > dstEnd) {
@@ -2846,6 +2848,13 @@ Utf16ToUtfProc(
ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
}
if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) {
+ if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) {
+ result = TCL_CONVERT_UNKNOWN;
+ src -= 2; /* Go back to before the high surrogate */
+ dst--; /* Also undo writing a single byte too much */
+ numChars--;
+ break;
+ }
/* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
dst += Tcl_UniCharToUtf(-1, dst);
}
@@ -2855,17 +2864,30 @@ Utf16ToUtfProc(
* unsigned short-size data.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (ch & 0xFF);
- } else {
+ } else if (((prev & ~0x3FF) == 0xD800) || ((ch & ~0x3FF) == 0xD800)) {
dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
+ } else if (((ch & ~0x3FF) == 0xDC00) && ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) {
+ /* Lo surrogate not preceded by Hi surrogate */
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ } else {
+ dst += Tcl_UniCharToUtf(ch, dst);
}
src += sizeof(unsigned short);
}
if ((ch & ~0x3FF) == 0xD800) {
- /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
- dst += Tcl_UniCharToUtf(-1, dst);
+ if ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) {
+ result = TCL_CONVERT_UNKNOWN;
+ src -= 2;
+ dst--;
+ numChars--;
+ } else {
+ /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
}
if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
/* We have a single byte left-over at the end */
diff --git a/tests/encoding.test b/tests/encoding.test
index 8b14353..68b5dcd 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -554,15 +554,21 @@ test encoding-16.18 {
return done
} [namespace current]]
} -result done
-test encoding-16.19 {UnicodeToUtfProc, bug [d19fe0a5b]} -body {
+test encoding-16.19 {Utf16ToUtfProc, bug [d19fe0a5b]} -body {
encoding convertfrom utf-16 "\x41\x41\x41"
} -result \u4141\uFFFD
-test encoding-16.20 {UnicodeToUtfProc, bug [d19fe0a5b]} -constraints deprecated -body {
+test encoding-16.20 {Utf16ToUtfProc, bug [d19fe0a5b]} -constraints deprecated -body {
encoding convertfrom utf-16 "\xD8\xD8"
} -result \uD8D8
-test encoding-16.21 {UnicodeToUtfProc, bug [d19fe0a5b]} -body {
+test encoding-16.21 {Utf16ToUtfProc, bug [d19fe0a5b]} -body {
encoding convertfrom utf-32 "\x00\x00\x00\x00\x41\x41"
} -result \x00\uFFFD
+test encoding-16.22 {Utf16ToUtfProc, strict, bug [db7a085bd9]} -body {
+ encoding convertfrom -strict utf-16le \x00\xD8
+} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\x00'}
+test encoding-16.23 {Utf16ToUtfProc, strict, bug [db7a085bd9]} -body {
+ encoding convertfrom -strict utf-16le \x00\xDC
+} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\x00'}
test encoding-17.1 {UtfToUtf16Proc} -body {
encoding convertto utf-16 "\U460DC"