From c606ae1574a7d66bcbf8666506e91840875f6d45 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 21 Feb 2023 17:50:36 +0000 Subject: Proposed fix for [d19fe0a5b]: Handling incomplete byte sequences for utf-16/utf-32 --- generic/tclEncoding.c | 27 ++++++++++++++++++++++++--- tests/encoding.test | 6 ++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index dfa7907..ecec6e9 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2457,21 +2457,27 @@ UnicodeToUtfProc( } result = TCL_OK; - /* check alignment with utf-16 (2 == sizeof(UTF-16)) */ + /* + * Check alignment with utf-16 (2 == sizeof(UTF-16)) + */ + if ((srcLen % 2) != 0) { result = TCL_CONVERT_MULTIBYTE; srcLen--; } +#if TCL_UTF_MAX > 3 /* - * If last code point is a high surrogate, we cannot handle that yet. + * If last code point is a high surrogate, we cannot handle that yet, + * unless we are at the end. */ - if ((srcLen >= 2) && + if (!(flags & TCL_ENCODING_END) && (srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; srcLen-= 2; } +#endif srcStart = src; srcEnd = src + srcLen; @@ -2504,6 +2510,21 @@ UnicodeToUtfProc( src += sizeof(unsigned short); } + if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { + /* We have a single byte left-over at the end */ + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + } else { + /* destination is not full, so we really are at the end now */ + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_SYNTAX; + } else { + dst += Tcl_UniCharToUtf(0xFFFD, dst); + numChars++; + src++; + } + } + } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; diff --git a/tests/encoding.test b/tests/encoding.test index f558e01..f6f9abc 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -419,6 +419,12 @@ test encoding-16.3 {UnicodeToUtfProc} -body { set val [encoding convertfrom unicode "\xDC\xDC"] list $val [format %X [scan $val %c]] } -result "\uDCDC DCDC" +test encoding-16.4 {UnicodeToUtfProc, bug [d19fe0a5b]} -body { + encoding convertfrom unicode "\x41\x41\x41" +} -result \u4141\uFFFD +test encoding-16.5 {UnicodeToUtfProc, bug [d19fe0a5b]} -constraints ucs2 -body { + encoding convertfrom unicode "\xD8\xD8" +} -result \uD8D8 test encoding-17.1 {UtfToUnicodeProc} -constraints fullutf -body { encoding convertto unicode "\U460DC" -- cgit v0.12