From 95f6d90cc6009c3c8510b8fe22e09b328c188d61 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 12 Feb 2024 14:43:01 +0000 Subject: Fix last "knownBug" in UTF-16 encoder (backported from 9.0) --- generic/tclEncoding.c | 41 +++++++++++++++++++++++++++++++++-------- tests/encodingVectors.tcl | 8 ++++---- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index ba36f55..78d613d 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -200,12 +200,16 @@ static const struct TclEncodingProfiles { {"strict", TCL_ENCODING_PROFILE_STRICT}, {"tcl8", TCL_ENCODING_PROFILE_TCL8}, }; + #define PROFILE_STRICT(flags_) \ ((flags_) & TCL_ENCODING_PROFILE_STRICT) #define PROFILE_REPLACE(flags_) \ ((ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) && !PROFILE_STRICT(flags_)) +#define PROFILE_TCL8(flags_) \ + ((ENCODING_PROFILE_GET(flags_) != TCL_ENCODING_PROFILE_REPLACE) && !PROFILE_STRICT(flags_)) + #define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD) #define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800) #define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800) @@ -3012,7 +3016,7 @@ Utf16ToUtfProc( dstStart = dst; dstEnd = dst + dstLen - TCL_UTF_MAX; - for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + for (numChars = 0; src < srcEnd && numChars <= charLimit; src += 2, numChars++) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; @@ -3031,9 +3035,23 @@ Utf16ToUtfProc( dst--; /* Also undo writing a single byte too much */ numChars--; break; - } + } else if (PROFILE_REPLACE(flags)) { + /* + * Previous loop wrote a single byte to mark the high surrogate. + * Replace it with the replacement character. Further, restart + * current loop iteration since need to recheck destination space + * and reset processing of current character. + */ + ch = UNICODE_REPLACE_CHAR; + dst--; + dst += Tcl_UniCharToUtf(ch, dst); + src -= 2; + numChars--; + continue; + } else { /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); + dst += Tcl_UniCharToUtf(-1, dst); + } } /* @@ -3045,15 +3063,19 @@ Utf16ToUtfProc( *dst++ = (ch & 0xFF); } else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) { dst += Tcl_UniCharToUtf(ch, dst); - } else if (LOW_SURROGATE(ch) && PROFILE_STRICT(flags)) { - /* Lo surrogate not preceded by Hi surrogate */ - result = TCL_CONVERT_SYNTAX; - break; + } else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) { + /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */ + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + break; + } else { + /* PROFILE_REPLACE */ + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + } } else { *dst = 0; /* In case of lower surrogate, don't try to combine */ dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(unsigned short); } if (HIGH_SURROGATE(ch)) { @@ -3062,6 +3084,9 @@ Utf16ToUtfProc( src -= 2; dst--; numChars--; + } else if (PROFILE_REPLACE(flags)) { + dst--; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); } else { /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); diff --git a/tests/encodingVectors.tcl b/tests/encodingVectors.tcl index dfceab4..9b62f84 100644 --- a/tests/encodingVectors.tcl +++ b/tests/encodingVectors.tcl @@ -553,20 +553,20 @@ lappend encInvalidBytes {*}{ utf-16le 41 replace \uFFFD -1 {solo tail} {Truncated} utf-16le 41 strict {} 0 {solo tail} {Truncated} utf-16le 00D8 tcl8 \uD800 -1 {} {Missing low surrogate} - utf-16le 00D8 replace \uFFFD -1 {knownBug} {Missing low surrogate} + utf-16le 00D8 replace \uFFFD -1 {} {Missing low surrogate} utf-16le 00D8 strict {} 0 {} {Missing low surrogate} utf-16le 00DC tcl8 \uDC00 -1 {} {Missing high surrogate} - utf-16le 00DC replace \uFFFD -1 {knownBug} {Missing high surrogate} + utf-16le 00DC replace \uFFFD -1 {} {Missing high surrogate} utf-16le 00DC strict {} 0 {} {Missing high surrogate} utf-16be 41 tcl8 \uFFFD -1 {solo tail} {Truncated} utf-16be 41 replace \uFFFD -1 {solo tail} {Truncated} utf-16be 41 strict {} 0 {solo tail} {Truncated} utf-16be D800 tcl8 \uD800 -1 {} {Missing low surrogate} - utf-16be D800 replace \uFFFD -1 {knownBug} {Missing low surrogate} + utf-16be D800 replace \uFFFD -1 {} {Missing low surrogate} utf-16be D800 strict {} 0 {} {Missing low surrogate} utf-16be DC00 tcl8 \uDC00 -1 {} {Missing high surrogate} - utf-16be DC00 replace \uFFFD -1 {knownBug} {Missing high surrogate} + utf-16be DC00 replace \uFFFD -1 {} {Missing high surrogate} utf-16be DC00 strict {} 0 {} {Missing high surrogate} } -- cgit v0.12