diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2024-02-12 14:43:01 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2024-02-12 14:43:01 (GMT) |
commit | 95f6d90cc6009c3c8510b8fe22e09b328c188d61 (patch) | |
tree | 3608e78513e7075e0e50fd8c2eacc15ad1ff70b1 /generic/tclEncoding.c | |
parent | 60879f541756d142e554f2a58fc4b229c31453ff (diff) | |
download | tcl-95f6d90cc6009c3c8510b8fe22e09b328c188d61.zip tcl-95f6d90cc6009c3c8510b8fe22e09b328c188d61.tar.gz tcl-95f6d90cc6009c3c8510b8fe22e09b328c188d61.tar.bz2 |
Fix last "knownBug" in UTF-16 encoder (backported from 9.0)
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 41 |
1 files changed, 33 insertions, 8 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index ba36f55..78d613d 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -200,12 +200,16 @@ static const struct TclEncodingProfiles { {"strict", TCL_ENCODING_PROFILE_STRICT}, {"tcl8", TCL_ENCODING_PROFILE_TCL8}, }; + #define PROFILE_STRICT(flags_) \ ((flags_) & TCL_ENCODING_PROFILE_STRICT) #define PROFILE_REPLACE(flags_) \ ((ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) && !PROFILE_STRICT(flags_)) +#define PROFILE_TCL8(flags_) \ + ((ENCODING_PROFILE_GET(flags_) != TCL_ENCODING_PROFILE_REPLACE) && !PROFILE_STRICT(flags_)) + #define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD) #define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800) #define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800) @@ -3012,7 +3016,7 @@ Utf16ToUtfProc( dstStart = dst; dstEnd = dst + dstLen - TCL_UTF_MAX; - for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + for (numChars = 0; src < srcEnd && numChars <= charLimit; src += 2, numChars++) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; @@ -3031,9 +3035,23 @@ Utf16ToUtfProc( dst--; /* Also undo writing a single byte too much */ numChars--; break; - } + } else if (PROFILE_REPLACE(flags)) { + /* + * Previous loop wrote a single byte to mark the high surrogate. + * Replace it with the replacement character. Further, restart + * current loop iteration since need to recheck destination space + * and reset processing of current character. + */ + ch = UNICODE_REPLACE_CHAR; + dst--; + dst += Tcl_UniCharToUtf(ch, dst); + src -= 2; + numChars--; + continue; + } else { /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); + dst += Tcl_UniCharToUtf(-1, dst); + } } /* @@ -3045,15 +3063,19 @@ Utf16ToUtfProc( *dst++ = (ch & 0xFF); } else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) { dst += Tcl_UniCharToUtf(ch, dst); - } else if (LOW_SURROGATE(ch) && PROFILE_STRICT(flags)) { - /* Lo surrogate not preceded by Hi surrogate */ - result = TCL_CONVERT_SYNTAX; - break; + } else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) { + /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */ + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + break; + } else { + /* PROFILE_REPLACE */ + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + } } else { *dst = 0; /* In case of lower surrogate, don't try to combine */ dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(unsigned short); } if (HIGH_SURROGATE(ch)) { @@ -3062,6 +3084,9 @@ Utf16ToUtfProc( src -= 2; dst--; numChars--; + } else if (PROFILE_REPLACE(flags)) { + dst--; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); } else { /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); |