diff options
| author | apnadkarni <apnmbx-wits@yahoo.com> | 2024-08-22 18:12:15 (GMT) |
|---|---|---|
| committer | apnadkarni <apnmbx-wits@yahoo.com> | 2024-08-22 18:12:15 (GMT) |
| commit | 831e4a87d5c00d565ecf83e361961f3664847edf (patch) | |
| tree | 3ad9b3f0692b06a5d7558a99c48dabb19aef0e6d /generic/tclEncoding.c | |
| parent | 11d48cf9a94ffd0f61f698c1c81fca711e1a65b3 (diff) | |
| download | tcl-831e4a87d5c00d565ecf83e361961f3664847edf.zip tcl-831e4a87d5c00d565ecf83e361961f3664847edf.tar.gz tcl-831e4a87d5c00d565ecf83e361961f3664847edf.tar.bz2 | |
Proposed fixes for [945d2387d7], [f2e924e881], [33f9b4de51]
Diffstat (limited to 'generic/tclEncoding.c')
| -rw-r--r-- | generic/tclEncoding.c | 222 |
1 files changed, 140 insertions, 82 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index d235911..cb5dd20 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -3009,6 +3009,7 @@ Utf16ToUtfProc( srcLen--; } +#if 0 /* * If last code point is a high surrogate, we cannot handle that yet, * unless we are at the end. @@ -3019,6 +3020,7 @@ Utf16ToUtfProc( result = TCL_CONVERT_MULTIBYTE; srcLen-= 2; } +#endif srcStart = src; srcEnd = src + srcLen; @@ -3028,7 +3030,7 @@ Utf16ToUtfProc( for (numChars = 0; src < srcEnd && numChars <= charLimit; src += 2, numChars++) { - if (dst > dstEnd) { + if (dst > dstEnd && !HIGH_SURROGATE(ch)) { result = TCL_CONVERT_NOSPACE; break; } @@ -3039,94 +3041,150 @@ Utf16ToUtfProc( } else { ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); } - if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - src -= 2; /* Go back to beginning of high surrogate */ - dst--; /* Also undo writing a single byte too much */ - numChars--; - break; - } else if (PROFILE_REPLACE(flags)) { - /* - * Previous loop wrote a single byte to mark the high surrogate. - * Replace it with the replacement character. Further, restart - * current loop iteration since need to recheck destination - * space and reset processing of current character. - */ - ch = UNICODE_REPLACE_CHAR; - dst--; - dst += Tcl_UniCharToUtf(ch, dst); - src -= 2; - numChars--; - continue; - } else { - /* - * Bug [10c2c17c32]. If Hi surrogate not followed by Lo - * surrogate, finish 3-byte UTF-8 - */ - dst += Tcl_UniCharToUtf(-1, dst); - } - } - - /* - * Special case for 1-byte utf chars for speed. Make sure we work with - * unsigned short-size data. - */ - - if ((unsigned)ch - 1 < 0x7F) { - *dst++ = (ch & 0xFF); - } else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) { - dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst); - if (HIGH_SURROGATE(prev) && LOW_SURROGATE(ch)) { - --numChars; /* Character has been combined, so compensage count */ + if (HIGH_SURROGATE(prev)) { + if (LOW_SURROGATE(ch)) { + /* + * High surrogate was followed by a low surrogate. + * Tcl_UniCharToUtf would have stashed away the state in dst. + * Call it again to combine that state with the low surrogate. + * We also have to compensate the numChars as two UTF-16 units + * have been combined into one character. + */ + dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst); + --numChars; + } else { + /* High surrogate was not followed by a low surrogate */ + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + src -= 2; /* Go back to beginning of high surrogate */ + dst--; /* Also undo writing a single byte too much */ + numChars--; + break; + } + if (PROFILE_REPLACE(flags)) { + /* + * Previous loop wrote a single byte to mark the high surrogate. + * Replace it with the replacement character. Further, restart + * current loop iteration since need to recheck destination + * space and reset processing of current character. + */ + ch = UNICODE_REPLACE_CHAR; + dst--; + dst += Tcl_UniCharToUtf(ch, dst); + } else { + /* + * Bug [10c2c17c32]. If Hi surrogate not followed by Lo + * surrogate, finish 3-byte UTF-8 + */ + dst += Tcl_UniCharToUtf(-1, dst); + } + /* Loop around again so destination space and other checks are done */ + prev = 0; /* Reset high surrogate tracker */ + src -= 2; + numChars--; } - } else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) { - /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */ - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - break; - } else { - /* PROFILE_REPLACE */ - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - } } else { - dst += Tcl_UniCharToUtf(ch, dst); - } - } + /* Previous char was not a high surrogate */ - if (HIGH_SURROGATE(ch)) { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - src -= 2; - dst--; - numChars--; - } else if (PROFILE_REPLACE(flags)) { - dst--; - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - } else { - /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); - } + /* + * Special case for 1-byte utf chars for speed. Make sure we work with + * unsigned short-size data. Order checks based on expected frequency. + */ + if ((unsigned)ch - 1 < 0x7F) { + /* ASCII except nul */ + *dst++ = (ch & 0xFF); + } else if (!SURROGATE(ch)) { + /* Not ASCII, not surrogate */ + dst += Tcl_UniCharToUtf(ch, dst); + } else if (HIGH_SURROGATE(ch)) { + dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst); + } else { + assert(LOW_SURROGATE(ch)); + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + break; + } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } + dst += Tcl_UniCharToUtf(ch, dst); + } + } } /* - * If we had a truncated code unit at the end AND this is the last - * fragment AND profile is not "strict", stick FFFD in its place. + * When the above loop ends, result may have the following values: + * 1. TCL_OK - full source buffer was completely processed. + * src, dst, numChars will hold values up to that point BUT + * there may be a leftover high surrogate we need to deal with. + * 2. TCL_CONVERT_NOSPACE - Ran out of room in the destination buffer. + * Same considerations as (1) + * 3. TCL_CONVERT_SYNTAX - decoding error. src, dst, numChars will + * hold the correct values up to the point of error even if the + * the last character decoded was a high surrogate. + * 4. TCL_CONVERT_MULTIBYTE - the buffer passed in was not fully + * processed, because there was a trailing single byte. However, + * we may have processed the requested number of characters already + * in which case the trailing byte does not matter. We still + * may still be a leftover high surrogate as in (1) and (2). */ - if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { - if (dst > dstEnd) { - result = TCL_CONVERT_NOSPACE; - } else { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - } else { - /* PROFILE_REPLACE or PROFILE_TCL8 */ - result = TCL_OK; - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - numChars++; - src++; /* Go past truncated code unit */ - } - } + switch (result) { + case TCL_CONVERT_MULTIBYTE: /* FALLTHRU */ + case TCL_OK: /* FALLTHRU */ + case TCL_CONVERT_NOSPACE: + if (HIGH_SURROGATE(ch)) { + if (flags & TCL_ENCODING_END) { + /* + * No more data expected. There will be space for output of + * one character (essentially overwriting the dst area holding + * high surrogate state) + */ + assert((dst-1) <= dstEnd); + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + src -= 2; + dst--; + numChars--; + } else if (PROFILE_REPLACE(flags)) { + dst--; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + } else { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } + } else { + /* More data is expected. Revert the surrogate state */ + src -= 2; + dst--; + numChars--; + /* Note: leave result of TCL_CONVERT_NOSPACE as is */ + if (result == TCL_OK) { + result = TCL_CONVERT_MULTIBYTE; + } + } + } else if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { + /* + * If we had a trailing byte at the end AND this is the last + * fragment AND profile is not "strict", stick FFFD in its place. + * Note in this case we DO need to check for room in dst. + */ + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + } else { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ + result = TCL_OK; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + numChars++; + src++; + } + } + } + break; + case TCL_CONVERT_SYNTAX: + break; /* Nothing to do */ } *srcReadPtr = src - srcStart; |
