diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-02-09 19:52:00 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-02-09 19:52:00 (GMT) |
commit | fd83fb931e43901b77f4e480ef63841e10b39f22 (patch) | |
tree | d35fd1c792d990de927ea8f74b85dbb9f80d2bb2 | |
parent | b185a55c3b335a847e148680c628136c7c16640f (diff) | |
download | tcl-fd83fb931e43901b77f4e480ef63841e10b39f22.zip tcl-fd83fb931e43901b77f4e480ef63841e10b39f22.tar.gz tcl-fd83fb931e43901b77f4e480ef63841e10b39f22.tar.bz2 |
Add 4 more testcases, showing that the same bug is present in utf-16 as well. Also fix the bug (really, now!)
-rw-r--r-- | generic/tclEncoding.c | 44 | ||||
-rw-r--r-- | tests/encoding.test | 12 |
2 files changed, 48 insertions, 8 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index d19e237..0941f14 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2531,7 +2531,7 @@ Utf32ToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; - int ch; + int ch = 0; flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { @@ -2548,6 +2548,19 @@ Utf32ToUtfProc( srcLen &= -4; } + /* + * If last code point is a high surrogate, we cannot handle that yet, + * unless we are at the end. + */ + + if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) { + result = TCL_CONVERT_MULTIBYTE; + srcLen-= 4; + } + srcStart = src; srcEnd = src + srcLen; @@ -2560,11 +2573,16 @@ Utf32ToUtfProc( break; } + int prev = ch; if (flags & TCL_ENCODING_LE) { ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF); } else { ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); } + if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) { + /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } if ((unsigned)ch > 0x10FFFF || (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) && ((ch & ~0x7FF) == 0xD800))) { if (STOPONERROR) { @@ -2582,14 +2600,14 @@ Utf32ToUtfProc( *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); - if ((ch & ~0x3FF) == 0xD800) { - /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); - } } src += sizeof(unsigned int); } + if ((ch & ~0x3FF) == 0xD800) { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; @@ -2734,7 +2752,7 @@ Utf16ToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; - unsigned short ch; + unsigned short ch = 0; flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { @@ -2752,10 +2770,11 @@ Utf16ToUtfProc( } /* - * If last code point is a high surrogate, we cannot handle that yet. + * If last code point is a high surrogate, we cannot handle that yet, + * unless we are at the end. */ - if ((srcLen >= 2) && + if (!(flags & TCL_ENCODING_END) && (srcLen >= 2) && ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; srcLen-= 2; @@ -2773,11 +2792,16 @@ Utf16ToUtfProc( break; } + unsigned short prev = ch; if (flags & TCL_ENCODING_LE) { ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); } else { ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); } + if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) { + /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } /* * Special case for 1-byte utf chars for speed. Make sure we work with @@ -2792,6 +2816,10 @@ Utf16ToUtfProc( src += sizeof(unsigned short); } + if ((ch & ~0x3FF) == 0xD800) { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; diff --git a/tests/encoding.test b/tests/encoding.test index e42c3b9..b2b029e 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -497,6 +497,18 @@ test encoding-16.11 {Utf32ToUtfProc} -body { test encoding-16.12 {Utf32ToUtfProc} -body { encoding convertfrom utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 } -result \uDC00\uD800 +test encoding-16.13 {Utf16ToUtfProc} -body { + encoding convertfrom utf-16le \x00\xD8 +} -result \uD800 +test encoding-16.14 {Utf16ToUtfProc} -body { + encoding convertfrom utf-16le \x00\xDC +} -result \uDC00 +test encoding-16.15 {Utf16ToUtfProc} -body { + encoding convertfrom utf-16le \x00\xD8\x00\xDC +} -result \uD800\uDC00 +test encoding-16.16 {Utf16ToUtfProc} -body { + encoding convertfrom utf-16le \x00\xDC\x00\xD8 +} -result \uDC00\uD800 test encoding-16.9 { Utf16ToUtfProc, Tcl_UniCharToUtf, surrogate pairs in utf-16 |