Add 4 more testcases, showing that the same bug is present in utf-16 as well. Also fix the bug (really, now!)

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2023-02-09 19:52:00 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2023-02-09 19:52:00 (GMT)
commit: fd83fb931e43901b77f4e480ef63841e10b39f22 (patch)
tree: d35fd1c792d990de927ea8f74b85dbb9f80d2bb2
parent: b185a55c3b335a847e148680c628136c7c16640f (diff)
download: tcl-fd83fb931e43901b77f4e480ef63841e10b39f22.zip
tcl-fd83fb931e43901b77f4e480ef63841e10b39f22.tar.gz
tcl-fd83fb931e43901b77f4e480ef63841e10b39f22.tar.bz2
2 files changed, 48 insertions, 8 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index d19e237..0941f14 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2531,7 +2531,7 @@ Utf32ToUtfProc(
     const char *srcStart, *srcEnd;
     const char *dstEnd, *dstStart;
     int result, numChars, charLimit = INT_MAX;
-    int ch;
+    int ch = 0;
 
     flags |= PTR2INT(clientData);
     if (flags & TCL_ENCODING_CHAR_LIMIT) {
@@ -2548,6 +2548,19 @@ Utf32ToUtfProc(
 	srcLen &= -4;
     }
 
+    /*
+     * If last code point is a high surrogate, we cannot handle that yet,
+     * unless we are at the end.
+     */
+
+    if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) &&
+	    ((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) &&
+	    ((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) &&
+	    ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) {
+	result = TCL_CONVERT_MULTIBYTE;
+	srcLen-= 4;
+    }
+
     srcStart = src;
     srcEnd = src + srcLen;
 
@@ -2560,11 +2573,16 @@ Utf32ToUtfProc(
 	    break;
 	}
 
+	int prev = ch;
 	if (flags & TCL_ENCODING_LE) {
 	    ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
 	} else {
 	    ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF);
 	}
+	if (((prev  & ~0x3FF) == 0xD800) && ((ch  & ~0x3FF) != 0xDC00)) {
+	    /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
+	    dst += Tcl_UniCharToUtf(-1, dst);
+	}
 	if  ((unsigned)ch > 0x10FFFF || (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
 		&& ((ch  & ~0x7FF) == 0xD800))) {
 	    if (STOPONERROR) {
@@ -2582,14 +2600,14 @@ Utf32ToUtfProc(
 	    *dst++ = (ch & 0xFF);
 	} else {
 	    dst += Tcl_UniCharToUtf(ch, dst);
-	    if ((ch  & ~0x3FF) == 0xD800) {
-		/* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
-		dst += Tcl_UniCharToUtf(-1, dst);
-	    }
 	}
 	src += sizeof(unsigned int);
     }
 
+    if ((ch  & ~0x3FF) == 0xD800) {
+	/* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+	dst += Tcl_UniCharToUtf(-1, dst);
+    }
     *srcReadPtr = src - srcStart;
     *dstWrotePtr = dst - dstStart;
     *dstCharsPtr = numChars;
@@ -2734,7 +2752,7 @@ Utf16ToUtfProc(
     const char *srcStart, *srcEnd;
     const char *dstEnd, *dstStart;
     int result, numChars, charLimit = INT_MAX;
-    unsigned short ch;
+    unsigned short ch = 0;
 
     flags |= PTR2INT(clientData);
     if (flags & TCL_ENCODING_CHAR_LIMIT) {
@@ -2752,10 +2770,11 @@ Utf16ToUtfProc(
     }
 
     /*
-     * If last code point is a high surrogate, we cannot handle that yet.
+     * If last code point is a high surrogate, we cannot handle that yet,
+     * unless we are at the end.
      */
 
-    if ((srcLen >= 2) &&
+    if (!(flags & TCL_ENCODING_END) && (srcLen >= 2) &&
 	    ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) {
 	result = TCL_CONVERT_MULTIBYTE;
 	srcLen-= 2;
@@ -2773,11 +2792,16 @@ Utf16ToUtfProc(
 	    break;
 	}
 
+	unsigned short prev = ch;
 	if (flags & TCL_ENCODING_LE) {
 	    ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
 	} else {
 	    ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
 	}
+	if (((prev  & ~0x3FF) == 0xD800) && ((ch  & ~0x3FF) != 0xDC00)) {
+	    /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
+	    dst += Tcl_UniCharToUtf(-1, dst);
+	}
 
 	/*
 	 * Special case for 1-byte utf chars for speed. Make sure we work with
@@ -2792,6 +2816,10 @@ Utf16ToUtfProc(
 	src += sizeof(unsigned short);
     }
 
+    if ((ch  & ~0x3FF) == 0xD800) {
+	/* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+	dst += Tcl_UniCharToUtf(-1, dst);
+    }
     *srcReadPtr = src - srcStart;
     *dstWrotePtr = dst - dstStart;
     *dstCharsPtr = numChars;
diff --git a/tests/encoding.test b/tests/encoding.test
index e42c3b9..b2b029e 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -497,6 +497,18 @@ test encoding-16.11 {Utf32ToUtfProc} -body {
 test encoding-16.12 {Utf32ToUtfProc} -body {
     encoding convertfrom utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00
 } -result \uDC00\uD800
+test encoding-16.13 {Utf16ToUtfProc} -body {
+    encoding convertfrom utf-16le \x00\xD8
+} -result \uD800
+test encoding-16.14 {Utf16ToUtfProc} -body {
+    encoding convertfrom utf-16le \x00\xDC
+} -result \uDC00
+test encoding-16.15 {Utf16ToUtfProc} -body {
+    encoding convertfrom utf-16le \x00\xD8\x00\xDC
+} -result \uD800\uDC00
+test encoding-16.16 {Utf16ToUtfProc} -body {
+    encoding convertfrom utf-16le \x00\xDC\x00\xD8
+} -result \uDC00\uD800
 
 test encoding-16.9 {
     Utf16ToUtfProc, Tcl_UniCharToUtf, surrogate pairs in utf-16
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2023-02-09 19:52:00 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2023-02-09 19:52:00 (GMT)
commit	fd83fb931e43901b77f4e480ef63841e10b39f22 (patch)
tree	d35fd1c792d990de927ea8f74b85dbb9f80d2bb2
parent	b185a55c3b335a847e148680c628136c7c16640f (diff)
download	tcl-fd83fb931e43901b77f4e480ef63841e10b39f22.zip tcl-fd83fb931e43901b77f4e480ef63841e10b39f22.tar.gz tcl-fd83fb931e43901b77f4e480ef63841e10b39f22.tar.bz2