summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2023-03-11 22:00:29 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2023-03-11 22:00:29 (GMT)
commit1889ded1144a4dbd44d0c6f03e72a01d70115a51 (patch)
treeef65cd952283efecf6e7fce938de557033f75a4e /generic
parent21388fa31e1099692f83009f664791387f2c387d (diff)
downloadtcl-1889ded1144a4dbd44d0c6f03e72a01d70115a51.zip
tcl-1889ded1144a4dbd44d0c6f03e72a01d70115a51.tar.gz
tcl-1889ded1144a4dbd44d0c6f03e72a01d70115a51.tar.bz2
Proposed fix for [db7a085bd9]: encoding convertfrom -strict utf-16 accepts partial surrogates. TODO: testcases, and implement for 8.7 too
Diffstat (limited to 'generic')
-rw-r--r--generic/tclCmdAH.c2
-rw-r--r--generic/tclEncoding.c30
2 files changed, 27 insertions, 5 deletions
diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c
index 4df1216..ac504d0 100644
--- a/generic/tclCmdAH.c
+++ b/generic/tclCmdAH.c
@@ -514,7 +514,7 @@ EncodingConvertfromObjCmd(
char buf[TCL_INTEGER_SPACE];
sprintf(buf, "%" TCL_Z_MODIFIER "u", result);
Tcl_SetObjResult(interp, Tcl_ObjPrintf("unexpected byte sequence starting at index %"
- TCL_Z_MODIFIER "u: '\\x%X'", result, UCHAR(bytesPtr[result])));
+ TCL_Z_MODIFIER "u: '\\x%02X'", result, UCHAR(bytesPtr[result])));
Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE",
buf, NULL);
Tcl_DStringFree(&ds);
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index fc3397a..4f334bb 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2603,6 +2603,7 @@ Utf32ToUtfProc(
dst += Tcl_UniCharToUtf(-1, dst);
}
#endif
+
if ((unsigned)ch > 0x10FFFF) {
ch = 0xFFFD;
if (STOPONERROR) {
@@ -2639,6 +2640,7 @@ Utf32ToUtfProc(
dst += Tcl_UniCharToUtf(-1, dst);
}
#endif
+
if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
/* We have a single byte left-over at the end */
if (dst > dstEnd) {
@@ -2846,6 +2848,13 @@ Utf16ToUtfProc(
ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
}
if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) {
+ if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) {
+ result = TCL_CONVERT_UNKNOWN;
+ src -= 2; /* Go back to before the high surrogate */
+ dst--; /* Also undo writing a single byte too much */
+ numChars--;
+ break;
+ }
/* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
dst += Tcl_UniCharToUtf(-1, dst);
}
@@ -2855,17 +2864,30 @@ Utf16ToUtfProc(
* unsigned short-size data.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (ch & 0xFF);
- } else {
+ } else if (((prev & ~0x3FF) == 0xD800) || ((ch & ~0x3FF) == 0xD800)) {
dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
+ } else if (((ch & ~0x3FF) == 0xDC00) && ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) {
+ /* Lo surrogate not preceded by Hi surrogate */
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ } else {
+ dst += Tcl_UniCharToUtf(ch, dst);
}
src += sizeof(unsigned short);
}
if ((ch & ~0x3FF) == 0xD800) {
- /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
- dst += Tcl_UniCharToUtf(-1, dst);
+ if ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) {
+ result = TCL_CONVERT_UNKNOWN;
+ src -= 2;
+ dst--;
+ numChars--;
+ } else {
+ /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
}
if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
/* We have a single byte left-over at the end */