From 19eb51a4cbc12b6d10a2ff7c488ae0471bc30503 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 16 Sep 2019 12:47:19 +0000 Subject: Fix Utf16ToUtfProc() (from TIP #548): If last code-point is higher surrogate, make sure that actual conversion is delayed until the next round, assuring proper merging of two surrogates into a single UTF-8 character. --- generic/tclEncoding.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 9896f85..0ec0649 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2485,10 +2485,16 @@ Utf16ToUtfProc( charLimit = *dstCharsPtr; } result = TCL_OK; - if ((srcLen % sizeof(unsigned short)) != 0) { + + /* check alignment with utf-16 (2 == sizeof(UTF-16)) */ + if ((srcLen % 2) != 0) { + result = TCL_CONVERT_MULTIBYTE; + srcLen--; + } + /* If last code point is a high surrogate, we cannot handle that yet */ + if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; - srcLen /= sizeof(unsigned short); - srcLen *= sizeof(unsigned short); + srcLen-= 2; } srcStart = src; -- cgit v0.12