From 37076881576fb0897e3c1c257ca37cd87685da0c Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 14 Nov 2019 14:01:13 +0000 Subject: Better - more complete - fix for [d433c0e0ad]: TCL_UTF_MAX == 4 problems. It allows emoji to be produced by the system encoding, even for other values of TCL_UTF_MAX. Also added test-cases for this. --- generic/tclEncoding.c | 37 +++++++++++++++++++++++++------------ tests/encoding.test | 16 ++++++++++++++++ 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index f159b32..00b97f5 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2155,7 +2155,7 @@ BinaryProc( /* *------------------------------------------------------------------------- * - * UtfExtToUtfIntProc -- + * UtfIntToUtfExtProc -- * * Convert from UTF-8 to UTF-8. While converting null-bytes from the * Tcl's internal representation (0xc0, 0x80) to the official @@ -2296,7 +2296,7 @@ UtfToUtfProc( * output buffer. */ int pureNullMode) /* Convert embedded nulls from internal * representation to real null-bytes or vice - * versa. */ + * versa. Also combine or separate surrogate pairs */ { const char *srcStart, *srcEnd, *srcClose; const char *dstStart, *dstEnd; @@ -2312,14 +2312,14 @@ UtfToUtfProc( srcEnd = src + srcLen; srcClose = srcEnd; if ((flags & TCL_ENCODING_END) == 0) { - srcClose -= TCL_UTF_MAX; + srcClose -= 6; } if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } dstStart = dst; - dstEnd = dst + dstLen - TCL_UTF_MAX; + dstEnd = dst + dstLen - ((pureNullMode == 1) ? 4 : TCL_UTF_MAX); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { @@ -2361,15 +2361,28 @@ UtfToUtfProc( src += 1; dst += Tcl_UniCharToUtf(*chPtr, dst); } else { - int len = TclUtfToUniChar(src, chPtr); - src += len; - dst += Tcl_UniCharToUtf(*chPtr, dst); -#if TCL_UTF_MAX == 4 - if ((*chPtr >= 0xD800) && (len < 3)) { - src += Tcl_UtfToUniChar(src, chPtr); - dst += Tcl_UniCharToUtf(*chPtr, dst); + src += TclUtfToUniChar(src, chPtr); + if ((*chPtr & 0xFC00) == 0xD800) { + /* A high surrogate character is detected, handle especially */ + Tcl_UniChar low = *chPtr; + size_t len = Tcl_UtfToUniChar(src, &low); + if ((low & 0xFC00) != 0xDC00) { + *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); + *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); + *dst++ = (char) ((*chPtr | 0x80) & 0xBF); + continue; + } else if (pureNullMode == 1) { + int full = (((*chPtr & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000; + *dst++ = (char) (((full >> 18) | 0xF0) & 0xF7); + *dst++ = (char) (((full >> 12) | 0x80) & 0xBF); + *dst++ = (char) (((full >> 6) | 0x80) & 0xBF); + *dst++ = (char) ((full | 0x80) & 0xBF); + *chPtr = 0; + src += len; + continue; + } } -#endif + dst += Tcl_UniCharToUtf(*chPtr, dst); } } diff --git a/tests/encoding.test b/tests/encoding.test index ed0e6a4..cf27190 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -328,6 +328,22 @@ test encoding-15.3 {UtfToUtfProc null character input} { binary scan [encoding convertto identity $y] H* z list [string bytelength $x] [string bytelength $y] $z } {1 2 c080} +test encoding-15.4 {UtfToUtfProc emoji character input} { + set x \xED\xA0\xBD\xED\xB8\x82 + set y [encoding convertfrom utf-8 \xED\xA0\xBD\xED\xB8\x82] + list [string length $x] [string length $y] $y +} "6 2 \uD83D\uDE02" +test encoding-15.5 {UtfToUtfProc emoji character input} { + set x \xF0\x9F\x98\x82 + set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] + list [string length $x] [string length $y] $y +} "4 2 \uD83D\uDE02" +test encoding-15.6 {UtfToUtfProc emoji character output} { + set x \uD83D\uDE02 + set y [encoding convertto utf-8 \uD83D\uDE02] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {2 4 f09f9882} test encoding-16.1 {UnicodeToUtfProc} { set val [encoding convertfrom unicode NN] -- cgit v0.12