From ded849032ef8207aae7f1a111b78041de49fffcf Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 13 Sep 2019 12:20:13 +0000 Subject: Attempt to improve [a179564826]: Tk 8.6: prevent issues when encountering non-BMP Unicode characters. In combination with latest Tcl 8.6 (tip of core-8-6-branch) this should handle non-BMP characters better, they should be handled as 2 surrogates in stead of 4 separate bytes on all platforms. --- unix/tkUnixFont.c | 22 +++++++++++++--------- unix/tkUnixSelect.c | 4 ++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/unix/tkUnixFont.c b/unix/tkUnixFont.c index f43e086..41b2df1 100644 --- a/unix/tkUnixFont.c +++ b/unix/tkUnixFont.c @@ -171,7 +171,7 @@ static Tcl_ThreadDataKey dataKey; * encodings into the names expected by the Tcl encoding package. */ -static EncodingAlias encodingAliases[] = { +static const EncodingAlias encodingAliases[] = { {"gb2312-raw", "gb2312*"}, {"big5", "big5*"}, {"cns11643-1", "cns11643*-1"}, @@ -572,7 +572,11 @@ UtfToUcs2beProc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar ch; + Tcl_UniChar *chPtr = (Tcl_UniChar *)statePtr; + + if (flags & TCL_ENCODING_START) { + *statePtr = 0; + } srcStart = src; srcEnd = src + srcLen; @@ -591,15 +595,14 @@ UtfToUcs2beProc( * If there is more string to follow, this will ensure that the * last UTF-8 character in the source buffer hasn't been cut off. */ - result = TCL_CONVERT_MULTIBYTE; break; } if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; - } - src += Tcl_UtfToUniChar(src, &ch); + } + src += Tcl_UtfToUniChar(src, chPtr); /* * Ensure big-endianness (store big bits first). @@ -607,8 +610,9 @@ UtfToUcs2beProc( * sure to work in char* for Tcl_UtfToUniChar alignment. [Bug 1122671] */ - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); + + *dst++ = (char)(*chPtr >> 8); + *dst++ = (char)*chPtr; } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; @@ -2997,10 +3001,10 @@ static const char * GetEncodingAlias( const char *name) /* The name to look up. */ { - EncodingAlias *aliasPtr; + const EncodingAlias *aliasPtr; for (aliasPtr = encodingAliases; aliasPtr->aliasPattern != NULL; ) { - if (Tcl_StringMatch(name, aliasPtr->aliasPattern)) { + if (Tcl_StringCaseMatch(name, aliasPtr->aliasPattern, 0)) { return aliasPtr->realName; } aliasPtr++; diff --git a/unix/tkUnixSelect.c b/unix/tkUnixSelect.c index dfbb895..4819183 100644 --- a/unix/tkUnixSelect.c +++ b/unix/tkUnixSelect.c @@ -21,7 +21,7 @@ typedef struct ConvertInfo { * offset of the next chunk of data to * transfer. */ Tcl_EncodingState state; /* The encoding state needed across chunks. */ - char buffer[TCL_UTF_MAX]; /* A buffer to hold part of a UTF character + char buffer[4]; /* A buffer to hold part of a UTF character * that is split across chunks.*/ } ConvertInfo; @@ -446,7 +446,7 @@ TkSelPropProc( * Preserve any left-over bytes. */ - if (srcLen > TCL_UTF_MAX) { + if (srcLen > 3) { Tcl_Panic("selection conversion left too many bytes unconverted"); } memcpy(incrPtr->converts[i].buffer, src, (size_t) srcLen+1); -- cgit v0.12 From 744eaf4fe647fb18ea2bfcfae3371a301136db47 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 13 Sep 2019 17:54:06 +0000 Subject: increase FONATMAP_PAGES so it can hold 3 planes of Unicode characters in stead of just one. This appears to be one cause for crashes. --- unix/tkUnixFont.c | 8 ++++---- win/tkWinFont.c | 5 ++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/unix/tkUnixFont.c b/unix/tkUnixFont.c index 41b2df1..c1b1b14 100644 --- a/unix/tkUnixFont.c +++ b/unix/tkUnixFont.c @@ -37,8 +37,8 @@ static const char *const encodingList[] = { #define FONTMAP_SHIFT 10 -#define FONTMAP_PAGES (1 << (sizeof(Tcl_UniChar)*8 - FONTMAP_SHIFT)) #define FONTMAP_BITSPERPAGE (1 << FONTMAP_SHIFT) +#define FONTMAP_PAGES (0x30000 / FONTMAP_BITSPERPAGE) typedef struct FontFamily { struct FontFamily *nextPtr; /* Next in list of all known font families. */ @@ -1972,11 +1972,11 @@ FindSubFontForChar( SubFont *subFontPtr; Tcl_DString ds; - if (FontMapLookup(&fontPtr->subFontArray[0], ch)) { - return &fontPtr->subFontArray[0]; + if (ch > 0x30000) { + ch = 0xfffd; } - for (i = 1; i < fontPtr->numSubFonts; i++) { + for (i = 0; i < fontPtr->numSubFonts; i++) { if (FontMapLookup(&fontPtr->subFontArray[i], ch)) { return &fontPtr->subFontArray[i]; } diff --git a/win/tkWinFont.c b/win/tkWinFont.c index ad7738f..4351f99 100644 --- a/win/tkWinFont.c +++ b/win/tkWinFont.c @@ -28,8 +28,8 @@ #define FONTMAP_SHIFT 10 -#define FONTMAP_PAGES (1 << (sizeof(Tcl_UniChar)*8 - FONTMAP_SHIFT)) #define FONTMAP_BITSPERPAGE (1 << FONTMAP_SHIFT) +#define FONTMAP_PAGES (0x30000 / FONTMAP_BITSPERPAGE) typedef struct FontFamily { struct FontFamily *nextPtr; /* Next in list of all known font families. */ @@ -1943,8 +1943,7 @@ FindSubFontForChar( SubFont *subFontPtr; Tcl_DString ds; - - if ((ch < BASE_CHARS) || (ch >= 0x10000)) { + if ((ch < BASE_CHARS) || (ch >= 0x30000)) { return &fontPtr->subFontArray[0]; } -- cgit v0.12 From 932a496abd3c78e24e4723792850205193d43f4d Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 16 Sep 2019 12:44:55 +0000 Subject: Fix Ucs2beToUtfProc(): If last code-point is higher surrogate, make sure that actual conversion is delayed until the next round, assuring proper merging of two surrogates into a single UTF-8 character. --- unix/tkUnixFont.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/unix/tkUnixFont.c b/unix/tkUnixFont.c index b500eb6..1e80231 100644 --- a/unix/tkUnixFont.c +++ b/unix/tkUnixFont.c @@ -391,7 +391,7 @@ ControlUtfProc( const char *srcStart, *srcEnd; char *dstStart, *dstEnd; int ch, result; - static char hexChars[] = "0123456789ABCDEF"; + static char hexChars[] = "0123456789abcdef"; static char mapChars[] = { 0, 0, 0, 0, 0, 0, 0, 'a', 'b', 't', 'n', 'v', 'f', 'r' @@ -430,10 +430,10 @@ ControlUtfProc( } else { /* TODO we can do better here */ dst[1] = 'u'; - dst[2] = 'F'; - dst[3] = 'F'; - dst[4] = 'F'; - dst[5] = 'D'; + dst[2] = 'f'; + dst[3] = 'f'; + dst[4] = 'f'; + dst[5] = 'd'; dst += 6; } } @@ -449,7 +449,6 @@ ControlUtfProc( * Ucs2beToUtfProc -- * * Convert from UCS-2BE (big-endian 16-bit Unicode) to UTF-8. - * This is only defined on LE machines. * * Results: * Returns TCL_OK if conversion was successful. @@ -498,6 +497,11 @@ Ucs2beToUtfProc( result = TCL_CONVERT_MULTIBYTE; srcLen--; } + /* If last code point is a high surrogate, we cannot handle that yet */ + if ((srcLen >= 2) && ((src[srcLen - 2] & 0xFC) == 0xD8)) { + result = TCL_CONVERT_MULTIBYTE; + srcLen -= 2; + } srcStart = src; srcEnd = src + srcLen; -- cgit v0.12