diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-02-18 20:48:59 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-02-18 20:48:59 (GMT) |
commit | 2473a591bfbd5b346e1900e3c1088496b0d17590 (patch) | |
tree | d4351873a596cd351ae0f6ade2cdbc3ae732aec7 | |
parent | 52ed230e6f1f0f90a1ee63afa7a4d9948fd336ae (diff) | |
download | tcl-2473a591bfbd5b346e1900e3c1088496b0d17590.zip tcl-2473a591bfbd5b346e1900e3c1088496b0d17590.tar.gz tcl-2473a591bfbd5b346e1900e3c1088496b0d17590.tar.bz2 |
Proposed fix for [bd94500678]: SEGFAULT by conversion of unicode (out of BMP) to byte-array.
-rw-r--r-- | generic/tclCmdMZ.c | 8 | ||||
-rw-r--r-- | generic/tclEncoding.c | 8 | ||||
-rw-r--r-- | generic/tclScan.c | 4 | ||||
-rw-r--r-- | generic/tclUtf.c | 75 |
4 files changed, 49 insertions, 46 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index dac82b8..c17c4f1 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1221,8 +1221,8 @@ Tcl_SplitObjCmd( fullchar = ch; #if TCL_UTF_MAX <= 4 - if (!len) { - len += TclUtfToUniChar(stringPtr, &ch); + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + len += TclUtfToUniChar(stringPtr + len, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } #endif @@ -1854,8 +1854,8 @@ StringIsCmd( length2 = TclUtfToUniChar(string1, &ch); fullchar = ch; #if TCL_UTF_MAX <= 4 - if (!length2) { - length2 = TclUtfToUniChar(string1, &ch); + if ((length2 == 1) && ((ch & 0xFC00) == 0xD800)) { + length2 += TclUtfToUniChar(string1 + length2, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } #endif diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index e601c3a..b5517bc 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2384,8 +2384,8 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(*chPtr, dst); #if TCL_UTF_MAX <= 4 - if (!len) { - src += TclUtfToUniChar(src, chPtr); + if ((len == 1) && ((*chPtr & 0xFC00) == 0xD800)) { + src += TclUtfToUniChar(src + len, chPtr); dst += Tcl_UniCharToUtf(*chPtr, dst); } #endif @@ -3006,7 +3006,7 @@ Iso88591FromUtfProc( if (ch > 0xff #if TCL_UTF_MAX <= 4 - || !len + || ((len == 1) && ((ch & 0xFC00) == 0xD800)) #endif ) { if (flags & TCL_ENCODING_STOPONERROR) { @@ -3014,7 +3014,7 @@ Iso88591FromUtfProc( break; } #if TCL_UTF_MAX <= 4 - if (!len) len = 4; + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) len = 4; #endif /* * Plunge on, using '?' as a fallback character. diff --git a/generic/tclScan.c b/generic/tclScan.c index fbfba2d..45035f1 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -882,8 +882,8 @@ Tcl_ScanObjCmd( offset = TclUtfToUniChar(string, &sch); i = (int)sch; #if TCL_UTF_MAX == 4 - if (!offset) { - offset = TclUtfToUniChar(string, &sch); + if ((offset == 1) && ((sch & 0xFC00) == 0xD800)) { + offset += TclUtfToUniChar(string+offset, &sch); i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF); } #endif diff --git a/generic/tclUtf.c b/generic/tclUtf.c index ce67db7..2227d45 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -312,6 +312,20 @@ Tcl_UtfToUniChar( * characters representing themselves. */ +#if TCL_UTF_MAX <= 4 + /* If *chPtr contains a high surrogate (produced by a previous + * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation + * bytes, then we must produce a follow-up low surrogate. We only + * do that if the high surrogate matches the bits we encounter. + */ + if ((byte >= 0x80) + && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC)) + && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80)) + && ((src[2] & 0xC0) == 0x80)) { + *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; + return 3; + } +#endif if ((unsigned)(byte-0x80) < (unsigned) 0x20) { *chPtr = (Tcl_UniChar) cp1252[byte-0x80]; } else { @@ -358,21 +372,14 @@ Tcl_UtfToUniChar( * Four-byte-character lead byte followed by three trail bytes. */ #if TCL_UTF_MAX <= 4 - Tcl_UniChar surrogate; - - byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000; - surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10)); - if (byte & 0x100000) { + byte = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) + | ((src[2] & 0x3F) >> 4)) - 0x40; + if ((unsigned) byte >= 0x400) { /* out of range, < 0x10000 or > 0x10ffff */ - } else if (*chPtr != surrogate) { - /* produce high surrogate, but don't advance source pointer */ - *chPtr = surrogate; - return 0; } else { - /* produce low surrogate, and advance source pointer */ - *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF)); - return 4; + /* produce high surrogate, advance source pointer */ + *chPtr = 0xD800 + byte; + return 1; } #else *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) @@ -582,8 +589,8 @@ Tcl_UtfFindFirst( len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX <= 4 - if (!len) { - len += TclUtfToUniChar(src, &find); + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + len += TclUtfToUniChar(src + len, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; } #endif @@ -630,8 +637,8 @@ Tcl_UtfFindLast( len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX <= 4 - if (!len) { - len += TclUtfToUniChar(src, &find); + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + len += TclUtfToUniChar(src + len, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; } #endif @@ -673,8 +680,8 @@ Tcl_UtfNext( int len = TclUtfToUniChar(src, &ch); #if TCL_UTF_MAX <= 4 - if (len == 0) { - len = TclUtfToUniChar(src, &ch); + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { + len += TclUtfToUniChar(src + len, &ch); } #endif return src + len; @@ -755,7 +762,7 @@ Tcl_UniCharAtIndex( Tcl_UniChar ch = 0; int fullchar = 0; #if TCL_UTF_MAX <= 4 - int len = 1; + int len = 0; #endif while (index-- >= 0) { @@ -767,9 +774,9 @@ Tcl_UniCharAtIndex( } fullchar = ch; #if TCL_UTF_MAX <= 4 - if (!len) { + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { /* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */ - (void)TclUtfToUniChar(src, &ch); + (void)TclUtfToUniChar(src + len, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } #endif @@ -801,14 +808,14 @@ Tcl_UtfAtIndex( register int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; - int len = 1; + int len = 0; while (index-- > 0) { len = TclUtfToUniChar(src, &ch); src += len; } #if TCL_UTF_MAX <= 4 - if (!len) { + if ((len == 1) && ((ch & 0xFC00) == 0xD800)) { /* Index points at character following High Surrogate */ src += TclUtfToUniChar(src, &ch); } @@ -905,9 +912,8 @@ Tcl_UtfToUpper( bytes = TclUtfToUniChar(src, &ch); upChar = ch; #if TCL_UTF_MAX <= 4 - if (!bytes) { - /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ - bytes = TclUtfToUniChar(src, &ch); + if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { + bytes += TclUtfToUniChar(src + bytes, &ch); /* Combine surrogates */ upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -968,9 +974,8 @@ Tcl_UtfToLower( bytes = TclUtfToUniChar(src, &ch); lowChar = ch; #if TCL_UTF_MAX <= 4 - if (!bytes) { - /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ - bytes = TclUtfToUniChar(src, &ch); + if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { + bytes += TclUtfToUniChar(src + bytes, &ch); /* Combine surrogates */ lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -1034,9 +1039,8 @@ Tcl_UtfToTitle( bytes = TclUtfToUniChar(src, &ch); titleChar = ch; #if TCL_UTF_MAX <= 4 - if (!bytes) { - /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ - bytes = TclUtfToUniChar(src, &ch); + if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { + bytes += TclUtfToUniChar(src + bytes, &ch); /* Combine surrogates */ titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } @@ -1055,9 +1059,8 @@ Tcl_UtfToTitle( bytes = TclUtfToUniChar(src, &ch); lowChar = ch; #if TCL_UTF_MAX <= 4 - if (!bytes) { - /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ - bytes = TclUtfToUniChar(src, &ch); + if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) { + bytes += TclUtfToUniChar(src + bytes, &ch); /* Combine surrogates */ lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } |