summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2019-02-18 20:48:59 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2019-02-18 20:48:59 (GMT)
commit2473a591bfbd5b346e1900e3c1088496b0d17590 (patch)
treed4351873a596cd351ae0f6ade2cdbc3ae732aec7
parent52ed230e6f1f0f90a1ee63afa7a4d9948fd336ae (diff)
downloadtcl-2473a591bfbd5b346e1900e3c1088496b0d17590.zip
tcl-2473a591bfbd5b346e1900e3c1088496b0d17590.tar.gz
tcl-2473a591bfbd5b346e1900e3c1088496b0d17590.tar.bz2
Proposed fix for [bd94500678]: SEGFAULT by conversion of unicode (out of BMP) to byte-array.
-rw-r--r--generic/tclCmdMZ.c8
-rw-r--r--generic/tclEncoding.c8
-rw-r--r--generic/tclScan.c4
-rw-r--r--generic/tclUtf.c75
4 files changed, 49 insertions, 46 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index dac82b8..c17c4f1 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1221,8 +1221,8 @@ Tcl_SplitObjCmd(
fullchar = ch;
#if TCL_UTF_MAX <= 4
- if (!len) {
- len += TclUtfToUniChar(stringPtr, &ch);
+ if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+ len += TclUtfToUniChar(stringPtr + len, &ch);
fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
}
#endif
@@ -1854,8 +1854,8 @@ StringIsCmd(
length2 = TclUtfToUniChar(string1, &ch);
fullchar = ch;
#if TCL_UTF_MAX <= 4
- if (!length2) {
- length2 = TclUtfToUniChar(string1, &ch);
+ if ((length2 == 1) && ((ch & 0xFC00) == 0xD800)) {
+ length2 += TclUtfToUniChar(string1 + length2, &ch);
fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
}
#endif
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index e601c3a..b5517bc 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2384,8 +2384,8 @@ UtfToUtfProc(
src += len;
dst += Tcl_UniCharToUtf(*chPtr, dst);
#if TCL_UTF_MAX <= 4
- if (!len) {
- src += TclUtfToUniChar(src, chPtr);
+ if ((len == 1) && ((*chPtr & 0xFC00) == 0xD800)) {
+ src += TclUtfToUniChar(src + len, chPtr);
dst += Tcl_UniCharToUtf(*chPtr, dst);
}
#endif
@@ -3006,7 +3006,7 @@ Iso88591FromUtfProc(
if (ch > 0xff
#if TCL_UTF_MAX <= 4
- || !len
+ || ((len == 1) && ((ch & 0xFC00) == 0xD800))
#endif
) {
if (flags & TCL_ENCODING_STOPONERROR) {
@@ -3014,7 +3014,7 @@ Iso88591FromUtfProc(
break;
}
#if TCL_UTF_MAX <= 4
- if (!len) len = 4;
+ if ((len == 1) && ((ch & 0xFC00) == 0xD800)) len = 4;
#endif
/*
* Plunge on, using '?' as a fallback character.
diff --git a/generic/tclScan.c b/generic/tclScan.c
index fbfba2d..45035f1 100644
--- a/generic/tclScan.c
+++ b/generic/tclScan.c
@@ -882,8 +882,8 @@ Tcl_ScanObjCmd(
offset = TclUtfToUniChar(string, &sch);
i = (int)sch;
#if TCL_UTF_MAX == 4
- if (!offset) {
- offset = TclUtfToUniChar(string, &sch);
+ if ((offset == 1) && ((sch & 0xFC00) == 0xD800)) {
+ offset += TclUtfToUniChar(string+offset, &sch);
i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF);
}
#endif
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index ce67db7..2227d45 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -312,6 +312,20 @@ Tcl_UtfToUniChar(
* characters representing themselves.
*/
+#if TCL_UTF_MAX <= 4
+ /* If *chPtr contains a high surrogate (produced by a previous
+ * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
+ * bytes, then we must produce a follow-up low surrogate. We only
+ * do that if the high surrogate matches the bits we encounter.
+ */
+ if ((byte >= 0x80)
+ && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
+ && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))
+ && ((src[2] & 0xC0) == 0x80)) {
+ *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
+ return 3;
+ }
+#endif
if ((unsigned)(byte-0x80) < (unsigned) 0x20) {
*chPtr = (Tcl_UniChar) cp1252[byte-0x80];
} else {
@@ -358,21 +372,14 @@ Tcl_UtfToUniChar(
* Four-byte-character lead byte followed by three trail bytes.
*/
#if TCL_UTF_MAX <= 4
- Tcl_UniChar surrogate;
-
- byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
- | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
- surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10));
- if (byte & 0x100000) {
+ byte = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
+ | ((src[2] & 0x3F) >> 4)) - 0x40;
+ if ((unsigned) byte >= 0x400) {
/* out of range, < 0x10000 or > 0x10ffff */
- } else if (*chPtr != surrogate) {
- /* produce high surrogate, but don't advance source pointer */
- *chPtr = surrogate;
- return 0;
} else {
- /* produce low surrogate, and advance source pointer */
- *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF));
- return 4;
+ /* produce high surrogate, advance source pointer */
+ *chPtr = 0xD800 + byte;
+ return 1;
}
#else
*chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
@@ -582,8 +589,8 @@ Tcl_UtfFindFirst(
len = TclUtfToUniChar(src, &find);
fullchar = find;
#if TCL_UTF_MAX <= 4
- if (!len) {
- len += TclUtfToUniChar(src, &find);
+ if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+ len += TclUtfToUniChar(src + len, &find);
fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
}
#endif
@@ -630,8 +637,8 @@ Tcl_UtfFindLast(
len = TclUtfToUniChar(src, &find);
fullchar = find;
#if TCL_UTF_MAX <= 4
- if (!len) {
- len += TclUtfToUniChar(src, &find);
+ if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+ len += TclUtfToUniChar(src + len, &find);
fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
}
#endif
@@ -673,8 +680,8 @@ Tcl_UtfNext(
int len = TclUtfToUniChar(src, &ch);
#if TCL_UTF_MAX <= 4
- if (len == 0) {
- len = TclUtfToUniChar(src, &ch);
+ if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+ len += TclUtfToUniChar(src + len, &ch);
}
#endif
return src + len;
@@ -755,7 +762,7 @@ Tcl_UniCharAtIndex(
Tcl_UniChar ch = 0;
int fullchar = 0;
#if TCL_UTF_MAX <= 4
- int len = 1;
+ int len = 0;
#endif
while (index-- >= 0) {
@@ -767,9 +774,9 @@ Tcl_UniCharAtIndex(
}
fullchar = ch;
#if TCL_UTF_MAX <= 4
- if (!len) {
+ if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
/* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */
- (void)TclUtfToUniChar(src, &ch);
+ (void)TclUtfToUniChar(src + len, &ch);
fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
}
#endif
@@ -801,14 +808,14 @@ Tcl_UtfAtIndex(
register int index) /* The position of the desired character. */
{
Tcl_UniChar ch = 0;
- int len = 1;
+ int len = 0;
while (index-- > 0) {
len = TclUtfToUniChar(src, &ch);
src += len;
}
#if TCL_UTF_MAX <= 4
- if (!len) {
+ if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
/* Index points at character following High Surrogate */
src += TclUtfToUniChar(src, &ch);
}
@@ -905,9 +912,8 @@ Tcl_UtfToUpper(
bytes = TclUtfToUniChar(src, &ch);
upChar = ch;
#if TCL_UTF_MAX <= 4
- if (!bytes) {
- /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
- bytes = TclUtfToUniChar(src, &ch);
+ if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
+ bytes += TclUtfToUniChar(src + bytes, &ch);
/* Combine surrogates */
upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
}
@@ -968,9 +974,8 @@ Tcl_UtfToLower(
bytes = TclUtfToUniChar(src, &ch);
lowChar = ch;
#if TCL_UTF_MAX <= 4
- if (!bytes) {
- /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
- bytes = TclUtfToUniChar(src, &ch);
+ if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
+ bytes += TclUtfToUniChar(src + bytes, &ch);
/* Combine surrogates */
lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
}
@@ -1034,9 +1039,8 @@ Tcl_UtfToTitle(
bytes = TclUtfToUniChar(src, &ch);
titleChar = ch;
#if TCL_UTF_MAX <= 4
- if (!bytes) {
- /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
- bytes = TclUtfToUniChar(src, &ch);
+ if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
+ bytes += TclUtfToUniChar(src + bytes, &ch);
/* Combine surrogates */
titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
}
@@ -1055,9 +1059,8 @@ Tcl_UtfToTitle(
bytes = TclUtfToUniChar(src, &ch);
lowChar = ch;
#if TCL_UTF_MAX <= 4
- if (!bytes) {
- /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
- bytes = TclUtfToUniChar(src, &ch);
+ if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
+ bytes += TclUtfToUniChar(src + bytes, &ch);
/* Combine surrogates */
lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
}