From 2473a591bfbd5b346e1900e3c1088496b0d17590 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Mon, 18 Feb 2019 20:48:59 +0000
Subject: Proposed fix for [bd94500678]: SEGFAULT by conversion of unicode (out
 of BMP) to byte-array.

---
 generic/tclCmdMZ.c    |  8 +++---
 generic/tclEncoding.c |  8 +++---
 generic/tclScan.c     |  4 +--
 generic/tclUtf.c      | 75 ++++++++++++++++++++++++++-------------------------
 4 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index dac82b8..c17c4f1 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1221,8 +1221,8 @@ Tcl_SplitObjCmd(
 	    fullchar = ch;
 
 #if TCL_UTF_MAX <= 4
-	    if (!len) {
-		len += TclUtfToUniChar(stringPtr, &ch);
+	    if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+		len += TclUtfToUniChar(stringPtr + len, &ch);
 		fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	    }
 #endif
@@ -1854,8 +1854,8 @@ StringIsCmd(
 	    length2 = TclUtfToUniChar(string1, &ch);
 	    fullchar = ch;
 #if TCL_UTF_MAX <= 4
-	    if (!length2) {
-	    	length2 = TclUtfToUniChar(string1, &ch);
+	    if ((length2 == 1) && ((ch & 0xFC00) == 0xD800)) {
+	    	length2 += TclUtfToUniChar(string1 + length2, &ch);
 	    	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	    }
 #endif
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index e601c3a..b5517bc 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2384,8 +2384,8 @@ UtfToUtfProc(
 	    src += len;
 	    dst += Tcl_UniCharToUtf(*chPtr, dst);
 #if TCL_UTF_MAX <= 4
-	    if (!len) {
-		src += TclUtfToUniChar(src, chPtr);
+	    if ((len == 1) && ((*chPtr & 0xFC00) == 0xD800)) {
+		src += TclUtfToUniChar(src + len, chPtr);
 		dst += Tcl_UniCharToUtf(*chPtr, dst);
 	    }
 #endif
@@ -3006,7 +3006,7 @@ Iso88591FromUtfProc(
 
 	if (ch > 0xff
 #if TCL_UTF_MAX <= 4
-		|| !len
+		|| ((len == 1) && ((ch & 0xFC00) == 0xD800))
 #endif
 		) {
 	    if (flags & TCL_ENCODING_STOPONERROR) {
@@ -3014,7 +3014,7 @@ Iso88591FromUtfProc(
 		break;
 	    }
 #if TCL_UTF_MAX <= 4
-	    if (!len) len = 4;
+	    if ((len == 1) && ((ch & 0xFC00) == 0xD800)) len = 4;
 #endif
 	    /*
 	     * Plunge on, using '?' as a fallback character.
diff --git a/generic/tclScan.c b/generic/tclScan.c
index fbfba2d..45035f1 100644
--- a/generic/tclScan.c
+++ b/generic/tclScan.c
@@ -882,8 +882,8 @@ Tcl_ScanObjCmd(
 	    offset = TclUtfToUniChar(string, &sch);
 	    i = (int)sch;
 #if TCL_UTF_MAX == 4
-	    if (!offset) {
-		offset = TclUtfToUniChar(string, &sch);
+	    if ((offset == 1) && ((sch & 0xFC00) == 0xD800)) {
+		offset += TclUtfToUniChar(string+offset, &sch);
 		i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF);
 	    }
 #endif
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index ce67db7..2227d45 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -312,6 +312,20 @@ Tcl_UtfToUniChar(
 	 * characters representing themselves.
 	 */
 
+#if TCL_UTF_MAX <= 4
+	/* If *chPtr contains a high surrogate (produced by a previous
+	 * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
+	 * bytes, then we must produce a follow-up low surrogate. We only
+	 * do that if the high surrogate matches the bits we encounter.
+	 */
+	if ((byte >= 0x80)
+		&& (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
+		&& ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))
+		&& ((src[2] & 0xC0) == 0x80)) {
+	    *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
+	    return 3;
+	}
+#endif
 	if ((unsigned)(byte-0x80) < (unsigned) 0x20) {
 	    *chPtr = (Tcl_UniChar) cp1252[byte-0x80];
 	} else {
@@ -358,21 +372,14 @@ Tcl_UtfToUniChar(
 	     * Four-byte-character lead byte followed by three trail bytes.
 	     */
 #if TCL_UTF_MAX <= 4
-	    Tcl_UniChar surrogate;
-
-	    byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
-		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
-	    surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10));
-	    if (byte & 0x100000) {
+	    byte = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
+		    | ((src[2] & 0x3F) >> 4)) - 0x40;
+	    if ((unsigned) byte >= 0x400) {
 		/* out of range, < 0x10000 or > 0x10ffff */
-	    } else if (*chPtr != surrogate) {
-		/* produce high surrogate, but don't advance source pointer */
-		*chPtr = surrogate;
-		return 0;
 	    } else {
-		/* produce low surrogate, and advance source pointer */
-		*chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF));
-		return 4;
+		/* produce high surrogate, advance source pointer */
+		*chPtr = 0xD800 + byte;
+		return 1;
 	    }
 #else
 	    *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
@@ -582,8 +589,8 @@ Tcl_UtfFindFirst(
 	len = TclUtfToUniChar(src, &find);
 	fullchar = find;
 #if TCL_UTF_MAX <= 4
-	if (!len) {
-	    len += TclUtfToUniChar(src, &find);
+	if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+	    len += TclUtfToUniChar(src + len, &find);
 	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
 	}
 #endif
@@ -630,8 +637,8 @@ Tcl_UtfFindLast(
 	len = TclUtfToUniChar(src, &find);
 	fullchar = find;
 #if TCL_UTF_MAX <= 4
-	if (!len) {
-	    len += TclUtfToUniChar(src, &find);
+	if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+	    len += TclUtfToUniChar(src + len, &find);
 	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
 	}
 #endif
@@ -673,8 +680,8 @@ Tcl_UtfNext(
     int len = TclUtfToUniChar(src, &ch);
 
 #if TCL_UTF_MAX <= 4
-    if (len == 0) {
-      len = TclUtfToUniChar(src, &ch);
+    if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+      len += TclUtfToUniChar(src + len, &ch);
     }
 #endif
     return src + len;
@@ -755,7 +762,7 @@ Tcl_UniCharAtIndex(
     Tcl_UniChar ch = 0;
     int fullchar = 0;
 #if TCL_UTF_MAX <= 4
-	int len = 1;
+	int len = 0;
 #endif
 
     while (index-- >= 0) {
@@ -767,9 +774,9 @@ Tcl_UniCharAtIndex(
     }
     fullchar = ch;
 #if TCL_UTF_MAX <= 4
-     if (!len) {
+    if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
 	/* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */
-	(void)TclUtfToUniChar(src, &ch);
+	(void)TclUtfToUniChar(src + len, &ch);
 	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
     }
 #endif
@@ -801,14 +808,14 @@ Tcl_UtfAtIndex(
     register int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch = 0;
-    int len = 1;
+    int len = 0;
 
     while (index-- > 0) {
 	len = TclUtfToUniChar(src, &ch);
 	src += len;
     }
 #if TCL_UTF_MAX <= 4
-     if (!len) {
+    if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
 	/* Index points at character following High Surrogate */
 	src += TclUtfToUniChar(src, &ch);
     }
@@ -905,9 +912,8 @@ Tcl_UtfToUpper(
 	bytes = TclUtfToUniChar(src, &ch);
 	upChar = ch;
 #if TCL_UTF_MAX <= 4
-	if (!bytes) {
-	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
-	    bytes = TclUtfToUniChar(src, &ch);
+	if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
+	    bytes += TclUtfToUniChar(src + bytes, &ch);
 	    /* Combine surrogates */
 	    upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	}
@@ -968,9 +974,8 @@ Tcl_UtfToLower(
 	bytes = TclUtfToUniChar(src, &ch);
 	lowChar = ch;
 #if TCL_UTF_MAX <= 4
-	if (!bytes) {
-	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
-	    bytes = TclUtfToUniChar(src, &ch);
+	if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
+	    bytes += TclUtfToUniChar(src + bytes, &ch);
 	    /* Combine surrogates */
 	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	}
@@ -1034,9 +1039,8 @@ Tcl_UtfToTitle(
 	bytes = TclUtfToUniChar(src, &ch);
 	titleChar = ch;
 #if TCL_UTF_MAX <= 4
-	if (!bytes) {
-	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
-	    bytes = TclUtfToUniChar(src, &ch);
+	if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
+	    bytes += TclUtfToUniChar(src + bytes, &ch);
 	    /* Combine surrogates */
 	    titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	}
@@ -1055,9 +1059,8 @@ Tcl_UtfToTitle(
 	bytes = TclUtfToUniChar(src, &ch);
 	lowChar = ch;
 #if TCL_UTF_MAX <= 4
-	if (!bytes) {
-	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
-	    bytes = TclUtfToUniChar(src, &ch);
+	if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
+	    bytes += TclUtfToUniChar(src + bytes, &ch);
 	    /* Combine surrogates */
 	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	}
-- 
cgit v0.12


From 9589c85462da7e8d01fe0154de892c6d30d92f0d Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Tue, 19 Feb 2019 19:38:10 +0000
Subject: Minor optimizations

---
 generic/tclCmdMZ.c     |  20 ++++-----
 generic/tclEncoding.c  |  14 +++---
 generic/tclExecute.c   |   4 +-
 generic/tclParse.c     |   6 +--
 generic/tclScan.c      |   2 +-
 generic/tclStringObj.c |   8 ++--
 generic/tclStubInit.c  |   4 +-
 generic/tclUtf.c       | 113 +++++++++++++++++++++++++------------------------
 generic/tclZipfs.c     |  22 +---------
 win/tclWin32Dll.c      |   8 +++-
 10 files changed, 95 insertions(+), 106 deletions(-)

diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index c17c4f1..a289a5c 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1221,7 +1221,7 @@ Tcl_SplitObjCmd(
 	    fullchar = ch;
 
 #if TCL_UTF_MAX <= 4
-	    if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+	    if ((ch >= 0xD800) && (len < 3)) {
 		len += TclUtfToUniChar(stringPtr + len, &ch);
 		fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	    }
@@ -1447,8 +1447,8 @@ StringIndexCmd(
 	    char buf[4];
 
 	    length = Tcl_UniCharToUtf(ch, buf);
-	    if (!length) {
-		length = Tcl_UniCharToUtf(-1, buf);
+	    if ((ch >= 0xD800) && (length < 3)) {
+		length = Tcl_UniCharToUtf(-1, buf + length);
 	    }
 	    Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length));
 	}
@@ -1854,7 +1854,7 @@ StringIsCmd(
 	    length2 = TclUtfToUniChar(string1, &ch);
 	    fullchar = ch;
 #if TCL_UTF_MAX <= 4
-	    if ((length2 == 1) && ((ch & 0xFC00) == 0xD800)) {
+	    if ((ch >= 0xD800) && (length2 < 3)) {
 	    	length2 += TclUtfToUniChar(string1 + length2, &ch);
 	    	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	    }
@@ -1935,7 +1935,7 @@ StringMapCmd(
 	const char *string = TclGetStringFromObj(objv[1], &length2);
 
 	if ((length2 > 1) &&
-		strncmp(string, "-nocase", (size_t) length2) == 0) {
+		strncmp(string, "-nocase", length2) == 0) {
 	    nocase = 1;
 	} else {
 	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
@@ -2203,7 +2203,7 @@ StringMatchCmd(
 	const char *string = TclGetStringFromObj(objv[1], &length);
 
 	if ((length > 1) &&
-	    strncmp(string, "-nocase", (size_t) length) == 0) {
+	    strncmp(string, "-nocase", length) == 0) {
 	    nocase = TCL_MATCH_NOCASE;
 	} else {
 	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
@@ -2605,10 +2605,10 @@ StringEqualCmd(
 
     for (i = 1; i < objc-2; i++) {
 	string2 = TclGetStringFromObj(objv[i], &length);
-	if ((length > 1) && !strncmp(string2, "-nocase", (size_t)length)) {
+	if ((length > 1) && !strncmp(string2, "-nocase", length)) {
 	    nocase = 1;
 	} else if ((length > 1)
-		&& !strncmp(string2, "-length", (size_t)length)) {
+		&& !strncmp(string2, "-length", length)) {
 	    if (i+1 >= objc-2) {
 		goto str_cmp_args;
 	    }
@@ -2703,10 +2703,10 @@ TclStringCmpOpts(
 
     for (i = 1; i < objc-2; i++) {
 	string = TclGetStringFromObj(objv[i], &length);
-	if ((length > 1) && !strncmp(string, "-nocase", (size_t)length)) {
+	if ((length > 1) && !strncmp(string, "-nocase", length)) {
 	    *nocase = 1;
 	} else if ((length > 1)
-		&& !strncmp(string, "-length", (size_t)length)) {
+		&& !strncmp(string, "-length", length)) {
 	    if (i+1 >= objc-2) {
 		goto str_cmp_args;
 	    }
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index b5517bc..08a0de3 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2167,7 +2167,7 @@ BinaryProc(
     *srcReadPtr = srcLen;
     *dstWrotePtr = srcLen;
     *dstCharsPtr = srcLen;
-    memcpy(dst, src, (size_t) srcLen);
+    memcpy(dst, src, srcLen);
     return result;
 }
 
@@ -2384,7 +2384,7 @@ UtfToUtfProc(
 	    src += len;
 	    dst += Tcl_UniCharToUtf(*chPtr, dst);
 #if TCL_UTF_MAX <= 4
-	    if ((len == 1) && ((*chPtr & 0xFC00) == 0xD800)) {
+	    if ((*chPtr >= 0xD800) && (len < 3)) {
 		src += TclUtfToUniChar(src + len, chPtr);
 		dst += Tcl_UniCharToUtf(*chPtr, dst);
 	    }
@@ -3006,7 +3006,7 @@ Iso88591FromUtfProc(
 
 	if (ch > 0xff
 #if TCL_UTF_MAX <= 4
-		|| ((len == 1) && ((ch & 0xFC00) == 0xD800))
+		|| ((ch >= 0xD800) && (len < 3))
 #endif
 		) {
 	    if (flags & TCL_ENCODING_STOPONERROR) {
@@ -3014,7 +3014,7 @@ Iso88591FromUtfProc(
 		break;
 	    }
 #if TCL_UTF_MAX <= 4
-	    if ((len == 1) && ((ch & 0xFC00) == 0xD800)) len = 4;
+	    if ((ch >= 0xD800) && (len < 3)) len = 4;
 #endif
 	    /*
 	     * Plunge on, using '?' as a fallback character.
@@ -3364,7 +3364,7 @@ EscapeFromUtfProc(
 	    *dstWrotePtr = 0;
 	    return TCL_CONVERT_NOSPACE;
 	}
-	memcpy(dst, dataPtr->init, (size_t)dataPtr->initLen);
+	memcpy(dst, dataPtr->init, dataPtr->initLen);
 	dst += dataPtr->initLen;
     } else {
 	state = PTR2INT(*statePtr);
@@ -3443,7 +3443,7 @@ EscapeFromUtfProc(
 		    break;
 		}
 		memcpy(dst, subTablePtr->sequence,
-			(size_t) subTablePtr->sequenceLen);
+			subTablePtr->sequenceLen);
 		dst += subTablePtr->sequenceLen;
 	    }
 	}
@@ -3486,7 +3486,7 @@ EscapeFromUtfProc(
 		memcpy(dst, dataPtr->subTables[0].sequence, len);
 		dst += len;
 	    }
-	    memcpy(dst, dataPtr->final, (size_t) dataPtr->finalLen);
+	    memcpy(dst, dataPtr->final, dataPtr->finalLen);
 	    dst += dataPtr->finalLen;
 	    state &= ~TCL_ENCODING_END;
 	}
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index 17ad0bb..3ae5571 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -5227,8 +5227,8 @@ TEBCresume(
 		objResultPtr = Tcl_NewObj();
 	    } else {
 		length = Tcl_UniCharToUtf(ch, buf);
-		if (!length) {
-		    length = Tcl_UniCharToUtf(-1, buf);
+		if ((ch >= 0xD800) && (length < 3)) {
+		    length = Tcl_UniCharToUtf(-1, buf + length);
 		}
 		objResultPtr = Tcl_NewStringObj(buf, length);
 	    }
diff --git a/generic/tclParse.c b/generic/tclParse.c
index ccb648c..8d07f7f 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -939,9 +939,9 @@ TclParseBackslash(
 	*readPtr = count;
     }
     count = Tcl_UniCharToUtf(result, dst);
-    if (!count) {
-	/* Special case for handling upper surrogates. */
-	count = Tcl_UniCharToUtf(-1, dst);
+    if ((result >= 0xD800) && (count < 3)) {
+	/* Special case for handling high surrogates. */
+	count += Tcl_UniCharToUtf(-1, dst + count);
     }
     return count;
 }
diff --git a/generic/tclScan.c b/generic/tclScan.c
index 45035f1..21ad953 100644
--- a/generic/tclScan.c
+++ b/generic/tclScan.c
@@ -882,7 +882,7 @@ Tcl_ScanObjCmd(
 	    offset = TclUtfToUniChar(string, &sch);
 	    i = (int)sch;
 #if TCL_UTF_MAX == 4
-	    if ((offset == 1) && ((sch & 0xFC00) == 0xD800)) {
+	    if (((sch & 0xFC00) == 0xD800) && (offset < 3)) {
 		offset += TclUtfToUniChar(string+offset, &sch);
 		i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF);
 	    }
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 72ca7cd..1ef7b9a 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -2048,9 +2048,9 @@ Tcl_AppendFormatToObj(
 		goto error;
 	    }
 	    length = Tcl_UniCharToUtf(code, buf);
-	    if (!length) {
-		/* Special case for handling upper surrogates. */
-		length = Tcl_UniCharToUtf(-1, buf);
+	    if ((code >= 0xD800) && (length < 3)) {
+		/* Special case for handling high surrogates. */
+		length += Tcl_UniCharToUtf(-1, buf + length);
 	    }
 	    segment = Tcl_NewStringObj(buf, length);
 	    Tcl_IncrRefCount(segment);
@@ -4287,7 +4287,7 @@ ExtendStringRepWithUnicode(
   copyBytes:
     dst = objPtr->bytes + origLength;
     for (i = 0; i < numChars; i++) {
-	dst += Tcl_UniCharToUtf((int) unicode[i], dst);
+	dst += Tcl_UniCharToUtf(unicode[i], dst);
     }
     *dst = '\0';
     objPtr->length = dst - objPtr->bytes;
diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c
index 8429a2f..66bb305 100644
--- a/generic/tclStubInit.c
+++ b/generic/tclStubInit.c
@@ -333,7 +333,7 @@ Tcl_WinTCharToUtf(
     wEnd = (wchar_t *)string + len;
     for (w = (wchar_t *)string; w < wEnd; ) {
 	if (!blen && ((*w & 0xFC00) != 0xDC00)) {
-	    /* Special case for handling upper surrogates. */
+	    /* Special case for handling high surrogates. */
 	    p += Tcl_UniCharToUtf(-1, p);
 	}
 	blen = Tcl_UniCharToUtf(*w, p);
@@ -341,7 +341,7 @@ Tcl_WinTCharToUtf(
 	w++;
     }
     if (!blen) {
-	/* Special case for handling upper surrogates. */
+	/* Special case for handling high surrogates. */
 	p += Tcl_UniCharToUtf(-1, p);
     }
     Tcl_DStringSetLength(dsPtr, oldLength + (p - result));
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 2227d45..6b63ecb 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -147,17 +147,17 @@ Tcl_UniCharToUtf(
 		    /* Low surrogate */
 		    if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
 			    && ((buf[2] & 0xCF) == 0)) {
-			/* Previous Tcl_UniChar was a High surrogate, so combine */
+			/* Previous Tcl_UniChar was a high surrogate, so combine */
 			buf[3] = (char) ((ch & 0x3F) | 0x80);
 			buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80);
 			return 4;
 		    }
-		    /* Previous Tcl_UniChar was not a High surrogate, so just output */
+		    /* Previous Tcl_UniChar was not a high surrogate, so just output */
 		} else {
 		    /* High surrogate */
 		    ch += 0x40;
 		    /* Fill buffer with specific 3-byte (invalid) byte combination,
-		       so following Low surrogate can recognize it and combine */
+		       so following low surrogate can recognize it and combine */
 		    buf[2] = (char) ((ch << 4) & 0x30);
 		    buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80);
 		    buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0);
@@ -232,15 +232,18 @@ Tcl_UniCharToUtfDString(
     wEnd = uniStr + uniLength;
     for (w = uniStr; w < wEnd; ) {
 	if (!len && ((*w & 0xFC00) != 0xDC00)) {
-	    /* Special case for handling upper surrogates. */
+	    /* Special case for handling high surrogates. */
 	    p += Tcl_UniCharToUtf(-1, p);
 	}
 	len = Tcl_UniCharToUtf(*w, p);
 	p += len;
+	if ((*w >= 0xD800) && (len < 3)) {
+	    len = 0; /* Indication that high surrogate was found */
+	}
 	w++;
     }
     if (!len) {
-	/* Special case for handling upper surrogates. */
+	/* Special case for handling high surrogates. */
 	p += Tcl_UniCharToUtf(-1, p);
     }
     Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
@@ -296,7 +299,7 @@ Tcl_UtfToUniChar(
     register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
 				 * the UTF-8 string. */
 {
-    register int byte;
+    Tcl_UniChar byte;
 
     /*
      * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
@@ -326,10 +329,10 @@ Tcl_UtfToUniChar(
 	    return 3;
 	}
 #endif
-	if ((unsigned)(byte-0x80) < (unsigned) 0x20) {
-	    *chPtr = (Tcl_UniChar) cp1252[byte-0x80];
+	if (byte-0x80 < 0x20) {
+	    *chPtr = cp1252[byte-0x80];
 	} else {
-	    *chPtr = (Tcl_UniChar) byte;
+	    *chPtr = byte;
 	}
 	return 1;
     } else if (byte < 0xE0) {
@@ -338,7 +341,7 @@ Tcl_UtfToUniChar(
 	     * Two-byte-character lead-byte followed by a trail-byte.
 	     */
 
-	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
+	    *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F));
 	    if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
 		return 2;
 	    }
@@ -354,7 +357,7 @@ Tcl_UtfToUniChar(
 	     * Three-byte-character lead byte followed by two trail bytes.
 	     */
 
-	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
+	    *chPtr = (((byte & 0x0F) << 12)
 		    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
 	    if (*chPtr > 0x7FF) {
 		return 3;
@@ -374,7 +377,7 @@ Tcl_UtfToUniChar(
 #if TCL_UTF_MAX <= 4
 	    byte = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
 		    | ((src[2] & 0x3F) >> 4)) - 0x40;
-	    if ((unsigned) byte >= 0x400) {
+	    if (byte >= 0x400) {
 		/* out of range, < 0x10000 or > 0x10ffff */
 	    } else {
 		/* produce high surrogate, advance source pointer */
@@ -382,9 +385,9 @@ Tcl_UtfToUniChar(
 		return 1;
 	    }
 #else
-	    *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+	    *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
 		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
-	    if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
+	    if ((*chPtr - 0x10000) <= 0xFFFFF) {
 		return 4;
 	    }
 #endif
@@ -396,7 +399,7 @@ Tcl_UtfToUniChar(
 	 */
     }
 
-    *chPtr = (Tcl_UniChar) byte;
+    *chPtr = byte;
     return 1;
 }
 
@@ -457,8 +460,8 @@ Tcl_UtfToUniCharDString(
     while (p < end) {
 	if (Tcl_UtfCharComplete(p, end-p)) {
 	    p += TclUtfToUniChar(p, &ch);
-	} else if ((unsigned)((UCHAR(*p)-0x80)) < (unsigned) 0x20) {
-	    ch = (Tcl_UniChar) cp1252[UCHAR(*p++)-0x80];
+	} else if (((UCHAR(*p)-0x80)) < 0x20) {
+	    ch = cp1252[UCHAR(*p++)-0x80];
 	} else {
 	    ch = UCHAR(*p++);
 	}
@@ -589,7 +592,7 @@ Tcl_UtfFindFirst(
 	len = TclUtfToUniChar(src, &find);
 	fullchar = find;
 #if TCL_UTF_MAX <= 4
-	if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+	if ((ch >= 0xD800) && (len < 3)) {
 	    len += TclUtfToUniChar(src + len, &find);
 	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
 	}
@@ -637,7 +640,7 @@ Tcl_UtfFindLast(
 	len = TclUtfToUniChar(src, &find);
 	fullchar = find;
 #if TCL_UTF_MAX <= 4
-	if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+	if ((ch >= 0xD800) && (len < 3)) {
 	    len += TclUtfToUniChar(src + len, &find);
 	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
 	}
@@ -680,8 +683,8 @@ Tcl_UtfNext(
     int len = TclUtfToUniChar(src, &ch);
 
 #if TCL_UTF_MAX <= 4
-    if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
-      len += TclUtfToUniChar(src + len, &ch);
+    if ((ch >= 0xD800) && (len < 3)) {
+	len += TclUtfToUniChar(src + len, &ch);
     }
 #endif
     return src + len;
@@ -774,8 +777,8 @@ Tcl_UniCharAtIndex(
     }
     fullchar = ch;
 #if TCL_UTF_MAX <= 4
-    if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
-	/* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */
+    if ((ch >= 0xD800) && (len < 3)) {
+	/* If last Tcl_UniChar was an high surrogate, combine with low surrogate */
 	(void)TclUtfToUniChar(src + len, &ch);
 	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
     }
@@ -815,7 +818,7 @@ Tcl_UtfAtIndex(
 	src += len;
     }
 #if TCL_UTF_MAX <= 4
-    if ((len == 1) && ((ch & 0xFC00) == 0xD800)) {
+    if ((ch >= 0xD800) && (len < 3)) {
 	/* Index points at character following High Surrogate */
 	src += TclUtfToUniChar(src, &ch);
     }
@@ -901,7 +904,7 @@ Tcl_UtfToUpper(
     Tcl_UniChar ch = 0;
     int upChar;
     char *src, *dst;
-    int bytes;
+    int len;
 
     /*
      * Iterate over the string until we hit the terminating null.
@@ -909,11 +912,11 @@ Tcl_UtfToUpper(
 
     src = dst = str;
     while (*src) {
-	bytes = TclUtfToUniChar(src, &ch);
+	len = TclUtfToUniChar(src, &ch);
 	upChar = ch;
 #if TCL_UTF_MAX <= 4
-	if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
-	    bytes += TclUtfToUniChar(src + bytes, &ch);
+	if ((ch >= 0xD800) && (len < 3)) {
+	    len += TclUtfToUniChar(src + len, &ch);
 	    /* Combine surrogates */
 	    upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	}
@@ -926,13 +929,13 @@ Tcl_UtfToUpper(
 	 * char to dst if its size is <= the original char.
 	 */
 
-	if ((bytes < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) {
-	    memcpy(dst, src, (size_t) bytes);
-	    dst += bytes;
+	if ((len < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) {
+	    memcpy(dst, src, len);
+	    dst += len;
 	} else {
 	    dst += Tcl_UniCharToUtf(upChar, dst);
 	}
-	src += bytes;
+	src += len;
     }
     *dst = '\0';
     return (dst - str);
@@ -963,7 +966,7 @@ Tcl_UtfToLower(
     Tcl_UniChar ch = 0;
     int lowChar;
     char *src, *dst;
-    int bytes;
+    int len;
 
     /*
      * Iterate over the string until we hit the terminating null.
@@ -971,11 +974,11 @@ Tcl_UtfToLower(
 
     src = dst = str;
     while (*src) {
-	bytes = TclUtfToUniChar(src, &ch);
+	len = TclUtfToUniChar(src, &ch);
 	lowChar = ch;
 #if TCL_UTF_MAX <= 4
-	if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
-	    bytes += TclUtfToUniChar(src + bytes, &ch);
+	if ((ch >= 0xD800) && (len < 3)) {
+	    len += TclUtfToUniChar(src + len, &ch);
 	    /* Combine surrogates */
 	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	}
@@ -988,13 +991,13 @@ Tcl_UtfToLower(
 	 * char to dst if its size is <= the original char.
 	 */
 
-	if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
-	    memcpy(dst, src, (size_t) bytes);
-	    dst += bytes;
+	if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
+	    memcpy(dst, src, len);
+	    dst += len;
 	} else {
 	    dst += Tcl_UniCharToUtf(lowChar, dst);
 	}
-	src += bytes;
+	src += len;
     }
     *dst = '\0';
     return (dst - str);
@@ -1026,7 +1029,7 @@ Tcl_UtfToTitle(
     Tcl_UniChar ch = 0;
     int titleChar, lowChar;
     char *src, *dst;
-    int bytes;
+    int len;
 
     /*
      * Capitalize the first character and then lowercase the rest of the
@@ -1036,31 +1039,31 @@ Tcl_UtfToTitle(
     src = dst = str;
 
     if (*src) {
-	bytes = TclUtfToUniChar(src, &ch);
+	len = TclUtfToUniChar(src, &ch);
 	titleChar = ch;
 #if TCL_UTF_MAX <= 4
-	if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
-	    bytes += TclUtfToUniChar(src + bytes, &ch);
+	if ((ch >= 0xD800) && (len < 3)) {
+	    len += TclUtfToUniChar(src + len, &ch);
 	    /* Combine surrogates */
 	    titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	}
 #endif
 	titleChar = Tcl_UniCharToTitle(titleChar);
 
-	if ((bytes < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) {
-	    memcpy(dst, src, (size_t) bytes);
-	    dst += bytes;
+	if ((len < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) {
+	    memcpy(dst, src, len);
+	    dst += len;
 	} else {
 	    dst += Tcl_UniCharToUtf(titleChar, dst);
 	}
-	src += bytes;
+	src += len;
     }
     while (*src) {
-	bytes = TclUtfToUniChar(src, &ch);
+	len = TclUtfToUniChar(src, &ch);
 	lowChar = ch;
 #if TCL_UTF_MAX <= 4
-	if ((bytes == 1) && ((ch & 0xFC00) == 0xD800)) {
-	    bytes += TclUtfToUniChar(src + bytes, &ch);
+	if ((ch >= 0xD800) && (len < 3)) {
+	    len += TclUtfToUniChar(src + len, &ch);
 	    /* Combine surrogates */
 	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
 	}
@@ -1070,13 +1073,13 @@ Tcl_UtfToTitle(
 	    lowChar = Tcl_UniCharToLower(lowChar);
 	}
 
-	if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
-	    memcpy(dst, src, (size_t) bytes);
-	    dst += bytes;
+	if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
+	    memcpy(dst, src, len);
+	    dst += len;
 	} else {
 	    dst += Tcl_UniCharToUtf(lowChar, dst);
 	}
-	src += bytes;
+	src += len;
     }
     *dst = '\0';
     return (dst - str);
diff --git a/generic/tclZipfs.c b/generic/tclZipfs.c
index d02a2da..64a12a3 100644
--- a/generic/tclZipfs.c
+++ b/generic/tclZipfs.c
@@ -3155,21 +3155,6 @@ ZipFSListObjCmd(
 
 #ifdef _WIN32
 #define LIBRARY_SIZE	    64
-
-static inline int
-WCharToUtf(
-    const WCHAR *wSrc,
-    char *dst)
-{
-    char *start = dst;
-
-    while (*wSrc != '\0') {
-	dst += Tcl_UniCharToUtf(*wSrc, dst);
-	wSrc++;
-    }
-    *dst = '\0';
-    return (int) (dst - start);
-}
 #endif /* _WIN32 */
 
 Tcl_Obj *
@@ -3213,11 +3198,8 @@ TclZipfs_TclLibrary(void)
 
 #if defined(_WIN32)
     hModule = TclWinGetTclInstance();
-    if (GetModuleFileNameW(hModule, wName, MAX_PATH) == 0) {
-	GetModuleFileNameA(hModule, dllName, MAX_PATH);
-    } else {
-	WCharToUtf(wName, dllName);
-    }
+    GetModuleFileNameW(hModule, wName, MAX_PATH);
+    WideCharToMultiByte(CP_UTF8, 0, wName, -1, dllName, sizeof(dllName), NULL, NULL);
 
     if (ZipfsAppHookFindTclInit(dllName) == TCL_OK) {
 	return Tcl_NewStringObj(zipfs_literal_tcl_library, -1);
diff --git a/win/tclWin32Dll.c b/win/tclWin32Dll.c
index ddfa0d6..c39d2c1 100644
--- a/win/tclWin32Dll.c
+++ b/win/tclWin32Dll.c
@@ -567,15 +567,19 @@ Tcl_WinTCharToUtf(
     wEnd = (TCHAR *)string + len;
     for (w = (TCHAR *)string; w < wEnd; ) {
 	if (!blen && ((*w & 0xFC00) != 0xDC00)) {
-	    /* Special case for handling upper surrogates. */
+	    /* Special case for handling high surrogates. */
 	    p += Tcl_UniCharToUtf(-1, p);
 	}
 	blen = Tcl_UniCharToUtf(*w, p);
 	p += blen;
+	if ((*w >= 0xD800) && (blen < 3)) {
+	    /* Indication that high surrogate is handled */
+	    blen = 0;
+	}
 	w++;
     }
     if (!blen) {
-	/* Special case for handling upper surrogates. */
+	/* Special case for handling high surrogates. */
 	p += Tcl_UniCharToUtf(-1, p);
     }
     Tcl_DStringSetLength(dsPtr, oldLength + (p - result));
-- 
cgit v0.12


From 57d9952ece8f81fc6802097bace965a196bb849b Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Mon, 25 Feb 2019 21:10:33 +0000
Subject: Finish complete fix, all corner-cases correct now. Also spurious
 UTF-8 testcase failure (as seen on travis) fixed now.

---
 generic/tclBinary.c     |  4 ++--
 generic/tclCmdMZ.c      |  4 ++--
 generic/tclCompCmdsSZ.c |  2 +-
 generic/tclCompile.c    |  4 ++--
 generic/tclExecute.c    |  4 ++--
 generic/tclParse.c      |  4 ++--
 generic/tclScan.c       |  2 +-
 generic/tclUtf.c        | 38 ++++++++++++++++++++------------------
 generic/tclUtil.c       |  2 +-
 9 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/generic/tclBinary.c b/generic/tclBinary.c
index 677213e..3590af4 100644
--- a/generic/tclBinary.c
+++ b/generic/tclBinary.c
@@ -1354,7 +1354,7 @@ BinaryFormatCmd(
  badField:
     {
 	Tcl_UniChar ch = 0;
-	char buf[TCL_UTF_MAX + 1];
+	char buf[TCL_UTF_MAX + 1] = "";
 
 	TclUtfToUniChar(errorString, &ch);
 	buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
@@ -1724,7 +1724,7 @@ BinaryScanCmd(
  badField:
     {
 	Tcl_UniChar ch = 0;
-	char buf[TCL_UTF_MAX + 1];
+	char buf[TCL_UTF_MAX + 1] = "";
 
 	TclUtfToUniChar(errorString, &ch);
 	buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index a289a5c..38689fd 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1444,11 +1444,11 @@ StringIndexCmd(
 
 	    Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1));
 	} else {
-	    char buf[4];
+	    char buf[TCL_UTF_MAX] = "";
 
 	    length = Tcl_UniCharToUtf(ch, buf);
 	    if ((ch >= 0xD800) && (length < 3)) {
-		length = Tcl_UniCharToUtf(-1, buf + length);
+		length += Tcl_UniCharToUtf(-1, buf + length);
 	    }
 	    Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length));
 	}
diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c
index daab0d5..b97121e 100644
--- a/generic/tclCompCmdsSZ.c
+++ b/generic/tclCompCmdsSZ.c
@@ -1502,7 +1502,7 @@ TclSubstCompile(
     for (endTokenPtr = tokenPtr + parse.numTokens;
 	    tokenPtr < endTokenPtr; tokenPtr = TokenAfter(tokenPtr)) {
 	int length, literal, catchRange, breakJump;
-	char buf[TCL_UTF_MAX];
+	char buf[TCL_UTF_MAX] = "";
 	JumpFixup startFixup, okFixup, returnFixup, breakFixup;
 	JumpFixup continueFixup, otherFixup, endFixup;
 
diff --git a/generic/tclCompile.c b/generic/tclCompile.c
index f6e6b81..d940ff7 100644
--- a/generic/tclCompile.c
+++ b/generic/tclCompile.c
@@ -1744,7 +1744,7 @@ TclWordKnownAtCompileTime(
 
 	case TCL_TOKEN_BS:
 	    if (tempPtr != NULL) {
-		char utfBuf[TCL_UTF_MAX];
+		char utfBuf[TCL_UTF_MAX] = "";
 		int length = TclParseBackslash(tokenPtr->start,
 			tokenPtr->size, NULL, utfBuf);
 
@@ -2358,7 +2358,7 @@ TclCompileTokens(
 {
     Tcl_DString textBuffer;	/* Holds concatenated chars from adjacent
 				 * TCL_TOKEN_TEXT, TCL_TOKEN_BS tokens. */
-    char buffer[TCL_UTF_MAX];
+    char buffer[TCL_UTF_MAX] = "";
     int i, numObjsToConcat, length, adjust;
     unsigned char *entryCodeNext = envPtr->codeNext;
 #define NUM_STATIC_POS 20
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index 3ae5571..78012f0 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -5215,7 +5215,7 @@ TEBCresume(
 	    objResultPtr = Tcl_NewStringObj((const char *)
 		    valuePtr->bytes+index, 1);
 	} else {
-	    char buf[4];
+	    char buf[TCL_UTF_MAX] = "";
 	    int ch = Tcl_GetUniChar(valuePtr, index);
 
 	    /*
@@ -5228,7 +5228,7 @@ TEBCresume(
 	    } else {
 		length = Tcl_UniCharToUtf(ch, buf);
 		if ((ch >= 0xD800) && (length < 3)) {
-		    length = Tcl_UniCharToUtf(-1, buf + length);
+		    length += Tcl_UniCharToUtf(-1, buf + length);
 		}
 		objResultPtr = Tcl_NewStringObj(buf, length);
 	    }
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 8d07f7f..c791585 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -791,7 +791,7 @@ TclParseBackslash(
     Tcl_UniChar unichar = 0;
     int result;
     int count;
-    char buf[TCL_UTF_MAX];
+    char buf[TCL_UTF_MAX] = "";
 
     if (numBytes == 0) {
 	if (readPtr != NULL) {
@@ -2151,7 +2151,7 @@ TclSubstTokens(
 	Tcl_Obj *appendObj = NULL;
 	const char *append = NULL;
 	int appendByteLength = 0;
-	char utfCharBytes[TCL_UTF_MAX];
+	char utfCharBytes[TCL_UTF_MAX] = "";
 
 	switch (tokenPtr->type) {
 	case TCL_TOKEN_TEXT:
diff --git a/generic/tclScan.c b/generic/tclScan.c
index 21ad953..acf1a58 100644
--- a/generic/tclScan.c
+++ b/generic/tclScan.c
@@ -261,7 +261,7 @@ ValidateFormat(
     Tcl_UniChar ch = 0;
     int objIndex, xpgSize, nspace = numVars;
     int *nassign = TclStackAlloc(interp, nspace * sizeof(int));
-    char buf[TCL_UTF_MAX+1];
+    char buf[TCL_UTF_MAX+1] = "";
     Tcl_Obj *errorMsg;		/* Place to build an error messages. Note that
 				 * these are messy operations because we do
 				 * not want to use the formatting engine;
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 6b63ecb..67c0b08 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -145,12 +145,11 @@ Tcl_UniCharToUtf(
 	    if ((ch & 0xF800) == 0xD800) {
 		if (ch & 0x0400) {
 		    /* Low surrogate */
-		    if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
-			    && ((buf[2] & 0xCF) == 0)) {
+		    if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) {
 			/* Previous Tcl_UniChar was a high surrogate, so combine */
-			buf[3] = (char) ((ch & 0x3F) | 0x80);
-			buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80);
-			return 4;
+			buf[2] = (char) ((ch & 0x3F) | 0x80);
+			buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80);
+			return 3;
 		    }
 		    /* Previous Tcl_UniChar was not a high surrogate, so just output */
 		} else {
@@ -161,7 +160,7 @@ Tcl_UniCharToUtf(
 		    buf[2] = (char) ((ch << 4) & 0x30);
 		    buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80);
 		    buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0);
-		    return 0;
+		    return 1;
 		}
 	    }
 	    goto three;
@@ -174,11 +173,14 @@ Tcl_UniCharToUtf(
 	    return 4;
 	}
     } else if (ch == -1) {
-	if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
-		&& ((buf[2] & 0xCF) == 0)) {
-	    ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2)
-		    + ((buf[2] & 0x30) >> 4);
-	    goto three;
+	if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)
+		&& ((buf[-1] & 0xF8) == 0xF0)) {
+	    ch = 0xD7C0 + ((buf[-1] & 0x07) << 8) + ((buf[0] & 0x3F) << 2)
+		    + ((buf[1] & 0x30) >> 4);
+	    buf[1] = (char) ((ch | 0x80) & 0xBF);
+	    buf[0] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	    buf[-1] = (char) ((ch >> 12) | 0xE0);
+	    return 2;
 	}
     }
 
@@ -302,7 +304,7 @@ Tcl_UtfToUniChar(
     Tcl_UniChar byte;
 
     /*
-     * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
+     * Unroll 1 to 4 byte UTF-8 sequences.
      */
 
     byte = *((unsigned char *) src);
@@ -375,13 +377,13 @@ Tcl_UtfToUniChar(
 	     * Four-byte-character lead byte followed by three trail bytes.
 	     */
 #if TCL_UTF_MAX <= 4
-	    byte = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
+	    Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
 		    | ((src[2] & 0x3F) >> 4)) - 0x40;
-	    if (byte >= 0x400) {
+	    if (high >= 0x400) {
 		/* out of range, < 0x10000 or > 0x10ffff */
 	    } else {
 		/* produce high surrogate, advance source pointer */
-		*chPtr = 0xD800 + byte;
+		*chPtr = 0xD800 + high;
 		return 1;
 	    }
 #else
@@ -778,8 +780,8 @@ Tcl_UniCharAtIndex(
     fullchar = ch;
 #if TCL_UTF_MAX <= 4
     if ((ch >= 0xD800) && (len < 3)) {
-	/* If last Tcl_UniChar was an high surrogate, combine with low surrogate */
-	(void)TclUtfToUniChar(src + len, &ch);
+	/* If last Tcl_UniChar was a high surrogate, combine with low surrogate */
+	(void)TclUtfToUniChar(src, &ch);
 	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
     }
 #endif
@@ -819,7 +821,7 @@ Tcl_UtfAtIndex(
     }
 #if TCL_UTF_MAX <= 4
     if ((ch >= 0xD800) && (len < 3)) {
-	/* Index points at character following High Surrogate */
+	/* Index points at character following high Surrogate */
 	src += TclUtfToUniChar(src, &ch);
     }
 #endif
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index 3d4298e..4590e8f 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -1654,7 +1654,7 @@ Tcl_Backslash(
     int *readPtr)		/* Fill in with number of characters read from
 				 * src, unless NULL. */
 {
-    char buf[TCL_UTF_MAX];
+    char buf[TCL_UTF_MAX] = "";
     Tcl_UniChar ch = 0;
 
     Tcl_UtfBackslash(src, readPtr, buf);
-- 
cgit v0.12