7 files changed, 179 insertions, 46 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 935d42f..d7394fb 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -2529,7 +2529,7 @@ StringStartCmd(
 		break;
 	    }
 
-	    next = Tcl_UtfPrev(p, string);
+	    next = TclUtfPrev(p, string);
 	    do {
 		next += delta;
 		delta = TclUtfToUCS4(next, &ch);
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 8d2e68a..2ff644e 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -4689,6 +4689,14 @@ MODULE_SCOPE const TclFileAttrProcs	tclpFileAttrProcs[];
 	(numChars) = _count; \
     } while (0);
 
+#define TclUtfPrev(src, start) \
+	(((src) < (start) + 2) ? (start) : \
+	((unsigned char) *((src) - 1)) < 0x80 ? (src) - 1 : \
+	Tcl_UtfPrev(src, start))
+
+#define TclUtfNext(src)	\
+	((((unsigned char) *(src)) < 0x80) ? (src) + 1 : Tcl_UtfNext(src))
+
 /*
  *----------------------------------------------------------------
  * Macro that encapsulates the logic that determines when it is safe to
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 2025674..78e49f9 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -1171,10 +1171,10 @@ Tcl_AppendLimitedToObj(
 	}
 	eLen = strlen(ellipsis);
 	while (eLen > limit) {
-	    eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis;
+	    eLen = TclUtfPrev(ellipsis+eLen, ellipsis) - ellipsis;
 	}
 
-	toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes;
+	toCopy = TclUtfPrev(bytes+limit+1-eLen, bytes) - bytes;
     }
 
     /*
@@ -2614,7 +2614,7 @@ AppendPrintfToObjVA(
 		 * multi-byte characters.
 		 */
 
-		q = Tcl_UtfPrev(end, bytes);
+		q = TclUtfPrev(end, bytes);
 		if (!Tcl_UtfCharComplete(q, (int)(end - q))) {
 		    end = q;
 		}
diff --git a/generic/tclTest.c b/generic/tclTest.c
index 008fdb7..19de8ee 100644
--- a/generic/tclTest.c
+++ b/generic/tclTest.c
@@ -6849,10 +6849,10 @@ TestUtfNextCmd(
     memcpy(buffer + 1, bytes, numBytes);
     buffer[0] = buffer[numBytes + 1] = '\x00';
 
-    first = Tcl_UtfNext(buffer + 1);
+    first = TclUtfNext(buffer + 1);
     while ((buffer[0] = *p++) != '\0') {
 	/* Run Tcl_UtfNext with many more possible bytes at src[-1], all should give the same result */
-	result = Tcl_UtfNext(buffer + 1);
+	result = TclUtfNext(buffer + 1);
 	if (first != result) {
 	    Tcl_AppendResult(interp, "Tcl_UtfNext is not supposed to read src[-1]", NULL);
 	    return TCL_ERROR;
@@ -6908,7 +6908,7 @@ TestUtfPrevCmd(
     bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1);
     bytes[numBytes] = '\0';
 
-    result = Tcl_UtfPrev(bytes + offset, bytes);
+    result = TclUtfPrev(bytes + offset, bytes);
     Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes));
 
     Tcl_DecrRefCount(copy);
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 6adfba9..20b0ebe 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -90,6 +90,12 @@ static const unsigned char complete[256] = {
 #endif
     1,1,1,1,1,1,1,1,1,1,1
 };
+
+/*
+ * Functions used only in this module.
+ */
+
+static int		Invalid(unsigned char *src);
 
 /*
  *---------------------------------------------------------------------------
@@ -122,7 +128,56 @@ TclUtfCount(
     }
     return 3;
 }
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Invalid --
+ *
+ *	Utility routine to report whether /src/ points to the start of an
+ *	invald byte sequence that should be rejected. This might be because
+ *	it is an overlong encoding, or because it encodes something out of
+ *	the proper range. Caller guarantees that src[0] and src[1] are
+ *	readable, and
+ *
+ *	(src[0] >= 0xC0) && (src[0] != 0xC1)
+ * 	(src[1] >= 0x80) && (src[1] < 0xC0)
+ *	(src[0] < ((TCL_UTF_MAX > 3) ? 0xF5 : 0xF0))
+ *
+ * Results:
+ *	A boolean.
+ *---------------------------------------------------------------------------
+ */
+
+static const unsigned char bounds[28] = {
+    0x80, 0x80,		/* \xC0 accepts \x80 only */
+    0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF,
+    0x80, 0xBF,		/* (\xC4 - \xDC) -- all sequences valid */
+    0xA0, 0xBF,	/* \xE0\x80 through \xE0\x9F are invalid prefixes */
+    0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */
+    0x90, 0xBF,	/* \xF0\x80 through \xF0\x8F are invalid prefixes */
+    0x80, 0x8F  /* \xF4\x90 and higher are invalid prefixes */
+};
 
+INLINE static int
+Invalid(
+    unsigned char *src)	/* Points to lead byte of a UTF-8 byte sequence */
+{
+    unsigned char byte = *src;
+    int index;
+
+    if (byte % 0x04) {
+	/* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */
+	return 0;
+    }
+    index = (byte - 0xC0) >> 1;
+    if (src[1] < bounds[index] || src[1] > bounds[index+1]) {
+	/* Out of bounds - report invalid. */
+	return 1;
+    }
+    return 0;
+}
+
 /*
  *---------------------------------------------------------------------------
  *
@@ -860,7 +915,7 @@ Tcl_UtfFindLast(
  *
  * Tcl_UtfNext --
  *
- * 	Given a pointer to some location in a UTF-8 string, Tcl_UtfNext
+ *	Given a pointer to some location in a UTF-8 string, Tcl_UtfNext
  *	returns a pointer to the next UTF-8 character in the string.
  *	The caller must not ask for the next character after the last
  *	character in the string if the string is not terminated by a null
@@ -880,8 +935,8 @@ const char *
 Tcl_UtfNext(
     const char *src)		/* The current location in the string. */
 {
-    Tcl_UniChar ch = 0;
-    int len;
+    int left = totalBytes[UCHAR(*src)];
+    const char *next = src + 1;
 
     if (((*src) & 0xC0) == 0x80) {
 	if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) {
@@ -889,14 +944,22 @@ Tcl_UtfNext(
 	}
 	return src;
     }
-    len = TclUtfToUniChar(src, &ch);
 
-#if TCL_UTF_MAX <= 3
-    if ((ch >= 0xD800) && (len < 3)) {
-	len += TclUtfToUniChar(src + len, &ch);
+    while (--left) {
+	if ((*next & 0xC0) != 0x80) {
+	    /*
+	     * src points to non-trail byte; We ran out of trail bytes
+	     * before the needs of the lead byte were satisfied.
+	     * Let the (malformed) lead byte alone be a character
+	     */
+	    return src + 1;
+	}
+	next++;
     }
-#endif
-    return src + len;
+    if (Invalid((unsigned char *)src)) {
+	return src + 1;
+    }
+    return next;
 }
 
 /*
@@ -953,30 +1016,92 @@ Tcl_UtfPrev(
     const char *src,		/* A location in a UTF-8 string. */
     const char *start)		/* Pointer to the beginning of the string */
 {
-    const char *look;
-    int i, byte;
-
-    look = --src;
-    for (i = 0; i < 4; i++) {
-	if (look < start) {
-	    if (src < start) {
-		src = start;
-	    }
-	    break;
-	}
-	byte = *((unsigned char *) look);
+    int trailBytesSeen = 0;	/* How many trail bytes have been verified? */
+    CONST char *fallback = src - 1;
+				/* If we cannot find a lead byte that might
+				 * start a prefix of a valid UTF byte sequence,
+				 * we will fallback to a one-byte back step */
+    unsigned char *look = (unsigned char *)fallback;
+				/* Start search at the fallback position */
+
+    /* Quick boundary case exit. */
+    if (fallback <= start) {
+	return start;
+    }
+
+    do {
+	unsigned char byte = look[0];
+
 	if (byte < 0x80) {
-	    break;
+	    /*
+	     * Single byte character. Either this is a correct previous
+	     * character, or it is followed by at least one trail byte
+	     * which indicates a malformed sequence. In either case the
+	     * correct result is to return the fallback.
+	     */
+	    return fallback;
 	}
 	if (byte >= 0xC0) {
-	    if (totalBytes[byte] <= i) {
-		break;
+	    /* Non-trail byte; May be multibyte lead. */
+
+	    if ((trailBytesSeen == 0)
+		/*
+		 * We've seen no trailing context to use to check
+		 * anything. From what we know, this non-trail byte
+		 * is a prefix of a previous character, and accepting
+		 * it (the fallback) is correct.
+		 */
+
+		    || (trailBytesSeen >= totalBytes[byte])) {
+		/*
+		 * That is, (1 + trailBytesSeen > needed).
+		 * We've examined more bytes than needed to complete
+		 * this lead byte. No matter about well-formedness or
+		 * validity, the sequence starting with this lead byte
+		 * will never include the fallback location, so we must
+		 * return the fallback location. See test utf-7.17
+		 */
+		return fallback;
 	    }
-	    return look;
+
+	    /*
+	     * trailBytesSeen > 0, so we can examine look[1] safely.
+	     * Use that capability to screen out overlong sequences.
+	     */
+
+	    if (Invalid(look)) {
+		/* Reject */
+		return fallback;
+	    }
+	    return (CONST char *)look;
+	}
+
+	/* We saw a trail byte. */
+	trailBytesSeen++;
+
+	if ((CONST char *)look == start) {
+	    /*
+	     * Do not read before the start of the string
+	     *
+	     * If we get here, we've examined bytes at every location
+	     * >= start and < src and all of them are trail bytes,
+	     * including (*start).  We need to return our fallback
+	     * and exit this loop before we run past the start of the string.
+	     */
+	    return fallback;
 	}
+
+	/* Continue the search backwards... */
 	look--;
-    }
-    return src;
+    } while (trailBytesSeen < 4);
+
+    /*
+     * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a
+     * properly formed byte sequence to find, and we can stop looking,
+     * accepting the fallback (for TCL_UTF_MAX > 3) or just go back as
+     * far as we can.
+     */
+    return fallback;
 }
 
 /*
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index b48ce30..dd527dc 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -1707,7 +1707,7 @@ TclTrimRight(
 	const char *q = trim;
 	int pInc = 0, bytesLeft = numTrim;
 
-	pp = Tcl_UtfPrev(p, bytes);
+	pp = TclUtfPrev(p, bytes);
 	do {
 	    pp += pInc;
  	    pInc = TclUtfToUCS4(pp, &ch1);
@@ -1858,7 +1858,7 @@ TclTrim(
 	 * that we will not trim. Skip over it. */
 	if (numBytes > 0) {
 	    const char *first = bytes + trimLeft;
-	    bytes = Tcl_UtfNext(first);
+	    bytes = TclUtfNext(first);
 	    numBytes -= (bytes - first);
 
 	    if (numBytes > 0) {
diff --git a/tests/utf.test b/tests/utf.test
index 79f866a..885b057 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -600,19 +600,19 @@ test utf-7.23 {Tcl_UtfPrev} testutfprev {
 } 4
 test utf-7.24 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
     testutfprev A\xC0\x81
-} 1
+} 2
 test utf-7.25 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
     testutfprev A\xC0\x81 2
 } 1
 test utf-7.26 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
     testutfprev A\xE0\x80\x80
-} 1
+} 3
 test utf-7.27 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
     testutfprev A\xE0\x80
-} 1
+} 2
 test utf-7.27.1 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
     testutfprev A\xE0\x80\x80 3
-} 1
+} 2
 test utf-7.28 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
     testutfprev A\xE0
 } 1
@@ -621,13 +621,13 @@ test utf-7.28.1 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
 } 1
 test utf-7.29 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
     testutfprev A\xF0\x80\x80\x80
-} 1
+} 4
 test utf-7.30 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
     testutfprev A\xF0\x80\x80\x80 4
-} 1
+} 3
 test utf-7.31 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
     testutfprev A\xF0\x80\x80\x80 3
-} 1
+} 2
 test utf-7.32 {Tcl_UtfPrev -- overlong sequence}  testutfprev {
     testutfprev A\xF0\x80\x80\x80 2
 } 1
@@ -696,13 +696,13 @@ test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
 } 1
 test utf-7.49 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
     testutfprev A\xF4\x90\x80\x80
-} 1
+} 4
 test utf-7.49.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
     testutfprev A\xF4\x90\x80\x80 4
-} 1
+} 3
 test utf-7.49.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
     testutfprev A\xF4\x90\x80\x80 3
-} 1
+} 2
 test utf-7.49.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
     testutfprev A\xF4\x90\x80\x80 2
 } 1