Merge 8.6. This mainly introduces the overlong check into Tcl_UtfPrev(). 10 testcase changed results, all of them due to the Tcl_UtfPrev() improvement. Tcl_UtfNext() is not affected: Previous implementation was based on Tcl_UtfToUniChar(), which already did this check.

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2020-04-24 10:23:33 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2020-04-24 10:23:33 (GMT)
commit: da116b95a31241dc3abb6eed7d8f8eb3a9487232 (patch)
tree: 3b98b64461c97646bdbd3e8235aad6b7bf1bb2c3 /generic/tclUtf.c
parent: b48319b304980c06ca5dd093770f8234eb8dbec5 (diff)
parent: 2ca7ab9af0d59c9907dde3d844e1785d33df4812 (diff)
download: tcl-da116b95a31241dc3abb6eed7d8f8eb3a9487232.zip
tcl-da116b95a31241dc3abb6eed7d8f8eb3a9487232.tar.gz
tcl-da116b95a31241dc3abb6eed7d8f8eb3a9487232.tar.bz2
1 files changed, 152 insertions, 27 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 6adfba9..20b0ebe 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -90,6 +90,12 @@ static const unsigned char complete[256] = {
 #endif
     1,1,1,1,1,1,1,1,1,1,1
 };
+
+/*
+ * Functions used only in this module.
+ */
+
+static int		Invalid(unsigned char *src);
 
 /*
  *---------------------------------------------------------------------------
@@ -122,7 +128,56 @@ TclUtfCount(
     }
     return 3;
 }
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Invalid --
+ *
+ *	Utility routine to report whether /src/ points to the start of an
+ *	invald byte sequence that should be rejected. This might be because
+ *	it is an overlong encoding, or because it encodes something out of
+ *	the proper range. Caller guarantees that src[0] and src[1] are
+ *	readable, and
+ *
+ *	(src[0] >= 0xC0) && (src[0] != 0xC1)
+ * 	(src[1] >= 0x80) && (src[1] < 0xC0)
+ *	(src[0] < ((TCL_UTF_MAX > 3) ? 0xF5 : 0xF0))
+ *
+ * Results:
+ *	A boolean.
+ *---------------------------------------------------------------------------
+ */
+
+static const unsigned char bounds[28] = {
+    0x80, 0x80,		/* \xC0 accepts \x80 only */
+    0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF,
+    0x80, 0xBF,		/* (\xC4 - \xDC) -- all sequences valid */
+    0xA0, 0xBF,	/* \xE0\x80 through \xE0\x9F are invalid prefixes */
+    0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */
+    0x90, 0xBF,	/* \xF0\x80 through \xF0\x8F are invalid prefixes */
+    0x80, 0x8F  /* \xF4\x90 and higher are invalid prefixes */
+};
 
+INLINE static int
+Invalid(
+    unsigned char *src)	/* Points to lead byte of a UTF-8 byte sequence */
+{
+    unsigned char byte = *src;
+    int index;
+
+    if (byte % 0x04) {
+	/* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */
+	return 0;
+    }
+    index = (byte - 0xC0) >> 1;
+    if (src[1] < bounds[index] || src[1] > bounds[index+1]) {
+	/* Out of bounds - report invalid. */
+	return 1;
+    }
+    return 0;
+}
+
 /*
  *---------------------------------------------------------------------------
  *
@@ -860,7 +915,7 @@ Tcl_UtfFindLast(
  *
  * Tcl_UtfNext --
  *
- * 	Given a pointer to some location in a UTF-8 string, Tcl_UtfNext
+ *	Given a pointer to some location in a UTF-8 string, Tcl_UtfNext
  *	returns a pointer to the next UTF-8 character in the string.
  *	The caller must not ask for the next character after the last
  *	character in the string if the string is not terminated by a null
@@ -880,8 +935,8 @@ const char *
 Tcl_UtfNext(
     const char *src)		/* The current location in the string. */
 {
-    Tcl_UniChar ch = 0;
-    int len;
+    int left = totalBytes[UCHAR(*src)];
+    const char *next = src + 1;
 
     if (((*src) & 0xC0) == 0x80) {
 	if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) {
@@ -889,14 +944,22 @@ Tcl_UtfNext(
 	}
 	return src;
     }
-    len = TclUtfToUniChar(src, &ch);
 
-#if TCL_UTF_MAX <= 3
-    if ((ch >= 0xD800) && (len < 3)) {
-	len += TclUtfToUniChar(src + len, &ch);
+    while (--left) {
+	if ((*next & 0xC0) != 0x80) {
+	    /*
+	     * src points to non-trail byte; We ran out of trail bytes
+	     * before the needs of the lead byte were satisfied.
+	     * Let the (malformed) lead byte alone be a character
+	     */
+	    return src + 1;
+	}
+	next++;
     }
-#endif
-    return src + len;
+    if (Invalid((unsigned char *)src)) {
+	return src + 1;
+    }
+    return next;
 }
 
 /*
@@ -953,30 +1016,92 @@ Tcl_UtfPrev(
     const char *src,		/* A location in a UTF-8 string. */
     const char *start)		/* Pointer to the beginning of the string */
 {
-    const char *look;
-    int i, byte;
-
-    look = --src;
-    for (i = 0; i < 4; i++) {
-	if (look < start) {
-	    if (src < start) {
-		src = start;
-	    }
-	    break;
-	}
-	byte = *((unsigned char *) look);
+    int trailBytesSeen = 0;	/* How many trail bytes have been verified? */
+    CONST char *fallback = src - 1;
+				/* If we cannot find a lead byte that might
+				 * start a prefix of a valid UTF byte sequence,
+				 * we will fallback to a one-byte back step */
+    unsigned char *look = (unsigned char *)fallback;
+				/* Start search at the fallback position */
+
+    /* Quick boundary case exit. */
+    if (fallback <= start) {
+	return start;
+    }
+
+    do {
+	unsigned char byte = look[0];
+
 	if (byte < 0x80) {
-	    break;
+	    /*
+	     * Single byte character. Either this is a correct previous
+	     * character, or it is followed by at least one trail byte
+	     * which indicates a malformed sequence. In either case the
+	     * correct result is to return the fallback.
+	     */
+	    return fallback;
 	}
 	if (byte >= 0xC0) {
-	    if (totalBytes[byte] <= i) {
-		break;
+	    /* Non-trail byte; May be multibyte lead. */
+
+	    if ((trailBytesSeen == 0)
+		/*
+		 * We've seen no trailing context to use to check
+		 * anything. From what we know, this non-trail byte
+		 * is a prefix of a previous character, and accepting
+		 * it (the fallback) is correct.
+		 */
+
+		    || (trailBytesSeen >= totalBytes[byte])) {
+		/*
+		 * That is, (1 + trailBytesSeen > needed).
+		 * We've examined more bytes than needed to complete
+		 * this lead byte. No matter about well-formedness or
+		 * validity, the sequence starting with this lead byte
+		 * will never include the fallback location, so we must
+		 * return the fallback location. See test utf-7.17
+		 */
+		return fallback;
 	    }
-	    return look;
+
+	    /*
+	     * trailBytesSeen > 0, so we can examine look[1] safely.
+	     * Use that capability to screen out overlong sequences.
+	     */
+
+	    if (Invalid(look)) {
+		/* Reject */
+		return fallback;
+	    }
+	    return (CONST char *)look;
+	}
+
+	/* We saw a trail byte. */
+	trailBytesSeen++;
+
+	if ((CONST char *)look == start) {
+	    /*
+	     * Do not read before the start of the string
+	     *
+	     * If we get here, we've examined bytes at every location
+	     * >= start and < src and all of them are trail bytes,
+	     * including (*start).  We need to return our fallback
+	     * and exit this loop before we run past the start of the string.
+	     */
+	    return fallback;
 	}
+
+	/* Continue the search backwards... */
 	look--;
-    }
-    return src;
+    } while (trailBytesSeen < 4);
+
+    /*
+     * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a
+     * properly formed byte sequence to find, and we can stop looking,
+     * accepting the fallback (for TCL_UTF_MAX > 3) or just go back as
+     * far as we can.
+     */
+    return fallback;
 }
 
 /*
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2020-04-24 10:23:33 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2020-04-24 10:23:33 (GMT)
commit	da116b95a31241dc3abb6eed7d8f8eb3a9487232 (patch)
tree	3b98b64461c97646bdbd3e8235aad6b7bf1bb2c3 /generic/tclUtf.c
parent	b48319b304980c06ca5dd093770f8234eb8dbec5 (diff)
parent	2ca7ab9af0d59c9907dde3d844e1785d33df4812 (diff)
download	tcl-da116b95a31241dc3abb6eed7d8f8eb3a9487232.zip tcl-da116b95a31241dc3abb6eed7d8f8eb3a9487232.tar.gz tcl-da116b95a31241dc3abb6eed7d8f8eb3a9487232.tar.bz2