summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authordgp <dgp@users.sourceforge.net>2020-05-06 19:48:34 (GMT)
committerdgp <dgp@users.sourceforge.net>2020-05-06 19:48:34 (GMT)
commit289b43aefdea045540bf95afbd07b55ec9c8a16e (patch)
treea71231c3acbb0fc376aa0e2344dd0ec00d944ea5 /generic
parent1a11b74770f279cecda0b4a31214558b9a78c229 (diff)
parent50fa02d30ebf14022416a55dc235e7e40d3c146b (diff)
downloadtcl-289b43aefdea045540bf95afbd07b55ec9c8a16e.zip
tcl-289b43aefdea045540bf95afbd07b55ec9c8a16e.tar.gz
tcl-289b43aefdea045540bf95afbd07b55ec9c8a16e.tar.bz2
merge 8.6
Diffstat (limited to 'generic')
-rw-r--r--generic/tclUtf.c37
1 files changed, 26 insertions, 11 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index ac6f0d8..0d78f03 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -134,15 +134,23 @@ TclUtfCount(
*
* Invalid --
*
- * Utility routine to report whether /src/ points to the start of an
- * invald byte sequence that should be rejected. This might be because
- * it is an overlong encoding, or because it encodes something out of
- * the proper range. Caller guarantees that src[0] and src[1] are
- * readable, and
+ * Given a pointer to a two-byte prefix of a well-formed UTF-8 byte
+ * sequence (a lead byte followed by a trail byte) this routine
+ * examines those two bytes to determine whether the sequence is
+ * invalid in UTF-8. This might be because it is an overlong
+ * encoding, or because it encodes something out of the proper range.
*
- * (src[0] >= 0xC0) && (src[0] != 0xC1)
- * (src[1] >= 0x80) && (src[1] < 0xC0)
- * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF5 : 0xF0))
+ * Given a pointer to the bytes \xF8 or \xFC , this routine will
+ * try to read beyond the end of the "bounds" table. Callers must
+ * prevent this.
+ *
+ * Given a pointer to something else (an ASCII byte, a trail byte,
+ * or another byte that can never begin a valid byte sequence such
+ * as \xF5) this routine returns false. That makes the routine poorly
+ * named, as it does not detect and report all invalid sequences.
+ *
+ * Callers have to take care that this routine does something useful
+ * for their needs.
*
* Results:
* A boolean.
@@ -945,9 +953,6 @@ Tcl_UtfNext(
return src;
}
- if (Invalid(src)) {
- return src + 1;
- }
left = totalBytes[UCHAR(*src)];
next = src + 1;
while (--left) {
@@ -961,6 +966,16 @@ Tcl_UtfNext(
}
next++;
}
+ /*
+ * Call Invalid() here only if required conditions are met:
+ * src[0] is known a lead byte.
+ * src[1] is known a trail byte.
+ * Especially important to prevent calls when src[0] == '\xF8' or '\xFC'
+ * See tests utf-6.37 through utf-6.43 through valgrind or similar tool.
+ */
+ if ((next == src + 1) || Invalid(src)) {
+ return src + 1;
+ }
return next;
}