From 30f444a254b28780964a86da83bd6a3d1ca60271 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 6 May 2020 16:58:51 +0000 Subject: The routine Invalid() has been revised to do something different. Update the comments to describe what it does now, and cautions that callers take into account. --- generic/tclUtf.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 60e475a..199ff83 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -122,15 +122,23 @@ UtfCount( * * Invalid -- * - * Utility routine to report whether /src/ points to the start of an - * invald byte sequence that should be rejected. This might be because - * it is an overlong encoding, or because it encodes something out of - * the proper range. Caller guarantees that src[0] and src[1] are - * readable, and - * - * (src[0] >= 0xC0) && (src[0] != 0xC1) - * (src[1] >= 0x80) && (src[1] < 0xC0) - * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF5 : 0xF0)) + * Given a pointer to a two-byte prefix of a well-formed UTF-8 byte + * sequence (a lead byte followed by a trail byte) this routine + * examines those two bytes to determine whether the sequence is + * invalid in UTF-8. This might be because it is an overlong + * encoding, or because it encodes something out of the proper range. + * + * Given a pointer to the bytes \xF8 or \xFC , this routine will + * try to read beyond the end of the "bounds" table. Callers must + * prevent this. + * + * Given a pointer to something else (an ASCII byte, a trail byte, + * or another byte that can never begin a valid byte sequence such + * as \xF5) this routine returns false. That makes the routine poorly + * named, as it does not detect and report all invalid sequences. + * + * Callers have to take care that this routine does something useful + * for their needs. * * Results: * A boolean. -- cgit v0.12 From 391bf996873721fdcb8d68003d96121b378d2654 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 6 May 2020 19:22:25 +0000 Subject: Restore safe calls of Invalid(). --- generic/tclUtf.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 199ff83..24ec3d2 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -659,9 +659,6 @@ Tcl_UtfNext( int left; const char *next; - if (Invalid(src)) { - return src + 1; - } left = totalBytes[UCHAR(*src)]; next = src + 1; while (--left) { @@ -675,6 +672,16 @@ Tcl_UtfNext( } next++; } + /* + * Call Invalid() here only if required conditions are met: + * src[0] is known a lead byte. + * src[1] is known a trail byte. + * Especially important to prevent calls when src[0] == '\xF8' or '\xFC' + * See tests utf-6.37 through utf-6.43 through valgrind or similar tool. + */ + if ((next == src + 1) || Invalid(src)) { + return src + 1; + } return next; } -- cgit v0.12