diff options
author | dgp <dgp@users.sourceforge.net> | 2020-04-17 21:07:09 (GMT) |
---|---|---|
committer | dgp <dgp@users.sourceforge.net> | 2020-04-17 21:07:09 (GMT) |
commit | aa9bb7f9e401573bc8c79e8336fdb74636b2702f (patch) | |
tree | ac4693765a447b4d7ae2691194c8977683897119 | |
parent | 4520aa1ca30a7b09dc9cfc4bc9007aa262793711 (diff) | |
download | tcl-aa9bb7f9e401573bc8c79e8336fdb74636b2702f.zip tcl-aa9bb7f9e401573bc8c79e8336fdb74636b2702f.tar.gz tcl-aa9bb7f9e401573bc8c79e8336fdb74636b2702f.tar.bz2 |
[493dccc2de] Revise sequence validity check to reject out of range decodes too.
-rw-r--r-- | generic/tclUtf.c | 53 |
1 files changed, 27 insertions, 26 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b5c430b..1883804 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -81,7 +81,7 @@ static CONST unsigned char totalBytes[256] = { */ static int UtfCount(int ch); -static int Overlong(unsigned char *src); +static int Invalid(unsigned char *src); /* *--------------------------------------------------------------------------- @@ -120,51 +120,52 @@ UtfCount( /* *--------------------------------------------------------------------------- * - * Overlong -- + * Invalid -- * * Utility routine to report whether /src/ points to the start of an - * overlong byte sequence that should be rejected. Caller guarantees - * that src[0] and src[1] are readable, and + * invald byte sequence that should be rejected. This might be because + * it is an overlong encoding, or because it encodes something out of + * the proper range. Caller guarantees that src[0] and src[1] are + * readable, and * * (src[0] >= 0xC0) && (src[0] != 0xC1) * (src[1] >= 0x80) && (src[1] < 0xC0) - * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF8 : 0xF0)) + * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF5 : 0xF0)) * * Results: * A boolean. *--------------------------------------------------------------------------- */ -static CONST unsigned char overlong[3] = { - 0x80, /* \xD0 -- all sequences valid */ - 0xA0, /* \xE0\x80 through \xE0\x9F are invalid prefixes */ +static CONST unsigned char bounds[28] = { + 0x80, 0x80, /* \xC0 accepts \x80 only */ + 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, + 0x80, 0xBF, /* (\xC4 - \xDC) -- all sequences valid */ + 0xA0, 0xBF, /* \xE0\x80 through \xE0\x9F are invalid prefixes */ + 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */ #if TCL_UTF_MAX > 3 - 0x90 /* \xF0\x80 through \xF0\x8F are invalid prefixes */ + 0x90, 0xBF, /* \xF0\x80 through \xF0\x8F are invalid prefixes */ + 0x80, 0x8F /* \xF4\x90 and higher are invalid prefixes */ #else - 0xC0 /* Not used, but reject all again for safety. */ + 0xC0, 0xBF, /* Not used, but reject all again for safety. */ + 0xC0, 0xBF /* Not used, but reject all again for safety. */ #endif }; INLINE static int -Overlong( +Invalid( unsigned char *src) /* Points to lead byte of a UTF-8 byte sequence */ { unsigned char byte = *src; + int index; - if (byte % 0x10) { - /* Only lead bytes 0xC0, 0xE0, 0xF0 need examination */ + if (byte % 0x04) { + /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */ return 0; } - if (byte == 0xC0) { - if (src[1] == 0x80) { - /* Valid sequence: \xC0\x80 for \u0000 */ - return 0; - } - /* Reject overlong: \xC0\x81 - \xC0\xBF */ - return 1; - } - if (src[1] < overlong[(byte >> 4) - 0x0D]) { - /* Reject overlong */ + index = (byte - 0xC0) >> 1; + if (src[1] < bounds[index] || src[1] > bounds[index+1]) { + /* Out of bounds - report invalid. */ return 1; } return 0; @@ -733,7 +734,7 @@ Tcl_UtfNext( } next++; } - if (Overlong((unsigned char *)src)) { + if (Invalid((unsigned char *)src)) { return src + 1; } return next; @@ -843,10 +844,10 @@ Tcl_UtfPrev( /* * trailBytesSeen > 0, so we can examine look[1] safely. - * Use that capability to screen out overlong sequences. + * Use that capability to screen out invalid sequences. */ - if (Overlong(look)) { + if (Invalid(look)) { /* Reject */ return fallback; } |