From 3b2cb79ab18e9506dfb66e2786a7ee04dd30a780 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 30 Aug 2016 13:00:32 +0000 Subject: Don't ever allow UTF-8 sequences of more than 4 characters to be generated or parsed, even when TCL_UTF_MAX>4: According to current Unicode standard, a byte string of >4 characters can never form a single UTF-8 character. And a few minor micro-optimizations related to UTF-8 handling. --- generic/tclUtf.c | 68 ++++++++++++++++++++------------------------------------ 1 file changed, 24 insertions(+), 44 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b878149..68119a4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -73,16 +73,7 @@ static const unsigned char totalBytes[256] = { #else 1,1,1,1,1,1,1,1, #endif -#if TCL_UTF_MAX > 4 - 5,5,5,5, -#else - 1,1,1,1, -#endif -#if TCL_UTF_MAX > 5 - 6,6,6,6 -#else - 1,1,1,1 -#endif + 1,1,1,1,1,1,1,1 }; /* @@ -111,14 +102,14 @@ INLINE static int UtfCount( int ch) /* The Tcl_UniChar whose size is returned. */ { - if ((ch > 0) && (ch < UNICODE_SELF)) { + if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { return 1; } if (ch <= 0x7FF) { return 2; } #if TCL_UTF_MAX > 3 - if ((ch > 0xFFFF) && (ch <= 0x10FFFF)) { + if (((unsigned)(ch - 0x10000) <= 0xfffff)) { return 4; } #endif @@ -152,7 +143,7 @@ Tcl_UniCharToUtf( * large enough to hold the UTF-8 character * (at most TCL_UTF_MAX bytes). */ { - if ((ch > 0) && (ch < UNICODE_SELF)) { + if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; return 1; } @@ -180,11 +171,7 @@ Tcl_UniCharToUtf( } } #endif - three: - buf[2] = (char) ((ch | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 12) | 0xE0); - return 3; + goto three; } #if TCL_UTF_MAX > 3 @@ -199,7 +186,11 @@ Tcl_UniCharToUtf( } ch = 0xFFFD; - goto three; +three: + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; } /* @@ -314,9 +305,6 @@ Tcl_UtfToUniChar( * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ - - *chPtr = (Tcl_UniChar) byte; - return 1; } else if (byte < 0xF0) { if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* @@ -332,31 +320,23 @@ Tcl_UtfToUniChar( * A three-byte-character lead-byte not followed by two trail-bytes * represents itself. */ - - *chPtr = (Tcl_UniChar) byte; - return 1; } #if TCL_UTF_MAX > 3 - { - int ch, total, trail; - - total = totalBytes[byte]; - trail = total - 1; - if (trail > 0) { - ch = byte & (0x3F >> trail); - do { - src++; - if ((*src & 0xC0) != 0x80) { - *chPtr = byte; - return 1; - } - ch <<= 6; - ch |= (*src & 0x3F); - trail--; - } while (trail > 0); - *chPtr = ch; - return total; + else if (byte < 0xF8) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { + /* + * Four-byte-character lead byte followed by three trail bytes. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12) + | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); + return 4; } + + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ } #endif -- cgit v0.12