diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2016-08-30 13:07:10 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2016-08-30 13:07:10 (GMT) |
commit | 5eb346e0d25b2ae446797b610058911dc3b65a63 (patch) | |
tree | 4c13a17c50a6283243312fe0ed36e17d16d4dfc9 /generic/tclUtf.c | |
parent | 6eb9e4367cece63b5ba975098c2d3ad41915a0fa (diff) | |
parent | d84492f3906d20d05b547a4fa90286fe0a59bb37 (diff) | |
download | tcl-5eb346e0d25b2ae446797b610058911dc3b65a63.zip tcl-5eb346e0d25b2ae446797b610058911dc3b65a63.tar.gz tcl-5eb346e0d25b2ae446797b610058911dc3b65a63.tar.bz2 |
Don't ever allow UTF-8 sequences of more than 4 characters to be generated or parsed, even when TCL_UTF_MAX>4: According to current Unicode standard, a byte string of >4 characters can never form a single UTF-8 character.
And a few minor micro-optimizations related to UTF-8 handling.
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 68 |
1 files changed, 24 insertions, 44 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 6c4cb7f..b33bf6a 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -73,16 +73,7 @@ static const unsigned char totalBytes[256] = { #else 1,1,1,1,1,1,1,1, #endif -#if TCL_UTF_MAX > 4 - 5,5,5,5, -#else - 1,1,1,1, -#endif -#if TCL_UTF_MAX > 5 - 6,6,6,6 -#else - 1,1,1,1 -#endif + 1,1,1,1,1,1,1,1 }; /* @@ -105,14 +96,14 @@ int TclUtfCount( int ch) /* The Tcl_UniChar whose size is returned. */ { - if ((ch > 0) && (ch < UNICODE_SELF)) { + if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { return 1; } if (ch <= 0x7FF) { return 2; } #if TCL_UTF_MAX > 3 - if ((ch > 0xFFFF) && (ch <= 0x10FFFF)) { + if (((unsigned)(ch - 0x10000) <= 0xfffff)) { return 4; } #endif @@ -146,7 +137,7 @@ Tcl_UniCharToUtf( * large enough to hold the UTF-8 character * (at most TCL_UTF_MAX bytes). */ { - if ((ch > 0) && (ch < UNICODE_SELF)) { + if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; return 1; } @@ -174,11 +165,7 @@ Tcl_UniCharToUtf( } } #endif - three: - buf[2] = (char) ((ch | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 12) | 0xE0); - return 3; + goto three; } #if TCL_UTF_MAX > 3 @@ -193,7 +180,11 @@ Tcl_UniCharToUtf( } ch = 0xFFFD; - goto three; +three: + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; } /* @@ -308,9 +299,6 @@ Tcl_UtfToUniChar( * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ - - *chPtr = (Tcl_UniChar) byte; - return 1; } else if (byte < 0xF0) { if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* @@ -326,31 +314,23 @@ Tcl_UtfToUniChar( * A three-byte-character lead-byte not followed by two trail-bytes * represents itself. */ - - *chPtr = (Tcl_UniChar) byte; - return 1; } #if TCL_UTF_MAX > 3 - { - int ch, total, trail; - - total = totalBytes[byte]; - trail = total - 1; - if (trail > 0) { - ch = byte & (0x3F >> trail); - do { - src++; - if ((*src & 0xC0) != 0x80) { - *chPtr = byte; - return 1; - } - ch <<= 6; - ch |= (*src & 0x3F); - trail--; - } while (trail > 0); - *chPtr = ch; - return total; + else if (byte < 0xF8) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { + /* + * Four-byte-character lead byte followed by three trail bytes. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12) + | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); + return 4; } + + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ } #endif |