From 829350cc1a1b9d065ccda17583ee55c731598f13 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 6 Jun 2017 09:02:28 +0000 Subject: [67aa9a2070] Tcl_UtfToUniChar returns single byte for invalid UTF-8 input as documented. --- generic/tclUtf.c | 127 +++++++++++++++++++++------------------------------- tests/encoding.test | 25 +++++++++++ 2 files changed, 77 insertions(+), 75 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e5497a4..3937141 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -73,16 +73,7 @@ static CONST unsigned char totalBytes[256] = { #else 1,1,1,1,1,1,1,1, #endif -#if TCL_UTF_MAX > 4 - 5,5,5,5, -#else - 1,1,1,1, -#endif -#if TCL_UTF_MAX > 5 - 6,6,6,6 -#else - 1,1,1,1 -#endif + 1,1,1,1,1,1,1,1 }; /* @@ -111,25 +102,16 @@ INLINE static int UtfCount( int ch) /* The Tcl_UniChar whose size is returned. */ { - if ((ch > 0) && (ch < UNICODE_SELF)) { + if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { return 1; } if (ch <= 0x7FF) { return 2; } - if (ch <= 0xFFFF) { - return 3; - } #if TCL_UTF_MAX > 3 - if (ch <= 0x1FFFFF) { + if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) { return 4; } - if (ch <= 0x3FFFFFF) { - return 5; - } - if (ch <= 0x7FFFFFFF) { - return 6; - } #endif return 3; } @@ -161,7 +143,7 @@ Tcl_UniCharToUtf( * large enough to hold the UTF-8 character * (at most TCL_UTF_MAX bytes). */ { - if ((ch > 0) && (ch < UNICODE_SELF)) { + if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; return 1; } @@ -172,43 +154,43 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { - three: - buf[2] = (char) ((ch | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 12) | 0xE0); - return 3; +#if TCL_UTF_MAX == 4 + if ((ch & 0xF800) == 0xD800) { + if (ch & 0x0400) { + /* Low surrogate */ + buf[3] = (char) ((ch | 0x80) & 0xBF); + buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); + return 4; + } else { + /* High surrogate */ + ch += 0x40; + buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); + buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); + buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); + return 0; + } + } +#endif + goto three; } #if TCL_UTF_MAX > 3 - if (ch <= 0x1FFFFF) { + if (ch <= 0x10FFFF) { buf[3] = (char) ((ch | 0x80) & 0xBF); buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } - if (ch <= 0x3FFFFFF) { - buf[4] = (char) ((ch | 0x80) & 0xBF); - buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 24) | 0xF8); - return 5; - } - if (ch <= 0x7FFFFFFF) { - buf[5] = (char) ((ch | 0x80) & 0xBF); - buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 30) | 0xFC); - return 6; - } #endif } ch = 0xFFFD; - goto three; +three: + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; } /* @@ -316,16 +298,15 @@ Tcl_UtfToUniChar( */ *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); - return 2; + if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { + return 2; + } } /* * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ - - *chPtr = (Tcl_UniChar) byte; - return 1; } else if (byte < 0xF0) { if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* @@ -334,38 +315,34 @@ Tcl_UtfToUniChar( *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); - return 3; + if (*chPtr > 0x7ff) { + return 3; + } } /* * A three-byte-character lead-byte not followed by two trail-bytes * represents itself. */ - - *chPtr = (Tcl_UniChar) byte; - return 1; } #if TCL_UTF_MAX > 3 - { - int ch, total, trail; - - total = totalBytes[byte]; - trail = total - 1; - if (trail > 0) { - ch = byte & (0x3F >> trail); - do { - src++; - if ((*src & 0xC0) != 0x80) { - *chPtr = byte; - return 1; - } - ch <<= 6; - ch |= (*src & 0x3F); - trail--; - } while (trail > 0); - *chPtr = ch; - return total; + else if (byte < 0xF8) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { + /* + * Four-byte-character lead byte followed by three trail bytes. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) + | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); + if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { + return 4; + } } + + /* + * A four-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ } #endif @@ -1038,7 +1015,7 @@ Tcl_UtfNcmp( /* * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the - * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001 + * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001 * (the byte 0x01.) */ @@ -1555,7 +1532,7 @@ Tcl_UniCharIsSpace( if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { return TclIsSpaceProc((char) ch); - } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) { + } else if ((Tcl_UniChar) ch == 0x180E || (Tcl_UniChar) ch == 0x202F) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); diff --git a/tests/encoding.test b/tests/encoding.test index aa50360..85deb33 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -439,6 +439,31 @@ test encoding-24.3 {EscapeFreeProc on open channels} {stdio} { list $count [viewable $line] } [list 3 "\u4e4e\u4e5e\u4e5f (\\u4e4e\\u4e5e\\u4e5f)"] +test encoding-24.4 {Parse valid or invalid utf-8} { + string length [encoding convertfrom utf-8 "\xc0\x80"] +} 1 +test encoding-24.5 {Parse valid or invalid utf-8} { + string length [encoding convertfrom utf-8 "\xc0\x81"] +} 2 +test encoding-24.6 {Parse valid or invalid utf-8} { + string length [encoding convertfrom utf-8 "\xc1\xbf"] +} 2 +test encoding-24.7 {Parse valid or invalid utf-8} { + string length [encoding convertfrom utf-8 "\xc2\x80"] +} 1 +test encoding-24.8 {Parse valid or invalid utf-8} { + string length [encoding convertfrom utf-8 "\xe0\x80\x80"] +} 3 +test encoding-24.9 {Parse valid or invalid utf-8} { + string length [encoding convertfrom utf-8 "\xe0\x9f\xbf"] +} 3 +test encoding-24.10 {Parse valid or invalid utf-8} { + string length [encoding convertfrom utf-8 "\xe0\xa0\x80"] +} 1 +test encoding-24.11 {Parse valid or invalid utf-8} { + string length [encoding convertfrom utf-8 "\xef\xbf\xbf"] +} 1 + file delete [file join [temporaryDirectory] iso2022.txt] # -- cgit v0.12