diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-03-02 16:04:59 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-03-02 16:04:59 (GMT) |
commit | bfaca509637e46e0ffd48c20a60b78c617c7bf44 (patch) | |
tree | 4a8907259172a0e3fb26c1592c5f5a9c9169d90c /generic/tclUtf.c | |
parent | cc3041cee0463eae2d11f0125f3921b66f67497a (diff) | |
download | tcl-bfaca509637e46e0ffd48c20a60b78c617c7bf44.zip tcl-bfaca509637e46e0ffd48c20a60b78c617c7bf44.tar.gz tcl-bfaca509637e46e0ffd48c20a60b78c617c7bf44.tar.bz2 |
Backport [bd94500678e837d7] from 8.7, preventing endless loops in UTF-8 conversions when handling surrogates. Only effective when compiling with -DTCL_UTF_MAX=4|6 (default: 3). Meant for benefit of Androwish.
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 141 |
1 files changed, 75 insertions, 66 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b33bf5f..40dc29f 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -158,23 +158,22 @@ Tcl_UniCharToUtf( if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ - if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) - && ((buf[2] & 0xCF) == 0)) { - /* Previous Tcl_UniChar was a High surrogate, so combine */ - buf[3] = (char) ((ch & 0x3F) | 0x80); - buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); - return 4; + if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) { + /* Previous Tcl_UniChar was a high surrogate, so combine */ + buf[2] = (char) ((ch & 0x3F) | 0x80); + buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80); + return 3; } - /* Previous Tcl_UniChar was not a High surrogate, so just output */ + /* Previous Tcl_UniChar was not a high surrogate, so just output */ } else { /* High surrogate */ ch += 0x40; /* Fill buffer with specific 3-byte (invalid) byte combination, - so following Low surrogate can recognize it and combine */ + so following low surrogate can recognize it and combine */ buf[2] = (char) ((ch << 4) & 0x30); buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); - return 0; + return 1; } } #endif @@ -190,11 +189,14 @@ Tcl_UniCharToUtf( return 4; } } else if (ch == -1) { - if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) - && ((buf[2] & 0xCF) == 0)) { - ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2) - + ((buf[2] & 0x30) >> 4); - goto three; + if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0) + && ((buf[-1] & 0xF8) == 0xF0)) { + ch = 0xD7C0 + ((buf[-1] & 0x07) << 8) + ((buf[0] & 0x3F) << 2) + + ((buf[1] & 0x30) >> 4); + buf[1] = (char) ((ch | 0x80) & 0xBF); + buf[0] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[-1] = (char) ((ch >> 12) | 0xE0); + return 2; } #endif } @@ -298,7 +300,7 @@ Tcl_UtfToUniChar( register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { - register int byte; + Tcl_UniChar byte; /* * Unroll 1 to 3 (or 4) byte UTF-8 sequences. @@ -312,7 +314,21 @@ Tcl_UtfToUniChar( * characters representing themselves. */ - *chPtr = (Tcl_UniChar) byte; +#if TCL_UTF_MAX == 4 + /* If *chPtr contains a high surrogate (produced by a previous + * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation + * bytes, then we must produce a follow-up low surrogate. We only + * do that if the high surrogate matches the bits we encounter. + */ + if ((byte >= 0x80) + && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC)) + && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80)) + && ((src[2] & 0xC0) == 0x80)) { + *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; + return 3; + } +#endif + *chPtr = byte; return 1; } else if (byte < 0xE0) { if ((src[1] & 0xC0) == 0x80) { @@ -320,7 +336,7 @@ Tcl_UtfToUniChar( * Two-byte-character lead-byte followed by a trail-byte. */ - *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); + *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F)); if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { return 2; } @@ -336,7 +352,7 @@ Tcl_UtfToUniChar( * Three-byte-character lead byte followed by two trail bytes. */ - *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) + *chPtr = (((byte & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); if (*chPtr > 0x7FF) { return 3; @@ -355,26 +371,19 @@ Tcl_UtfToUniChar( * Four-byte-character lead byte followed by three trail bytes. */ #if TCL_UTF_MAX == 4 - Tcl_UniChar surrogate; - - byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000; - surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10)); - if (byte & 0x100000) { + Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) + | ((src[2] & 0x3F) >> 4)) - 0x40; + if (high >= 0x400) { /* out of range, < 0x10000 or > 0x10ffff */ - } else if (*chPtr != surrogate) { - /* produce high surrogate, but don't advance source pointer */ - *chPtr = surrogate; - return 0; } else { - /* produce low surrogate, and advance source pointer */ - *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF)); - return 4; + /* produce high surrogate, advance source pointer */ + *chPtr = 0xD800 + high; + return 1; } #else - *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) + *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); - if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { + if ((*chPtr - 0x10000) <= 0xFFFFF) { return 4; } #endif @@ -387,7 +396,7 @@ Tcl_UtfToUniChar( } #endif - *chPtr = (Tcl_UniChar) byte; + *chPtr = byte; return 1; } @@ -578,8 +587,8 @@ Tcl_UtfFindFirst( len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX == 4 - if (!len) { - len += TclUtfToUniChar(src, &find); + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; } #endif @@ -626,8 +635,8 @@ Tcl_UtfFindLast( len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX == 4 - if (!len) { - len += TclUtfToUniChar(src, &find); + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; } #endif @@ -669,8 +678,8 @@ Tcl_UtfNext( int len = TclUtfToUniChar(src, &ch); #if TCL_UTF_MAX == 4 - if (len == 0) { - len = TclUtfToUniChar(src, &ch); + if ((ch >= 0xD800) && (len < 3)) { + len += TclUtfToUniChar(src + len, &ch); } #endif return src + len; @@ -779,15 +788,15 @@ Tcl_UtfAtIndex( register int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; - int len = 1; + int len = 0; while (index-- > 0) { len = TclUtfToUniChar(src, &ch); src += len; } #if TCL_UTF_MAX == 4 - if (!len) { - /* Index points at character following High Surrogate */ + if ((ch >= 0xD800) && (len < 3)) { + /* Index points at character following high Surrogate */ src += TclUtfToUniChar(src, &ch); } #endif @@ -871,7 +880,7 @@ Tcl_UtfToUpper( { Tcl_UniChar ch = 0, upChar; char *src, *dst; - int bytes; + int len; /* * Iterate over the string until we hit the terminating null. @@ -879,7 +888,7 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { - bytes = TclUtfToUniChar(src, &ch); + len = TclUtfToUniChar(src, &ch); upChar = Tcl_UniCharToUpper(ch); /* @@ -888,13 +897,13 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if (bytes < UtfCount(upChar)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; + if (len < UtfCount(upChar)) { + memcpy(dst, src, len); + dst += len; } else { dst += Tcl_UniCharToUtf(upChar, dst); } - src += bytes; + src += len; } *dst = '\0'; return (dst - str); @@ -924,7 +933,7 @@ Tcl_UtfToLower( { Tcl_UniChar ch = 0, lowChar; char *src, *dst; - int bytes; + int len; /* * Iterate over the string until we hit the terminating null. @@ -932,7 +941,7 @@ Tcl_UtfToLower( src = dst = str; while (*src) { - bytes = TclUtfToUniChar(src, &ch); + len = TclUtfToUniChar(src, &ch); lowChar = Tcl_UniCharToLower(ch); /* @@ -941,13 +950,13 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if (bytes < UtfCount(lowChar)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; + if (len < UtfCount(lowChar)) { + memcpy(dst, src, len); + dst += len; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } - src += bytes; + src += len; } *dst = '\0'; return (dst - str); @@ -978,7 +987,7 @@ Tcl_UtfToTitle( { Tcl_UniChar ch = 0, titleChar, lowChar; char *src, *dst; - int bytes; + int len; /* * Capitalize the first character and then lowercase the rest of the @@ -988,32 +997,32 @@ Tcl_UtfToTitle( src = dst = str; if (*src) { - bytes = TclUtfToUniChar(src, &ch); + len = TclUtfToUniChar(src, &ch); titleChar = Tcl_UniCharToTitle(ch); - if (bytes < UtfCount(titleChar)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; + if (len < UtfCount(titleChar)) { + memcpy(dst, src, len); + dst += len; } else { dst += Tcl_UniCharToUtf(titleChar, dst); } - src += bytes; + src += len; } while (*src) { - bytes = TclUtfToUniChar(src, &ch); + len = TclUtfToUniChar(src, &ch); lowChar = ch; /* Special exception for Georgian Asomtavruli chars, no titlecase. */ if ((unsigned)(lowChar - 0x1C90) >= 0x30) { lowChar = Tcl_UniCharToLower(lowChar); } - if (bytes < UtfCount(lowChar)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; + if (len < UtfCount(lowChar)) { + memcpy(dst, src, len); + dst += len; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } - src += bytes; + src += len; } *dst = '\0'; return (dst - str); |