diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-02-01 07:32:14 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-02-01 07:32:14 (GMT) |
commit | 537c8e77ba967fbb6f2ef1d7b2134420a3117bad (patch) | |
tree | 78e317741ddda739050efc0295f85fc3b0c413f1 | |
parent | 596e33bb9ec8c0083b2d6234c84afc293e525d24 (diff) | |
parent | 5866ef6d2acf4db24499c820df08a8feb88ea865 (diff) | |
download | tcl-537c8e77ba967fbb6f2ef1d7b2134420a3117bad.zip tcl-537c8e77ba967fbb6f2ef1d7b2134420a3117bad.tar.gz tcl-537c8e77ba967fbb6f2ef1d7b2134420a3117bad.tar.bz2 |
(cherry-pick) Make Tcl_UniCharToUtf more readable and add test to exercise surrogate handling.
-rw-r--r-- | generic/tclEncoding.c | 6 | ||||
-rw-r--r-- | generic/tclUtf.c | 57 |
2 files changed, 33 insertions, 30 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 2f7d803..288b07c 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2302,9 +2302,9 @@ BinaryProc( * * UtfToUtfProc -- * - * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation - * is not a no-op, because it will turn a stream of improperly formed - * UTF-8 into a properly formed stream. + * Converts from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation + * is not a no-op, because it turns a stream of improperly formed + * UTF-8 into a properly-formed stream. * * Results: * Returns TCL_OK if conversion was successful. diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 87216c2..db2be84 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -182,7 +182,7 @@ Invalid( * * Tcl_UniCharToUtf -- * - * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the + * Stores the given Tcl_UniChar as a sequence of UTF-8 bytes in the * provided buffer. Equivalent to Plan 9 runetochar(). * * Special handling of Surrogate pairs is handled as follows: @@ -199,8 +199,7 @@ Invalid( * representing the high surrogate. * * Results: - * The return values is the number of bytes in the buffer that were - * consumed. + * Returns the number of bytes stored into the buffer. * * Side effects: * None. @@ -223,18 +222,19 @@ Tcl_UniCharToUtf( } if (ch >= 0) { if (ch <= 0x7FF) { - buf[1] = (char) ((ch | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 6) | 0xC0); + buf[1] = (char) (0x80 | (0x3F & ch)); + buf[0] = (char) (0xC0 | (ch >> 6)); return 2; } if (ch <= 0xFFFF) { if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ - if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) { + if ( (0x80 == (0xC0 & buf[0])) + && (0 == (0xCF & buf[1]))) { /* Previous Tcl_UniChar was a high surrogate, so combine */ - buf[2] = (char) ((ch & 0x3F) | 0x80); - buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80); + buf[2] = (char) (0x80 | (0x3F & ch)); + buf[1] |= (char) (0x80 | (0x0F & (ch >> 6))); return 3; } /* Previous Tcl_UniChar was not a high surrogate, so just output */ @@ -244,37 +244,40 @@ Tcl_UniCharToUtf( /* Fill buffer with specific 3-byte (invalid) byte combination, so following low surrogate can recognize it and combine */ buf[2] = (char) ((ch << 4) & 0x30); - buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); - buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); + buf[1] = (char) (0x80 | (0x3F & (ch >> 2))); + buf[0] = (char) (0xF0 | (0x07 & (ch >> 8))); return 1; } } goto three; } if (ch <= 0x10FFFF) { - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 18) | 0xF0); + buf[3] = (char) (0x80 | (0x3F & ch)); + buf[2] = (char) (0x80 | (0x3F & (ch >> 6))); + buf[1] = (char) (0x80 | (0x3F & (ch >> 12))); + buf[0] = (char) (0xF0 | (ch >> 18)); return 4; } } else if (ch == -1) { - if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0) - && ((buf[-1] & 0xF8) == 0xF0)) { - ch = 0xD7C0 + ((buf[-1] & 0x07) << 8) + ((buf[0] & 0x3F) << 2) - + ((buf[1] & 0x30) >> 4); - buf[1] = (char) ((ch | 0x80) & 0xBF); - buf[0] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[-1] = (char) ((ch >> 12) | 0xE0); + if ( (0x80 == (0xC0 & buf[0])) + && (0 == (0xCF & buf[1])) + && (0xF0 == (0xF8 & buf[-1]))) { + ch = 0xD7C0 + + ((0x07 & buf[-1]) << 8) + + ((0x3F & buf[0]) << 2) + + ((0x30 & buf[1]) >> 4); + buf[1] = (char) (0x80 | (0x3F & ch)); + buf[0] = (char) (0x80 | (0x3F & (ch >> 6))); + buf[-1] = (char) (0xE0 | (ch >> 12)); return 2; } } ch = 0xFFFD; three: - buf[2] = (char) ((ch | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 12) | 0xE0); + buf[2] = (char) (0x80 | (0x3F & ch)); + buf[1] = (char) (0x80 | (0x3F & (ch >> 6))); + buf[0] = (char) (0xE0 | (ch >> 12)); return 3; } @@ -2948,7 +2951,7 @@ TclUniCharMatch( * * TclUtfToUCS4 -- * - * Extract the 4-byte codepoint from the leading bytes of the + * Extracts the 4-byte codepoint from the leading bytes of the * Modified UTF-8 string "src". This is a utility routine to * contain the surrogate gymnastics in one place. * @@ -2960,8 +2963,8 @@ TclUniCharMatch( * enough bytes remain in the string. * * Results: - * *usc4Ptr is filled with the UCS4 code point, and the return value is - * the number of bytes from the UTF-8 string that were consumed. + * Fills *usc4Ptr with the UCS4 code point and returns the number of bytes + * consumed from the source string. * * Side effects: * None. |