diff options
-rw-r--r-- | generic/tclEncoding.c | 6 | ||||
-rw-r--r-- | generic/tclUtf.c | 57 |
2 files changed, 33 insertions, 30 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index fe2b55b..dfa7907 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2244,9 +2244,9 @@ UtfExtToUtfIntProc( * * UtfToUtfProc -- * - * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation - * is not a no-op, because it will turn a stream of improperly formed - * UTF-8 into a properly formed stream. + * Converts from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation + * is not a no-op, because it turns a stream of improperly formed + * UTF-8 into a properly-formed stream. * * Results: * Returns TCL_OK if conversion was successful. diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 8931b39..e4d0fc8 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -206,12 +206,11 @@ Invalid( * * Tcl_UniCharToUtf -- * - * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the + * Stores the given Tcl_UniChar as a sequence of UTF-8 bytes in the * provided buffer. Equivalent to Plan 9 runetochar(). * * Results: - * The return values is the number of bytes in the buffer that were - * consumed. + * Returns the number of bytes stored into the buffer. * * Side effects: * None. @@ -234,8 +233,8 @@ Tcl_UniCharToUtf( } if (ch >= 0) { if (ch <= 0x7FF) { - buf[1] = (char) ((ch | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 6) | 0xC0); + buf[1] = (char) (0x80 | (0x3F & ch)); + buf[0] = (char) (0xC0 | (ch >> 6)); return 2; } if (ch <= 0xFFFF) { @@ -243,10 +242,11 @@ Tcl_UniCharToUtf( if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ - if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) { + if ( (0x80 == (0xC0 & buf[0])) + && (0 == (0xCF & buf[1]))) { /* Previous Tcl_UniChar was a high surrogate, so combine */ - buf[2] = (char) ((ch & 0x3F) | 0x80); - buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80); + buf[2] = (char) (0x80 | (0x3F & ch)); + buf[1] |= (char) (0x80 | (0x0F & (ch >> 6))); return 3; } /* Previous Tcl_UniChar was not a high surrogate, so just output */ @@ -256,8 +256,8 @@ Tcl_UniCharToUtf( /* Fill buffer with specific 3-byte (invalid) byte combination, so following low surrogate can recognize it and combine */ buf[2] = (char) ((ch << 4) & 0x30); - buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); - buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); + buf[1] = (char) (0x80 | (0x3F & (ch >> 2))); + buf[0] = (char) (0xF0 | (0x07 & (ch >> 8))); return 1; } } @@ -267,20 +267,23 @@ Tcl_UniCharToUtf( #if TCL_UTF_MAX > 3 if (ch <= 0x10FFFF) { - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 18) | 0xF0); + buf[3] = (char) (0x80 | (0x3F & ch)); + buf[2] = (char) (0x80 | (0x3F & (ch >> 6))); + buf[1] = (char) (0x80 | (0x3F & (ch >> 12))); + buf[0] = (char) (0xF0 | (ch >> 18)); return 4; } } else if (ch == -1) { - if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0) - && ((buf[-1] & 0xF8) == 0xF0)) { - ch = 0xD7C0 + ((buf[-1] & 0x07) << 8) + ((buf[0] & 0x3F) << 2) - + ((buf[1] & 0x30) >> 4); - buf[1] = (char) ((ch | 0x80) & 0xBF); - buf[0] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[-1] = (char) ((ch >> 12) | 0xE0); + if ( (0x80 == (0xC0 & buf[0])) + && (0 == (0xCF & buf[1])) + && (0xF0 == (0xF8 & buf[-1]))) { + ch = 0xD7C0 + + ((0x07 & buf[-1]) << 8) + + ((0x3F & buf[0]) << 2) + + ((0x30 & buf[1]) >> 4); + buf[1] = (char) (0x80 | (0x3F & ch)); + buf[0] = (char) (0x80 | (0x3F & (ch >> 6))); + buf[-1] = (char) (0xE0 | (ch >> 12)); return 2; } #endif @@ -288,9 +291,9 @@ Tcl_UniCharToUtf( ch = 0xFFFD; three: - buf[2] = (char) ((ch | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 12) | 0xE0); + buf[2] = (char) (0x80 | (0x3F & ch)); + buf[1] = (char) (0x80 | (0x3F & (ch >> 6))); + buf[0] = (char) (0xE0 | (ch >> 12)); return 3; } @@ -2386,7 +2389,7 @@ TclUniCharMatch( * * TclUtfToUCS4 -- * - * Extract the 4-byte codepoint from the leading bytes of the + * Extracts the 4-byte codepoint from the leading bytes of the * Modified UTF-8 string "src". This is a utility routine to * contain the surrogate gymnastics in one place. * @@ -2398,8 +2401,8 @@ TclUniCharMatch( * enough bytes remain in the string. * * Results: - * *usc4Ptr is filled with the UCS4 code point, and the return value is - * the number of bytes from the UTF-8 string that were consumed. + * Fills *usc4Ptr with the UCS4 code point and returns the number of bytes + * consumed from the source string. * * Side effects: * None. |