diff options
Diffstat (limited to 'generic/tkUtil.c')
-rw-r--r-- | generic/tkUtil.c | 99 |
1 files changed, 22 insertions, 77 deletions
diff --git a/generic/tkUtil.c b/generic/tkUtil.c index fb796fd..a266cb3 100644 --- a/generic/tkUtil.c +++ b/generic/tkUtil.c @@ -1193,24 +1193,15 @@ TkSendVirtualEvent( Tk_QueueWindowEvent(&event.general, TCL_QUEUE_TAIL); } -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 /* *--------------------------------------------------------------------------- * - * TkUtfToUniChar32 -- + * TkUtfToUniChar2 -- * - * Copied from Tcl_UtfToUniChar but using int instead of Tcl_UniChar! - * - * Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8 - * sequences are converted to valid Tcl_UniChars and processing - * continues. Equivalent to Plan 9 chartorune(). - * - * The caller must ensure that the source buffer is long enough that this - * routine does not run off the end and dereference non-existent memory - * looking for trail bytes. If the source buffer is known to be '\0' - * terminated, this cannot happen. Otherwise, the caller should call - * Tcl_UtfCharComplete() before calling this routine to ensure that - * enough bytes remain in the string. + * Almost the same as Tcl_UtfToUniChar but using int instead of Tcl_UniChar. + * This function is capable of collapsing a upper/lower pair to a single + * unicode character. So, up to 6 bytes (two UTF-8 characters) might be read. * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the @@ -1223,75 +1214,29 @@ TkSendVirtualEvent( */ int -TkUtfToUniChar32( +TkUtfToUniChar2( const char *src, /* The UTF-8 string. */ int *chPtr) /* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { - int byte; - - /* - * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. - */ - - byte = *((unsigned char *) src); - if (byte < 0xC0) { - /* - * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid - * characters representing themselves. - */ - - *chPtr = byte; - return 1; - } else if (byte < 0xE0) { - if ((src[1] & 0xC0) == 0x80) { - /* - * Two-byte-character lead-byte followed by a trail-byte. - */ - - *chPtr = ((byte & 0x1F) << 6) | (src[1] & 0x3F); - return 2; + Tcl_UniChar uniChar = 0; + + int len = Tcl_UtfToUniChar(src, &uniChar); + if ((uniChar & 0xfc00) == 0xd800) { + Tcl_UniChar high = uniChar; + /* This can only happen when Tcl is compiled with TCL_UTF_MAX=4, + * or when a high surrogate character is detected */ + int len2 = Tcl_UtfToUniChar(src+len, &uniChar); + if ((uniChar & 0xfc00) == 0xdc00) { + *chPtr = ((high & 0x3ff) << 10) | (uniChar & 0x3ff) | 0x10000; + len += len2; + } else { + *chPtr = high; } - - /* - * A two-byte-character lead-byte not followed by trail-byte - * represents itself. - */ - } else if (byte < 0xF0) { - if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { - /* - * Three-byte-character lead byte followed by two trail bytes. - */ - - *chPtr = ((byte & 0x0F) << 12) - | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F); - return 3; - } - - /* - * A three-byte-character lead-byte not followed by two trail-bytes - * represents itself. - */ - } else if (byte < 0xF8) { - if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { - /* - * Four-byte-character lead byte followed by three trail bytes. - */ - - *chPtr = ((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F); - return 4; - } - - /* - * A three-byte-character lead-byte not followed by two trail-bytes - * represents itself. - */ + } else { + *chPtr = uniChar; } - - *chPtr = byte; - return 1; + return len; } #endif /* |