1 files changed, 22 insertions, 77 deletions
diff --git a/generic/tkUtil.c b/generic/tkUtil.c
index fb796fd..a266cb3 100644
--- a/generic/tkUtil.c
+++ b/generic/tkUtil.c
@@ -1193,24 +1193,15 @@ TkSendVirtualEvent(
     Tk_QueueWindowEvent(&event.general, TCL_QUEUE_TAIL);
 }
 
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
 /*
  *---------------------------------------------------------------------------
  *
- * TkUtfToUniChar32 --
+ * TkUtfToUniChar2 --
  *
- *	Copied from Tcl_UtfToUniChar but using int instead of Tcl_UniChar!
- *
- *	Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8
- *	sequences are converted to valid Tcl_UniChars and processing
- *	continues. Equivalent to Plan 9 chartorune().
- *
- *	The caller must ensure that the source buffer is long enough that this
- *	routine does not run off the end and dereference non-existent memory
- *	looking for trail bytes. If the source buffer is known to be '\0'
- *	terminated, this cannot happen. Otherwise, the caller should call
- *	Tcl_UtfCharComplete() before calling this routine to ensure that
- *	enough bytes remain in the string.
+ *	Almost the same as Tcl_UtfToUniChar but using int instead of Tcl_UniChar.
+ *	This function is capable of collapsing a upper/lower pair to a single
+ *	unicode character. So, up to 6 bytes (two UTF-8 characters) might be read.
  *
  * Results:
  *	*chPtr is filled with the Tcl_UniChar, and the return value is the
@@ -1223,75 +1214,29 @@ TkSendVirtualEvent(
  */
 
 int
-TkUtfToUniChar32(
+TkUtfToUniChar2(
     const char *src,	/* The UTF-8 string. */
     int *chPtr)		/* Filled with the Tcl_UniChar represented by
 			 * the UTF-8 string. */
 {
-    int byte;
-
-    /*
-     * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
-     */
-
-    byte = *((unsigned char *) src);
-    if (byte < 0xC0) {
-	/*
-	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
-	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
-	 * characters representing themselves.
-	 */
-
-	*chPtr = byte;
-	return 1;
-    } else if (byte < 0xE0) {
-	if ((src[1] & 0xC0) == 0x80) {
-	    /*
-	     * Two-byte-character lead-byte followed by a trail-byte.
-	     */
-
-	    *chPtr = ((byte & 0x1F) << 6) | (src[1] & 0x3F);
-	    return 2;
+    Tcl_UniChar uniChar = 0;
+
+    int len = Tcl_UtfToUniChar(src, &uniChar);
+    if ((uniChar & 0xfc00) == 0xd800) {
+	Tcl_UniChar high = uniChar;
+	/* This can only happen when Tcl is compiled with TCL_UTF_MAX=4,
+	 * or when a high surrogate character is detected */
+	int len2 = Tcl_UtfToUniChar(src+len, &uniChar);
+	if ((uniChar & 0xfc00) == 0xdc00) {
+	    *chPtr = ((high & 0x3ff) << 10) | (uniChar & 0x3ff) | 0x10000;
+	    len += len2;
+	} else {
+	    *chPtr = high;
 	}
-
-	/*
-	 * A two-byte-character lead-byte not followed by trail-byte
-	 * represents itself.
-	 */
-    } else if (byte < 0xF0) {
-	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
-	    /*
-	     * Three-byte-character lead byte followed by two trail bytes.
-	     */
-
-	    *chPtr = ((byte & 0x0F) << 12)
-		    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F);
-	    return 3;
-	}
-
-	/*
-	 * A three-byte-character lead-byte not followed by two trail-bytes
-	 * represents itself.
-	 */
-    } else if (byte < 0xF8) {
-	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
-	    /*
-	     * Four-byte-character lead byte followed by three trail bytes.
-	     */
-
-	    *chPtr = ((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
-		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F);
-	    return 4;
-	}
-
-	/*
-	 * A three-byte-character lead-byte not followed by two trail-bytes
-	 * represents itself.
-	 */
+    } else {
+	*chPtr = uniChar;
     }
-
-    *chPtr = byte;
-    return 1;
+    return len;
 }
 #endif
 /*