Better UTF-8 surrogate handling, only functional when TCL_UTF_MAX>3

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2017-06-08 08:26:58 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2017-06-08 08:26:58 (GMT)
commit: 16f3f234e8500f5f71e4d9321689a8bdf9efc809 (patch)
tree: 5d0ad393849e7c2d6b1b88d3f6d413ff2b505f14 /generic/tclUtf.c
parent: 73a3dfdeeabb1a43c73101b4b6a9826f83866b32 (diff)
download: tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.zip
tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.tar.gz
tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.tar.bz2
1 files changed, 49 insertions, 19 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 52b4291..db941e2 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -134,7 +134,7 @@ UtfCount(
  *---------------------------------------------------------------------------
  */
 
-INLINE int
+int
 Tcl_UniCharToUtf(
     int ch,			/* The Tcl_UniChar to be stored in the
 				 * buffer. */
@@ -259,6 +259,15 @@ Tcl_UniCharToUtfDString(
  *	Tcl_UtfCharComplete() before calling this routine to ensure that
  *	enough bytes remain in the string.
  *
+ *	If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done:
+ *	For any UTF-8 string containing a character outside of the BMP, the
+ *	first call to this function will fill *chPtr with the high surrogate
+ *	and generate a return value of 0. Calling Tcl_UtfToUniChar again
+ *	will produce the low surrogate and a return value of 4. Because *chPtr
+ *	is used to remember whether the high surrogate is already produced, it
+ *	is recommended to initialize the variable it points to as 0 before
+ *	the first call to Tcl_UtfToUniChar is done.
+ *
  * Results:
  *	*chPtr is filled with the Tcl_UniChar, and the return value is the
  *	number of bytes from the UTF-8 string that were consumed.
@@ -278,7 +287,7 @@ Tcl_UtfToUniChar(
     register int byte;
 
     /*
-     * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
+     * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
      */
 
     byte = *((unsigned char *) src);
@@ -331,12 +340,30 @@ Tcl_UtfToUniChar(
 	    /*
 	     * Four-byte-character lead byte followed by three trail bytes.
 	     */
-
+#if TCL_UTF_MAX == 4
+	    Tcl_UniChar surrogate;
+
+	    byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
+	    surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10));
+	    if (byte & 0x100000) {
+		/* out of range, < 0x10000 or > 0x10ffff */
+	    } else if (*chPtr != surrogate) {
+		/* produce high surrogate, but don't advance source pointer */
+		*chPtr = surrogate;
+		return 0;
+	    } else {
+		/* produce low surrogate, and advance source pointer */
+		*chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF));
+		return 4;
+	    }
+#else
 	    *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
 		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
 	    if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
 		return 4;
 	    }
+#endif
 	}
 
 	/*
@@ -377,7 +404,7 @@ Tcl_UtfToUniCharDString(
 				 * appended to this previously initialized
 				 * DString. */
 {
-    Tcl_UniChar *w, *wString;
+    Tcl_UniChar ch, *w, *wString;
     const char *p, *end;
     int oldLength;
 
@@ -399,8 +426,8 @@ Tcl_UtfToUniCharDString(
     w = wString;
     end = src + length;
     for (p = src; p < end; ) {
-	p += TclUtfToUniChar(p, w);
-	w++;
+	p += TclUtfToUniChar(p, &ch);
+	*w++ = ch;
     }
     *w = '\0';
     Tcl_DStringSetLength(dsPtr,
@@ -434,9 +461,8 @@ Tcl_UtfCharComplete(
 				 * a complete UTF-8 character. */
     int length)			/* Length of above string in bytes. */
 {
-    int ch;
+    int ch = *((unsigned char *) src);
 
-    ch = *((unsigned char *) src);
     return length >= totalBytes[ch];
 }
 
@@ -464,8 +490,7 @@ Tcl_NumUtfChars(
     int length)			/* The length of the string in bytes, or -1
 				 * for strlen(string). */
 {
-    Tcl_UniChar ch;
-    register Tcl_UniChar *chPtr = &ch;
+    Tcl_UniChar ch = 0;
     register int i;
 
     /*
@@ -478,7 +503,7 @@ Tcl_NumUtfChars(
     i = 0;
     if (length < 0) {
 	while (*src != '\0') {
-	    src += TclUtfToUniChar(src, chPtr);
+	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
     } else {
@@ -489,7 +514,7 @@ Tcl_NumUtfChars(
 		length--;
 		src++;
 	    } else {
-		n = Tcl_UtfToUniChar(src, chPtr);
+		n = Tcl_UtfToUniChar(src, &ch);
 		length -= n;
 		src += n;
 	    }
@@ -524,7 +549,7 @@ Tcl_UtfFindFirst(
     int ch)			/* The Tcl_UniChar to search for. */
 {
     int len;
-    Tcl_UniChar find;
+    Tcl_UniChar find = 0;
 
     while (1) {
 	len = TclUtfToUniChar(src, &find);
@@ -563,7 +588,7 @@ Tcl_UtfFindLast(
     int ch)			/* The Tcl_UniChar to search for. */
 {
     int len;
-    Tcl_UniChar find;
+    Tcl_UniChar find = 0;
     const char *last;
 
     last = NULL;
@@ -603,9 +628,15 @@ const char *
 Tcl_UtfNext(
     const char *src)		/* The current location in the string. */
 {
-    Tcl_UniChar ch;
+    Tcl_UniChar ch = 0;
+    int len = TclUtfToUniChar(src, &ch);
 
-    return src + TclUtfToUniChar(src, &ch);
+#if TCL_UTF_MAX == 4
+    if (len == 0) {
+      len = TclUtfToUniChar(src, &ch);
+    }
+#endif
+    return src + len;
 }
 
 /*
@@ -638,8 +669,7 @@ Tcl_UtfPrev(
     const char *look;
     int i, byte;
 
-    src--;
-    look = src;
+    look = --src;
     for (i = 0; i < TCL_UTF_MAX; i++) {
 	if (look < start) {
 	    if (src < start) {
@@ -712,7 +742,7 @@ Tcl_UtfAtIndex(
     register const char *src,	/* The UTF-8 string. */
     register int index)		/* The position of the desired character. */
 {
-    Tcl_UniChar ch;
+    Tcl_UniChar ch = 0;
 
     while (index > 0) {
 	index--;
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2017-06-08 08:26:58 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2017-06-08 08:26:58 (GMT)
commit	16f3f234e8500f5f71e4d9321689a8bdf9efc809 (patch)
tree	5d0ad393849e7c2d6b1b88d3f6d413ff2b505f14 /generic/tclUtf.c
parent	73a3dfdeeabb1a43c73101b4b6a9826f83866b32 (diff)
download	tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.zip tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.tar.gz tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.tar.bz2