Don't ever allow UTF-8 sequences of more than 4 characters to be generated or parsed, even when TCL_UTF_MAX>4: According to current Unicode standard, a byte string of >4 characters can never form a single UTF-8 character.

And a few minor micro-optimizations related to UTF-8 handling.
author: jan.nijtmans <nijtmans@users.sourceforge.net> 2016-08-30 13:00:32 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2016-08-30 13:00:32 (GMT)
commit: d84492f3906d20d05b547a4fa90286fe0a59bb37 (patch)
tree: cfec5955da6f587fe00bd8471120c07add91560c
parent: a6063330c474dde9b388bfeda1b1bb746aebf23a (diff)
download: tcl-d84492f3906d20d05b547a4fa90286fe0a59bb37.zip
tcl-d84492f3906d20d05b547a4fa90286fe0a59bb37.tar.gz
tcl-d84492f3906d20d05b547a4fa90286fe0a59bb37.tar.bz2
1 files changed, 24 insertions, 44 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index b878149..68119a4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -73,16 +73,7 @@ static const unsigned char totalBytes[256] = {
 #else
     1,1,1,1,1,1,1,1,
 #endif
-#if TCL_UTF_MAX > 4
-    5,5,5,5,
-#else
-    1,1,1,1,
-#endif
-#if TCL_UTF_MAX > 5
-    6,6,6,6
-#else
-    1,1,1,1
-#endif
+    1,1,1,1,1,1,1,1
 };
 
 /*
@@ -111,14 +102,14 @@ INLINE static int
 UtfCount(
     int ch)			/* The Tcl_UniChar whose size is returned. */
 {
-    if ((ch > 0) && (ch < UNICODE_SELF)) {
+    if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
 	return 1;
     }
     if (ch <= 0x7FF) {
 	return 2;
     }
 #if TCL_UTF_MAX > 3
-    if ((ch > 0xFFFF) && (ch <= 0x10FFFF)) {
+    if (((unsigned)(ch - 0x10000) <= 0xfffff)) {
 	return 4;
     }
 #endif
@@ -152,7 +143,7 @@ Tcl_UniCharToUtf(
 				 * large enough to hold the UTF-8 character
 				 * (at most TCL_UTF_MAX bytes). */
 {
-    if ((ch > 0) && (ch < UNICODE_SELF)) {
+    if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
 	buf[0] = (char) ch;
 	return 1;
     }
@@ -180,11 +171,7 @@ Tcl_UniCharToUtf(
 		}
 	    }
 #endif
-	three:
-	    buf[2] = (char) ((ch | 0x80) & 0xBF);
-	    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
-	    buf[0] = (char) ((ch >> 12) | 0xE0);
-	    return 3;
+	    goto three;
 	}
 
 #if TCL_UTF_MAX > 3
@@ -199,7 +186,11 @@ Tcl_UniCharToUtf(
     }
 
     ch = 0xFFFD;
-    goto three;
+three:
+    buf[2] = (char) ((ch | 0x80) & 0xBF);
+    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+    buf[0] = (char) ((ch >> 12) | 0xE0);
+    return 3;
 }
 
 /*
@@ -314,9 +305,6 @@ Tcl_UtfToUniChar(
 	 * A two-byte-character lead-byte not followed by trail-byte
 	 * represents itself.
 	 */
-
-	*chPtr = (Tcl_UniChar) byte;
-	return 1;
     } else if (byte < 0xF0) {
 	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
 	    /*
@@ -332,31 +320,23 @@ Tcl_UtfToUniChar(
 	 * A three-byte-character lead-byte not followed by two trail-bytes
 	 * represents itself.
 	 */
-
-	*chPtr = (Tcl_UniChar) byte;
-	return 1;
     }
 #if TCL_UTF_MAX > 3
-    {
-	int ch, total, trail;
-
-	total = totalBytes[byte];
-	trail = total - 1;
-	if (trail > 0) {
-	    ch = byte & (0x3F >> trail);
-	    do {
-		src++;
-		if ((*src & 0xC0) != 0x80) {
-		    *chPtr = byte;
-		    return 1;
-		}
-		ch <<= 6;
-		ch |= (*src & 0x3F);
-		trail--;
-	    } while (trail > 0);
-	    *chPtr = ch;
-	    return total;
+    else if (byte < 0xF8) {
+	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+	    /*
+	     * Four-byte-character lead byte followed by three trail bytes.
+	     */
+
+	    *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
+		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+	    return 4;
 	}
+
+	/*
+	 * A three-byte-character lead-byte not followed by two trail-bytes
+	 * represents itself.
+	 */
     }
 #endif
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2016-08-30 13:00:32 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2016-08-30 13:00:32 (GMT)
commit	d84492f3906d20d05b547a4fa90286fe0a59bb37 (patch)
tree	cfec5955da6f587fe00bd8471120c07add91560c
parent	a6063330c474dde9b388bfeda1b1bb746aebf23a (diff)
download	tcl-d84492f3906d20d05b547a4fa90286fe0a59bb37.zip tcl-d84492f3906d20d05b547a4fa90286fe0a59bb37.tar.gz tcl-d84492f3906d20d05b547a4fa90286fe0a59bb37.tar.bz2