Rebase to latest core-8-6-branch.

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2020-05-08 15:19:43 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2020-05-08 15:19:43 (GMT)
commit: dc4018e40d89363bcceca8c6ef20fc315ac734d9 (patch)
tree: 9ef0bded6ed11b50b26f73112b91a1a71a05045d
parent: 07e0e0d06a58acfb653f6f760c4a7f180c6ac456 (diff)
parent: 752f99d5762c8c78c12d48f81c36d7f2a4914e37 (diff)
download: tcl-dc4018e40d89363bcceca8c6ef20fc315ac734d9.zip
tcl-dc4018e40d89363bcceca8c6ef20fc315ac734d9.tar.gz
tcl-dc4018e40d89363bcceca8c6ef20fc315ac734d9.tar.bz2
3 files changed, 21 insertions, 39 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 03998de..c0de80a 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -64,7 +64,7 @@ static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-#if TCL_UTF_MAX != 4
+#if TCL_UTF_MAX < 4
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 #else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
@@ -384,7 +384,7 @@ Tcl_UtfToUniChar(
 	 * characters representing themselves.
 	 */
 
-#if TCL_UTF_MAX <= 4
+#if TCL_UTF_MAX == 4
 	/* If *chPtr contains a high surrogate (produced by a previous
 	 * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
 	 * bytes, then we must produce a follow-up low surrogate. We only
@@ -440,7 +440,7 @@ Tcl_UtfToUniChar(
 	     * Four-byte-character lead byte followed by at least two trail bytes.
 	     * We don't test the validity of 3th trail byte, see [ed29806ba]
 	     */
-#if TCL_UTF_MAX <= 4
+#if TCL_UTF_MAX == 4
 	    Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
 		    | ((src[2] & 0x3F) >> 4)) - 0x40;
 	    if (high < 0x400) {
@@ -449,7 +449,7 @@ Tcl_UtfToUniChar(
 		return 1;
 	    }
 	    /* out of range, < 0x10000 or > 0x10FFFF */
-#else
+#elif TCL_UTF_MAX > 4
 	    if ((src[3] & 0xC0) == 0x80) {
 		*chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
 			| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
@@ -621,26 +621,12 @@ Tcl_NumUtfChars(
 	 */
 	while (src <= optPtr
 		/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
-#if TCL_UTF_MAX < 4
-	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
-		/* treat F0 - F4 as single character */
-		ch = 0;
-		src++;
-	    } else
-#endif
 	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
 	/* Loop over the remaining string where call must happen */
 	while (src < endPtr) {
 	    if (Tcl_UtfCharComplete(src, endPtr - src)) {
-#if TCL_UTF_MAX < 4
-		if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
-		    /* treat F0 - F4 as single character */
-		    ch = 0;
-		    src++;
-		} else
-#endif
 		src += TclUtfToUniChar(src, &ch);
 	    } else {
 		/*
@@ -1064,11 +1050,11 @@ Tcl_UtfToUpper(
 	 * char to dst if its size is <= the original char.
 	 */
 
-	if (len < UtfCount(upChar) || ((upChar & ~0x7FF) == 0xD800)) {
+	if (len < UtfCount(upChar)) {
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
-	    dst += Tcl_UniCharToUtf(upChar, dst);
+	    dst += TclUCS4ToUtf(upChar, dst);
 	}
 	src += len;
     }
@@ -1117,11 +1103,11 @@ Tcl_UtfToLower(
 	 * char to dst if its size is <= the original char.
 	 */
 
-	if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) {
+	if (len < UtfCount(lowChar)) {
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
-	    dst += Tcl_UniCharToUtf(lowChar, dst);
+	    dst += TclUCS4ToUtf(lowChar, dst);
 	}
 	src += len;
     }
@@ -1167,11 +1153,11 @@ Tcl_UtfToTitle(
 	len = TclUtfToUCS4(src, &ch);
 	titleChar = UCS4ToTitle(ch);
 
-	if (len < UtfCount(titleChar) || ((titleChar & ~0x7FF) == 0xD800)) {
+	if (len < UtfCount(titleChar)) {
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
-	    dst += Tcl_UniCharToUtf(titleChar, dst);
+	    dst += TclUCS4ToUtf(titleChar, dst);
 	}
 	src += len;
     }
@@ -1183,11 +1169,11 @@ Tcl_UtfToTitle(
 	    lowChar = TclUCS4ToLower(lowChar);
 	}
 
-	if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) {
+	if (len < UtfCount(lowChar)) {
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
-	    dst += Tcl_UniCharToUtf(lowChar, dst);
+	    dst += TclUCS4ToUtf(lowChar, dst);
 	}
 	src += len;
     }
diff --git a/tests/encoding.test b/tests/encoding.test
index 552c97f..84f9ae1 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -335,7 +335,12 @@ test encoding-15.4 {UtfToUtfProc emoji character input} -body {
     set y [encoding convertfrom utf-8 \xED\xA0\xBD\xED\xB8\x82]
     list [string length $x] $y
 } -result "6 \uD83D\uDE02"
-test encoding-15.5 {UtfToUtfProc emoji character input} {
+test encoding-15.5.0 {UtfToUtfProc emoji character input} ucs2 {
+    set x \xF0\x9F\x98\x82
+    set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
+    list [string length $x] $y
+} "4 \xF0\x9F\x98\x82"
+test encoding-15.5.1 {UtfToUtfProc emoji character input} fullutf {
     set x \xF0\x9F\x98\x82
     set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
     list [string length $x] $y
diff --git a/tests/utf.test b/tests/utf.test
index 7b5cbf6..1a4b157 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -219,12 +219,9 @@ test utf-6.8 {Tcl_UtfNext} {testutfnext testbytestring} {
 test utf-6.9.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0]
 } 1
-test utf-6.9.1 {Tcl_UtfNext} {testutfnext testbytestring utf16} {
+test utf-6.9.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} {
     testutfnext [testbytestring \xA0]
 } -1
-test utf-6.9.2 {Tcl_UtfNext} {testutfnext testbytestring ucs4} {
-    testutfnext [testbytestring \xA0]
-} 1
 test utf-6.10 {Tcl_UtfNext} {testutfnext testbytestring} {
     testutfnext [testbytestring \xA0]G
 } 1
@@ -516,21 +513,15 @@ test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbyte
 test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0\xA0\xA0]
 } 1
-test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring utf16} {
+test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring fullutf} {
     testutfnext [testbytestring \xA0\xA0\xA0]
 } 3
-test utf-6.92.2 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring ucs4} {
-    testutfnext [testbytestring \xA0\xA0\xA0]
-} 1
 test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \x80\x80\x80]
 } 1
-test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring utf16} {
+test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring fullutf} {
     testutfnext [testbytestring \x80\x80\x80]
 } 3
-test utf-6.93.2 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring ucs4} {
-    testutfnext [testbytestring \x80\x80\x80]
-} 1
 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0\xA0\xA0\xA0]
 } 1
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2020-05-08 15:19:43 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2020-05-08 15:19:43 (GMT)
commit	dc4018e40d89363bcceca8c6ef20fc315ac734d9 (patch)
tree	9ef0bded6ed11b50b26f73112b91a1a71a05045d
parent	07e0e0d06a58acfb653f6f760c4a7f180c6ac456 (diff)
parent	752f99d5762c8c78c12d48f81c36d7f2a4914e37 (diff)
download	tcl-dc4018e40d89363bcceca8c6ef20fc315ac734d9.zip tcl-dc4018e40d89363bcceca8c6ef20fc315ac734d9.tar.gz tcl-dc4018e40d89363bcceca8c6ef20fc315ac734d9.tar.bz2