Backport some UTF-8-related changed from 8.7 to 8.6, only for TCL_UTF_MAX > 3. No change for TCL_UTF_MAX=3.

Also adapt test-cases accordingly, and add comments why the changes were done.
author: jan.nijtmans <nijtmans@users.sourceforge.net> 2021-03-02 10:10:54 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2021-03-02 10:10:54 (GMT)
commit: b697c844add273719e0d724f0e7680d215a0b163 (patch)
tree: aec144b25c92ffc5b85625471de5b49fdd28558a
parent: 1ac55b7d3fca0fa53e5ebbd6f7107d4dc3b79c9b (diff)
download: tcl-b697c844add273719e0d724f0e7680d215a0b163.zip
tcl-b697c844add273719e0d724f0e7680d215a0b163.tar.gz
tcl-b697c844add273719e0d724f0e7680d215a0b163.tar.bz2
2 files changed, 68 insertions, 17 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index f99c497..65a3f41 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -90,6 +90,8 @@ static const unsigned char complete[256] = {
 #if TCL_UTF_MAX > 3
     4,4,4,4,4,
 #else
+    /* Tcl_UtfToUniChar() accesses src[1] and src[2] to check whether
+     * the UTF-8 sequence is valid, so we cannot use 1 here. */
     3,3,3,3,3,
 #endif
     1,1,1,1,1,1,1,1,1,1,1
@@ -536,7 +538,7 @@ Tcl_UtfToUniCharDString(
     w = wString;
     p = src;
     endPtr = src + length;
-    optPtr = endPtr - TCL_UTF_MAX;
+    optPtr = endPtr - ((TCL_UTF_MAX > 3) ? 4 : 3) ;
     while (p <= optPtr) {
 	p += TclUtfToUniChar(p, &ch);
 	*w++ = ch;
@@ -623,7 +625,7 @@ Tcl_NumUtfChars(
 	/* Pointer to the end of string. Never read endPtr[0] */
 	const char *endPtr = src + length;
 	/* Pointer to last byte where optimization still can be used */
-	const char *optPtr = endPtr - TCL_UTF_MAX;
+	const char *optPtr = endPtr - ((TCL_UTF_MAX > 3) ? 4 : 3);
 
 	/*
 	 * Optimize away the call in this loop. Justified because...
@@ -759,6 +761,19 @@ Tcl_UtfNext(
     int left;
     const char *next;
 
+#if TCL_UTF_MAX > 3
+    if (((*src) & 0xC0) == 0x80) {
+	/* Continuation byte, so we start 'inside' a (possible valid) UTF-8
+	 * sequence. Since we are not allowed to access src[-1], we cannot
+	 * check if the sequence is actually valid, the best we can do is
+	 * just assume it is valid and locate the end. */
+	if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) {
+	    ++src;
+	}
+	return src;
+    }
+#endif
+
     left = totalBytes[UCHAR(*src)];
     next = src + 1;
     while (--left) {
@@ -895,7 +910,6 @@ Tcl_UtfPrev(
      * properly formed byte sequence to find, and we can stop looking,
      * accepting the fallback.
      */
-
     return fallback;
 }
 
diff --git a/tests/utf.test b/tests/utf.test
index 8e886ae..cd8feb6 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -3,7 +3,7 @@
 # errors.  No output means no errors were found.
 #
 # Copyright (c) 1997 Sun Microsystems, Inc.
-# Copyright (c) 1998-1999 by Scriptics Corporation.
+# Copyright (c) 1998-1999 Scriptics Corporation.
 #
 # See the file "license.terms" for information on usage and redistribution
 # of this file, and for a DISCLAIMER OF ALL WARRANTIES.
@@ -21,6 +21,7 @@ testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}]
 testConstraint utf16 [expr {[string length [format %c 0x10000]] == 2}]
 testConstraint ucs4 [expr {[testConstraint fullutf]
 		&& [string length [format %c 0x10000]] == 1}]
+testConstraint ucs2_utf16 [expr {![testConstraint ucs4]}]
 
 testConstraint Uesc [expr {"\U0041" eq "A"}]
 testConstraint pre388 [expr {"\x741" eq "A"}]
@@ -78,9 +79,12 @@ test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring
 test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} {
     expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]}
 } 1
-test utf-1.13 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc ucs2} {
+test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc ucs2} {
     expr {"\UD842" eq "\uD842"}
 } 1
+test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc testbytestring fullutf} {
+    expr {"\UD842" eq [testbytestring \xEF\xBF\xBD]}
+} 1
 
 test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
     string length "abc"
@@ -103,7 +107,7 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestrin
 test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring {
     string length [testbytestring \xE4\xB9\x8E]
 } 1
-test utf-2.8.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytestring ucs2} {
+test utf-2.8.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytestring ucs2_utf16} {
     string length [testbytestring \xF0\x90\x80\x80]
 } 2
 test utf-2.8.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytestring ucs4} {
@@ -215,9 +219,12 @@ test utf-6.9 {Tcl_UtfNext} {testutfnext testbytestring} {
 test utf-6.10 {Tcl_UtfNext} {testutfnext testbytestring} {
     testutfnext [testbytestring \xA0]G
 } 1
-test utf-6.11 {Tcl_UtfNext} {testutfnext testbytestring} {
+test utf-6.11.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0\xA0\x00]
 } 1
+test utf-6.11.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \xA0\xA0\x00]
+} 2
 test utf-6.12 {Tcl_UtfNext} {testutfnext testbytestring} {
     testutfnext [testbytestring \xA0\xD0]
 } 1
@@ -476,12 +483,18 @@ test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring u
 test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring fullutf} {
     testutfnext [testbytestring \xF0\x90\x80\x80]
 } 4
-test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring} {
+test utf-6.88.0 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0\xA0\x00]
 } 1
-test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring} {
+test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \xA0\xA0\x00]
+} 2
+test utf-6.89.0 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \x80\x80\x00]
 } 1
+test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \x80\x80\x00]
+} 2
 test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xF4\x8F\xBF\xBF]
 } 1
@@ -491,18 +504,30 @@ test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbyte
 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring} {
     testutfnext [testbytestring \xF4\x90\x80\x80]
 } 1
-test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring} {
+test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0\xA0\xA0]
 } 1
-test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring} {
+test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \xA0\xA0\xA0]
+} 3
+test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \x80\x80\x80]
 } 1
-test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring} {
+test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \x80\x80\x80]
+} 3
+test utf-6.94.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0\xA0\xA0\xA0]
 } 1
-test utf-6.95 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring} {
+test utf-6.94.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \xA0\xA0\xA0\xA0]
+} 3
+test utf-6.95.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \x80\x80\x80\x80]
 } 1
+test utf-6.95.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \x80\x80\x80\x80]
+} 3
 test utf-6.96 {Tcl_UtfNext, read limits} testutfnext {
     testutfnext G 0
 } 0
@@ -600,18 +625,30 @@ test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
 test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
     testutfnext [testbytestring \xA0\xA0\xA0] 2
 } 0
-test utf-6.123 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.123.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0\xA0\xA0]G 3
 } 1
-test utf-6.124 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.123.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \xA0\xA0\xA0]G 3
+} 3
+test utf-6.124.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0\xA0\xA0\xA0] 3
 } 1
-test utf-6.125 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.124.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \xA0\xA0\xA0\xA0] 3
+} 3
+test utf-6.125.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0\xA0\xA0\xA0]G 4
 } 1
-test utf-6.126 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.125.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \xA0\xA0\xA0\xA0]G 4
+} 3
+test utf-6.126.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0\xA0\xA0\xA0\xA0] 4
 } 1
+test utf-6.126.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} {
+    testutfnext [testbytestring \xA0\xA0\xA0\xA0\xA0] 4
+} 3
 
 test utf-7.1 {Tcl_UtfPrev} testutfprev {
     testutfprev {}
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2021-03-02 10:10:54 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2021-03-02 10:10:54 (GMT)
commit	b697c844add273719e0d724f0e7680d215a0b163 (patch)
tree	aec144b25c92ffc5b85625471de5b49fdd28558a
parent	1ac55b7d3fca0fa53e5ebbd6f7107d4dc3b79c9b (diff)
download	tcl-b697c844add273719e0d724f0e7680d215a0b163.zip tcl-b697c844add273719e0d724f0e7680d215a0b163.tar.gz tcl-b697c844add273719e0d724f0e7680d215a0b163.tar.bz2