From 6860390243ff910bf7175af41567d588c930a45d Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 8 Apr 2020 07:27:50 +0000
Subject: Teach "split", "string wordstart" and "string wordend" how to handle
 Unicode characters >\uFFFF, using the new utility function TclUtfToUCS4().
 (Thanks, Don!). Add test-cases to prove it.

---
 generic/tclCmdMZ.c | 29 ++++++++++++-----------------
 tests/split.test   |  7 +++++--
 tests/string.test  | 12 ++++++++++++
 3 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 3e4f1d5..1ddae0f 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1174,7 +1174,7 @@ Tcl_SplitObjCmd(
     int objc,			/* Number of arguments. */
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
-    Tcl_UniChar ch = 0;
+    int ch = 0;
     int len;
     const char *splitChars;
     const char *stringPtr;
@@ -1217,10 +1217,8 @@ Tcl_SplitObjCmd(
 	Tcl_InitHashTable(&charReuseTable, TCL_ONE_WORD_KEYS);
 
 	for ( ; stringPtr < end; stringPtr += len) {
-	    int ucs4;
-
-	    len = TclUtfToUCS4(stringPtr, &ucs4);
-	    hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(ucs4), &isNew);
+	    len = TclUtfToUCS4(stringPtr, &ch);
+	    hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(ch), &isNew);
 	    if (isNew) {
 		TclNewStringObj(objPtr, stringPtr, len);
 
@@ -1255,7 +1253,7 @@ Tcl_SplitObjCmd(
     } else {
 	const char *element, *p, *splitEnd;
 	int splitLen;
-	Tcl_UniChar splitChar = 0;
+	int splitChar;
 
 	/*
 	 * Normal case: split on any of a given set of characters. Discard
@@ -1265,9 +1263,9 @@ Tcl_SplitObjCmd(
 	splitEnd = splitChars + splitCharLen;
 
 	for (element = stringPtr; stringPtr < end; stringPtr += len) {
-	    len = TclUtfToUniChar(stringPtr, &ch);
+	    len = TclUtfToUCS4(stringPtr, &ch);
 	    for (p = splitChars; p < splitEnd; p += splitLen) {
-		splitLen = TclUtfToUniChar(p, &splitChar);
+		splitLen = TclUtfToUCS4(p, &splitChar);
 		if (ch == splitChar) {
 		    TclNewStringObj(objPtr, element, stringPtr - element);
 		    Tcl_ListObjAppendElement(NULL, listPtr, objPtr);
@@ -2482,9 +2480,7 @@ StringRevCmd(
  * StringStartCmd --
  *
  *	This procedure is invoked to process the "string wordstart" Tcl
- *	command. See the user documentation for details on what it does. Note
- *	that this command only functions correctly on properly formed Tcl UTF
- *	strings.
+ *	command. See the user documentation for details on what it does.
  *
  * Results:
  *	A standard Tcl result.
@@ -2502,7 +2498,7 @@ StringStartCmd(
     int objc,			/* Number of arguments. */
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
-    Tcl_UniChar ch = 0;
+    int ch;
     const char *p, *string;
     int cur, index, length, numChars;
 
@@ -2524,7 +2520,7 @@ StringStartCmd(
     if (index > 0) {
 	p = Tcl_UtfAtIndex(string, index);
 	for (cur = index; cur >= 0; cur--) {
-	    TclUtfToUniChar(p, &ch);
+	    TclUtfToUCS4(p, &ch);
 	    if (!Tcl_UniCharIsWordChar(ch)) {
 		break;
 	    }
@@ -2544,8 +2540,7 @@ StringStartCmd(
  * StringEndCmd --
  *
  *	This procedure is invoked to process the "string wordend" Tcl command.
- *	See the user documentation for details on what it does. Note that this
- *	command only functions correctly on properly formed Tcl UTF strings.
+ *	See the user documentation for details on what it does.
  *
  * Results:
  *	A standard Tcl result.
@@ -2563,7 +2558,7 @@ StringEndCmd(
     int objc,			/* Number of arguments. */
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
-    Tcl_UniChar ch = 0;
+    int ch;
     const char *p, *end, *string;
     int cur, index, length, numChars;
 
@@ -2585,7 +2580,7 @@ StringEndCmd(
 	p = Tcl_UtfAtIndex(string, index);
 	end = string+length;
 	for (cur = index; p < end; cur++) {
-	    p += TclUtfToUniChar(p, &ch);
+	    p += TclUtfToUCS4(p, &ch);
 	    if (!Tcl_UniCharIsWordChar(ch)) {
 		break;
 	    }
diff --git a/tests/split.test b/tests/split.test
index 2d180e0..d00c452 100644
--- a/tests/split.test
+++ b/tests/split.test
@@ -71,8 +71,11 @@ test split-1.14 {basic split commands} {
     split ",12,,,34,56," {,}
 } {{} 12 {} {} 34 56 {}}
 test split-1.15 {basic split commands} -body {
-    split "a\U01f4a9b" {}
-} -result "a \U01f4a9 b"
+    split "a\U1F4A9b" {}
+} -result "a \U1F4A9 b"
+test split-1.16 {basic split commands} -body {
+    split "a\U1F4A9b" \U1F4A9
+} -result "a b"
 
 test split-2.1 {split errors} {
     list [catch split msg] $msg $errorCode
diff --git a/tests/string.test b/tests/string.test
index 7117d4d..d358fce 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -1856,6 +1856,12 @@ test string-21.13.$noComp {string wordend, unicode} -body {
 test string-21.14.$noComp {string wordend, unicode} -body {
     run {string wordend "\uC700\uC700 abc" 8}
 } -result 6
+test string-21.15.$noComp {string wordend, unicode} -body {
+    run {string wordend "\U1D7CA\U1D7CA abc" 0}
+} -result 2
+test string-21.16.$noComp {string wordend, unicode} -body {
+    run {string wordend "\U1D7CA\U1D7CA abc" 10}
+} -result 8
 
 test string-22.1.$noComp {string wordstart} -body {
     list [catch {run {string word a}} msg] $msg
@@ -1896,6 +1902,12 @@ test string-22.12.$noComp {string wordstart, unicode} -body {
 test string-22.13.$noComp {string wordstart, unicode} -body {
     run {string wordstart "\uC700\uC700 abc" 8}
 } -result 3
+test string-22.14.$noComp {string wordstart, unicode} -body {
+    run {string wordstart "\U1D7CA\U1D7CA abc" 0}
+} -result 0
+test string-22.15.$noComp {string wordstart, unicode} -body {
+    run {string wordstart "\U1D7CA\U1D7CA abc" 10}
+} -result 5
 
 test string-23.0.$noComp {string is boolean, Bug 1187123} testindexobj {
     set x 5
-- 
cgit v0.12


From c7de8a1d22c60626b19b0bb57ac58fa7d9d5cc16 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 8 Apr 2020 07:57:03 +0000
Subject: In TCL_UTF_MAX=4 mode, string lengths are quitely different.

---
 tests/string.test | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/string.test b/tests/string.test
index d358fce..aca3570 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -1859,7 +1859,7 @@ test string-21.14.$noComp {string wordend, unicode} -body {
 test string-21.15.$noComp {string wordend, unicode} -body {
     run {string wordend "\U1D7CA\U1D7CA abc" 0}
 } -result 2
-test string-21.16.$noComp {string wordend, unicode} -body {
+test string-21.16.$noComp {string wordend, unicode} -constraints tip389 -body {
     run {string wordend "\U1D7CA\U1D7CA abc" 10}
 } -result 8
 
@@ -1905,7 +1905,7 @@ test string-22.13.$noComp {string wordstart, unicode} -body {
 test string-22.14.$noComp {string wordstart, unicode} -body {
     run {string wordstart "\U1D7CA\U1D7CA abc" 0}
 } -result 0
-test string-22.15.$noComp {string wordstart, unicode} -body {
+test string-22.15.$noComp {string wordstart, unicode} -constraints tip389 -body {
     run {string wordstart "\U1D7CA\U1D7CA abc" 10}
 } -result 5
 
-- 
cgit v0.12