From a09671a0a00f2d3e4abf4747a072da94b0320459 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Fri, 29 May 2020 13:36:58 +0000
Subject: Change implementation "charstart", not behaving as "prevchar" any
 more. Also optimize charend/charstart for TCL_UTF_MAX>3 (not need to do
 actual conversion then).

---
 generic/tclCmdMZ.c | 59 +++++++++++++++++++++++++++++++++++-------------------
 tests/string.test  | 59 +++++++++++++++++++++++++++---------------------------
 2 files changed, 67 insertions(+), 51 deletions(-)

diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index e15e5c8..63268a4 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -2518,7 +2518,7 @@ StringStartCmd(
     if (index > 0) {
 	p = &string[index];
 
-	TclUniCharToUCS4(p, &ch);
+	(void)TclUniCharToUCS4(p, &ch);
 	for (cur = index; cur >= 0; cur--) {
 	    int delta = 0;
 	    const Tcl_UniChar *next;
@@ -2537,8 +2537,6 @@ StringStartCmd(
 	if (cur != index) {
 	    cur += 1;
 	}
-    } else {
-	cur = -1;
     }
     Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur));
     return TCL_OK;
@@ -2568,7 +2566,11 @@ StringCharStartCmd(
     int objc,			/* Number of arguments. */
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
-    const Tcl_UniChar *p, *string;
+#if TCL_UTF_MAX <= 3
+    const Tcl_UniChar *src;
+#else
+    const char *src;
+#endif
     int index, length;
 
     if (objc != 3) {
@@ -2576,18 +2578,23 @@ StringCharStartCmd(
 	return TCL_ERROR;
     }
 
-    string = Tcl_GetUnicodeFromObj(objv[1], &length);
+#if TCL_UTF_MAX <= 3
+    src = Tcl_GetUnicodeFromObj(objv[1], &length);
+#else
+    src = Tcl_GetStringFromObj(objv[1], &length);
+    length = Tcl_NumUtfChars(src, length);
+#endif
     if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
 	return TCL_ERROR;
     }
-    if (index > length) {
+    if (index >= length) {
 	index = length;
-    }
-    if (index > 0) {
-	p = &string[index];
-	index = TclUCS4Prev(p, string) - string;
-    } else {
-	index = 0;
+    } else if (index < 0) {
+	index = -1;
+#if TCL_UTF_MAX <= 3
+    } else if ((index > 0) && ((src[index-1] & 0xFC00) == 0xD800) && ((src[index] & 0xFC00) == 0xDC00)) {
+	index--;
+#endif
     }
     Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index));
     return TCL_OK;
@@ -2646,7 +2653,7 @@ StringEndCmd(
 	    cur++;
 	}
     } else {
-	cur = -1;
+	cur = length;
     }
     Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur));
     return TCL_OK;
@@ -2676,8 +2683,11 @@ StringCharEndCmd(
     int objc,			/* Number of arguments. */
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
-    int ch;
-    const Tcl_UniChar *string;
+#if TCL_UTF_MAX <= 3
+    const Tcl_UniChar *src;
+#else
+    const char *src;
+#endif
     int index, length;
 
     if (objc != 3) {
@@ -2685,17 +2695,24 @@ StringCharEndCmd(
 	return TCL_ERROR;
     }
 
-    string = Tcl_GetUnicodeFromObj(objv[1], &length);
+#if TCL_UTF_MAX <= 3
+    src = Tcl_GetUnicodeFromObj(objv[1], &length);
+#else
+    src = Tcl_GetStringFromObj(objv[1], &length);
+    length = Tcl_NumUtfChars(src, length);
+#endif
     if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
 	return TCL_ERROR;
     }
-    if (index < 0) {
+    if (++index < 0) {
 	index = 0;
     }
-    if (index < length) {
-	    index += TclUniCharToUCS4(&string[index], &ch);
-    } else {
-	    index = -1;
+    if (index >= length) {
+	index = length;
+#if TCL_UTF_MAX <= 3
+    } else if ((index > 0) && ((src[index-1] & 0xFC00) == 0xD800) && ((src[index] & 0xFC00) == 0xDC00)) {
+	index++;
+#endif
     }
     Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index));
     return TCL_OK;
diff --git a/tests/string.test b/tests/string.test
index d868610..cddd506 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -33,7 +33,6 @@ testConstraint testindexobj [expr {[info commands testindexobj] ne {}}]
 testConstraint testevalex [expr {[info commands testevalex] ne {}}]
 testConstraint utf16 [expr {[string length \U010000] == 2}]
 testConstraint testbytestring   [llength [info commands testbytestring]]
-testConstraint nodep [info exists tcl_precision]
 
 # Used for constraining memory leak tests
 testConstraint memory [llength [info commands memory]]
@@ -73,9 +72,9 @@ if {$noComp} {
 }
 
 
-test string-1.1.$noComp {error conditions} -body {
+test string-1.1.$noComp {error conditions} {
     list [catch {run {string gorp a b}} msg] $msg
-} -match regexp -result {1 {unknown or ambiguous subcommand "gorp": must be (bytelength, |)cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
+} {1 {unknown or ambiguous subcommand "gorp": must be bytelength, cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
 test string-1.2.$noComp {error conditions} {
     list [catch {run {string}} msg] $msg
 } {1 {wrong # args: should be "string subcommand ?arg ...?"}}
@@ -1025,16 +1024,16 @@ test string-7.16.$noComp {string last, start index} {
     run {string last \334a \334ad\334ad end-1}
 } 3
 
-test string-8.1.$noComp {string bytelength} nodep {
+test string-8.1.$noComp {string bytelength} {
     list [catch {run {string bytelength}} msg] $msg
 } {1 {wrong # args: should be "string bytelength string"}}
-test string-8.2.$noComp {string bytelength} nodep {
+test string-8.2.$noComp {string bytelength} {
     list [catch {run {string bytelength a b}} msg] $msg
 } {1 {wrong # args: should be "string bytelength string"}}
-test string-8.3.$noComp {string bytelength} nodep {
+test string-8.3.$noComp {string bytelength} {
     run {string bytelength "\xC7"}
 } 2
-test string-8.4.$noComp {string bytelength} nodep {
+test string-8.4.$noComp {string bytelength} {
     run {string b ""}
 } 0
 
@@ -1800,9 +1799,9 @@ test string-19.3.$noComp {string trimleft, unicode default} {
 test string-20.1.$noComp {string trimright errors} {
     list [catch {run {string trimright}} msg] $msg
 } {1 {wrong # args: should be "string trimright string ?chars?"}}
-test string-20.2.$noComp {string trimright errors} -body {
+test string-20.2.$noComp {string trimright errors} {
     list [catch {run {string trimg a}} msg] $msg
-} -match regexp -result {1 {unknown or ambiguous subcommand "trimg": must be (bytelength, |)cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
+} {1 {unknown or ambiguous subcommand "trimg": must be bytelength, cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
 test string-20.3.$noComp {string trimright} {
     run {string trimright "    XYZ      "}
 } {    XYZ}
@@ -1858,7 +1857,7 @@ test string-21.4.$noComp {string wordend} -body {
 } -result 3
 test string-21.5.$noComp {string wordend} -body {
     run {string wordend abc. 100}
-} -result -1
+} -result 4
 test string-21.6.$noComp {string wordend} -body {
     run {string wordend "word_one two three" 2}
 } -result 8
@@ -1885,17 +1884,17 @@ test string-21.13.$noComp {string wordend, unicode} -body {
 } -result 3
 test string-21.14.$noComp {string wordend, unicode} -body {
     run {string wordend "\uC700\uC700 abc" 8}
-} -result -1
+} -result 6
 test string-21.15.$noComp {string wordend, unicode} -body {
     run {string wordend "\U1D7CA\U1D7CA abc" 0}
 } -result 2
 test string-21.16.$noComp {string wordend, unicode} -constraints utf16 -body {
     run {string wordend "\U1D7CA\U1D7CA abc" 10}
-} -result -1
+} -result 8
 
 test string-22.1.$noComp {string wordstart} -body {
     list [catch {run {string word a}} msg] $msg
-} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
+} -result {1 {unknown or ambiguous subcommand "word": must be bytelength, cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
 test string-22.2.$noComp {string wordstart} -body {
     list [catch {run {string wordstart a}} msg] $msg
 } -result {1 {wrong # args: should be "string wordstart string index"}}
@@ -1913,7 +1912,7 @@ test string-22.6.$noComp {string wordstart} -body {
 } -result 0
 test string-22.7.$noComp {string wordstart} -body {
     run {string wordstart "one two three_words" -2}
-} -result -1
+} -result 0
 test string-22.8.$noComp {string wordstart} -body {
     run {string wordstart "one .*&^ three" 6}
 } -result 6
@@ -1939,7 +1938,7 @@ test string-22.14.$noComp {string wordstart, invalid UTF-8} -constraints testbyt
 } -result g
 test string-22.15.$noComp {string wordstart, unicode} -body {
     run {string wordstart "\U1D7CA\U1D7CA abc" 0}
-} -result -1
+} -result 0
 test string-22.16.$noComp {string wordstart, unicode} -constraints utf16 -body {
     run {string wordstart "\U1D7CA\U1D7CA abc" 10}
 } -result 5
@@ -2545,10 +2544,10 @@ test string-33.3.$noComp {string charend} -body {
 } -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}}
 test string-33.4.$noComp {string charend} -body {
     run {string charend abc. -1}
-} -result 1
+} -result 0
 test string-33.5.$noComp {string charend} -body {
     run {string charend abc. 100}
-} -result -1
+} -result 4
 test string-33.6.$noComp {string charend} -body {
     run {string charend "word_one two three" 2}
 } -result 3
@@ -2575,13 +2574,13 @@ test string-33.13.$noComp {string charend, unicode} -body {
 } -result 1
 test string-33.14.$noComp {string charend, unicode} -body {
     run {string charend "\uC700\uC700 abc" 8}
-} -result -1
+} -result 6
 test string-33.15.$noComp {string charend, unicode} -constraints utf16 -body {
     run {string charend "\U1D7CA\U1D7CA abc" 0}
 } -result 2
 test string-33.16.$noComp {string charend, unicode} -constraints utf16 -body {
     run {string charend "\U1D7CA\U1D7CA abc" 10}
-} -result -1
+} -result 8
 
 test string-34.1.$noComp {string charstart} -body {
     list [catch {run {string word a}} msg] $msg
@@ -2597,42 +2596,42 @@ test string-34.4.$noComp {string charstart} -body {
 } -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}}
 test string-34.5.$noComp {string charstart} -body {
     run {string charstart "one two three_words" 400}
-} -result 18
+} -result 19
 test string-34.6.$noComp {string charstart} -body {
     run {string charstart "one two three_words" 2}
-} -result 1
+} -result 2
 test string-34.7.$noComp {string charstart} -body {
     run {string charstart "one two three_words" -2}
-} -result 0
+} -result -1
 test string-34.8.$noComp {string charstart} -body {
     run {string charstart "one .*&^ three" 6}
-} -result 5
+} -result 6
 test string-34.9.$noComp {string charstart} -body {
     run {string charstart "one two three" 4}
-} -result 3
+} -result 4
 test string-34.10.$noComp {string charstart} -body {
     run {string charstart "one two three" end-5}
-} -result 6
+} -result 7
 test string-34.11.$noComp {string charstart, unicode} -body {
     run {string charstart "one tw\xC7o three" 7}
-} -result 6
+} -result 7
 test string-34.12.$noComp {string charstart, unicode} -body {
     run {string charstart "ab\uC700\uC700 cdef ghi" 12}
-} -result 11
+} -result 12
 test string-34.13.$noComp {string charstart, unicode} -body {
     run {string charstart "\uC700\uC700 abc" 8}
-} -result 5
+} -result 6
 test string-34.14.$noComp {string charstart, invalid UTF-8} -constraints testbytestring -body {
     # See Bug c61818e4c9
     set demo [testbytestring "abc def\xE0\xA9ghi"]
     run {string index $demo [string charstart $demo 10]}
-} -result g
+} -result h
 test string-34.15.$noComp {string charstart, unicode} -body {
     run {string charstart "\U1D7CA\U1D7CA abc" 0}
 } -result 0
 test string-34.16.$noComp {string charstart, unicode} -constraints utf16 -body {
     run {string charstart "\U1D7CA\U1D7CA abc" 10}
-} -result 7
+} -result 8
 
 };				# foreach noComp {0 1}
 
-- 
cgit v0.12