From 0fce9600ee90c152b14295a28da571c3a4db622b Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Mon, 25 May 2020 11:53:11 +0000
Subject: Finish implementation of "string
 nextchar|nextword|prevchar|prevword". Not thourougly test yet, but seems OK
 at first sight.

---
 generic/tclCmdMZ.c   | 163 ++++++++++++++++++-----------------------
 generic/tclExecute.c |   2 +-
 generic/tclInt.h     |   6 +-
 generic/tclUtf.c     |  15 +++-
 tests/string.test    | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 290 insertions(+), 96 deletions(-)

diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index bbd03d8..36b2443 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -2499,40 +2499,38 @@ StringStartCmd(
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
     int ch;
-    const char *p, *string;
-    int cur, index, length, numChars;
+    const Tcl_UniChar *p, *string;
+    int cur, index, length;
 
     if (objc != 3) {
 	Tcl_WrongNumArgs(interp, 1, objv, "string index");
 	return TCL_ERROR;
     }
 
-    string = TclGetStringFromObj(objv[1], &length);
-    numChars = Tcl_NumUtfChars(string, length);
-    if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) {
+    string = Tcl_GetUnicodeFromObj(objv[1], &length);
+    if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
 	return TCL_ERROR;
     }
-    string = TclGetStringFromObj(objv[1], &length);
-    if (index >= numChars) {
-	index = numChars - 1;
+    if (index >= length) {
+	index = length - 1;
     }
     cur = 0;
     if (index > 0) {
-	p = Tcl_UtfAtIndex(string, index);
+	p = &string[index];
 
-	TclUtfToUCS4(p, &ch);
+	TclUniCharToUCS4(p, &ch);
 	for (cur = index; cur >= 0; cur--) {
 	    int delta = 0;
-	    const char *next;
+	    const Tcl_UniChar *next;
 
 	    if (!Tcl_UniCharIsWordChar(ch)) {
 		break;
 	    }
 
-	    next = Tcl_UtfPrev(p, string);
+	    next = TclUCS4Prev(p, string);
 	    do {
 		next += delta;
-		delta = TclUtfToUCS4(next, &ch);
+		delta = TclUniCharToUCS4(next, &ch);
 	    } while (next + delta < p);
 	    p = next;
 	}
@@ -2568,49 +2566,28 @@ StringPrevCharCmd(
     int objc,			/* Number of arguments. */
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
-    int ch;
-    const char *p, *string;
-    int cur, index, length, numChars;
+    const Tcl_UniChar *p, *string;
+    int index, length;
 
     if (objc != 3) {
 	Tcl_WrongNumArgs(interp, 1, objv, "string index");
 	return TCL_ERROR;
     }
 
-    string = TclGetStringFromObj(objv[1], &length);
-    numChars = Tcl_NumUtfChars(string, length);
-    if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) {
+    string = Tcl_GetUnicodeFromObj(objv[1], &length);
+    if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
 	return TCL_ERROR;
     }
-    string = TclGetStringFromObj(objv[1], &length);
-    if (index >= numChars) {
-	index = numChars - 1;
+    if (index > length) {
+	index = length;
     }
-    cur = 0;
     if (index > 0) {
-	p = Tcl_UtfAtIndex(string, index);
-
-	TclUtfToUCS4(p, &ch);
-	for (cur = index; cur >= 0; cur--) {
-	    int delta = 0;
-	    const char *next;
-
-	    if (!Tcl_UniCharIsWordChar(ch)) {
-		break;
-	    }
-
-	    next = Tcl_UtfPrev(p, string);
-	    do {
-		next += delta;
-		delta = TclUtfToUCS4(next, &ch);
-	    } while (next + delta < p);
-	    p = next;
-	}
-	if (cur != index) {
-	    cur += 1;
-	}
+	p = &string[index];
+	index = TclUCS4Prev(p, string) - string;
+    } else {
+	index = 0;
     }
-    Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur));
+    Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index));
     return TCL_OK;
 }
 
@@ -2639,40 +2616,53 @@ StringPrevWordCmd(
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
     int ch;
-    const char *p, *string;
-    int cur, index, length, numChars;
+    const Tcl_UniChar *p, *string;
+    int cur, index, length;
 
     if (objc != 3) {
 	Tcl_WrongNumArgs(interp, 1, objv, "string index");
 	return TCL_ERROR;
     }
 
-    string = TclGetStringFromObj(objv[1], &length);
-    numChars = Tcl_NumUtfChars(string, length);
-    if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) {
+    string = Tcl_GetUnicodeFromObj(objv[1], &length);
+    if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
 	return TCL_ERROR;
     }
-    string = TclGetStringFromObj(objv[1], &length);
-    if (index >= numChars) {
-	index = numChars - 1;
+    if (index >= length) {
+	index = length - 1;
     }
     cur = 0;
     if (index > 0) {
-	p = Tcl_UtfAtIndex(string, index);
+	p = &string[index];
 
-	TclUtfToUCS4(p, &ch);
+	TclUniCharToUCS4(p, &ch);
 	for (cur = index; cur >= 0; cur--) {
 	    int delta = 0;
-	    const char *next;
+	    const Tcl_UniChar *next;
 
 	    if (!Tcl_UniCharIsWordChar(ch)) {
 		break;
 	    }
 
-	    next = Tcl_UtfPrev(p, string);
+	    next = TclUCS4Prev(p, string);
 	    do {
 		next += delta;
-		delta = TclUtfToUCS4(next, &ch);
+		delta = TclUniCharToUCS4(next, &ch);
+	    } while (next + delta < p);
+	    p = next;
+	}
+	for (; cur >= 0; cur--) {
+	    int delta = 0;
+	    const Tcl_UniChar *next;
+
+	    if (Tcl_UniCharIsWordChar(ch)) {
+		break;
+	    }
+
+	    next = TclUCS4Prev(p, string);
+	    do {
+		next += delta;
+		delta = TclUniCharToUCS4(next, &ch);
 	    } while (next + delta < p);
 	    p = next;
 	}
@@ -2769,39 +2759,27 @@ StringNextCharCmd(
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
     int ch;
-    const char *p, *end, *string;
-    int cur, index, length, numChars;
+    const Tcl_UniChar *string;
+    int index, length;
 
     if (objc != 3) {
 	Tcl_WrongNumArgs(interp, 1, objv, "string index");
 	return TCL_ERROR;
     }
 
-    string = TclGetStringFromObj(objv[1], &length);
-    numChars = Tcl_NumUtfChars(string, length);
-    if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) {
+    string = Tcl_GetUnicodeFromObj(objv[1], &length);
+    if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
 	return TCL_ERROR;
     }
-    string = TclGetStringFromObj(objv[1], &length);
     if (index < 0) {
 	index = 0;
     }
-    if (index < numChars) {
-	p = Tcl_UtfAtIndex(string, index);
-	end = string+length;
-	for (cur = index; p < end; cur++) {
-	    p += TclUtfToUCS4(p, &ch);
-	    if (!Tcl_UniCharIsWordChar(ch)) {
-		break;
-	    }
-	}
-	if (cur == index) {
-	    cur++;
-	}
+    if (index < length) {
+	    index += TclUniCharToUCS4(&string[index], &ch);
     } else {
-	cur = numChars;
+	    index = length;
     }
-    Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur));
+    Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index));
     return TCL_OK;
 }
 
@@ -2831,39 +2809,40 @@ StringNextWordCmd(
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
     int ch;
-    const char *p, *end, *string;
-    int cur, index, length, numChars;
+    const Tcl_UniChar *p, *end, *string;
+    int index, length;
 
     if (objc != 3) {
 	Tcl_WrongNumArgs(interp, 1, objv, "string index");
 	return TCL_ERROR;
     }
 
-    string = TclGetStringFromObj(objv[1], &length);
-    numChars = Tcl_NumUtfChars(string, length);
-    if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) {
+    string = Tcl_GetUnicodeFromObj(objv[1], &length);
+    if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
 	return TCL_ERROR;
     }
-    string = TclGetStringFromObj(objv[1], &length);
     if (index < 0) {
 	index = 0;
     }
-    if (index < numChars) {
-	p = Tcl_UtfAtIndex(string, index);
+    if (index < length) {
+	p = &string[index];
 	end = string+length;
-	for (cur = index; p < end; cur++) {
-	    p += TclUtfToUCS4(p, &ch);
+	while (index++, p < end) {
+	    p += TclUniCharToUCS4(p, &ch);
 	    if (!Tcl_UniCharIsWordChar(ch)) {
 		break;
 	    }
 	}
-	if (cur == index) {
-	    cur++;
+	for (; p < end; index++) {
+	    p += TclUniCharToUCS4(p, &ch);
+	    if (Tcl_UniCharIsWordChar(ch)) {
+		break;
+	    }
 	}
     } else {
-	cur = numChars;
+	index = length;
     }
-    Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur));
+    Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index));
     return TCL_OK;
 }
 
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index cc366e7..c80e12b 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -5292,7 +5292,7 @@ TEBCresume(
 	} else if (TclIsPureByteArray(valuePtr)) {
 	    objResultPtr = Tcl_NewByteArrayObj(
 		    Tcl_GetByteArrayFromObj(valuePtr, NULL)+index, 1);
-	} else if (valuePtr->bytes && length == valuePtr->length) {
+	} else if (valuePtr->bytes && length == valuePtr->length && !(valuePtr->bytes[index] & 0x80)) {
 	    objResultPtr = Tcl_NewStringObj((const char *)
 		    valuePtr->bytes+index, 1);
 	} else {
diff --git a/generic/tclInt.h b/generic/tclInt.h
index ef7411a..e1dedda 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3253,9 +3253,11 @@ MODULE_SCOPE int	TclUtfCount(int ch);
 #if TCL_UTF_MAX > 3
 #   define TclUtfToUCS4 Tcl_UtfToUniChar
 #   define TclUniCharToUCS4(src, ptr) (*ptr = *(src),1)
+#   define TclUCS4Prev(src, ptr) (((src) > (ptr)) ? ((src) - 1) : (src))
 #else
-    MODULE_SCOPE int	TclUtfToUCS4(const char *src, int *ucs4Ptr);
-    MODULE_SCOPE int	TclUniCharToUCS4(const Tcl_UniChar *src, int *ucs4Ptr);
+    MODULE_SCOPE int	TclUtfToUCS4(const char *, int *);
+    MODULE_SCOPE int	TclUniCharToUCS4(const Tcl_UniChar *, int *);
+    MODULE_SCOPE const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *, const Tcl_UniChar *);
 #endif
 MODULE_SCOPE Tcl_Obj *	TclpNativeToNormalized(ClientData clientData);
 MODULE_SCOPE Tcl_Obj *	TclpFilesystemPathType(Tcl_Obj *pathPtr);
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index db2fc02..807e087 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2642,12 +2642,25 @@ TclUniCharToUCS4(
 			 * by the Tcl_UniChar string. */
 {
     if (((src[0] & 0xFC00) == 0xD800) && ((src[1] & 0xFC00) == 0xDC00)) {
-	*ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[01] & 0x3FF)) + 0x10000;
+	*ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[1] & 0x3FF)) + 0x10000;
 	return 2;
     }
     *ucs4Ptr = src[0];
     return 1;
 }
+
+const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *src, const Tcl_UniChar *ptr) {
+    if (src <= ptr + 1) {
+    	return ptr;
+    }
+    if (((src[-1] & 0xFC00) == 0xDC00) && ((src[-2] & 0xFC00) == 0xD800)) {
+	return src - 2;
+    }
+    return src - 1;
+}
+
+
+
 #endif
 
 /*
diff --git a/tests/string.test b/tests/string.test
index 184a555..17a6d3c 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -2534,6 +2534,206 @@ test string-32.17.$noComp {string is dict, valid dict packed in invalid dict} {
     string is dict {{a b c d e f g h}}
 } 0
 
+test string-33.1.$noComp {string nextchar} -body {
+    list [catch {run {string nextchar a}} msg] $msg
+} -result {1 {wrong # args: should be "string nextchar string index"}}
+test string-33.2.$noComp {string nextchar} -body {
+    list [catch {run {string nextchar a b c}} msg] $msg
+} -result {1 {wrong # args: should be "string nextchar string index"}}
+test string-33.3.$noComp {string nextchar} -body {
+    list [catch {run {string nextchar a gorp}} msg] $msg
+} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}}
+test string-33.4.$noComp {string nextchar} -body {
+    run {string nextchar abc. -1}
+} -result 1
+test string-33.5.$noComp {string nextchar} -body {
+    run {string nextchar abc. 100}
+} -result 4
+test string-33.6.$noComp {string nextchar} -body {
+    run {string nextchar "word_one two three" 2}
+} -result 3
+test string-33.7.$noComp {string nextchar} -body {
+    run {string nextchar "one .&# three" 5}
+} -result 6
+test string-33.8.$noComp {string nextchar} -body {
+    run {string worde "x.y" 0}
+} -result 1
+test string-33.9.$noComp {string nextchar} -body {
+    run {string worde "x.y" end-1}
+} -result 2
+test string-33.10.$noComp {string nextchar, unicode} -body {
+    run {string nextchar "xyz\xC7de fg" 0}
+} -result 1
+test string-33.11.$noComp {string nextchar, unicode} -body {
+    run {string nextchar "xyz\uC700de fg" 0}
+} -result 1
+test string-33.12.$noComp {string nextchar, unicode} -body {
+    run {string nextchar "xyz\u203Fde fg" 0}
+} -result 1
+test string-33.13.$noComp {string nextchar, unicode} -body {
+    run {string nextchar "xyz\u2045de fg" 0}
+} -result 1
+test string-33.14.$noComp {string nextchar, unicode} -body {
+    run {string nextchar "\uC700\uC700 abc" 8}
+} -result 6
+test string-33.15.$noComp {string nextchar, unicode} -constraints utf16 -body {
+    run {string nextchar "\U1D7CA\U1D7CA abc" 0}
+} -result 2
+test string-33.16.$noComp {string nextchar, unicode} -constraints utf16 -body {
+    run {string nextchar "\U1D7CA\U1D7CA abc" 10}
+} -result 8
+
+test string-34.1.$noComp {string nextword} -body {
+    list [catch {run {string nextword a}} msg] $msg
+} -result {1 {wrong # args: should be "string nextword string index"}}
+test string-34.2.$noComp {string nextword} -body {
+    list [catch {run {string nextword a b c}} msg] $msg
+} -result {1 {wrong # args: should be "string nextword string index"}}
+test string-34.3.$noComp {string nextword} -body {
+    list [catch {run {string nextword a gorp}} msg] $msg
+} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}}
+test string-34.4.$noComp {string nextword} -body {
+    run {string nextword abc. -1}
+} -result 4
+test string-34.5.$noComp {string nextword} -body {
+    run {string nextword abc. 100}
+} -result 4
+test string-34.6.$noComp {string nextword} -body {
+    run {string nextword "word_one two three" 2}
+} -result 9
+test string-34.7.$noComp {string nextword} -body {
+    run {string nextword "one .&# three" 5}
+} -result 8
+test string-34.8.$noComp {string nextword} -body {
+    run {string worde "x.y" 0}
+} -result 1
+test string-34.9.$noComp {string nextword} -body {
+    run {string worde "x.y" end-1}
+} -result 2
+test string-34.10.$noComp {string nextword, unicode} -body {
+    run {string nextword "xyz\xC7de fg" 0}
+} -result 7
+test string-34.11.$noComp {string nextword, unicode} -body {
+    run {string nextword "xyz\uC700de fg" 0}
+} -result 7
+test string-34.12.$noComp {string nextword, unicode} -body {
+    run {string nextword "xyz\u203Fde fg" 0}
+} -result 7
+test string-34.13.$noComp {string nextword, unicode} -body {
+    run {string nextword "xyz\u2045\u2045de fg" 0}
+} -result 5
+test string-34.14.$noComp {string nextword, unicode} -body {
+    run {string nextword "\uC700\uC700 abc" 8}
+} -result 6
+test string-34.15.$noComp {string nextword, unicode} -body {
+    run {string nextword "\U1D7CA\U1D7CA abc" 0}
+} -result 3
+test string-34.16.$noComp {string nextword, unicode} -constraints utf16 -body {
+    run {string nextword "\U1D7CA\U1D7CA abc" 10}
+} -result 8
+
+test string-35.1.$noComp {string prevchar} -body {
+    list [catch {run {string word a}} msg] $msg
+} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
+test string-35.2.$noComp {string prevchar} -body {
+    list [catch {run {string prevchar a}} msg] $msg
+} -result {1 {wrong # args: should be "string prevchar string index"}}
+test string-35.3.$noComp {string prevchar} -body {
+    list [catch {run {string prevchar a b c}} msg] $msg
+} -result {1 {wrong # args: should be "string prevchar string index"}}
+test string-35.4.$noComp {string prevchar} -body {
+    list [catch {run {string prevchar a gorp}} msg] $msg
+} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}}
+test string-35.5.$noComp {string prevchar} -body {
+    run {string prevchar "one two three_words" 400}
+} -result 18
+test string-35.6.$noComp {string prevchar} -body {
+    run {string prevchar "one two three_words" 2}
+} -result 1
+test string-35.7.$noComp {string prevchar} -body {
+    run {string prevchar "one two three_words" -2}
+} -result 0
+test string-35.8.$noComp {string prevchar} -body {
+    run {string prevchar "one .*&^ three" 6}
+} -result 5
+test string-35.9.$noComp {string prevchar} -body {
+    run {string prevchar "one two three" 4}
+} -result 3
+test string-35.10.$noComp {string prevchar} -body {
+    run {string prevchar "one two three" end-5}
+} -result 6
+test string-35.11.$noComp {string prevchar, unicode} -body {
+    run {string prevchar "one tw\xC7o three" 7}
+} -result 6
+test string-35.12.$noComp {string prevchar, unicode} -body {
+    run {string prevchar "ab\uC700\uC700 cdef ghi" 12}
+} -result 11
+test string-35.13.$noComp {string prevchar, unicode} -body {
+    run {string prevchar "\uC700\uC700 abc" 8}
+} -result 5
+test string-35.14.$noComp {string prevchar, invalid UTF-8} -constraints testbytestring -body {
+    # See Bug c61818e4c9
+    set demo [testbytestring "abc def\xE0\xA9ghi"]
+    run {string index $demo [string prevchar $demo 10]}
+} -result g
+test string-35.15.$noComp {string prevchar, unicode} -body {
+    run {string prevchar "\U1D7CA\U1D7CA abc" 0}
+} -result 0
+test string-35.16.$noComp {string prevchar, unicode} -constraints utf16 -body {
+    run {string prevchar "\U1D7CA\U1D7CA abc" 10}
+} -result 7
+
+test string-36.1.$noComp {string prevword} -body {
+    list [catch {run {string word a}} msg] $msg
+} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
+test string-36.2.$noComp {string prevword} -body {
+    list [catch {run {string prevword a}} msg] $msg
+} -result {1 {wrong # args: should be "string prevword string index"}}
+test string-36.3.$noComp {string prevword} -body {
+    list [catch {run {string prevword a b c}} msg] $msg
+} -result {1 {wrong # args: should be "string prevword string index"}}
+test string-36.4.$noComp {string prevword} -body {
+    list [catch {run {string prevword a gorp}} msg] $msg
+} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}}
+test string-36.5.$noComp {string prevword} -body {
+    run {string prevword "one two three_words" 400}
+} -result 7
+test string-36.6.$noComp {string prevword} -body {
+    run {string prevword "one two three_words" 2}
+} -result 0
+test string-36.7.$noComp {string prevword} -body {
+    run {string prevword "one two three_words" -2}
+} -result 0
+test string-36.8.$noComp {string prevword} -body {
+    run {string prevword "one .*&^ three" 6}
+} -result 3
+test string-36.9.$noComp {string prevword} -body {
+    run {string prevword "one two three" 4}
+} -result 3
+test string-36.10.$noComp {string prevword} -body {
+    run {string prevword "one two three" end-5}
+} -result 7
+test string-36.11.$noComp {string prevword, unicode} -body {
+    run {string prevword "one tw\xC7o three" 7}
+} -result 3
+test string-36.12.$noComp {string prevword, unicode} -body {
+    run {string prevword "ab\uC700\uC700 cdef ghi" 12}
+} -result 9
+test string-36.13.$noComp {string prevword, unicode} -body {
+    run {string prevword "\uC700\uC700 abc" 8}
+} -result 2
+test string-36.14.$noComp {string prevword, invalid UTF-8} -constraints testbytestring -body {
+    # See Bug c61818e4c9
+    set demo [testbytestring "abc def\xE0\xA9ghi"]
+    run {string index $demo [string prevword $demo 10]}
+} -result \xA9
+test string-36.15.$noComp {string prevword, unicode} -body {
+    run {string prevword "\U1D7CA\U1D7CA abc" 0}
+} -result 0
+test string-36.16.$noComp {string prevword, unicode} -constraints utf16 -body {
+    run {string prevword "\U1D7CA\U1D7CA abc" 10}
+} -result 4
+
 };				# foreach noComp {0 1}
 
 # cleanup
-- 
cgit v0.12