From e9b1c88a4142a059a821fdd2f2b02272a53c9151 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Tue, 14 Apr 2020 20:03:33 +0000
Subject: The function of Tcl_UtfNext() is to advance a pointer. There's
 nothing inherent in that task that requires decoding of the characters, but
 the implementation does that. Let's try a simpler solution for callers that
 do not need the content decoded.

---
 generic/tclUtf.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index fbdba4c..a03fa30 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -644,9 +644,19 @@ CONST char *
 Tcl_UtfNext(
     CONST char *src)		/* The current location in the string. */
 {
-    Tcl_UniChar ch;
-
-    return src + TclUtfToUniChar(src, &ch);
+    int byte = *((unsigned char *) src);
+    int left = totalBytes[byte];
+
+    src++;
+    while (--left) {
+	byte = *((unsigned char *) src);
+	if ((byte & 0xC0) != 0x80) {
+	    /* src points to non-trail byte; return it */
+	    return src;
+	}
+	src++;
+    }
+    return src;
 }
 
 /*
-- 
cgit v0.12


From 279c54dbff724a62d6739a9cc71ba31a83325c98 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Tue, 14 Apr 2020 20:18:55 +0000
Subject: Create and use an optimized macro TclUtfNext() for Tcl_UtfNext().

---
 generic/tclInt.h  | 3 +++
 generic/tclUtil.c | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/generic/tclInt.h b/generic/tclInt.h
index 15bc000..e92cd18 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3691,6 +3691,9 @@ MODULE_SCOPE void	TclDbInitNewObj(Tcl_Obj *objPtr, CONST char *file,
 	    ((*(chPtr) = (unsigned char) *(str)), 1)	\
 	    : Tcl_UtfToUniChar(str, chPtr))
 
+#define TclUtfNext(src)	\
+	((((unsigned char) *(src)) < 0xC0) ? src + 1 : Tcl_UtfNext(src))
+
 /*
  *----------------------------------------------------------------
  * Macro that encapsulates the logic that determines when it is safe to
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index 3dd9a32..e87cf83 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -1691,7 +1691,7 @@ TclTrim(
 	 * that we will not trim. Skip over it. */
 	if (numBytes > 0) {
 	    const char *first = bytes + trimLeft;
-	    bytes = Tcl_UtfNext(first);
+	    bytes = TclUtfNext(first);
 	    numBytes -= (bytes - first);
 
 	    if (numBytes > 0) {
-- 
cgit v0.12


From 6df0ccf3997e397b860c47c770ba0fc31a2a9961 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Tue, 14 Apr 2020 21:16:43 +0000
Subject: Replace calls of TclUtfToUniChar() with TclUtfNext() when caller has
 no decoding need. Failing test string-22.14 indicates something is still not
 quite right. Now that Tcl_NumUtfChars() is not paying decoding prices, we let
 it spend to properly protect against overflow. [2738427]

---
 generic/tclCompExpr.c |  5 ++---
 generic/tclUtf.c      | 19 ++++++-------------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c
index 27d7503..42321af 100644
--- a/generic/tclCompExpr.c
+++ b/generic/tclCompExpr.c
@@ -1801,7 +1801,6 @@ ParseLexeme(
 {
     const char *end;
     int scanned;
-    Tcl_UniChar ch;
     Tcl_Obj *literal = NULL;
     unsigned char byte;
 
@@ -1979,12 +1978,12 @@ ParseLexeme(
 
     if (!TclIsBareword(*start) || *start == '_') {
 	if (Tcl_UtfCharComplete(start, numBytes)) {
-	    scanned = Tcl_UtfToUniChar(start, &ch);
+	    scanned = TclUtfNext(start) - start;
 	} else {
 	    char utfBytes[TCL_UTF_MAX];
 	    memcpy(utfBytes, start, (size_t) numBytes);
 	    utfBytes[numBytes] = '\0';
-	    scanned = Tcl_UtfToUniChar(utfBytes, &ch);
+	    scanned = TclUtfNext(utfBytes) - utfBytes;
 	}
 	*lexemePtr = INVALID;
 	Tcl_DecrRefCount(literal);
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index a03fa30..25d52d0 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -504,7 +504,6 @@ Tcl_NumUtfChars(
     int length)			/* The length of the string in bytes, or -1
 				 * for strlen(string). */
 {
-    Tcl_UniChar ch;
     register int i;
 
     /*
@@ -516,21 +515,20 @@ Tcl_NumUtfChars(
 
     i = 0;
     if (length < 0) {
-	while (*src != '\0') {
-	    src += TclUtfToUniChar(src, &ch);
+	while ((*src != '\0') && (i < INT_MAX)) {
+	    src = TclUtfNext(src);
 	    i++;
 	}
-	if (i < 0) i = INT_MAX; /* Bug [2738427] */
     } else {
 	register const char *endPtr = src + length - TCL_UTF_MAX;
 
 	while (src < endPtr) {
-	    src += TclUtfToUniChar(src, &ch);
+	    src = TclUtfNext(src);
 	    i++;
 	}
 	endPtr += TCL_UTF_MAX;
 	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
-	    src += TclUtfToUniChar(src, &ch);
+	    src = TclUtfNext(src);
 	    i++;
 	}
 	if (src < endPtr) {
@@ -764,10 +762,7 @@ Tcl_UniCharAtIndex(
 {
     Tcl_UniChar ch;
 
-    while (index >= 0) {
-	index--;
-	src += TclUtfToUniChar(src, &ch);
-    }
+    TclUtfToUniChar(Tcl_UtfAtIndex(src, index), &ch);
     return ch;
 }
 
@@ -793,11 +788,9 @@ Tcl_UtfAtIndex(
     register CONST char *src,	/* The UTF-8 string. */
     register int index)		/* The position of the desired character. */
 {
-    Tcl_UniChar ch;
-
     while (index > 0) {
 	index--;
-	src += TclUtfToUniChar(src, &ch);
+	src = TclUtfNext(src);
     }
     return src;
 }
-- 
cgit v0.12


From e2b7ad1627665c99b128ff5a023e9f772fe467b4 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Tue, 14 Apr 2020 21:25:33 +0000
Subject: Fix the bad logic in Tcl_UtfNext().

---
 generic/tclUtf.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 25d52d0..7dd8598 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -644,17 +644,21 @@ Tcl_UtfNext(
 {
     int byte = *((unsigned char *) src);
     int left = totalBytes[byte];
+    const char *next = src + 1;
 
-    src++;
     while (--left) {
-	byte = *((unsigned char *) src);
+	byte = *((unsigned char *) next);
 	if ((byte & 0xC0) != 0x80) {
-	    /* src points to non-trail byte; return it */
-	    return src;
+	    /*
+	     * src points to non-trail byte; We ran out of trail bytes
+	     * before the needs of the lead bytes were satisfied.
+	     * Let the (malformed) lead byte alone be a character
+	     */
+	    return src + 1;
 	}
-	src++;
+	next++;
     }
-    return src;
+    return next;
 }
 
 /*
-- 
cgit v0.12


From 7aff227882dd8bfaa8972ecaf1e129bb9ef1e6e3 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Tue, 14 Apr 2020 21:32:22 +0000
Subject: typo

---
 generic/tclUtf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 7dd8598..078ecf4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -651,7 +651,7 @@ Tcl_UtfNext(
 	if ((byte & 0xC0) != 0x80) {
 	    /*
 	     * src points to non-trail byte; We ran out of trail bytes
-	     * before the needs of the lead bytes were satisfied.
+	     * before the needs of the lead byte were satisfied.
 	     * Let the (malformed) lead byte alone be a character
 	     */
 	    return src + 1;
-- 
cgit v0.12


From f0f59ae8a31a818d78cb449dc4532762cfb2bb00 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Tue, 14 Apr 2020 21:39:53 +0000
Subject: New testing command [testutfnext].

---
 generic/tclTest.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/generic/tclTest.c b/generic/tclTest.c
index 31d3a7f..782b9a2 100644
--- a/generic/tclTest.c
+++ b/generic/tclTest.c
@@ -433,6 +433,7 @@ static int		SimpleMatchInDirectory(
 			    Tcl_Interp *interp, Tcl_Obj *resultPtr,
 			    Tcl_Obj *dirPtr, const char *pattern,
 			    Tcl_GlobTypeData *types);
+static Tcl_ObjCmdProc	TestUtfNextCmd;
 static Tcl_ObjCmdProc	TestUtfPrevCmd;
 static int		TestNumUtfCharsCmd(ClientData clientData,
 			    Tcl_Interp *interp, int objc,
@@ -697,8 +698,10 @@ Tcltest_Init(
 	    (ClientData) 0, NULL);
     Tcl_CreateObjCommand(interp, "testsetobjerrorcode",
 	    TestsetobjerrorcodeCmd, NULL, NULL);
+    Tcl_CreateObjCommand(interp, "testutfnext",
+	    TestUtfNextCmd, NULL, NULL);
     Tcl_CreateObjCommand(interp, "testutfprev",
-	    TestUtfPrevCmd, (ClientData) 0, NULL);
+	    TestUtfPrevCmd, NULL, NULL);
     Tcl_CreateObjCommand(interp, "testnumutfchars",
 	    TestNumUtfCharsCmd, NULL, NULL);
     Tcl_CreateObjCommand(interp, "testfindfirst",
@@ -7107,6 +7110,52 @@ SimpleListVolumes(void)
 }
 
 /*
+ * Used to check operations of Tcl_UtfNext.
+ *
+ * Usage: testutfnext $bytes $offset
+ */
+
+static int
+TestUtfNextCmd(
+    ClientData clientData,
+    Tcl_Interp *interp,
+    int objc,
+    Tcl_Obj *const objv[])
+{
+    int numBytes, offset = 0;
+    char *bytes;
+    const char *result;
+    Tcl_Obj *copy;
+    
+    if (objc < 2 || objc > 3) {
+	Tcl_WrongNumArgs(interp, 1, objv, "bytes ?offset?");
+	return TCL_ERROR;
+    }
+
+    bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes);
+    
+    if (objc == 3) {
+	if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) {
+	    return TCL_ERROR;
+	}
+	if (offset < 0) {
+	    offset = 0;
+	}
+	if (offset > numBytes) {
+	    offset = numBytes;
+	}
+    }
+    copy = Tcl_DuplicateObj(objv[1]);
+    bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1);
+    bytes[numBytes] = '\0';
+
+    result = Tcl_UtfNext(bytes + offset);
+    Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes));
+
+    Tcl_DecrRefCount(copy);
+    return TCL_OK;
+}
+/*
  * Used to check operations of Tcl_UtfPrev.
  *
  * Usage: testutfprev $bytes $offset
@@ -7149,9 +7198,9 @@ TestUtfPrevCmd(
     bytes[numBytes] = '\0';
 
     result = Tcl_UtfPrev(bytes + offset, bytes);
+    Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes));
 
     Tcl_DecrRefCount(copy);
-    Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes));
     return TCL_OK;
 }
 
-- 
cgit v0.12


From 532ec4fa923534f592e04cc3c5679ce5771c684c Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Wed, 15 Apr 2020 14:42:41 +0000
Subject: Collection of coverage tests for Tcl_UtfNext.

---
 tests/utf.test | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 244 insertions(+), 2 deletions(-)

diff --git a/tests/utf.test b/tests/utf.test
index 56ca1b9..b5358cc 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -111,8 +111,250 @@ test utf-5.2 {Tcl_UtfFindLast} testfindlast {
     testfindlast [bytestring "abcbc"] 98
 } {bc}
 
-test utf-6.1 {Tcl_UtfNext} {
-} {}
+testConstraint testutfnext [llength [info commands testutfnext]]
+
+test utf-6.1 {Tcl_UtfNext} testutfnext {
+    # This takes the pointer one past the terminating NUL.
+    # This is really an invalid call.
+    testutfnext {}
+} 1
+test utf-6.2 {Tcl_UtfNext} testutfnext {
+    testutfnext A
+} 1
+test utf-6.3 {Tcl_UtfNext} testutfnext {
+    testutfnext AA
+} 1
+test utf-6.4 {Tcl_UtfNext} testutfnext {
+    testutfnext A\xA0
+} 1
+test utf-6.5 {Tcl_UtfNext} testutfnext {
+    testutfnext A\xD0
+} 1
+test utf-6.6 {Tcl_UtfNext} testutfnext {
+    testutfnext A\xE8
+} 1
+test utf-6.7 {Tcl_UtfNext} testutfnext {
+    testutfnext A\xF4
+} 1
+test utf-6.8 {Tcl_UtfNext} testutfnext {
+    testutfnext A\xF8
+} 1
+test utf-6.9 {Tcl_UtfNext} testutfnext {
+    testutfnext \xA0
+} 1
+test utf-6.10 {Tcl_UtfNext} testutfnext {
+    testutfnext \xA0G
+} 1
+test utf-6.11 {Tcl_UtfNext} testutfnext {
+    testutfnext \xA0\xA0
+} 1
+test utf-6.12 {Tcl_UtfNext} testutfnext {
+    testutfnext \xA0\xD0
+} 1
+test utf-6.13 {Tcl_UtfNext} testutfnext {
+    testutfnext \xA0\xE8
+} 1
+test utf-6.14 {Tcl_UtfNext} testutfnext {
+    testutfnext \xA0\xF4
+} 1
+test utf-6.15 {Tcl_UtfNext} testutfnext {
+    testutfnext \xA0\xF8
+} 1
+test utf-6.16 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0
+} 1
+test utf-6.17 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0A
+} 1
+test utf-6.18 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xA0
+} 2
+test utf-6.19 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xD0
+} 1
+test utf-6.20 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xE8
+} 1
+test utf-6.21 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xF4
+} 1
+test utf-6.22 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xF8
+} 1
+test utf-6.23 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8
+} 1
+test utf-6.24 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8A
+} 1
+test utf-6.25 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0
+} 1
+test utf-6.26 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xD0
+} 1
+test utf-6.27 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xE8
+} 1
+test utf-6.28 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xF4
+} 1
+test utf-6.29 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xF8
+} 1
+test utf-6.30 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4
+} 1
+test utf-6.31 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4A
+} 1
+test utf-6.32 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0
+} 1
+test utf-6.33 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xD0
+} 1
+test utf-6.34 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xE8
+} 1
+test utf-6.35 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xF4
+} 1
+test utf-6.36 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xF8
+} 1
+test utf-6.37 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF8
+} 1
+test utf-6.38 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF8A
+} 1
+test utf-6.39 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF8\xA0
+} 1
+test utf-6.40 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF8\xD0
+} 1
+test utf-6.41 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF8\xE8
+} 1
+test utf-6.42 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF8\xF4
+} 1
+test utf-6.43 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF8\xF8
+} 1
+test utf-6.44 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xA0G
+} 2
+test utf-6.45 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xA0\xA0
+} 2
+test utf-6.46 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xA0\xD0
+} 2
+test utf-6.47 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xA0\xE8
+} 2
+test utf-6.48 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xA0\xF4
+} 2
+test utf-6.49 {Tcl_UtfNext} testutfnext {
+    testutfnext \xD0\xA0\xF8
+} 2
+test utf-6.50 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0G
+} 1
+test utf-6.51 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xA0
+} 3
+test utf-6.52 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xD0
+} 1
+test utf-6.53 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xE8
+} 1
+test utf-6.54 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xF4
+} 1
+test utf-6.55 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xF8
+} 1
+test utf-6.56 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0G
+} 1
+test utf-6.57 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0
+} 1
+test utf-6.58 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xD0
+} 1
+test utf-6.59 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xE8
+} 1
+test utf-6.60 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xF4
+} 1
+test utf-6.61 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xF8
+} 1
+test utf-6.62 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xA0G
+} 3
+test utf-6.63 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xA0\xA0
+} 3
+test utf-6.64 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xA0\xD0
+} 3
+test utf-6.65 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xA0\xE8
+} 3
+test utf-6.66 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xA0\xF4
+} 3
+test utf-6.67 {Tcl_UtfNext} testutfnext {
+    testutfnext \xE8\xA0\xA0\xF8
+} 3
+test utf-6.68 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0G
+} 1
+test utf-6.69 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xA0
+} 1
+test utf-6.70 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xD0
+} 1
+test utf-6.71 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xE8
+} 1
+test utf-6.71 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xF4
+} 1
+test utf-6.73 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xF8
+} 1
+test utf-6.74 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xA0G
+} 1
+test utf-6.75 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xA0\xA0
+} 1
+test utf-6.76 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xA0\xD0
+} 1
+test utf-6.77 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xA0\xE8
+} 1
+test utf-6.78 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xA0\xF4
+} 1
+test utf-6.79 {Tcl_UtfNext} testutfnext {
+    testutfnext \xF4\xA0\xA0\xA0G\xF8
+} 1
+
+
+
 
 testConstraint testutfprev [llength [info commands testutfprev]]
 
-- 
cgit v0.12


From e2a3a358c95196a7bf142d591fa5ef729b3b0d69 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Wed, 15 Apr 2020 16:42:32 +0000
Subject: Add test demonstrating that Tcl_UtfNext accepts overlong byte
 sequences, which is in conflict with what Tcl_UtfToUniChar does.

---
 tests/utf.test | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/utf.test b/tests/utf.test
index b5358cc..a930aae 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -352,9 +352,9 @@ test utf-6.78 {Tcl_UtfNext} testutfnext {
 test utf-6.79 {Tcl_UtfNext} testutfnext {
     testutfnext \xF4\xA0\xA0\xA0G\xF8
 } 1
-
-
-
+test utf-6.80 {Tcl_UtfNext - overlong sequences} {
+    testutfnext \xC0\x81
+} 1
 
 testConstraint testutfprev [llength [info commands testutfprev]]
 
-- 
cgit v0.12


From e6faa58e6df3292b2c0735ba4921af4be0e215fa Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 16 Apr 2020 18:40:40 +0000
Subject: More tests and fix for overlong handling in revised Tcl_UtfNext.

---
 generic/tclUtf.c |  3 +++
 tests/utf.test   | 23 ++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index e41e7a5..00ca94e 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -712,6 +712,9 @@ Tcl_UtfNext(
 	}
 	next++;
     }
+    if (Overlong(src)) {
+	return src + 1;
+    }
     return next;
 }
 
diff --git a/tests/utf.test b/tests/utf.test
index 72165f9..02b7002 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -352,9 +352,30 @@ test utf-6.78 {Tcl_UtfNext} testutfnext {
 test utf-6.79 {Tcl_UtfNext} testutfnext {
     testutfnext \xF4\xA0\xA0\xA0G\xF8
 } 1
-test utf-6.80 {Tcl_UtfNext - overlong sequences} {
+test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext {
+    testutfnext \xC0\x80
+} 2
+test utf-6.81 {Tcl_UtfNext - overlong sequences} testutfnext {
     testutfnext \xC0\x81
 } 1
+test utf-6.82 {Tcl_UtfNext - overlong sequences} testutfnext {
+    testutfnext \xC1\x80
+} 1
+test utf-6.83 {Tcl_UtfNext - overlong sequences} testutfnext {
+    testutfnext \xC2\x80
+} 2
+test utf-6.84 {Tcl_UtfNext - overlong sequences} testutfnext {
+    testutfnext \xE0\x80\x80
+} 1
+test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext {
+    testutfnext \xE0\xA0\x80
+} 3
+test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext {
+    testutfnext \xF0\x80\x80\x80
+} 1
+test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext {
+    testutfnext \xF0\x90\x80\x80
+} 1
 
 testConstraint testutfprev [llength [info commands testutfprev]]
 
-- 
cgit v0.12


From 443928c10f1ac94e6a6adfafb478eb9fa09ac39a Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 16 Apr 2020 18:42:37 +0000
Subject: compiler warning

---
 generic/tclUtf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 00ca94e..91e9c73 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -712,7 +712,7 @@ Tcl_UtfNext(
 	}
 	next++;
     }
-    if (Overlong(src)) {
+    if (Overlong((unsigned char *)src)) {
 	return src + 1;
     }
     return next;
-- 
cgit v0.12


From 60d5424069845124e51d4032f295d913a17454a1 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 16 Apr 2020 19:02:40 +0000
Subject: More detailed comments.

---
 generic/tclUtf.c | 70 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 24 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 91e9c73..67603af 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -678,13 +678,35 @@ Tcl_UtfFindLast(
  *
  * Tcl_UtfNext --
  *
- *	Given a pointer to some current location in a UTF-8 string, move
- *	forward one character. The caller must ensure that they are not asking
- *	for the next character after the last character in the string.
+ *	The aim of this routine is to provide a way to iterate forward
+ *	through a UTF-8 string. The caller is expected to pass a non-NULL
+ *	pointer argument /src/ which points to a location within a string.
+ *	(*src) will be read, so /src/ must not point to an unreadable
+ *	location past the end of the string. If /src/ points to the
+ *	beginning of a complete, well-formed and valid UTF_8 byte sequence
+ *	of no more than TCL_UTF_MAX bytes, Tcl_UtfNext returns the pointer
+ *	just past the end of that sequence. In any other circumstance,
+ *	Tcl_UtfNext returns /src/+1.
+ *
+ *	Because this routine always returns a value > /src/, it is useful
+ *	as a forward iterator that will always make progress. If the string
+ *	is NUL-terminated, Tcl_UtfNext will not read beyond the terminating
+ *	NUL character. If it is not NUL-terminated, the caller must make
+ *	use of the companion routine Tcl_UtfCharComplete to test whether
+ *	there is risk that Tcl_UtfNext will read beyond the end of the string.
+ *	Tcl_UtfNext will never read more than TCL_UTF_MAX bytes.
+ *
+ *	In a string where all characters are complete and properly formed,
+ *	and /src/ points to the first byte of a character, repeated
+ *	Tcl_UtfNext calls will step to the starting bytes of characters, one
+ *	character at a time. Within those limitations, Tcl_UtfPrev and
+ *	Tcl_UtfNext are inverses. If either condition cannot be met,
+ *	Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the
+ *	caller will have to take greater care.
  *
  * Results:
- *	The return value is the pointer to the next character in the UTF-8
- *	string.
+ *	A pointer to the start of the next character in the string (or to
+ *	the end of the string) as described above.
  *
  * Side effects:
  *	None.
@@ -725,37 +747,37 @@ Tcl_UtfNext(
  *
  *	The aim of this routine is to provide a way to move backward
  *	through a UTF-8 string. The caller is expected to pass non-NULL
- *	pointer arguments start and src. start points to the beginning
- *	of a string, and src >= start points to a location within (or just
- *	past the end) of the string. This routine always returns a
- *	pointer within the string (>= start).  When (src == start), it
- *	returns start. When (src > start), it returns a pointer (< src)
- *	and (>= src - TCL_UTF_MAX).  Subject to these constraints, the
- *	routine returns a pointer to the earliest byte in the string that
- *	starts a character when characters are read starting at start and
+ *	pointer arguments /start/ and /src/. /start/ points to the beginning
+ *	of a string, and /src/ (>= /start/) points to a location within (or
+ *	just past the end) of the string. This routine always returns a
+ *	pointer within the string (>= /start/).  When (/src/ == /start/),
+ *	it returns /start/. When (/src/ > /start/), it returns a pointer
+ *	(< /src/) and (>= /src/ - TCL_UTF_MAX).  Subject to these constraints,
+ *	the routine returns a pointer to the earliest byte in the string that
+ *	starts a character when characters are read starting at /start/ and
  *	that character might include the byte src[-1]. The routine will
  *	examine only those bytes in the range that might be returned.
- *	It will not examine the byte *src, and because of that cannot
+ *	It will not examine the byte (*src), and because of that cannot
  *	determine for certain in all circumstances whether the character
  *	that begins with the returned pointer will or will not include
- *	the byte src[-1]. In the scenario, where src points to the end of
- *	a buffer being filled, the returned pointer point to either the
+ *	the byte src[-1]. In the scenario where /src/ points to the end of
+ *	a buffer being filled, the returned pointer points to either the
  *	final complete character in the string or to the earliest byte
  *	that might start an incomplete character waiting for more bytes to
  *	complete.
  *
- *	Because this routine always returns a value < src until the point
- *	it is forced to return start, it is useful as a backward iterator
+ *	Because this routine always returns a value < /src/ until the point
+ *	it is forced to return /start/, it is useful as a backward iterator
  *	through a string that will always make progress and always be
  *	prevented from running past the beginning of the string.
  *
  *	In a string where all characters are complete and properly formed,
- *	and the value of src points to the first byte of a character,
- *	repeated Tcl_UtfPrev calls will step to the starting bytes of
- *	characters, one character at a time. Within those limitations,
- *	Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot
- *	be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and
- *	the caller will have to take greater care.
+ *	and /src/ points to the first byte of a character, repeated
+ *	Tcl_UtfPrev calls will step to the starting bytes of characters, one
+ *	character at a time. Within those limitations, Tcl_UtfPrev and
+ *	Tcl_UtfNext are inverses. If either condition cannot be met,
+ *	Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the
+ *	caller will have to take greater care.
  *
  * Results:
  *	A pointer to the start of a character in the string as described
-- 
cgit v0.12


From c2479a465e6bef08275d47d0277deda87e6e014e Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 16 Apr 2020 19:04:47 +0000
Subject: delete merge litter

---
 generic/tclTest.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/generic/tclTest.c b/generic/tclTest.c
index b8507bf..6e0fbed 100644
--- a/generic/tclTest.c
+++ b/generic/tclTest.c
@@ -325,7 +325,6 @@ static Tcl_FSPathInFilesystemProc SimplePathInFilesystem;
 static Tcl_Obj *	SimpleRedirect(Tcl_Obj *pathPtr);
 static Tcl_FSMatchInDirectoryProc SimpleMatchInDirectory;
 static Tcl_ObjCmdProc	TestUtfNextCmd;
-static Tcl_ObjCmdProc	TestUtfNextCmd;
 static Tcl_ObjCmdProc	TestUtfPrevCmd;
 static Tcl_ObjCmdProc	TestNumUtfCharsCmd;
 static Tcl_ObjCmdProc	TestFindFirstCmd;
-- 
cgit v0.12


From 66197fff215f60690a444b3f2af67a0c3c87c8af Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 16 Apr 2020 19:39:36 +0000
Subject: Improve the docs for Tcl_UtfNext.

---
 doc/Utf.3 | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/doc/Utf.3 b/doc/Utf.3
index 87d1318..cb82699 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -217,11 +217,20 @@ returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR
 in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is
 considered part of the UTF-8 string.  
 .PP
-Given \fIsrc\fR, a pointer to some location in a UTF-8 string,
-\fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the
-string.  The caller must not ask for the next character after the last
-character in the string if the string is not terminated by a null
-character.
+\fBTcl_UtfNext\fR is used to step forward through a UTF-8 string.
+If the UTF-8 string is made up entirely of complete, well-formed, and
+valid character byte sequences, and \fIsrc\fR points to the lead byte
+of one of those sequences, then repeated calls of \fBTcl_UtfNext\fR will
+return pointers to the lead bytes of each character in the string, one
+character at a time. In any other circumstance, \fBTcl_UtfNext\fR
+returns \fIsrc\fR+1.  \fBTcl_UtfNext\fR will always read \fIsrc[0]\fR
+and may read as many following bytes (up to a total of \fBTCL_UTF_MAX\fR)
+as needed to find the end of the byte sequence. If the string is
+\fBNUL\fR-terminated, \fBTcl_UtfNext\fR will not read beyond the terminating
+\fBNUL\fR byte. If not, the caller must use the companion routine
+\fBTcl_UtfCharComplete\fR to determine whether there is any risk
+\fBTcl_UtfNext\fR might read beyond the readable memory occupied
+by the string.
 .PP
 \fBTcl_UtfPrev\fR is used to step backward through but not beyond the
 UTF-8 string that begins at \fIstart\fR.  If the UTF-8 string is made
-- 
cgit v0.12