From 2ea2ef0609d7e306bf981672cda2e66782ed4db3 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Thu, 11 Mar 2021 12:19:14 +0000
Subject: Backport Tcl_UtfCharComplete() functionality from 8.6 for
 TCL_UTF_MAX>3. This makes Tcl_UtfCharComplete() usable to protect
 Tcl_UtfNext() calls for overflow. No change for TCL_UTF_MAX=3 (default build)

---
 generic/tclUtf.c | 19 ++++++++++++++++++-
 tests/utf.test   |  6 +++---
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 03d0f3a..efbd383 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -76,6 +76,23 @@ static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1
 };
 
+#if TCL_UTF_MAX > 3
+static const unsigned char complete[256] = {
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/* End of "continuation byte section" */
+    2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1
+};
+#else
+#   define complete totalBytes
+#endif
+
 /*
  * Functions used only in this module.
  */
@@ -492,7 +509,7 @@ Tcl_UtfCharComplete(
 				 * a complete UTF-8 character. */
     int length)			/* Length of above string in bytes. */
 {
-    return length >= totalBytes[UCHAR(*src)];
+    return length >= complete[UCHAR(*src)];
 }
 
 /*
diff --git a/tests/utf.test b/tests/utf.test
index 06ac329..76b6847 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -8,8 +8,8 @@
 # See the file "license.terms" for information on usage and redistribution
 # of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 
-if {[lsearch [namespace children] ::tcltest] == -1} {
-    package require tcltest 2
+if {"::tcltest" ni [namespace children]} {
+    package require tcltest 2.5
     namespace import -force ::tcltest::*
 }
 
@@ -614,7 +614,7 @@ test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf}
 test utf-6.118 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
     testutfnext [testbytestring \xA0]G 0
 } 0
-test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext testbytestring} {
+test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
     testutfnext [testbytestring \xA0]G 1
 } 1
 test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} {
-- 
cgit v0.12


From b0e0d4b618d58c962735cb62982229a8f67fb632 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Sun, 14 Mar 2021 16:12:25 +0000
Subject: Document that Tcl_UtfCharComplete() can (now) be used to protect
 Tcl_UtfNext() calls against overflow, if the string being handled is not
 NULL-terminated.

---
 doc/Utf.3 | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/doc/Utf.3 b/doc/Utf.3
index cca6498..9687eb6 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -141,8 +141,8 @@ source buffer is long enough such that this routine does not run off the
 end and dereference non-existent or random memory; if the source buffer
 is known to be null-terminated, this will not happen.  If the input is
 not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
-byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0080 and
-0x00FF and return 1.
+byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x80 and
+0xFF and return 1.
 .PP
 \fBTcl_UniCharToUtfDString\fR converts the given Unicode string
 to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR.
@@ -197,10 +197,10 @@ characters.
 .PP
 \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR
 of \fIlength\fR bytes is long enough to be decoded by
-\fBTcl_UtfToUniChar\fR, or 0 otherwise.  This function does not guarantee
-that the UTF-8 string is properly formed.  This routine is used by
-procedures that are operating on a byte at a time and need to know if a
-full Tcl_UniChar has been seen.
+\fBTcl_UtfToUniChar\fR/\fBTcl_UtfNext\fR, or 0 otherwise.  This function
+does not guarantee that the UTF-8 string is properly formed.  This routine
+is used by procedures that are operating on a byte at a time and need to
+know if a full Tcl_UniChar has been seen.
 .PP
 \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings.  It
 returns the number of Tcl_UniChars that are represented by the UTF-8 string
@@ -221,7 +221,8 @@ Given \fIsrc\fR, a pointer to some location in a UTF-8 string,
 \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the
 string.  The caller must not ask for the next character after the last
 character in the string if the string is not terminated by a null
-character.
+character. \fBTcl_UtfCharComplete\fR can be used in that case to
+make sure enough bytes are available before calling \fBTcl_UtfNext\fR.
 .PP
 \fBTcl_UtfPrev\fR is used to step backward through but not beyond the
 UTF-8 string that begins at \fIstart\fR.  If the UTF-8 string is made
-- 
cgit v0.12