From b583ea7360808cb502d1ea65954ab0387ebdd823 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 3 May 2023 15:51:10 +0000
Subject: Backport "Comments only. Fix blatantly obsolete ones". And a few more
 improvements from the same files.

---
 generic/tclStringObj.c | 95 +++++++++++++++++++++++++++++---------------------
 generic/tclStringRep.h | 44 ++++++++---------------
 2 files changed, 70 insertions(+), 69 deletions(-)

diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index d3a17d1..7fbf77a 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -1,29 +1,27 @@
 /*
  * tclStringObj.c --
  *
- *	This file contains functions that implement string operations on Tcl
- *	objects. Some string operations work with UTF strings and others
- *	require Unicode format. Functions that require knowledge of the width
- *	of each character, such as indexing, operate on Unicode data.
- *
- *	A Unicode string is an internationalized string. Conceptually, a
- *	Unicode string is an array of 16-bit quantities organized as a
- *	sequence of properly formed UTF-8 characters. There is a one-to-one
- *	map between Unicode and UTF characters. Because Unicode characters
- *	have a fixed width, operations such as indexing operate on Unicode
- *	data. The String object is optimized for the case where each UTF char
+ *      This file contains functions that implement string operations on Tcl
+ *      objects. Some string operations work with UTF-8 encoding forms.
+ *      Functions that require knowledge of the width of each character,
+ * 	such as indexing, operate on fixed width encoding forms such as UTF-32.
+ *
+ * 	Conceptually, a string is a sequence of Unicode code points. Internally
+ * 	it may be stored in an encoding form such as a modified version of
+ * 	UTF-8 or UTF-16 (when TCL_UTF_MAX=3) or UTF-32.
+ *
+ *	The String object is optimized for the case where each UTF char
  *	in a string is only one byte. In this case, we store the value of
- *	numChars, but we don't store the Unicode data (unless Tcl_GetUnicode
- *	is explicitly called).
+ *	numChars, but we don't store the fixed form encoding (unless
+ * 	Tcl_GetUnicode is explicitly called).
  *
- *	The String object type stores one or both formats. The default
- *	behavior is to store UTF. Once Unicode is calculated by a function, it
- *	is stored in the internal rep for future access (without an additional
- *	O(n) cost).
+ *      The String object type stores one or both formats. The default
+ *      behavior is to store UTF-8. Once UTF-16/UTF32 is calculated, it is
+ *      stored in the internal rep for future access (without an additional
+ *      O(n) cost).
  *
  *	To allow many appends to be done to an object without constantly
- *	reallocating the space for the string or Unicode representation, we
- *	allocate double the space for the string or Unicode and use the
+ *	reallocating space, we allocate double the space and use the
  *	internal representation to keep track of how much space is used vs.
  *	allocated.
  *
@@ -37,7 +35,6 @@
 #include "tclInt.h"
 #include "tclTomMath.h"
 #include "tclStringRep.h"
-
 #include "assert.h"
 /*
  * Prototypes for functions defined later in this file:
@@ -631,10 +628,8 @@ TclGetCharLength(
      */
 
     if (TclIsPureByteArray(objPtr)) {
-	int length;
-
-	(void) Tcl_GetByteArrayFromObj(objPtr, &length);
-	return length;
+	(void) Tcl_GetByteArrayFromObj(objPtr, &numChars);
+	return numChars;
     }
 
     /*
@@ -675,10 +670,10 @@ Tcl_GetCharLength(
     }
 
     /*
-     * Optimize BytArray case: No need to convert to a string to perform the
-     * get-length operation.
+     * Optimize the case where we're really dealing with a bytearray object;
+     * we don't need to convert to a string to perform the get-length operation.
      *
-     * Starting in Tcl 8.7, check for a "pure" bytearray, because the
+     * Starting in Tcl 8.7, we check for a "pure" bytearray, because the
      * machinery behind that test is using a proper bytearray ObjType.  We
      * could also compute length of an improper bytearray without shimmering
      * but there's no value in that. We *want* to shimmer an improper bytearray
@@ -686,16 +681,17 @@ Tcl_GetCharLength(
      */
 
     if (TclIsPureByteArray(objPtr)) {
-
 	(void) Tcl_GetByteArrayFromObj(objPtr, &numChars);
     } else {
 	Tcl_GetString(objPtr);
 	numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length);
     }
+
     return numChars;
 }
 #endif
 
+
 /*
  *----------------------------------------------------------------------
  *
@@ -722,6 +718,11 @@ TclCheckEmptyString(
 	return TCL_EMPTYSTRING_YES;
     }
 
+    if (TclIsPureByteArray(objPtr)
+	&& Tcl_GetCharLength(objPtr) == 0) {
+	return TCL_EMPTYSTRING_YES;
+    }
+
     if (TclListObjIsCanonical(objPtr)) {
 	TclListObjLengthM(NULL, objPtr, &length);
 	return length == 0;
@@ -2383,12 +2384,16 @@ Tcl_AppendFormatToObj(
 
 	width = 0;
 	if (isdigit(UCHAR(ch))) {
-	    width = strtoul(format, &end, 10);
-	    if (width < 0) {
+	    /* Note ull will be >= 0 because of isdigit check above */
+	    unsigned long long ull;
+	    ull = strtoull(format, &end, 10);
+	    /* Comparison is >=, not >, to leave room for nul */
+	    if (ull >= WIDE_MAX) {
 		msg = overflow;
 		errCode = "OVERFLOW";
 		goto errorMsg;
 	    }
+	    width = (Tcl_WideInt)ull;
 	    format = end;
 	    step = TclUtfToUniChar(format, &ch);
 	} else if (ch == '*') {
@@ -2425,7 +2430,16 @@ Tcl_AppendFormatToObj(
 	    step = TclUtfToUniChar(format, &ch);
 	}
 	if (isdigit(UCHAR(ch))) {
-	    precision = strtoul(format, &end, 10);
+	    /* Note ull will be >= 0 because of isdigit check above */
+	    unsigned long long ull;
+	    ull = strtoull(format, &end, 10);
+	    /* Comparison is >=, not >, to leave room for nul */
+	    if (ull >= WIDE_MAX) {
+		msg = overflow;
+		errCode = "OVERFLOW";
+		goto errorMsg;
+	    }
+	    precision = (Tcl_WideInt)ull;
 	    format = end;
 	    step = TclUtfToUniChar(format, &ch);
 	} else if (ch == '*') {
@@ -2531,6 +2545,9 @@ Tcl_AppendFormatToObj(
 	    if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) {
 		goto error;
 	    }
+	    if ((unsigned)code > 0x10FFFF) {
+	    	code = 0xFFFD;
+	    }
 	    length = Tcl_UniCharToUtf(code, buf);
 	    if ((code >= 0xD800) && (length < 3)) {
 		/* Special case for handling high surrogates. */
@@ -3875,6 +3892,7 @@ TclStringCmp(
     if ((reqlength == 0) || (value1Ptr == value2Ptr)) {
 	/*
 	 * Always match at 0 chars of if it is the same obj.
+	 * Note: as documented reqlength negative means it is ignored
 	 */
 	match = 0;
     } else {
@@ -4006,15 +4024,15 @@ TclStringCmp(
 	 * comparison function.
 	 */
 	length = (s1len < s2len) ? s1len : s2len;
-	if (reqlength > 0 && reqlength < length) {
-	    length = reqlength;
-	} else if (reqlength < 0) {
+	if (reqlength < 0) {
 	    /*
 	     * The requested length is negative, so ignore it by setting it
 	     * to length + 1 to correct the match var.
 	     */
 
 	    reqlength = length + 1;
+	} else if (reqlength > 0 && reqlength < length) {
+	    length = reqlength;
 	}
 
 	if (checkEq && reqlength < 0 && (s1len != s2len)) {
@@ -4452,18 +4470,17 @@ TclStringReplace(
     int inPlace = flags & TCL_STRING_IN_PLACE;
     Tcl_Obj *result;
 
-    /* Caller is expected to pass sensible arguments */
-    assert ( count >= 0 ) ;
-    assert ( first >= 0 ) ;
-
     /* Replace nothing with nothing */
-    if ((insertPtr == NULL) && (count == 0)) {
+    if ((insertPtr == NULL) && (count <= 0)) {
 	if (inPlace) {
 	    return objPtr;
 	} else {
 	    return Tcl_DuplicateObj(objPtr);
 	}
     }
+    if (first < 0) {
+	first = 0;
+    }
 
     /*
      * The caller very likely had to call Tcl_GetCharLength() or similar
diff --git a/generic/tclStringRep.h b/generic/tclStringRep.h
index 0219a00..ef64d6c 100644
--- a/generic/tclStringRep.h
+++ b/generic/tclStringRep.h
@@ -1,29 +1,12 @@
 /*
  * tclStringRep.h --
  *
- *	This file contains the definition of the Unicode string internal
- *	representation and macros to access it.
+ *  This file contains the definition of internal representations of a string
+ *  and macros to access it.
  *
- *	A Unicode string is an internationalized string. Conceptually, a
- *	Unicode string is an array of 16-bit quantities organized as a
- *	sequence of properly formed UTF-8 characters. There is a one-to-one
- *	map between Unicode and UTF characters. Because Unicode characters
- *	have a fixed width, operations such as indexing operate on Unicode
- *	data. The String object is optimized for the case where each UTF char
- *	in a string is only one byte. In this case, we store the value of
- *	numChars, but we don't store the Unicode data (unless Tcl_GetUnicode
- *	is explicitly called).
- *
- *	The String object type stores one or both formats. The default
- *	behavior is to store UTF. Once Unicode is calculated by a function, it
- *	is stored in the internal rep for future access (without an additional
- *	O(n) cost).
- *
- *	To allow many appends to be done to an object without constantly
- *	reallocating the space for the string or Unicode representation, we
- *	allocate double the space for the string or Unicode and use the
- *	internal representation to keep track of how much space is used vs.
- *	allocated.
+ *  Conceptually, a string is a sequence of Unicode code points. Internally
+ *  it may be stored in an encoding form such as a modified version of UTF-8
+ *  or UTF-16 (when TCL_UTF_MAX=3) or UTF-32.
  *
  * Copyright (c) 1995-1997 Sun Microsystems, Inc.
  * Copyright (c) 1999 by Scriptics Corporation.
@@ -39,10 +22,10 @@
 /*
  * The following structure is the internal rep for a String object. It keeps
  * track of how much memory has been used and how much has been allocated for
- * the Unicode and UTF string to enable growing and shrinking of the UTF and
- * Unicode reps of the String object with fewer mallocs. To optimize string
+ * the various representations to enable growing and shrinking of 
+ * the String object with fewer mallocs. To optimize string
  * length and indexing operations, this structure also stores the number of
- * characters (same of UTF and Unicode!) once that value has been computed.
+ * code points (independent of encoding form) once that value has been computed.
  */
 
 typedef struct {
@@ -52,17 +35,18 @@ typedef struct {
 				 * Unicode rep, or that the number of UTF bytes ==
 				 * the number of chars. */
     Tcl_Size allocated;		/* The amount of space actually allocated for
-				 * the UTF string (minus 1 byte for the
+				 * the UTF-8 string (minus 1 byte for the
 				 * termination char). */
     Tcl_Size maxChars;		/* Max number of chars that can fit in the
 				 * space allocated for the Unicode array. */
     int hasUnicode;		/* Boolean determining whether the string has
-				 * a Unicode representation. */
-    unsigned short unicode[TCLFLEXARRAY];	/* The array of Unicode chars. The actual size
-				 * of this field depends on the 'maxChars'
-				 * field above. */
+				 * a Tcl_UniChar representation. */
+    unsigned short unicode[TCLFLEXARRAY];	/* The array of Tcl_UniChar units.
+				 * The actual size of this field depends on
+				 * the maxChars field above. */
 } String;
 
+/* Limit on string lengths. The -1 because limit does not include the nul */
 #define STRING_MAXCHARS \
     (int)(((size_t)UINT_MAX - offsetof(String, unicode))/sizeof(unsigned short) - 1)
 #define STRING_SIZE(numChars) \
-- 
cgit v0.12