From b583ea7360808cb502d1ea65954ab0387ebdd823 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 3 May 2023 15:51:10 +0000 Subject: Backport "Comments only. Fix blatantly obsolete ones". And a few more improvements from the same files. --- generic/tclStringObj.c | 95 +++++++++++++++++++++++++++++--------------------- generic/tclStringRep.h | 44 ++++++++--------------- 2 files changed, 70 insertions(+), 69 deletions(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index d3a17d1..7fbf77a 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1,29 +1,27 @@ /* * tclStringObj.c -- * - * This file contains functions that implement string operations on Tcl - * objects. Some string operations work with UTF strings and others - * require Unicode format. Functions that require knowledge of the width - * of each character, such as indexing, operate on Unicode data. - * - * A Unicode string is an internationalized string. Conceptually, a - * Unicode string is an array of 16-bit quantities organized as a - * sequence of properly formed UTF-8 characters. There is a one-to-one - * map between Unicode and UTF characters. Because Unicode characters - * have a fixed width, operations such as indexing operate on Unicode - * data. The String object is optimized for the case where each UTF char + * This file contains functions that implement string operations on Tcl + * objects. Some string operations work with UTF-8 encoding forms. + * Functions that require knowledge of the width of each character, + * such as indexing, operate on fixed width encoding forms such as UTF-32. + * + * Conceptually, a string is a sequence of Unicode code points. Internally + * it may be stored in an encoding form such as a modified version of + * UTF-8 or UTF-16 (when TCL_UTF_MAX=3) or UTF-32. + * + * The String object is optimized for the case where each UTF char * in a string is only one byte. In this case, we store the value of - * numChars, but we don't store the Unicode data (unless Tcl_GetUnicode - * is explicitly called). + * numChars, but we don't store the fixed form encoding (unless + * Tcl_GetUnicode is explicitly called). * - * The String object type stores one or both formats. The default - * behavior is to store UTF. Once Unicode is calculated by a function, it - * is stored in the internal rep for future access (without an additional - * O(n) cost). + * The String object type stores one or both formats. The default + * behavior is to store UTF-8. Once UTF-16/UTF32 is calculated, it is + * stored in the internal rep for future access (without an additional + * O(n) cost). * * To allow many appends to be done to an object without constantly - * reallocating the space for the string or Unicode representation, we - * allocate double the space for the string or Unicode and use the + * reallocating space, we allocate double the space and use the * internal representation to keep track of how much space is used vs. * allocated. * @@ -37,7 +35,6 @@ #include "tclInt.h" #include "tclTomMath.h" #include "tclStringRep.h" - #include "assert.h" /* * Prototypes for functions defined later in this file: @@ -631,10 +628,8 @@ TclGetCharLength( */ if (TclIsPureByteArray(objPtr)) { - int length; - - (void) Tcl_GetByteArrayFromObj(objPtr, &length); - return length; + (void) Tcl_GetByteArrayFromObj(objPtr, &numChars); + return numChars; } /* @@ -675,10 +670,10 @@ Tcl_GetCharLength( } /* - * Optimize BytArray case: No need to convert to a string to perform the - * get-length operation. + * Optimize the case where we're really dealing with a bytearray object; + * we don't need to convert to a string to perform the get-length operation. * - * Starting in Tcl 8.7, check for a "pure" bytearray, because the + * Starting in Tcl 8.7, we check for a "pure" bytearray, because the * machinery behind that test is using a proper bytearray ObjType. We * could also compute length of an improper bytearray without shimmering * but there's no value in that. We *want* to shimmer an improper bytearray @@ -686,16 +681,17 @@ Tcl_GetCharLength( */ if (TclIsPureByteArray(objPtr)) { - (void) Tcl_GetByteArrayFromObj(objPtr, &numChars); } else { Tcl_GetString(objPtr); numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); } + return numChars; } #endif + /* *---------------------------------------------------------------------- * @@ -722,6 +718,11 @@ TclCheckEmptyString( return TCL_EMPTYSTRING_YES; } + if (TclIsPureByteArray(objPtr) + && Tcl_GetCharLength(objPtr) == 0) { + return TCL_EMPTYSTRING_YES; + } + if (TclListObjIsCanonical(objPtr)) { TclListObjLengthM(NULL, objPtr, &length); return length == 0; @@ -2383,12 +2384,16 @@ Tcl_AppendFormatToObj( width = 0; if (isdigit(UCHAR(ch))) { - width = strtoul(format, &end, 10); - if (width < 0) { + /* Note ull will be >= 0 because of isdigit check above */ + unsigned long long ull; + ull = strtoull(format, &end, 10); + /* Comparison is >=, not >, to leave room for nul */ + if (ull >= WIDE_MAX) { msg = overflow; errCode = "OVERFLOW"; goto errorMsg; } + width = (Tcl_WideInt)ull; format = end; step = TclUtfToUniChar(format, &ch); } else if (ch == '*') { @@ -2425,7 +2430,16 @@ Tcl_AppendFormatToObj( step = TclUtfToUniChar(format, &ch); } if (isdigit(UCHAR(ch))) { - precision = strtoul(format, &end, 10); + /* Note ull will be >= 0 because of isdigit check above */ + unsigned long long ull; + ull = strtoull(format, &end, 10); + /* Comparison is >=, not >, to leave room for nul */ + if (ull >= WIDE_MAX) { + msg = overflow; + errCode = "OVERFLOW"; + goto errorMsg; + } + precision = (Tcl_WideInt)ull; format = end; step = TclUtfToUniChar(format, &ch); } else if (ch == '*') { @@ -2531,6 +2545,9 @@ Tcl_AppendFormatToObj( if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) { goto error; } + if ((unsigned)code > 0x10FFFF) { + code = 0xFFFD; + } length = Tcl_UniCharToUtf(code, buf); if ((code >= 0xD800) && (length < 3)) { /* Special case for handling high surrogates. */ @@ -3875,6 +3892,7 @@ TclStringCmp( if ((reqlength == 0) || (value1Ptr == value2Ptr)) { /* * Always match at 0 chars of if it is the same obj. + * Note: as documented reqlength negative means it is ignored */ match = 0; } else { @@ -4006,15 +4024,15 @@ TclStringCmp( * comparison function. */ length = (s1len < s2len) ? s1len : s2len; - if (reqlength > 0 && reqlength < length) { - length = reqlength; - } else if (reqlength < 0) { + if (reqlength < 0) { /* * The requested length is negative, so ignore it by setting it * to length + 1 to correct the match var. */ reqlength = length + 1; + } else if (reqlength > 0 && reqlength < length) { + length = reqlength; } if (checkEq && reqlength < 0 && (s1len != s2len)) { @@ -4452,18 +4470,17 @@ TclStringReplace( int inPlace = flags & TCL_STRING_IN_PLACE; Tcl_Obj *result; - /* Caller is expected to pass sensible arguments */ - assert ( count >= 0 ) ; - assert ( first >= 0 ) ; - /* Replace nothing with nothing */ - if ((insertPtr == NULL) && (count == 0)) { + if ((insertPtr == NULL) && (count <= 0)) { if (inPlace) { return objPtr; } else { return Tcl_DuplicateObj(objPtr); } } + if (first < 0) { + first = 0; + } /* * The caller very likely had to call Tcl_GetCharLength() or similar diff --git a/generic/tclStringRep.h b/generic/tclStringRep.h index 0219a00..ef64d6c 100644 --- a/generic/tclStringRep.h +++ b/generic/tclStringRep.h @@ -1,29 +1,12 @@ /* * tclStringRep.h -- * - * This file contains the definition of the Unicode string internal - * representation and macros to access it. + * This file contains the definition of internal representations of a string + * and macros to access it. * - * A Unicode string is an internationalized string. Conceptually, a - * Unicode string is an array of 16-bit quantities organized as a - * sequence of properly formed UTF-8 characters. There is a one-to-one - * map between Unicode and UTF characters. Because Unicode characters - * have a fixed width, operations such as indexing operate on Unicode - * data. The String object is optimized for the case where each UTF char - * in a string is only one byte. In this case, we store the value of - * numChars, but we don't store the Unicode data (unless Tcl_GetUnicode - * is explicitly called). - * - * The String object type stores one or both formats. The default - * behavior is to store UTF. Once Unicode is calculated by a function, it - * is stored in the internal rep for future access (without an additional - * O(n) cost). - * - * To allow many appends to be done to an object without constantly - * reallocating the space for the string or Unicode representation, we - * allocate double the space for the string or Unicode and use the - * internal representation to keep track of how much space is used vs. - * allocated. + * Conceptually, a string is a sequence of Unicode code points. Internally + * it may be stored in an encoding form such as a modified version of UTF-8 + * or UTF-16 (when TCL_UTF_MAX=3) or UTF-32. * * Copyright (c) 1995-1997 Sun Microsystems, Inc. * Copyright (c) 1999 by Scriptics Corporation. @@ -39,10 +22,10 @@ /* * The following structure is the internal rep for a String object. It keeps * track of how much memory has been used and how much has been allocated for - * the Unicode and UTF string to enable growing and shrinking of the UTF and - * Unicode reps of the String object with fewer mallocs. To optimize string + * the various representations to enable growing and shrinking of + * the String object with fewer mallocs. To optimize string * length and indexing operations, this structure also stores the number of - * characters (same of UTF and Unicode!) once that value has been computed. + * code points (independent of encoding form) once that value has been computed. */ typedef struct { @@ -52,17 +35,18 @@ typedef struct { * Unicode rep, or that the number of UTF bytes == * the number of chars. */ Tcl_Size allocated; /* The amount of space actually allocated for - * the UTF string (minus 1 byte for the + * the UTF-8 string (minus 1 byte for the * termination char). */ Tcl_Size maxChars; /* Max number of chars that can fit in the * space allocated for the Unicode array. */ int hasUnicode; /* Boolean determining whether the string has - * a Unicode representation. */ - unsigned short unicode[TCLFLEXARRAY]; /* The array of Unicode chars. The actual size - * of this field depends on the 'maxChars' - * field above. */ + * a Tcl_UniChar representation. */ + unsigned short unicode[TCLFLEXARRAY]; /* The array of Tcl_UniChar units. + * The actual size of this field depends on + * the maxChars field above. */ } String; +/* Limit on string lengths. The -1 because limit does not include the nul */ #define STRING_MAXCHARS \ (int)(((size_t)UINT_MAX - offsetof(String, unicode))/sizeof(unsigned short) - 1) #define STRING_SIZE(numChars) \ -- cgit v0.12