summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclStringObj.c95
-rw-r--r--generic/tclStringRep.h44
2 files changed, 70 insertions, 69 deletions
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index d3a17d1..7fbf77a 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -1,29 +1,27 @@
/*
* tclStringObj.c --
*
- * This file contains functions that implement string operations on Tcl
- * objects. Some string operations work with UTF strings and others
- * require Unicode format. Functions that require knowledge of the width
- * of each character, such as indexing, operate on Unicode data.
- *
- * A Unicode string is an internationalized string. Conceptually, a
- * Unicode string is an array of 16-bit quantities organized as a
- * sequence of properly formed UTF-8 characters. There is a one-to-one
- * map between Unicode and UTF characters. Because Unicode characters
- * have a fixed width, operations such as indexing operate on Unicode
- * data. The String object is optimized for the case where each UTF char
+ * This file contains functions that implement string operations on Tcl
+ * objects. Some string operations work with UTF-8 encoding forms.
+ * Functions that require knowledge of the width of each character,
+ * such as indexing, operate on fixed width encoding forms such as UTF-32.
+ *
+ * Conceptually, a string is a sequence of Unicode code points. Internally
+ * it may be stored in an encoding form such as a modified version of
+ * UTF-8 or UTF-16 (when TCL_UTF_MAX=3) or UTF-32.
+ *
+ * The String object is optimized for the case where each UTF char
* in a string is only one byte. In this case, we store the value of
- * numChars, but we don't store the Unicode data (unless Tcl_GetUnicode
- * is explicitly called).
+ * numChars, but we don't store the fixed form encoding (unless
+ * Tcl_GetUnicode is explicitly called).
*
- * The String object type stores one or both formats. The default
- * behavior is to store UTF. Once Unicode is calculated by a function, it
- * is stored in the internal rep for future access (without an additional
- * O(n) cost).
+ * The String object type stores one or both formats. The default
+ * behavior is to store UTF-8. Once UTF-16/UTF32 is calculated, it is
+ * stored in the internal rep for future access (without an additional
+ * O(n) cost).
*
* To allow many appends to be done to an object without constantly
- * reallocating the space for the string or Unicode representation, we
- * allocate double the space for the string or Unicode and use the
+ * reallocating space, we allocate double the space and use the
* internal representation to keep track of how much space is used vs.
* allocated.
*
@@ -37,7 +35,6 @@
#include "tclInt.h"
#include "tclTomMath.h"
#include "tclStringRep.h"
-
#include "assert.h"
/*
* Prototypes for functions defined later in this file:
@@ -631,10 +628,8 @@ TclGetCharLength(
*/
if (TclIsPureByteArray(objPtr)) {
- int length;
-
- (void) Tcl_GetByteArrayFromObj(objPtr, &length);
- return length;
+ (void) Tcl_GetByteArrayFromObj(objPtr, &numChars);
+ return numChars;
}
/*
@@ -675,10 +670,10 @@ Tcl_GetCharLength(
}
/*
- * Optimize BytArray case: No need to convert to a string to perform the
- * get-length operation.
+ * Optimize the case where we're really dealing with a bytearray object;
+ * we don't need to convert to a string to perform the get-length operation.
*
- * Starting in Tcl 8.7, check for a "pure" bytearray, because the
+ * Starting in Tcl 8.7, we check for a "pure" bytearray, because the
* machinery behind that test is using a proper bytearray ObjType. We
* could also compute length of an improper bytearray without shimmering
* but there's no value in that. We *want* to shimmer an improper bytearray
@@ -686,16 +681,17 @@ Tcl_GetCharLength(
*/
if (TclIsPureByteArray(objPtr)) {
-
(void) Tcl_GetByteArrayFromObj(objPtr, &numChars);
} else {
Tcl_GetString(objPtr);
numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length);
}
+
return numChars;
}
#endif
+
/*
*----------------------------------------------------------------------
*
@@ -722,6 +718,11 @@ TclCheckEmptyString(
return TCL_EMPTYSTRING_YES;
}
+ if (TclIsPureByteArray(objPtr)
+ && Tcl_GetCharLength(objPtr) == 0) {
+ return TCL_EMPTYSTRING_YES;
+ }
+
if (TclListObjIsCanonical(objPtr)) {
TclListObjLengthM(NULL, objPtr, &length);
return length == 0;
@@ -2383,12 +2384,16 @@ Tcl_AppendFormatToObj(
width = 0;
if (isdigit(UCHAR(ch))) {
- width = strtoul(format, &end, 10);
- if (width < 0) {
+ /* Note ull will be >= 0 because of isdigit check above */
+ unsigned long long ull;
+ ull = strtoull(format, &end, 10);
+ /* Comparison is >=, not >, to leave room for nul */
+ if (ull >= WIDE_MAX) {
msg = overflow;
errCode = "OVERFLOW";
goto errorMsg;
}
+ width = (Tcl_WideInt)ull;
format = end;
step = TclUtfToUniChar(format, &ch);
} else if (ch == '*') {
@@ -2425,7 +2430,16 @@ Tcl_AppendFormatToObj(
step = TclUtfToUniChar(format, &ch);
}
if (isdigit(UCHAR(ch))) {
- precision = strtoul(format, &end, 10);
+ /* Note ull will be >= 0 because of isdigit check above */
+ unsigned long long ull;
+ ull = strtoull(format, &end, 10);
+ /* Comparison is >=, not >, to leave room for nul */
+ if (ull >= WIDE_MAX) {
+ msg = overflow;
+ errCode = "OVERFLOW";
+ goto errorMsg;
+ }
+ precision = (Tcl_WideInt)ull;
format = end;
step = TclUtfToUniChar(format, &ch);
} else if (ch == '*') {
@@ -2531,6 +2545,9 @@ Tcl_AppendFormatToObj(
if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) {
goto error;
}
+ if ((unsigned)code > 0x10FFFF) {
+ code = 0xFFFD;
+ }
length = Tcl_UniCharToUtf(code, buf);
if ((code >= 0xD800) && (length < 3)) {
/* Special case for handling high surrogates. */
@@ -3875,6 +3892,7 @@ TclStringCmp(
if ((reqlength == 0) || (value1Ptr == value2Ptr)) {
/*
* Always match at 0 chars of if it is the same obj.
+ * Note: as documented reqlength negative means it is ignored
*/
match = 0;
} else {
@@ -4006,15 +4024,15 @@ TclStringCmp(
* comparison function.
*/
length = (s1len < s2len) ? s1len : s2len;
- if (reqlength > 0 && reqlength < length) {
- length = reqlength;
- } else if (reqlength < 0) {
+ if (reqlength < 0) {
/*
* The requested length is negative, so ignore it by setting it
* to length + 1 to correct the match var.
*/
reqlength = length + 1;
+ } else if (reqlength > 0 && reqlength < length) {
+ length = reqlength;
}
if (checkEq && reqlength < 0 && (s1len != s2len)) {
@@ -4452,18 +4470,17 @@ TclStringReplace(
int inPlace = flags & TCL_STRING_IN_PLACE;
Tcl_Obj *result;
- /* Caller is expected to pass sensible arguments */
- assert ( count >= 0 ) ;
- assert ( first >= 0 ) ;
-
/* Replace nothing with nothing */
- if ((insertPtr == NULL) && (count == 0)) {
+ if ((insertPtr == NULL) && (count <= 0)) {
if (inPlace) {
return objPtr;
} else {
return Tcl_DuplicateObj(objPtr);
}
}
+ if (first < 0) {
+ first = 0;
+ }
/*
* The caller very likely had to call Tcl_GetCharLength() or similar
diff --git a/generic/tclStringRep.h b/generic/tclStringRep.h
index 0219a00..ef64d6c 100644
--- a/generic/tclStringRep.h
+++ b/generic/tclStringRep.h
@@ -1,29 +1,12 @@
/*
* tclStringRep.h --
*
- * This file contains the definition of the Unicode string internal
- * representation and macros to access it.
+ * This file contains the definition of internal representations of a string
+ * and macros to access it.
*
- * A Unicode string is an internationalized string. Conceptually, a
- * Unicode string is an array of 16-bit quantities organized as a
- * sequence of properly formed UTF-8 characters. There is a one-to-one
- * map between Unicode and UTF characters. Because Unicode characters
- * have a fixed width, operations such as indexing operate on Unicode
- * data. The String object is optimized for the case where each UTF char
- * in a string is only one byte. In this case, we store the value of
- * numChars, but we don't store the Unicode data (unless Tcl_GetUnicode
- * is explicitly called).
- *
- * The String object type stores one or both formats. The default
- * behavior is to store UTF. Once Unicode is calculated by a function, it
- * is stored in the internal rep for future access (without an additional
- * O(n) cost).
- *
- * To allow many appends to be done to an object without constantly
- * reallocating the space for the string or Unicode representation, we
- * allocate double the space for the string or Unicode and use the
- * internal representation to keep track of how much space is used vs.
- * allocated.
+ * Conceptually, a string is a sequence of Unicode code points. Internally
+ * it may be stored in an encoding form such as a modified version of UTF-8
+ * or UTF-16 (when TCL_UTF_MAX=3) or UTF-32.
*
* Copyright (c) 1995-1997 Sun Microsystems, Inc.
* Copyright (c) 1999 by Scriptics Corporation.
@@ -39,10 +22,10 @@
/*
* The following structure is the internal rep for a String object. It keeps
* track of how much memory has been used and how much has been allocated for
- * the Unicode and UTF string to enable growing and shrinking of the UTF and
- * Unicode reps of the String object with fewer mallocs. To optimize string
+ * the various representations to enable growing and shrinking of
+ * the String object with fewer mallocs. To optimize string
* length and indexing operations, this structure also stores the number of
- * characters (same of UTF and Unicode!) once that value has been computed.
+ * code points (independent of encoding form) once that value has been computed.
*/
typedef struct {
@@ -52,17 +35,18 @@ typedef struct {
* Unicode rep, or that the number of UTF bytes ==
* the number of chars. */
Tcl_Size allocated; /* The amount of space actually allocated for
- * the UTF string (minus 1 byte for the
+ * the UTF-8 string (minus 1 byte for the
* termination char). */
Tcl_Size maxChars; /* Max number of chars that can fit in the
* space allocated for the Unicode array. */
int hasUnicode; /* Boolean determining whether the string has
- * a Unicode representation. */
- unsigned short unicode[TCLFLEXARRAY]; /* The array of Unicode chars. The actual size
- * of this field depends on the 'maxChars'
- * field above. */
+ * a Tcl_UniChar representation. */
+ unsigned short unicode[TCLFLEXARRAY]; /* The array of Tcl_UniChar units.
+ * The actual size of this field depends on
+ * the maxChars field above. */
} String;
+/* Limit on string lengths. The -1 because limit does not include the nul */
#define STRING_MAXCHARS \
(int)(((size_t)UINT_MAX - offsetof(String, unicode))/sizeof(unsigned short) - 1)
#define STRING_SIZE(numChars) \