summaryrefslogtreecommitdiffstats
path: root/generic/tclStringObj.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclStringObj.c')
-rw-r--r--generic/tclStringObj.c457
1 files changed, 381 insertions, 76 deletions
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 06e60af..720a891 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -458,10 +458,53 @@ Tcl_GetCharLength(
/*
*----------------------------------------------------------------------
*
+ * TclCheckEmptyString --
+ *
+ * Determine whether the string value of an object is or would be the
+ * empty string, without generating a string representation.
+ *
+ * Results:
+ * Returns 1 if empty, 0 if not, and -1 if unknown.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+int
+TclCheckEmptyString(
+ Tcl_Obj *objPtr)
+{
+ int length = -1;
+
+ if (objPtr->bytes == &tclEmptyString) {
+ return TCL_EMPTYSTRING_YES;
+ }
+
+ if (TclListObjIsCanonical(objPtr)) {
+ Tcl_ListObjLength(NULL, objPtr, &length);
+ return length == 0;
+ }
+
+ if (TclIsPureDict(objPtr)) {
+ Tcl_DictObjSize(NULL, objPtr, &length);
+ return length == 0;
+ }
+
+ if (objPtr->bytes == NULL) {
+ return TCL_EMPTYSTRING_UNKNOWN;
+ }
+ return objPtr->length == 0;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
* Tcl_GetUniChar --
*
- * Get the index'th Unicode character from the String object. The index
- * is assumed to be in the appropriate range.
+ * Get the index'th Unicode character from the String object. If index
+ * is out of range or it references a low surrogate preceded by a high
+ * surrogate, the result = -1;
*
* Results:
* Returns the index'th Unicode character in the Object.
@@ -472,13 +515,18 @@ Tcl_GetCharLength(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
+int
Tcl_GetUniChar(
Tcl_Obj *objPtr, /* The object to get the Unicode charater
* from. */
int index) /* Get the index'th Unicode character. */
{
String *stringPtr;
+ int ch, length;
+
+ if (index < 0) {
+ return -1;
+ }
/*
* Optimize the case where we're really dealing with a bytearray object
@@ -486,9 +534,12 @@ Tcl_GetUniChar(
*/
if (TclIsPureByteArray(objPtr)) {
- unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL);
+ unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length);
+ if (index >= length) {
+ return -1;
+ }
- return (Tcl_UniChar) bytes[index];
+ return (int) bytes[index];
}
/*
@@ -512,7 +563,28 @@ Tcl_GetUniChar(
FillUnicodeRep(objPtr);
stringPtr = GET_STRING(objPtr);
}
- return stringPtr->unicode[index];
+
+ if (index >= stringPtr->numChars) {
+ return -1;
+ }
+ ch = stringPtr->unicode[index];
+#if TCL_UTF_MAX <= 4
+ /* See: bug [11ae2be95dac9417] */
+ if ((ch & 0xF800) == 0xD800) {
+ if (ch & 0x400) {
+ if ((index > 0)
+ && ((stringPtr->unicode[index-1] & 0xFC00) == 0xD800)) {
+ ch = -1; /* low surrogate preceded by high surrogate */
+ }
+ } else if ((++index < stringPtr->numChars)
+ && ((stringPtr->unicode[index] & 0xFC00) == 0xDC00)) {
+ /* high surrogate followed by low surrogate */
+ ch = (((ch & 0x3FF) << 10) |
+ (stringPtr->unicode[index] & 0x3FF)) + 0x10000;
+ }
+ }
+#endif
+ return ch;
}
/*
@@ -612,6 +684,11 @@ Tcl_GetRange(
{
Tcl_Obj *newObjPtr; /* The Tcl object to find the range of. */
String *stringPtr;
+ int length;
+
+ if (first < 0) {
+ first = 0;
+ }
/*
* Optimize the case where we're really dealing with a bytearray object
@@ -619,9 +696,15 @@ Tcl_GetRange(
*/
if (TclIsPureByteArray(objPtr)) {
- unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL);
+ unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length);
- return Tcl_NewByteArrayObj(bytes+first, last-first+1);
+ if (last >= length) {
+ last = length - 1;
+ }
+ if (last < first) {
+ return Tcl_NewObj();
+ }
+ return Tcl_NewByteArrayObj(bytes + first, last - first + 1);
}
/*
@@ -640,6 +723,12 @@ Tcl_GetRange(
TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length);
}
if (stringPtr->numChars == objPtr->length) {
+ if (last >= stringPtr->numChars) {
+ last = stringPtr->numChars - 1;
+ }
+ if (last < first) {
+ return Tcl_NewObj();
+ }
newObjPtr = Tcl_NewStringObj(objPtr->bytes + first, last-first+1);
/*
@@ -654,19 +743,25 @@ Tcl_GetRange(
FillUnicodeRep(objPtr);
stringPtr = GET_STRING(objPtr);
}
-
-#if TCL_UTF_MAX == 4
- /* See: bug [11ae2be95dac9417] */
- if ((first>0) && ((stringPtr->unicode[first]&0xFC00) == 0xDC00)
- && ((stringPtr->unicode[first-1]&0xFC00) == 0xD800)) {
- ++first;
- }
- if ((last+1<stringPtr->numChars) && ((stringPtr->unicode[last+1]&0xFC00) == 0xDC00)
- && ((stringPtr->unicode[last]&0xFC00) == 0xD800)) {
- ++last;
- }
+ if (last > stringPtr->numChars) {
+ last = stringPtr->numChars;
+ }
+ if (last < first) {
+ return Tcl_NewObj();
+ }
+#if TCL_UTF_MAX <= 4
+ /* See: bug [11ae2be95dac9417] */
+ if ((first > 0) && ((stringPtr->unicode[first] & 0xFC00) == 0xDC00)
+ && ((stringPtr->unicode[first-1] & 0xFC00) == 0xD800)) {
+ ++first;
+ }
+ if ((last + 1 < stringPtr->numChars)
+ && ((stringPtr->unicode[last+1] & 0xFC00) == 0xDC00)
+ && ((stringPtr->unicode[last] & 0xFC00) == 0xD800)) {
+ ++last;
+ }
#endif
- return Tcl_NewUnicodeObj(stringPtr->unicode + first, last-first+1);
+ return Tcl_NewUnicodeObj(stringPtr->unicode + first, last - first + 1);
}
/*
@@ -1208,45 +1303,51 @@ Tcl_AppendObjToObj(
/*
* Handle append of one bytearray object to another as a special case.
* Note that we only do this when the objects are pure so that the
- * bytearray faithfully represent the true value; Otherwise
- * appending the byte arrays together could lose information;
+ * bytearray faithfully represent the true value; Otherwise appending the
+ * byte arrays together could lose information;
*/
if ((TclIsPureByteArray(objPtr) || objPtr->bytes == &tclEmptyString)
&& TclIsPureByteArray(appendObjPtr)) {
-
/*
* You might expect the code here to be
*
* bytes = Tcl_GetByteArrayFromObj(appendObjPtr, &length);
* TclAppendBytesToByteArray(objPtr, bytes, length);
*
- * and essentially all of the time that would be fine. However,
- * it would run into trouble in the case where objPtr and
- * appendObjPtr point to the same thing. That may never be a
- * good idea. It seems to violate Copy On Write, and we don't
- * have any tests for the situation, since making any Tcl commands
- * that call Tcl_AppendObjToObj() do that appears impossible
- * (They honor Copy On Write!). For the sake of extensions that
- * go off into that realm, though, here's a more complex approach
- * that can handle all the cases.
+ * and essentially all of the time that would be fine. However, it
+ * would run into trouble in the case where objPtr and appendObjPtr
+ * point to the same thing. That may never be a good idea. It seems to
+ * violate Copy On Write, and we don't have any tests for the
+ * situation, since making any Tcl commands that call
+ * Tcl_AppendObjToObj() do that appears impossible (They honor Copy On
+ * Write!). For the sake of extensions that go off into that realm,
+ * though, here's a more complex approach that can handle all the
+ * cases.
+ *
+ * First, get the lengths.
*/
- /* Get lengths */
int lengthSrc;
(void) Tcl_GetByteArrayFromObj(objPtr, &length);
(void) Tcl_GetByteArrayFromObj(appendObjPtr, &lengthSrc);
- /* Grow buffer enough for the append */
+ /*
+ * Grow buffer enough for the append.
+ */
+
TclAppendBytesToByteArray(objPtr, NULL, lengthSrc);
- /* Reset objPtr back to the original value */
+ /*
+ * Reset objPtr back to the original value.
+ */
+
Tcl_SetByteArrayLength(objPtr, length);
/*
- * Now do the append knowing that buffer growth cannot cause
- * any trouble.
+ * Now do the append knowing that buffer growth cannot cause any
+ * trouble.
*/
TclAppendBytesToByteArray(objPtr,
@@ -1294,6 +1395,7 @@ Tcl_AppendObjToObj(
numChars = stringPtr->numChars;
if ((numChars >= 0) && (appendObjPtr->typePtr == &tclStringType)) {
String *appendStringPtr = GET_STRING(appendObjPtr);
+
appendNumChars = appendStringPtr->numChars;
}
@@ -1850,7 +1952,8 @@ Tcl_AppendFormatToObj(
format += step;
step = TclUtfToUniChar(format, &ch);
}
- } else if ((ch == 't') || (ch == 'z') || (ch == 'q') || (ch == 'j') || (ch == 'L')) {
+ } else if ((ch == 't') || (ch == 'z') || (ch == 'q') || (ch == 'j')
+ || (ch == 'L')) {
format += step;
step = TclUtfToUniChar(format, &ch);
useBig = 1;
@@ -1885,7 +1988,7 @@ Tcl_AppendFormatToObj(
}
break;
case 'c': {
- char buf[TCL_UTF_MAX];
+ char buf[4];
int code, length;
if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) {
@@ -2367,7 +2470,7 @@ Tcl_AppendFormatToObj(
/*
*---------------------------------------------------------------------------
*
- * Tcl_Format--
+ * Tcl_Format --
*
* Results:
* A refcount zero Tcl_Obj.
@@ -2758,7 +2861,10 @@ TclStringRepeat(
Tcl_GetByteArrayFromObj(objResultPtr, NULL),
(count - done) * length);
} else if (unichar) {
- /* Efficiently produce a pure Tcl_UniChar array result */
+ /*
+ * Efficiently produce a pure Tcl_UniChar array result.
+ */
+
if (!inPlace || Tcl_IsShared(objPtr)) {
objResultPtr = Tcl_NewUnicodeObj(Tcl_GetUnicode(objPtr), length);
} else {
@@ -2784,7 +2890,10 @@ TclStringRepeat(
Tcl_AppendUnicodeToObj(objResultPtr, Tcl_GetUnicode(objResultPtr),
(count - done) * length);
} else {
- /* Efficiently concatenate string reps */
+ /*
+ * Efficiently concatenate string reps.
+ */
+
if (!inPlace || Tcl_IsShared(objPtr)) {
objResultPtr = Tcl_NewStringObj(Tcl_GetString(objPtr), length);
} else {
@@ -2868,9 +2977,10 @@ TclStringCat(
/* Value has a string rep. */
if (objPtr->length) {
/*
- * Non-empty string rep. Not a pure bytearray, so we
- * won't create a pure bytearray
+ * Non-empty string rep. Not a pure bytearray, so we won't
+ * create a pure bytearray.
*/
+
binary = 0;
if ((objPtr->typePtr) && (objPtr->typePtr != &tclStringType)) {
/* Prevent shimmer of non-string types. */
@@ -2891,8 +3001,12 @@ TclStringCat(
} while (--oc && (binary || allowUniChar));
if (binary) {
- /* Result will be pure byte array. Pre-size it */
- ov = objv; oc = objc;
+ /*
+ * Result will be pure byte array. Pre-size it
+ */
+
+ ov = objv;
+ oc = objc;
do {
Tcl_Obj *objPtr = *ov++;
@@ -2912,8 +3026,12 @@ TclStringCat(
}
} while (--oc);
} else if (allowUniChar && requestUniChar) {
- /* Result will be pure Tcl_UniChar array. Pre-size it. */
- ov = objv; oc = objc;
+ /*
+ * Result will be pure Tcl_UniChar array. Pre-size it.
+ */
+
+ ov = objv;
+ oc = objc;
do {
Tcl_Obj *objPtr = *ov++;
@@ -2958,9 +3076,9 @@ TclStringCat(
} while (--oc && (length == 0) && (pendingPtr == NULL));
/*
- * Either we found a possibly non-empty value, and we
- * remember this index as the first and last such value so
- * far seen, or (oc == 0) and all values are known empty,
+ * Either we found a possibly non-empty value, and we remember
+ * this index as the first and last such value so far seen,
+ * or (oc == 0) and all values are known empty,
* so first = last = objc - 1 signals the right quick return.
*/
@@ -2972,10 +3090,9 @@ TclStringCat(
/* assert ( pendingPtr != NULL ) */
/*
- * There's a pending value followed by more values.
- * Loop over remaining values generating strings until
- * a non-empty value is found, or the pending value gets
- * its string generated.
+ * There's a pending value followed by more values. Loop over
+ * remaining values generating strings until a non-empty value
+ * is found, or the pending value gets its string generated.
*/
do {
@@ -3031,10 +3148,10 @@ TclStringCat(
unsigned char *dst;
/*
- * Broken interface! Byte array value routines offer no way
- * to handle failure to allocate enough space. Following
- * stanza may panic.
+ * Broken interface! Byte array value routines offer no way to handle
+ * failure to allocate enough space. Following stanza may panic.
*/
+
if (inPlace && !Tcl_IsShared(*objv)) {
int start;
@@ -3149,6 +3266,7 @@ TclStringCat(
if ((objPtr->bytes == NULL) || (objPtr->length)) {
int more;
char *src = Tcl_GetStringFromObj(objPtr, &more);
+
memcpy(dst, src, (size_t) more);
dst += more;
}
@@ -3170,6 +3288,187 @@ TclStringCat(
/*
*---------------------------------------------------------------------------
*
+ * TclStringCmp --
+ * Compare two Tcl_Obj values as strings.
+ *
+ * Results:
+ * Like memcmp, return -1, 0, or 1.
+ *
+ * Side effects:
+ * String representations may be generated. Internal representation may
+ * be changed.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int TclStringCmp(
+ Tcl_Obj *value1Ptr,
+ Tcl_Obj *value2Ptr,
+ int checkEq, /* comparison is only for equality */
+ int nocase, /* comparison is not case sensitive */
+ int reqlength) /* requested length */
+{
+ char *s1, *s2;
+ int empty, length, match, s1len, s2len;
+ memCmpFn_t memCmpFn;
+
+ if ((reqlength == 0) || (value1Ptr == value2Ptr)) {
+ /*
+ * Always match at 0 chars of if it is the same obj.
+ */
+ match = 0;
+ } else {
+
+ if (!nocase && TclIsPureByteArray(value1Ptr)
+ && TclIsPureByteArray(value2Ptr)) {
+ /*
+ * Use binary versions of comparisons since that won't cause undue
+ * type conversions and it is much faster. Only do this if we're
+ * case-sensitive (which is all that really makes sense with byte
+ * arrays anyway, and we have no memcasecmp() for some reason... :^)
+ */
+
+ s1 = (char *) Tcl_GetByteArrayFromObj(value1Ptr, &s1len);
+ s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len);
+ memCmpFn = memcmp;
+ } else if ((value1Ptr->typePtr == &tclStringType)
+ && (value2Ptr->typePtr == &tclStringType)) {
+ /*
+ * Do a unicode-specific comparison if both of the args are of
+ * String type. If the char length == byte length, we can do a
+ * memcmp. In benchmark testing this proved the most efficient
+ * check between the unicode and string comparison operations.
+ */
+
+ if (nocase) {
+ s1 = (char *) Tcl_GetUnicodeFromObj(value1Ptr, &s1len);
+ s2 = (char *) Tcl_GetUnicodeFromObj(value2Ptr, &s2len);
+ memCmpFn = (memCmpFn_t)Tcl_UniCharNcasecmp;
+ } else {
+ s1len = Tcl_GetCharLength(value1Ptr);
+ s2len = Tcl_GetCharLength(value2Ptr);
+ if ((s1len == value1Ptr->length)
+ && (value1Ptr->bytes != NULL)
+ && (s2len == value2Ptr->length)
+ && (value2Ptr->bytes != NULL)) {
+ s1 = value1Ptr->bytes;
+ s2 = value2Ptr->bytes;
+ memCmpFn = memcmp;
+ } else {
+ s1 = (char *) Tcl_GetUnicode(value1Ptr);
+ s2 = (char *) Tcl_GetUnicode(value2Ptr);
+ if (
+#ifdef WORDS_BIGENDIAN
+ 1
+#else
+ checkEq
+#endif
+ ) {
+ memCmpFn = memcmp;
+ s1len *= sizeof(Tcl_UniChar);
+ s2len *= sizeof(Tcl_UniChar);
+ } else {
+ memCmpFn = (memCmpFn_t) Tcl_UniCharNcmp;
+ }
+ }
+ }
+ } else {
+ if ((empty = TclCheckEmptyString(value1Ptr)) > 0) {
+ switch (TclCheckEmptyString(value2Ptr)) {
+ case -1:
+ s1 = 0;
+ s1len = 0;
+ s2 = TclGetStringFromObj(value2Ptr, &s2len);
+ break;
+ case 0:
+ match = -1;
+ goto matchdone;
+ case 1:
+ default: /* avoid warn: `s2` may be used uninitialized */
+ match = 0;
+ goto matchdone;
+ }
+ } else if (TclCheckEmptyString(value2Ptr) > 0) {
+ switch (empty) {
+ case -1:
+ s2 = 0;
+ s2len = 0;
+ s1 = TclGetStringFromObj(value1Ptr, &s1len);
+ break;
+ case 0:
+ match = 1;
+ goto matchdone;
+ case 1:
+ default: /* avoid warn: `s1` may be used uninitialized */
+ match = 0;
+ goto matchdone;
+ }
+ } else {
+ s1 = TclGetStringFromObj(value1Ptr, &s1len);
+ s2 = TclGetStringFromObj(value2Ptr, &s2len);
+ }
+ if (!nocase && checkEq) {
+ /*
+ * When we have equal-length we can check only for
+ * (in)equality. We can use memcmp in all (n)eq cases because
+ * we don't need to worry about lexical LE/BE variance.
+ */
+
+ memCmpFn = memcmp;
+ } else {
+ /*
+ * As a catch-all we will work with UTF-8. We cannot use
+ * memcmp() as that is unsafe with any string containing NUL
+ * (\xC0\x80 in Tcl's utf rep). We can use the more efficient
+ * TclpUtfNcmp2 if we are case-sensitive and no specific
+ * length was requested.
+ */
+
+ if ((reqlength < 0) && !nocase) {
+ memCmpFn = (memCmpFn_t) TclpUtfNcmp2;
+ } else {
+ s1len = Tcl_NumUtfChars(s1, s1len);
+ s2len = Tcl_NumUtfChars(s2, s2len);
+ memCmpFn = (memCmpFn_t)
+ (nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp);
+ }
+ }
+ }
+
+ length = (s1len < s2len) ? s1len : s2len;
+ if (reqlength > 0 && reqlength < length) {
+ length = reqlength;
+ } else if (reqlength < 0) {
+ /*
+ * The requested length is negative, so we ignore it by setting it
+ * to length + 1 so we correct the match var.
+ */
+
+ reqlength = length + 1;
+ }
+
+ if (checkEq && (s1len != s2len)) {
+ match = 1; /* This will be reversed below. */
+ } else {
+ /*
+ * The comparison function should compare up to the minimum byte
+ * length only.
+ */
+
+ match = memCmpFn(s1, s2, (size_t) length);
+ }
+ if ((match == 0) && (reqlength > length)) {
+ match = s1len - s2len;
+ }
+ match = (match > 0) ? 1 : (match < 0) ? -1 : 0;
+ }
+ matchdone:
+ return match;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
* TclStringFirst --
*
* Implements the [string first] operation.
@@ -3365,14 +3664,16 @@ ReverseBytes(
/* reversing as you go. */
{
unsigned char *src = from + count;
+
if (to == from) {
/* Reversing in place */
while (--src > to) {
unsigned char c = *src;
+
*src = *to;
*to++ = c;
}
- } else {
+ } else {
while (--src >= from) {
*to++ = *src;
}
@@ -3421,7 +3722,10 @@ TclStringReverse(
*to++ = *src;
}
} else {
- /* Reversing in place */
+ /*
+ * Reversing in place.
+ */
+
while (--src > from) {
ch = *src;
*src = *from;
@@ -3445,20 +3749,22 @@ TclStringReverse(
/*
* Either numChars == -1 and we don't know how many chars are
* represented by objPtr->bytes and we need Pass 1 just in case,
- * or numChars >= 0 and we know we have fewer chars than bytes,
- * so we know there's a multibyte character needing Pass 1.
+ * or numChars >= 0 and we know we have fewer chars than bytes, so
+ * we know there's a multibyte character needing Pass 1.
*
* Pass 1. Reverse the bytes of each multi-byte character.
*/
+
size_t charCount = 0;
size_t bytesLeft = numBytes;
while (bytesLeft) {
/*
- * NOTE: We know that the from buffer is NUL-terminated.
- * It's part of the contract for objPtr->bytes values.
- * Thus, we can skip calling Tcl_UtfCharComplete() here.
+ * NOTE: We know that the from buffer is NUL-terminated. It's
+ * part of the contract for objPtr->bytes values. Thus, we can
+ * skip calling Tcl_UtfCharComplete() here.
*/
+
size_t bytesInChar = TclUtfToUniChar(from, &ch);
ReverseBytes((unsigned char *)to, (unsigned char *)from,
@@ -3488,19 +3794,18 @@ TclStringReverse(
*
* The result is a concatenation of a prefix from objPtr, characters
* 0 through first-1, the insertPtr string value, and a suffix from
- * objPtr, characters from first + count to the end. The effect is
- * as if the inner substring of characters first through first+count-1
- * are removed and replaced with insertPtr.
- * If insertPtr is NULL, it is treated as an empty string.
- * When passed the flag TCL_STRING_IN_PLACE, this routine will try
- * to do the work within objPtr, so long as no sharing forbids it.
- * Without that request, or as needed, a new Tcl value will be allocated
- * to be the result.
+ * objPtr, characters from first + count to the end. The effect is as if
+ * the inner substring of characters first through first+count-1 are
+ * removed and replaced with insertPtr. If insertPtr is NULL, it is
+ * treated as an empty string. When passed the flag TCL_STRING_IN_PLACE,
+ * this routine will try to do the work within objPtr, so long as no
+ * sharing forbids it. Without that request, or as needed, a new Tcl
+ * value will be allocated to be the result.
*
* Results:
- * A Tcl value that is the result of the substring replacement.
- * May return NULL in case of an error. When NULL is returned and
- * interp is non-NULL, error information is left in interp
+ * A Tcl value that is the result of the substring replacement. May
+ * return NULL in case of an error. When NULL is returned and interp is
+ * non-NULL, error information is left in interp
*
*---------------------------------------------------------------------------
*/