summaryrefslogtreecommitdiffstats
path: root/generic/tclStringObj.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclStringObj.c')
-rw-r--r--generic/tclStringObj.c315
1 files changed, 207 insertions, 108 deletions
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index e51b98a..85cac83 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -418,12 +418,16 @@ Tcl_GetCharLength(
}
/*
- * Optimize the case where we're really dealing with a bytearray object
- * without string representation; we don't need to convert to a string to
- * perform the get-length operation.
+ * Optimize the case where we're really dealing with a bytearray object;
+ * we don't need to convert to a string to perform the get-length operation.
+ *
+ * NOTE that we do not need the bytearray to be "pure". A ByteArray value
+ * with a string rep cannot be trusted to represent the same value as the
+ * string rep, but it *can* be trusted to have the same character length
+ * as the string rep, which is all this routine cares about.
*/
- if (TclIsPureByteArray(objPtr)) {
+ if (objPtr->typePtr == &tclByteArrayType) {
int length;
(void) Tcl_GetByteArrayFromObj(objPtr, &length);
@@ -1650,6 +1654,7 @@ Tcl_AppendFormatToObj(
const char *span = format, *msg, *errCode;
int numBytes = 0, objIndex = 0, gotXpg = 0, gotSequential = 0;
int originalLength, limit;
+ Tcl_UniChar ch = 0;
static const char *mixedXPG =
"cannot mix \"%\" and \"%n$\" conversion specifiers";
static const char *const badIndex[2] = {
@@ -1677,8 +1682,7 @@ Tcl_AppendFormatToObj(
#endif
int newXpg, numChars, allocSegment = 0, segmentLimit, segmentNumBytes;
Tcl_Obj *segment;
- Tcl_UniChar ch;
- int step = Tcl_UtfToUniChar(format, &ch);
+ int step = TclUtfToUniChar(format, &ch);
format += step;
if (ch != '%') {
@@ -1702,7 +1706,7 @@ Tcl_AppendFormatToObj(
* Step 0. Handle special case of escaped format marker (i.e., %%).
*/
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
if (ch == '%') {
span = format;
numBytes = step;
@@ -1722,7 +1726,7 @@ Tcl_AppendFormatToObj(
newXpg = 1;
objIndex = position - 1;
format = end + 1;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
}
}
if (newXpg) {
@@ -1773,7 +1777,7 @@ Tcl_AppendFormatToObj(
}
if (sawFlag) {
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
}
} while (sawFlag);
@@ -1785,7 +1789,7 @@ Tcl_AppendFormatToObj(
if (isdigit(UCHAR(ch))) {
width = strtoul(format, &end, 10);
format = end;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
} else if (ch == '*') {
if (objIndex >= objc - 1) {
msg = badIndex[gotXpg];
@@ -1801,7 +1805,7 @@ Tcl_AppendFormatToObj(
}
objIndex++;
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
}
if (width > limit) {
msg = overflow;
@@ -1817,12 +1821,12 @@ Tcl_AppendFormatToObj(
if (ch == '.') {
gotPrecision = 1;
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
}
if (isdigit(UCHAR(ch))) {
precision = strtoul(format, &end, 10);
format = end;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
} else if (ch == '*') {
if (objIndex >= objc - 1) {
msg = badIndex[gotXpg];
@@ -1843,7 +1847,7 @@ Tcl_AppendFormatToObj(
}
objIndex++;
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
}
/*
@@ -1853,14 +1857,14 @@ Tcl_AppendFormatToObj(
if (ch == 'h') {
useShort = 1;
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
} else if (ch == 'l') {
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
if (ch == 'l') {
useBig = 1;
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
#ifndef TCL_WIDE_INT_IS_LONG
} else {
useWide = 1;
@@ -1869,20 +1873,20 @@ Tcl_AppendFormatToObj(
} else if (ch == 'I') {
if ((format[1] == '6') && (format[2] == '4')) {
format += (step + 2);
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
#ifndef TCL_WIDE_INT_IS_LONG
useWide = 1;
#endif
} else if ((format[1] == '3') && (format[2] == '2')) {
format += (step + 2);
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
} else {
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
}
} else if ((ch == 't') || (ch == 'z')) {
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
#ifndef TCL_WIDE_INT_IS_LONG
if (sizeof(size_t) > sizeof(int)) {
useWide = 1;
@@ -1890,7 +1894,7 @@ Tcl_AppendFormatToObj(
#endif
} else if ((ch == 'q') ||(ch == 'j')) {
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
#ifndef TCL_WIDE_INT_IS_LONG
useWide = 1;
#endif
@@ -1939,6 +1943,11 @@ Tcl_AppendFormatToObj(
}
case 'u':
+ if (useBig) {
+ msg = "unsigned bignum format is invalid";
+ errCode = "BADUNSIGNED";
+ goto errorMsg;
+ }
case 'd':
case 'o':
case 'p':
@@ -1962,15 +1971,6 @@ Tcl_AppendFormatToObj(
goto error;
}
isNegative = (mp_cmp_d(&big, 0) == MP_LT);
- if (ch == 'u') {
- if (isNegative) {
- msg = "unsigned bignum format is invalid";
- errCode = "BADUNSIGNED";
- goto errorMsg;
- } else {
- ch = 'd';
- }
- }
#ifndef TCL_WIDE_INT_IS_LONG
} else if (useWide) {
if (Tcl_GetWideIntFromObj(NULL, segment, &w) != TCL_OK) {
@@ -2033,9 +2033,12 @@ Tcl_AppendFormatToObj(
segmentLimit -= 1;
precision--;
break;
+ case 'X':
+ Tcl_AppendToObj(segment, "0X", 2);
+ segmentLimit -= 2;
+ break;
case 'p':
case 'x':
- case 'X':
Tcl_AppendToObj(segment, "0x", 2);
segmentLimit -= 2;
break;
@@ -2043,6 +2046,10 @@ Tcl_AppendFormatToObj(
Tcl_AppendToObj(segment, "0b", 2);
segmentLimit -= 2;
break;
+ case 'd':
+ Tcl_AppendToObj(segment, "0d", 2);
+ segmentLimit -= 2;
+ break;
}
}
@@ -2200,7 +2207,11 @@ Tcl_AppendFormatToObj(
}
digitOffset = (int) (bits % base);
if (digitOffset > 9) {
- bytes[numDigits] = 'a' + digitOffset - 10;
+ if (ch == 'X') {
+ bytes[numDigits] = 'A' + digitOffset - 10;
+ } else {
+ bytes[numDigits] = 'a' + digitOffset - 10;
+ }
} else {
bytes[numDigits] = '0' + digitOffset;
}
@@ -2322,14 +2333,6 @@ Tcl_AppendFormatToObj(
goto error;
}
- switch (ch) {
- case 'E':
- case 'G':
- case 'X': {
- Tcl_SetObjLength(segment, Tcl_UtfToUpper(TclGetString(segment)));
- }
- }
-
if (width>0 && numChars<0) {
numChars = Tcl_GetCharLength(segment);
}
@@ -2784,7 +2787,7 @@ TclStringRepeat(
if (interp) {
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"string size overflow: unable to alloc %"
- TCL_LL_MODIFIER "u bytes",
+ TCL_LL_MODIFIER "d bytes",
(Tcl_WideUInt)STRING_SIZE(count*length)));
Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL);
}
@@ -2851,11 +2854,21 @@ TclStringCatObjv(
Tcl_Obj * const objv[],
Tcl_Obj **objPtrPtr)
{
- Tcl_Obj *objPtr, *objResultPtr, * const *ov;
- int oc, length = 0, binary = 1, first = 0;
+ Tcl_Obj *objResultPtr, * const *ov;
+ int oc, length = 0, binary = 1;
int allowUniChar = 1, requestUniChar = 0;
+ int first = objc - 1; /* Index of first value possibly not empty */
+ int last = 0; /* Index of last value possibly not empty */
+
+ /* assert ( objc >= 0 ) */
+
+ if (objc <= 1) {
+ /* Only one or no objects; return first or empty */
+ *objPtrPtr = objc ? objv[0] : Tcl_NewObj();
+ return TCL_OK;
+ }
- /* assert (objc >= 2) */
+ /* assert ( objc >= 2 ) */
/*
* Analyze to determine what representation result should be.
@@ -2865,8 +2878,8 @@ TclStringCatObjv(
*/
ov = objv, oc = objc;
- while (oc-- && (binary || allowUniChar)) {
- objPtr = *ov++;
+ do {
+ Tcl_Obj *objPtr = *ov++;
if (objPtr->bytes) {
/* Value has a string rep. */
@@ -2896,72 +2909,144 @@ TclStringCatObjv(
}
}
}
- }
+ } while (--oc && (binary || allowUniChar));
if (binary) {
/* Result will be pure byte array. Pre-size it */
ov = objv; oc = objc;
- while (oc-- && (length >= 0)) {
- objPtr = *ov++;
+ do {
+ Tcl_Obj *objPtr = *ov++;
if (objPtr->bytes == NULL) {
int numBytes;
Tcl_GetByteArrayFromObj(objPtr, &numBytes); /* PANIC? */
- if (length == 0) {
- first = objc - oc - 1;
+ if (numBytes) {
+ last = objc - oc;
+ if (length == 0) {
+ first = last;
+ } else if (numBytes > INT_MAX - length) {
+ goto overflow;
+ }
+ length += numBytes;
}
- length += numBytes;
}
- }
+ } while (--oc);
} else if (allowUniChar && requestUniChar) {
/* Result will be pure Tcl_UniChar array. Pre-size it. */
ov = objv; oc = objc;
- while (oc-- && (length >= 0)) {
- objPtr = *ov++;
+ do {
+ Tcl_Obj *objPtr = *ov++;
if ((objPtr->bytes == NULL) || (objPtr->length)) {
int numChars;
Tcl_GetUnicodeFromObj(objPtr, &numChars); /* PANIC? */
- if (length == 0) {
- first = objc - oc - 1;
+ if (numChars) {
+ last = objc - oc;
+ if (length == 0) {
+ first = last;
+ } else if (numChars > INT_MAX - length) {
+ goto overflow;
+ }
+ length += numChars;
}
- length += numChars;
}
- }
+ } while (--oc);
} else {
/* Result will be concat of string reps. Pre-size it. */
ov = objv; oc = objc;
- while (oc-- && (length >= 0)) {
- int numBytes;
+ do {
+ Tcl_Obj *pendingPtr = NULL;
+
+ /*
+ * Loop until a possibly non-empty value is reached.
+ * Keep string rep generation pending when possible.
+ */
- objPtr = *ov++;
+ do {
+ /* assert ( pendingPtr == NULL ) */
+ /* assert ( length == 0 ) */
- Tcl_GetStringFromObj(objPtr, &numBytes); /* PANIC? */
- if ((length == 0) && numBytes) {
- first = objc - oc - 1;
+ Tcl_Obj *objPtr = *ov++;
+
+ if (objPtr->bytes == NULL) {
+ /* No string rep; Take the chance we can avoid making it */
+ pendingPtr = objPtr;
+ } else {
+ Tcl_GetStringFromObj(objPtr, &length); /* PANIC? */
+ }
+ } while (--oc && (length == 0) && (pendingPtr == NULL));
+
+ /*
+ * Either we found a possibly non-empty value, and we
+ * remember this index as the first and last such value so
+ * far seen, or (oc == 0) and all values are known empty,
+ * so first = last = objc - 1 signals the right quick return.
+ */
+
+ first = last = objc - oc - 1;
+
+ if (oc && (length == 0)) {
+ int numBytes;
+
+ /* assert ( pendingPtr != NULL ) */
+
+ /*
+ * There's a pending value followed by more values.
+ * Loop over remaining values generating strings until
+ * a non-empty value is found, or the pending value gets
+ * its string generated.
+ */
+
+ do {
+ Tcl_Obj *objPtr = *ov++;
+ Tcl_GetStringFromObj(objPtr, &numBytes); /* PANIC? */
+ } while (--oc && numBytes == 0 && pendingPtr->bytes == NULL);
+
+ if (numBytes) {
+ last = objc -oc -1;
+ }
+ if (oc || numBytes) {
+ Tcl_GetStringFromObj(pendingPtr, &length);
+ }
+ if (length == 0) {
+ if (numBytes) {
+ first = last;
+ }
+ } else if (numBytes > INT_MAX - length) {
+ goto overflow;
+ }
+ length += numBytes;
}
- length += numBytes;
- }
- }
+ } while (oc && (length == 0));
- if (length < 0) {
- if (interp) {
- Tcl_SetObjResult(interp, Tcl_ObjPrintf(
- "max size for a Tcl value (%d bytes) exceeded", INT_MAX));
- Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL);
+ while (oc) {
+ int numBytes;
+ Tcl_Obj *objPtr = *ov++;
+
+ /* assert ( length > 0 && pendingPtr == NULL ) */
+
+ Tcl_GetStringFromObj(objPtr, &numBytes); /* PANIC? */
+ if (numBytes) {
+ last = objc - oc;
+ if (numBytes > INT_MAX - length) {
+ goto overflow;
+ }
+ length += numBytes;
+ }
+ --oc;
}
- return TCL_ERROR;
}
- if (length == 0) {
- /* Total length of zero means every value has length zero */
- *objPtrPtr = objv[0];
+ if (last <= first /*|| length == 0 */) {
+ /* Only one non-empty value or zero length; return first */
+ /* NOTE: (length == 0) implies (last <= first) */
+ *objPtrPtr = objv[first];
return TCL_OK;
}
- objv += first; objc -= first;
+ objv += first; objc = (last - first + 1);
if (binary) {
/* Efficiently produce a pure byte array result */
@@ -3008,7 +3093,7 @@ TclStringCatObjv(
if (interp) {
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"concatenation failed: unable to alloc %"
- TCL_LL_MODIFIER "u bytes",
+ TCL_LL_MODIFIER "d bytes",
(Tcl_WideUInt)STRING_SIZE(length)));
Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL);
}
@@ -3021,10 +3106,11 @@ TclStringCatObjv(
/* Ugly interface! No scheme to init array size. */
objResultPtr = Tcl_NewUnicodeObj(&ch, 0); /* PANIC? */
if (0 == Tcl_AttemptSetObjLength(objResultPtr, length)) {
+ Tcl_DecrRefCount(objResultPtr);
if (interp) {
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"concatenation failed: unable to alloc %"
- TCL_LL_MODIFIER "u bytes",
+ TCL_LL_MODIFIER "d bytes",
(Tcl_WideUInt)STRING_SIZE(length)));
Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL);
}
@@ -3062,12 +3148,13 @@ TclStringCatObjv(
return TCL_ERROR;
}
dst = Tcl_GetString(objResultPtr) + start;
- if (length > start) {
- TclFreeIntRep(objResultPtr);
- }
+
+ /* assert ( length > start ) */
+ TclFreeIntRep(objResultPtr);
} else {
objResultPtr = Tcl_NewObj(); /* PANIC? */
if (0 == Tcl_AttemptSetObjLength(objResultPtr, length)) {
+ Tcl_DecrRefCount(objResultPtr);
if (interp) {
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"concatenation failed: unable to alloc %u bytes",
@@ -3091,6 +3178,14 @@ TclStringCatObjv(
}
*objPtrPtr = objResultPtr;
return TCL_OK;
+
+ overflow:
+ if (interp) {
+ Tcl_SetObjResult(interp, Tcl_ObjPrintf(
+ "max size for a Tcl value (%d bytes) exceeded", INT_MAX));
+ Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL);
+ }
+ return TCL_ERROR;
}
/*
@@ -3151,40 +3246,44 @@ TclStringFind(
return -1;
}
+ /*
+ * Check if we have two strings of single-byte characters. If we have, we
+ * can use strstr() to do the search. Note that we can sometimes have
+ * multibyte characters when the string could be minimally represented
+ * using single byte characters; we can't assume that a mismatch here
+ * means no match.
+ */
+
lh = Tcl_GetCharLength(haystack);
- if (haystack->bytes && (lh == haystack->length)) {
- /* haystack is all single-byte chars */
+ if (haystack->bytes && (lh == haystack->length) && needle->bytes
+ && (ln == needle->length)) {
+ /*
+ * Both haystack and needle are all single-byte chars.
+ */
- if (needle->bytes && (ln == needle->length)) {
- /* needle is also all single-byte chars */
- char *found = strstr(haystack->bytes + start, needle->bytes);
+ char *found = strstr(haystack->bytes + start, needle->bytes);
- if (found) {
- return (found - haystack->bytes);
- } else {
- return -1;
- }
+ if (found) {
+ return (found - haystack->bytes);
} else {
- /*
- * Cannot find substring with a multi-byte char inside
- * a string with no multi-byte chars.
- */
return -1;
}
} else {
+ /*
+ * Do the search on the unicode representation for simplicity.
+ */
+
Tcl_UniChar *try, *end, *uh;
Tcl_UniChar *un = Tcl_GetUnicodeFromObj(needle, &ln);
uh = Tcl_GetUnicodeFromObj(haystack, &lh);
end = uh + lh;
- try = uh + start;
- while (try + ln <= end) {
- if ((*try == *un)
- && (0 == memcmp(try+1, un+1, (ln-1)*sizeof(Tcl_UniChar)))) {
+ for (try = uh + start; try + ln <= end; try++) {
+ if ((*try == *un) && (0 ==
+ memcmp(try + 1, un + 1, (ln-1) * sizeof(Tcl_UniChar)))) {
return (try - uh);
}
- try++;
}
return -1;
}
@@ -3339,7 +3438,7 @@ TclStringObjReverse(
Tcl_Obj *objPtr)
{
String *stringPtr;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
if (TclIsPureByteArray(objPtr)) {
int numBytes;
@@ -3367,7 +3466,6 @@ TclStringObjReverse(
* Tcl_SetObjLength into growing the unicode rep buffer.
*/
- ch = 0;
objPtr = Tcl_NewUnicodeObj(&ch, 1);
Tcl_SetObjLength(objPtr, stringPtr->numChars);
to = Tcl_GetUnicode(objPtr);
@@ -3413,7 +3511,7 @@ TclStringObjReverse(
* It's part of the contract for objPtr->bytes values.
* Thus, we can skip calling Tcl_UtfCharComplete() here.
*/
- int bytesInChar = Tcl_UtfToUniChar(from, &ch);
+ int bytesInChar = TclUtfToUniChar(from, &ch);
ReverseBytes((unsigned char *)to, (unsigned char *)from,
bytesInChar);
@@ -3470,7 +3568,7 @@ ExtendUnicodeRepWithString(
{
String *stringPtr = GET_STRING(objPtr);
int needed, numOrigChars = 0;
- Tcl_UniChar *dst;
+ Tcl_UniChar *dst, unichar = 0;
if (stringPtr->hasUnicode) {
numOrigChars = stringPtr->numChars;
@@ -3493,7 +3591,8 @@ ExtendUnicodeRepWithString(
numAppendChars = 0;
}
for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) {
- bytes += TclUtfToUniChar(bytes, dst);
+ bytes += TclUtfToUniChar(bytes, &unichar);
+ *dst = unichar;
}
*dst = 0;
}