diff options
Diffstat (limited to 'generic/tclStringObj.c')
| -rw-r--r-- | generic/tclStringObj.c | 315 |
1 files changed, 207 insertions, 108 deletions
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index e51b98a..85cac83 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -418,12 +418,16 @@ Tcl_GetCharLength( } /* - * Optimize the case where we're really dealing with a bytearray object - * without string representation; we don't need to convert to a string to - * perform the get-length operation. + * Optimize the case where we're really dealing with a bytearray object; + * we don't need to convert to a string to perform the get-length operation. + * + * NOTE that we do not need the bytearray to be "pure". A ByteArray value + * with a string rep cannot be trusted to represent the same value as the + * string rep, but it *can* be trusted to have the same character length + * as the string rep, which is all this routine cares about. */ - if (TclIsPureByteArray(objPtr)) { + if (objPtr->typePtr == &tclByteArrayType) { int length; (void) Tcl_GetByteArrayFromObj(objPtr, &length); @@ -1650,6 +1654,7 @@ Tcl_AppendFormatToObj( const char *span = format, *msg, *errCode; int numBytes = 0, objIndex = 0, gotXpg = 0, gotSequential = 0; int originalLength, limit; + Tcl_UniChar ch = 0; static const char *mixedXPG = "cannot mix \"%\" and \"%n$\" conversion specifiers"; static const char *const badIndex[2] = { @@ -1677,8 +1682,7 @@ Tcl_AppendFormatToObj( #endif int newXpg, numChars, allocSegment = 0, segmentLimit, segmentNumBytes; Tcl_Obj *segment; - Tcl_UniChar ch; - int step = Tcl_UtfToUniChar(format, &ch); + int step = TclUtfToUniChar(format, &ch); format += step; if (ch != '%') { @@ -1702,7 +1706,7 @@ Tcl_AppendFormatToObj( * Step 0. Handle special case of escaped format marker (i.e., %%). */ - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); if (ch == '%') { span = format; numBytes = step; @@ -1722,7 +1726,7 @@ Tcl_AppendFormatToObj( newXpg = 1; objIndex = position - 1; format = end + 1; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } } if (newXpg) { @@ -1773,7 +1777,7 @@ Tcl_AppendFormatToObj( } if (sawFlag) { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } } while (sawFlag); @@ -1785,7 +1789,7 @@ Tcl_AppendFormatToObj( if (isdigit(UCHAR(ch))) { width = strtoul(format, &end, 10); format = end; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } else if (ch == '*') { if (objIndex >= objc - 1) { msg = badIndex[gotXpg]; @@ -1801,7 +1805,7 @@ Tcl_AppendFormatToObj( } objIndex++; format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } if (width > limit) { msg = overflow; @@ -1817,12 +1821,12 @@ Tcl_AppendFormatToObj( if (ch == '.') { gotPrecision = 1; format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } if (isdigit(UCHAR(ch))) { precision = strtoul(format, &end, 10); format = end; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } else if (ch == '*') { if (objIndex >= objc - 1) { msg = badIndex[gotXpg]; @@ -1843,7 +1847,7 @@ Tcl_AppendFormatToObj( } objIndex++; format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } /* @@ -1853,14 +1857,14 @@ Tcl_AppendFormatToObj( if (ch == 'h') { useShort = 1; format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } else if (ch == 'l') { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); if (ch == 'l') { useBig = 1; format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); #ifndef TCL_WIDE_INT_IS_LONG } else { useWide = 1; @@ -1869,20 +1873,20 @@ Tcl_AppendFormatToObj( } else if (ch == 'I') { if ((format[1] == '6') && (format[2] == '4')) { format += (step + 2); - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); #ifndef TCL_WIDE_INT_IS_LONG useWide = 1; #endif } else if ((format[1] == '3') && (format[2] == '2')) { format += (step + 2); - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } else { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } } else if ((ch == 't') || (ch == 'z')) { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); #ifndef TCL_WIDE_INT_IS_LONG if (sizeof(size_t) > sizeof(int)) { useWide = 1; @@ -1890,7 +1894,7 @@ Tcl_AppendFormatToObj( #endif } else if ((ch == 'q') ||(ch == 'j')) { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); #ifndef TCL_WIDE_INT_IS_LONG useWide = 1; #endif @@ -1939,6 +1943,11 @@ Tcl_AppendFormatToObj( } case 'u': + if (useBig) { + msg = "unsigned bignum format is invalid"; + errCode = "BADUNSIGNED"; + goto errorMsg; + } case 'd': case 'o': case 'p': @@ -1962,15 +1971,6 @@ Tcl_AppendFormatToObj( goto error; } isNegative = (mp_cmp_d(&big, 0) == MP_LT); - if (ch == 'u') { - if (isNegative) { - msg = "unsigned bignum format is invalid"; - errCode = "BADUNSIGNED"; - goto errorMsg; - } else { - ch = 'd'; - } - } #ifndef TCL_WIDE_INT_IS_LONG } else if (useWide) { if (Tcl_GetWideIntFromObj(NULL, segment, &w) != TCL_OK) { @@ -2033,9 +2033,12 @@ Tcl_AppendFormatToObj( segmentLimit -= 1; precision--; break; + case 'X': + Tcl_AppendToObj(segment, "0X", 2); + segmentLimit -= 2; + break; case 'p': case 'x': - case 'X': Tcl_AppendToObj(segment, "0x", 2); segmentLimit -= 2; break; @@ -2043,6 +2046,10 @@ Tcl_AppendFormatToObj( Tcl_AppendToObj(segment, "0b", 2); segmentLimit -= 2; break; + case 'd': + Tcl_AppendToObj(segment, "0d", 2); + segmentLimit -= 2; + break; } } @@ -2200,7 +2207,11 @@ Tcl_AppendFormatToObj( } digitOffset = (int) (bits % base); if (digitOffset > 9) { - bytes[numDigits] = 'a' + digitOffset - 10; + if (ch == 'X') { + bytes[numDigits] = 'A' + digitOffset - 10; + } else { + bytes[numDigits] = 'a' + digitOffset - 10; + } } else { bytes[numDigits] = '0' + digitOffset; } @@ -2322,14 +2333,6 @@ Tcl_AppendFormatToObj( goto error; } - switch (ch) { - case 'E': - case 'G': - case 'X': { - Tcl_SetObjLength(segment, Tcl_UtfToUpper(TclGetString(segment))); - } - } - if (width>0 && numChars<0) { numChars = Tcl_GetCharLength(segment); } @@ -2784,7 +2787,7 @@ TclStringRepeat( if (interp) { Tcl_SetObjResult(interp, Tcl_ObjPrintf( "string size overflow: unable to alloc %" - TCL_LL_MODIFIER "u bytes", + TCL_LL_MODIFIER "d bytes", (Tcl_WideUInt)STRING_SIZE(count*length))); Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL); } @@ -2851,11 +2854,21 @@ TclStringCatObjv( Tcl_Obj * const objv[], Tcl_Obj **objPtrPtr) { - Tcl_Obj *objPtr, *objResultPtr, * const *ov; - int oc, length = 0, binary = 1, first = 0; + Tcl_Obj *objResultPtr, * const *ov; + int oc, length = 0, binary = 1; int allowUniChar = 1, requestUniChar = 0; + int first = objc - 1; /* Index of first value possibly not empty */ + int last = 0; /* Index of last value possibly not empty */ + + /* assert ( objc >= 0 ) */ + + if (objc <= 1) { + /* Only one or no objects; return first or empty */ + *objPtrPtr = objc ? objv[0] : Tcl_NewObj(); + return TCL_OK; + } - /* assert (objc >= 2) */ + /* assert ( objc >= 2 ) */ /* * Analyze to determine what representation result should be. @@ -2865,8 +2878,8 @@ TclStringCatObjv( */ ov = objv, oc = objc; - while (oc-- && (binary || allowUniChar)) { - objPtr = *ov++; + do { + Tcl_Obj *objPtr = *ov++; if (objPtr->bytes) { /* Value has a string rep. */ @@ -2896,72 +2909,144 @@ TclStringCatObjv( } } } - } + } while (--oc && (binary || allowUniChar)); if (binary) { /* Result will be pure byte array. Pre-size it */ ov = objv; oc = objc; - while (oc-- && (length >= 0)) { - objPtr = *ov++; + do { + Tcl_Obj *objPtr = *ov++; if (objPtr->bytes == NULL) { int numBytes; Tcl_GetByteArrayFromObj(objPtr, &numBytes); /* PANIC? */ - if (length == 0) { - first = objc - oc - 1; + if (numBytes) { + last = objc - oc; + if (length == 0) { + first = last; + } else if (numBytes > INT_MAX - length) { + goto overflow; + } + length += numBytes; } - length += numBytes; } - } + } while (--oc); } else if (allowUniChar && requestUniChar) { /* Result will be pure Tcl_UniChar array. Pre-size it. */ ov = objv; oc = objc; - while (oc-- && (length >= 0)) { - objPtr = *ov++; + do { + Tcl_Obj *objPtr = *ov++; if ((objPtr->bytes == NULL) || (objPtr->length)) { int numChars; Tcl_GetUnicodeFromObj(objPtr, &numChars); /* PANIC? */ - if (length == 0) { - first = objc - oc - 1; + if (numChars) { + last = objc - oc; + if (length == 0) { + first = last; + } else if (numChars > INT_MAX - length) { + goto overflow; + } + length += numChars; } - length += numChars; } - } + } while (--oc); } else { /* Result will be concat of string reps. Pre-size it. */ ov = objv; oc = objc; - while (oc-- && (length >= 0)) { - int numBytes; + do { + Tcl_Obj *pendingPtr = NULL; + + /* + * Loop until a possibly non-empty value is reached. + * Keep string rep generation pending when possible. + */ - objPtr = *ov++; + do { + /* assert ( pendingPtr == NULL ) */ + /* assert ( length == 0 ) */ - Tcl_GetStringFromObj(objPtr, &numBytes); /* PANIC? */ - if ((length == 0) && numBytes) { - first = objc - oc - 1; + Tcl_Obj *objPtr = *ov++; + + if (objPtr->bytes == NULL) { + /* No string rep; Take the chance we can avoid making it */ + pendingPtr = objPtr; + } else { + Tcl_GetStringFromObj(objPtr, &length); /* PANIC? */ + } + } while (--oc && (length == 0) && (pendingPtr == NULL)); + + /* + * Either we found a possibly non-empty value, and we + * remember this index as the first and last such value so + * far seen, or (oc == 0) and all values are known empty, + * so first = last = objc - 1 signals the right quick return. + */ + + first = last = objc - oc - 1; + + if (oc && (length == 0)) { + int numBytes; + + /* assert ( pendingPtr != NULL ) */ + + /* + * There's a pending value followed by more values. + * Loop over remaining values generating strings until + * a non-empty value is found, or the pending value gets + * its string generated. + */ + + do { + Tcl_Obj *objPtr = *ov++; + Tcl_GetStringFromObj(objPtr, &numBytes); /* PANIC? */ + } while (--oc && numBytes == 0 && pendingPtr->bytes == NULL); + + if (numBytes) { + last = objc -oc -1; + } + if (oc || numBytes) { + Tcl_GetStringFromObj(pendingPtr, &length); + } + if (length == 0) { + if (numBytes) { + first = last; + } + } else if (numBytes > INT_MAX - length) { + goto overflow; + } + length += numBytes; } - length += numBytes; - } - } + } while (oc && (length == 0)); - if (length < 0) { - if (interp) { - Tcl_SetObjResult(interp, Tcl_ObjPrintf( - "max size for a Tcl value (%d bytes) exceeded", INT_MAX)); - Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL); + while (oc) { + int numBytes; + Tcl_Obj *objPtr = *ov++; + + /* assert ( length > 0 && pendingPtr == NULL ) */ + + Tcl_GetStringFromObj(objPtr, &numBytes); /* PANIC? */ + if (numBytes) { + last = objc - oc; + if (numBytes > INT_MAX - length) { + goto overflow; + } + length += numBytes; + } + --oc; } - return TCL_ERROR; } - if (length == 0) { - /* Total length of zero means every value has length zero */ - *objPtrPtr = objv[0]; + if (last <= first /*|| length == 0 */) { + /* Only one non-empty value or zero length; return first */ + /* NOTE: (length == 0) implies (last <= first) */ + *objPtrPtr = objv[first]; return TCL_OK; } - objv += first; objc -= first; + objv += first; objc = (last - first + 1); if (binary) { /* Efficiently produce a pure byte array result */ @@ -3008,7 +3093,7 @@ TclStringCatObjv( if (interp) { Tcl_SetObjResult(interp, Tcl_ObjPrintf( "concatenation failed: unable to alloc %" - TCL_LL_MODIFIER "u bytes", + TCL_LL_MODIFIER "d bytes", (Tcl_WideUInt)STRING_SIZE(length))); Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL); } @@ -3021,10 +3106,11 @@ TclStringCatObjv( /* Ugly interface! No scheme to init array size. */ objResultPtr = Tcl_NewUnicodeObj(&ch, 0); /* PANIC? */ if (0 == Tcl_AttemptSetObjLength(objResultPtr, length)) { + Tcl_DecrRefCount(objResultPtr); if (interp) { Tcl_SetObjResult(interp, Tcl_ObjPrintf( "concatenation failed: unable to alloc %" - TCL_LL_MODIFIER "u bytes", + TCL_LL_MODIFIER "d bytes", (Tcl_WideUInt)STRING_SIZE(length))); Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL); } @@ -3062,12 +3148,13 @@ TclStringCatObjv( return TCL_ERROR; } dst = Tcl_GetString(objResultPtr) + start; - if (length > start) { - TclFreeIntRep(objResultPtr); - } + + /* assert ( length > start ) */ + TclFreeIntRep(objResultPtr); } else { objResultPtr = Tcl_NewObj(); /* PANIC? */ if (0 == Tcl_AttemptSetObjLength(objResultPtr, length)) { + Tcl_DecrRefCount(objResultPtr); if (interp) { Tcl_SetObjResult(interp, Tcl_ObjPrintf( "concatenation failed: unable to alloc %u bytes", @@ -3091,6 +3178,14 @@ TclStringCatObjv( } *objPtrPtr = objResultPtr; return TCL_OK; + + overflow: + if (interp) { + Tcl_SetObjResult(interp, Tcl_ObjPrintf( + "max size for a Tcl value (%d bytes) exceeded", INT_MAX)); + Tcl_SetErrorCode(interp, "TCL", "MEMORY", NULL); + } + return TCL_ERROR; } /* @@ -3151,40 +3246,44 @@ TclStringFind( return -1; } + /* + * Check if we have two strings of single-byte characters. If we have, we + * can use strstr() to do the search. Note that we can sometimes have + * multibyte characters when the string could be minimally represented + * using single byte characters; we can't assume that a mismatch here + * means no match. + */ + lh = Tcl_GetCharLength(haystack); - if (haystack->bytes && (lh == haystack->length)) { - /* haystack is all single-byte chars */ + if (haystack->bytes && (lh == haystack->length) && needle->bytes + && (ln == needle->length)) { + /* + * Both haystack and needle are all single-byte chars. + */ - if (needle->bytes && (ln == needle->length)) { - /* needle is also all single-byte chars */ - char *found = strstr(haystack->bytes + start, needle->bytes); + char *found = strstr(haystack->bytes + start, needle->bytes); - if (found) { - return (found - haystack->bytes); - } else { - return -1; - } + if (found) { + return (found - haystack->bytes); } else { - /* - * Cannot find substring with a multi-byte char inside - * a string with no multi-byte chars. - */ return -1; } } else { + /* + * Do the search on the unicode representation for simplicity. + */ + Tcl_UniChar *try, *end, *uh; Tcl_UniChar *un = Tcl_GetUnicodeFromObj(needle, &ln); uh = Tcl_GetUnicodeFromObj(haystack, &lh); end = uh + lh; - try = uh + start; - while (try + ln <= end) { - if ((*try == *un) - && (0 == memcmp(try+1, un+1, (ln-1)*sizeof(Tcl_UniChar)))) { + for (try = uh + start; try + ln <= end; try++) { + if ((*try == *un) && (0 == + memcmp(try + 1, un + 1, (ln-1) * sizeof(Tcl_UniChar)))) { return (try - uh); } - try++; } return -1; } @@ -3339,7 +3438,7 @@ TclStringObjReverse( Tcl_Obj *objPtr) { String *stringPtr; - Tcl_UniChar ch; + Tcl_UniChar ch = 0; if (TclIsPureByteArray(objPtr)) { int numBytes; @@ -3367,7 +3466,6 @@ TclStringObjReverse( * Tcl_SetObjLength into growing the unicode rep buffer. */ - ch = 0; objPtr = Tcl_NewUnicodeObj(&ch, 1); Tcl_SetObjLength(objPtr, stringPtr->numChars); to = Tcl_GetUnicode(objPtr); @@ -3413,7 +3511,7 @@ TclStringObjReverse( * It's part of the contract for objPtr->bytes values. * Thus, we can skip calling Tcl_UtfCharComplete() here. */ - int bytesInChar = Tcl_UtfToUniChar(from, &ch); + int bytesInChar = TclUtfToUniChar(from, &ch); ReverseBytes((unsigned char *)to, (unsigned char *)from, bytesInChar); @@ -3470,7 +3568,7 @@ ExtendUnicodeRepWithString( { String *stringPtr = GET_STRING(objPtr); int needed, numOrigChars = 0; - Tcl_UniChar *dst; + Tcl_UniChar *dst, unichar = 0; if (stringPtr->hasUnicode) { numOrigChars = stringPtr->numChars; @@ -3493,7 +3591,8 @@ ExtendUnicodeRepWithString( numAppendChars = 0; } for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) { - bytes += TclUtfToUniChar(bytes, dst); + bytes += TclUtfToUniChar(bytes, &unichar); + *dst = unichar; } *dst = 0; } |
