diff options
Diffstat (limited to 'generic/tclUtil.c')
-rw-r--r-- | generic/tclUtil.c | 2413 |
1 files changed, 1664 insertions, 749 deletions
diff --git a/generic/tclUtil.c b/generic/tclUtil.c index fb4e20b..13e54ec 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -10,11 +10,10 @@ * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. - * - * RCS: @(#) $Id: tclUtil.c,v 1.117 2010/08/22 18:53:26 nijtmans Exp $ */ #include "tclInt.h" +#include "tclParse.h" #include <math.h> /* @@ -27,31 +26,71 @@ static ProcessGlobalValue executableName = { }; /* - * The following values are used in the flags returned by Tcl_ScanElement and - * used by Tcl_ConvertElement. The values TCL_DONT_USE_BRACES and - * TCL_DONT_QUOTE_HASH are defined in tcl.h; make sure neither value overlaps - * with any of the values below. - * - * TCL_DONT_USE_BRACES - 1 means the string mustn't be enclosed in - * braces (e.g. it contains unmatched braces, or - * ends in a backslash character, or user just - * doesn't want braces); handle all special - * characters by adding backslashes. - * USE_BRACES - 1 means the string contains a special - * character that can be handled simply by - * enclosing the entire argument in braces. - * BRACES_UNMATCHED - 1 means that braces aren't properly matched in - * the argument. + * The following values are used in the flags arguments of Tcl*Scan*Element + * and Tcl*Convert*Element. The values TCL_DONT_USE_BRACES and + * TCL_DONT_QUOTE_HASH are defined in tcl.h, like so: + * +#define TCL_DONT_USE_BRACES 1 +#define TCL_DONT_QUOTE_HASH 8 + * + * Those are public flag bits which callers of the public routines + * Tcl_Convert*Element() can use to indicate: + * + * TCL_DONT_USE_BRACES - 1 means the caller is insisting that brace + * quoting not be used when converting the list + * element. * TCL_DONT_QUOTE_HASH - 1 means the caller insists that a leading hash * character ('#') should *not* be quoted. This * is appropriate when the caller can guarantee * the element is not the first element of a * list, so [eval] cannot mis-parse the element * as a comment. + * + * The remaining values which can be carried by the flags of these routines + * are for internal use only. Make sure they do not overlap with the public + * values above. + * + * The Tcl*Scan*Element() routines make a determination which of 4 modes of + * conversion is most appropriate for Tcl*Convert*Element() to perform, and + * sets two bits of the flags value to indicate the mode selected. + * + * CONVERT_NONE The element needs no quoting. Its literal string is + * suitable as is. + * CONVERT_BRACE The conversion should be enclosing the literal string + * in braces. + * CONVERT_ESCAPE The conversion should be using backslashes to escape + * any characters in the string that require it. + * CONVERT_MASK A mask value used to extract the conversion mode from + * the flags argument. + * Also indicates a strange conversion mode where all + * special characters are escaped with backslashes + * *except for braces*. This is a strange and unnecessary + * case, but it's part of the historical way in which + * lists have been formatted in Tcl. To experiment with + * removing this case, set the value of COMPAT to 0. + * + * One last flag value is used only by callers of TclScanElement(). The flag + * value produced by a call to Tcl*Scan*Element() will never leave this bit + * set. + * + * CONVERT_ANY The caller of TclScanElement() declares it can make no + * promise about what public flags will be passed to the + * matching call of TclConvertElement(). As such, + * TclScanElement() has to determine the worst case + * destination buffer length over all possibilities, and + * in other cases this means an overestimate of the + * required size. + * + * For more details, see the comments on the Tcl*Scan*Element and + * Tcl*Convert*Element routines. */ -#define USE_BRACES 2 -#define BRACES_UNMATCHED 4 +#define COMPAT 1 +#define CONVERT_NONE 0 +#define CONVERT_BRACE 2 +#define CONVERT_ESCAPE 4 +#define CONVERT_MASK (CONVERT_BRACE | CONVERT_ESCAPE) +#define CONVERT_ANY 16 /* * The following key is used by Tcl_PrintDouble and TclPrecTraceProc to @@ -88,6 +127,322 @@ const Tcl_ObjType tclEndOffsetType = { }; /* + * * STRING REPRESENTATION OF LISTS * * * + * + * The next several routines implement the conversions of strings to and from + * Tcl lists. To understand their operation, the rules of parsing and + * generating the string representation of lists must be known. Here we + * describe them in one place. + * + * A list is made up of zero or more elements. Any string is a list if it is + * made up of alternating substrings of element-separating ASCII whitespace + * and properly formatted elements. + * + * The ASCII characters which can make up the whitespace between list elements + * are: + * + * \u0009 \t TAB + * \u000A \n NEWLINE + * \u000B \v VERTICAL TAB + * \u000C \f FORM FEED + * \u000D \r CARRIAGE RETURN + * \u0020 SPACE + * + * NOTE: differences between this and other places where Tcl defines a role + * for "whitespace". + * + * * Unlike command parsing, here NEWLINE is just another whitespace + * character; its role as a command terminator in a script has no + * importance here. + * + * * Unlike command parsing, the BACKSLASH NEWLINE sequence is not + * considered to be a whitespace character. + * + * * Other Unicode whitespace characters (recognized by [string is space] + * or Tcl_UniCharIsSpace()) do not play any role as element separators + * in Tcl lists. + * + * * The NUL byte ought not appear, as it is not in strings properly + * encoded for Tcl, but if it is present, it is not treated as + * separating whitespace, or a string terminator. It is just another + * character in a list element. + * + * The interpretaton of a formatted substring as a list element follows rules + * similar to the parsing of the words of a command in a Tcl script. Backslash + * substitution plays a key role, and is defined exactly as it is in command + * parsing. The same routine, TclParseBackslash() is used in both command + * parsing and list parsing. + * + * NOTE: This means that if and when backslash substitution rules ever change + * for command parsing, the interpretation of strings as lists also changes. + * + * Backslash substitution replaces an "escape sequence" of one or more + * characters starting with + * \u005c \ BACKSLASH + * with a single character. The one character escape sequent case happens only + * when BACKSLASH is the last character in the string. In all other cases, the + * escape sequence is at least two characters long. + * + * The formatted substrings are interpreted as element values according to the + * following cases: + * + * * If the first character of a formatted substring is + * \u007b { OPEN BRACE + * then the end of the substring is the matching + * \u007d } CLOSE BRACE + * character, where matching is determined by counting nesting levels, and + * not including any brace characters that are contained within a backslash + * escape sequence in the nesting count. Having found the matching brace, + * all characters between the braces are the string value of the element. + * If no matching close brace is found before the end of the string, the + * string is not a Tcl list. If the character following the close brace is + * not an element separating whitespace character, or the end of the string, + * then the string is not a Tcl list. + * + * NOTE: this differs from a brace-quoted word in the parsing of a Tcl + * command only in its treatment of the backslash-newline sequence. In a + * list element, the literal characters in the backslash-newline sequence + * become part of the element value. In a script word, conversion to a + * single SPACE character is done. + * + * NOTE: Most list element values can be represented by a formatted + * substring using brace quoting. The exceptions are any element value that + * includes an unbalanced brace not in a backslash escape sequence, and any + * value that ends with a backslash not itself in a backslash escape + * sequence. + * + * * If the first character of a formatted substring is + * \u0022 " QUOTE + * then the end of the substring is the next QUOTE character, not counting + * any QUOTE characters that are contained within a backslash escape + * sequence. If no next QUOTE is found before the end of the string, the + * string is not a Tcl list. If the character following the closing QUOTE is + * not an element separating whitespace character, or the end of the string, + * then the string is not a Tcl list. Having found the limits of the + * substring, the element value is produced by performing backslash + * substitution on the character sequence between the open and close QUOTEs. + * + * NOTE: Any element value can be represented by this style of formatting, + * given suitable choice of backslash escape sequences. + * + * * All other formatted substrings are terminated by the next element + * separating whitespace character in the string. Having found the limits + * of the substring, the element value is produced by performing backslash + * substitution on it. + * + * NOTE: Any element value can be represented by this style of formatting, + * given suitable choice of backslash escape sequences, with one exception. + * The empty string cannot be represented as a list element without the use + * of either braces or quotes to delimit it. + * + * This collection of parsing rules is implemented in the routine + * TclFindElement(). + * + * In order to produce lists that can be parsed by these rules, we need the + * ability to distinguish between characters that are part of a list element + * value from characters providing syntax that define the structure of the + * list. This means that our code that generates lists must at a minimum be + * able to produce escape sequences for the 10 characters identified above + * that have significance to a list parser. + * + * * * CANONICAL LISTS * * * * * + * + * In addition to the basic rules for parsing strings into Tcl lists, there + * are additional properties to be met by the set of list values that are + * generated by Tcl. Such list values are often said to be in "canonical + * form": + * + * * When any canonical list is evaluated as a Tcl script, it is a script of + * either zero commands (an empty list) or exactly one command. The command + * word is exactly the first element of the list, and each argument word is + * exactly one of the following elements of the list. This means that any + * characters that have special meaning during script evaluation need + * special treatment when canonical lists are produced: + * + * * Whitespace between elements may not include NEWLINE. + * * The command terminating character, + * \u003b ; SEMICOLON + * must be BRACEd, QUOTEd, or escaped so that it does not terminate the + * command prematurely. + * * Any of the characters that begin substitutions in scripts, + * \u0024 $ DOLLAR + * \u005b [ OPEN BRACKET + * \u005c \ BACKSLASH + * need to be BRACEd or escaped. + * * In any list where the first character of the first element is + * \u0023 # HASH + * that HASH character must be BRACEd, QUOTEd, or escaped so that it + * does not convert the command into a comment. + * * Any list element that contains the character sequence BACKSLASH + * NEWLINE cannot be formatted with BRACEs. The BACKSLASH character + * must be represented by an escape sequence, and unless QUOTEs are + * used, the NEWLINE must be as well. + * + * * It is also guaranteed that one can use a canonical list as a building + * block of a larger script within command substitution, as in this example: + * set script "puts \[[list $cmd $arg]]"; eval $script + * To support this usage, any appearance of the character + * \u005d ] CLOSE BRACKET + * in a list element must be BRACEd, QUOTEd, or escaped. + * + * * Finally it is guaranteed that enclosing a canonical list in braces + * produces a new value that is also a canonical list. This new list has + * length 1, and its only element is the original canonical list. This same + * guarantee also makes it possible to construct scripts where an argument + * word is given a list value by enclosing the canonical form of that list + * in braces: + * set script "puts {[list $one $two $three]}"; eval $script + * This sort of coding was once fairly common, though it's become more + * idiomatic to see the following instead: + * set script [list puts [list $one $two $three]]; eval $script + * In order to support this guarantee, every canonical list must have + * balance when counting those braces that are not in escape sequences. + * + * Within these constraints, the canonical list generation routines + * TclScanElement() and TclConvertElement() attempt to generate the string for + * any list that is easiest to read. When an element value is itself + * acceptable as the formatted substring, it is usually used (CONVERT_NONE). + * When some quoting or escaping is required, use of BRACEs (CONVERT_BRACE) is + * usually preferred over the use of escape sequences (CONVERT_ESCAPE). There + * are some exceptions to both of these preferences for reasons of code + * simplicity, efficiency, and continuation of historical habits. Canonical + * lists never use the QUOTE formatting to delimit their elements because that + * form of quoting does not nest, which makes construction of nested lists far + * too much trouble. Canonical lists always use only a single SPACE character + * for element-separating whitespace. + * + * * * FUTURE CONSIDERATIONS * * * + * + * When a list element requires quoting or escaping due to a CLOSE BRACKET + * character or an internal QUOTE character, a strange formatting mode is + * recommended. For example, if the value "a{b]c}d" is converted by the usual + * modes: + * + * CONVERT_BRACE: a{b]c}d => {a{b]c}d} + * CONVERT_ESCAPE: a{b]c}d => a\{b\]c\}d + * + * we get perfectly usable formatted list elements. However, this is not what + * Tcl releases have been producing. Instead, we have: + * + * CONVERT_MASK: a{b]c}d => a{b\]c}d + * + * where the CLOSE BRACKET is escaped, but the BRACEs are not. The same effect + * can be seen replacing ] with " in this example. There does not appear to be + * any functional or aesthetic purpose for this strange additional mode. The + * sole purpose I can see for preserving it is to keep generating the same + * formatted lists programmers have become accustomed to, and perhaps written + * tests to expect. That is, compatibility only. The additional code + * complexity required to support this mode is significant. The lines of code + * supporting it are delimited in the routines below with #if COMPAT + * directives. This makes it easy to experiment with eliminating this + * formatting mode simply with "#define COMPAT 0" above. I believe this is + * worth considering. + * + * Another consideration is the treatment of QUOTE characters in list + * elements. TclConvertElement() must have the ability to produce the escape + * sequence \" so that when a list element begins with a QUOTE we do not + * confuse that first character with a QUOTE used as list syntax to define + * list structure. However, that is the only place where QUOTE characters need + * quoting. In this way, handling QUOTE could really be much more like the way + * we handle HASH which also needs quoting and escaping only in particular + * situations. Following up this could increase the set of list elements that + * can use the CONVERT_NONE formatting mode. + * + * More speculative is that the demands of canonical list form require brace + * balance for the list as a whole, while the current implementation achieves + * this by establishing brace balance for every element. + * + * Finally, a reminder that the rules for parsing and formatting lists are + * closely tied together with the rules for parsing and evaluating scripts, + * and will need to evolve in sync. + */ + +/* + *---------------------------------------------------------------------- + * + * TclMaxListLength -- + * + * Given 'bytes' pointing to 'numBytes' bytes, scan through them and + * count the number of whitespace runs that could be list element + * separators. If 'numBytes' is -1, scan to the terminating '\0'. Not a + * full list parser. Typically used to get a quick and dirty overestimate + * of length size in order to allocate space for an actual list parser to + * operate with. + * + * Results: + * Returns the largest number of list elements that could possibly be in + * this string, interpreted as a Tcl list. If 'endPtr' is not NULL, + * writes a pointer to the end of the string scanned there. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclMaxListLength( + const char *bytes, + int numBytes, + const char **endPtr) +{ + int count = 0; + + if ((numBytes == 0) || ((numBytes == -1) && (*bytes == '\0'))) { + /* Empty string case - quick exit */ + goto done; + } + + /* + * No list element before leading white space. + */ + + count += 1 - TclIsSpaceProc(*bytes); + + /* + * Count white space runs as potential element separators. + */ + + while (numBytes) { + if ((numBytes == -1) && (*bytes == '\0')) { + break; + } + if (TclIsSpaceProc(*bytes)) { + /* + * Space run started; bump count. + */ + + count++; + do { + bytes++; + numBytes -= (numBytes != -1); + } while (numBytes && TclIsSpaceProc(*bytes)); + if ((numBytes == 0) || ((numBytes == -1) && (*bytes == '\0'))) { + break; + } + + /* + * (*bytes) is non-space; return to counting state. + */ + } + bytes++; + numBytes -= (numBytes != -1); + } + + /* + * No list element following trailing white space. + */ + + count -= TclIsSpaceProc(bytes[-1]); + + done: + if (endPtr) { + *endPtr = bytes; + } + return count; +} + +/* *---------------------------------------------------------------------- * * TclFindElement -- @@ -107,13 +462,18 @@ const Tcl_ObjType tclEndOffsetType = { * that's part of the element. If this is the last argument in the list, * then *nextPtr will point just after the last character in the list * (i.e., at the character at list+listLength). If sizePtr is non-NULL, - * *sizePtr is filled in with the number of characters in the element. If - * the element is in braces, then *elementPtr will point to the character + * *sizePtr is filled in with the number of bytes in the element. If the + * element is in braces, then *elementPtr will point to the character * after the opening brace and *sizePtr will not include either of the * braces. If there isn't an element in the list, *sizePtr will be zero, - * and both *elementPtr and *termPtr will point just after the last - * character in the list. Note: this function does NOT collapse backslash - * sequences. + * and both *elementPtr and *nextPtr will point just after the last + * character in the list. If literalPtr is non-NULL, *literalPtr is set + * to a boolean value indicating whether the substring returned as the + * values of **elementPtr and *sizePtr is the literal value of a list + * element. If not, a call to TclCopyAndCollapse() is needed to produce + * the actual value of the list element. Note: this function does NOT + * collapse backslash sequences, but uses *literalPtr to tell callers + * when it is required for them to do so. * * Side effects: * None. @@ -137,8 +497,12 @@ TclFindElement( * argument (next arg or end of list). */ int *sizePtr, /* If non-zero, fill in with size of * element. */ - int *bracePtr) /* If non-zero, fill in with non-zero/zero to - * indicate that arg was/wasn't in braces. */ + int *literalPtr) /* If non-zero, fill in with non-zero/zero to + * indicate that the substring of *sizePtr + * bytes starting at **elementPtr is/is not + * the literal list element and therefore + * does not/does require a call to + * TclCopyAndCollapse() by the caller. */ { const char *p = list; const char *elemStart; /* Points to first byte of first element. */ @@ -147,6 +511,7 @@ TclFindElement( int inQuotes = 0; int size = 0; /* lint. */ int numChars; + int literal = 1; const char *p2; /* @@ -156,7 +521,7 @@ TclFindElement( */ limit = (list + listLength); - while ((p < limit) && (isspace(UCHAR(*p)))) { /* INTL: ISO space. */ + while ((p < limit) && (TclIsSpaceProc(*p))) { p++; } if (p == limit) { /* no element found */ @@ -172,9 +537,6 @@ TclFindElement( p++; } elemStart = p; - if (bracePtr != 0) { - *bracePtr = openBraces; - } /* * Find element's end (a space, close brace, or the end of the string). @@ -204,8 +566,7 @@ TclFindElement( } else if (openBraces == 1) { size = (p - elemStart); p++; - if ((p >= limit) - || isspace(UCHAR(*p))) { /* INTL: ISO space. */ + if ((p >= limit) || TclIsSpaceProc(*p)) { goto done; } @@ -215,14 +576,15 @@ TclFindElement( if (interp != NULL) { p2 = p; - while ((p2 < limit) - && (!isspace(UCHAR(*p2))) /* INTL: ISO space. */ + while ((p2 < limit) && (!TclIsSpaceProc(*p2)) && (p2 < p+20)) { p2++; } Tcl_SetObjResult(interp, Tcl_ObjPrintf( "list element in braces followed by \"%.*s\" " "instead of space", (int) (p2-p), p)); + Tcl_SetErrorCode(interp, "TCL", "VALUE", "LIST", "JUNK", + NULL); } return TCL_ERROR; } @@ -234,7 +596,17 @@ TclFindElement( */ case '\\': - Tcl_UtfBackslash(p, &numChars, NULL); + if (openBraces == 0) { + /* + * A backslash sequence not within a brace quoted element + * means the value of the element is different from the + * substring we are parsing. A call to TclCopyAndCollapse() is + * needed to produce the element value. Inform the caller. + */ + + literal = 0; + } + TclParseBackslash(p, limit - p, &numChars, NULL); p += (numChars - 1); break; @@ -263,8 +635,7 @@ TclFindElement( if (inQuotes) { size = (p - elemStart); p++; - if ((p >= limit) - || isspace(UCHAR(*p))) { /* INTL: ISO space */ + if ((p >= limit) || TclIsSpaceProc(*p)) { goto done; } @@ -274,14 +645,15 @@ TclFindElement( if (interp != NULL) { p2 = p; - while ((p2 < limit) - && (!isspace(UCHAR(*p2))) /* INTL: ISO space */ + while ((p2 < limit) && (!TclIsSpaceProc(*p2)) && (p2 < p+20)) { p2++; } Tcl_SetObjResult(interp, Tcl_ObjPrintf( "list element in quotes followed by \"%.*s\" " "instead of space", (int) (p2-p), p)); + Tcl_SetErrorCode(interp, "TCL", "VALUE", "LIST", "JUNK", + NULL); } return TCL_ERROR; } @@ -297,14 +669,18 @@ TclFindElement( if (p == limit) { if (openBraces != 0) { if (interp != NULL) { - Tcl_SetResult(interp, "unmatched open brace in list", - TCL_STATIC); + Tcl_SetObjResult(interp, Tcl_NewStringObj( + "unmatched open brace in list", -1)); + Tcl_SetErrorCode(interp, "TCL", "VALUE", "LIST", "BRACE", + NULL); } return TCL_ERROR; } else if (inQuotes) { if (interp != NULL) { - Tcl_SetResult(interp, "unmatched open quote in list", - TCL_STATIC); + Tcl_SetObjResult(interp, Tcl_NewStringObj( + "unmatched open quote in list", -1)); + Tcl_SetErrorCode(interp, "TCL", "VALUE", "LIST", "QUOTE", + NULL); } return TCL_ERROR; } @@ -312,7 +688,7 @@ TclFindElement( } done: - while ((p < limit) && (isspace(UCHAR(*p)))) { /* INTL: ISO space. */ + while ((p < limit) && (TclIsSpaceProc(*p))) { p++; } *elementPtr = elemStart; @@ -320,6 +696,9 @@ TclFindElement( if (sizePtr != 0) { *sizePtr = size; } + if (literalPtr != 0) { + *literalPtr = literal; + } return TCL_OK; } @@ -328,14 +707,13 @@ TclFindElement( * * TclCopyAndCollapse -- * - * Copy a string and eliminate any backslashes that aren't in braces. + * Copy a string and substitute all backslash escape sequences * * Results: - * Count characters get copied from src to dst. Along the way, if - * backslash sequences are found outside braces, the backslashes are - * eliminated in the copy. After scanning count chars from source, a null - * character is placed at the end of dst. Returns the number of - * characters that got copied. + * Count bytes get copied from src to dst. Along the way, backslash + * sequences are substituted in the copy. After scanning count bytes from + * src, a null character is placed at the end of dst. Returns the number + * of bytes that got written to dst. * * Side effects: * None. @@ -345,26 +723,29 @@ TclFindElement( int TclCopyAndCollapse( - int count, /* Number of characters to copy from src. */ + int count, /* Number of byte to copy from src. */ const char *src, /* Copy from here... */ char *dst) /* ... to here. */ { - register char c; - int numRead; int newCount = 0; - int backslashCount; - for (c = *src; count > 0; src++, c = *src, count--) { + while (count > 0) { + char c = *src; + if (c == '\\') { - backslashCount = Tcl_UtfBackslash(src, &numRead, dst); + int numRead; + int backslashCount = TclParseBackslash(src, count, &numRead, dst); + dst += backslashCount; newCount += backslashCount; - src += numRead-1; - count -= numRead-1; + src += numRead; + count -= numRead; } else { *dst = c; dst++; newCount++; + src++; + count--; } } *dst = 0; @@ -409,76 +790,55 @@ Tcl_SplitList( const char ***argvPtr) /* Pointer to place to store pointer to array * of pointers to list elements. */ { - const char **argv, *l, *element; + const char **argv, *end, *element; char *p; - int length, size, i, result, elSize, brace; + int length, size, i, result, elSize; /* - * Figure out how much space to allocate. There must be enough space for - * both the array of pointers and also for a copy of the list. To estimate - * the number of pointers needed, count the number of space characters in - * the list. + * Allocate enough space to work in. A (const char *) for each (possible) + * list element plus one more for terminating NULL, plus as many bytes as + * in the original string value, plus one more for a terminating '\0'. + * Space used to hold element separating white space in the original + * string gets re-purposed to hold '\0' characters in the argv array. */ - for (size = 2, l = list; *l != 0; l++) { - if (isspace(UCHAR(*l))) { /* INTL: ISO space. */ - size++; - - /* - * Consecutive space can only count as a single list delimiter. - */ - - while (1) { - char next = *(l + 1); + size = TclMaxListLength(list, -1, &end) + 1; + length = end - list; + argv = ckalloc((size * sizeof(char *)) + length + 1); - if (next == '\0') { - break; - } - l++; - if (isspace(UCHAR(next))) { /* INTL: ISO space. */ - continue; - } - break; - } - } - } - length = l - list; - argv = (const char **) ckalloc((unsigned) - ((size * sizeof(char *)) + length + 1)); for (i = 0, p = ((char *) argv) + size*sizeof(char *); *list != 0; i++) { const char *prevList = list; + int literal; result = TclFindElement(interp, list, length, &element, &list, - &elSize, &brace); + &elSize, &literal); length -= (list - prevList); if (result != TCL_OK) { - if (interp != NULL) { - Tcl_SetErrorCode(interp, "TCL", "VALUE", "LIST", NULL); - } - ckfree((char *) argv); + ckfree(argv); return result; } if (*element == 0) { break; } if (i >= size) { - ckfree((char *) argv); + ckfree(argv); if (interp != NULL) { - Tcl_SetResult(interp, "internal error in Tcl_SplitList", - TCL_STATIC); + Tcl_SetObjResult(interp, Tcl_NewStringObj( + "internal error in Tcl_SplitList", -1)); + Tcl_SetErrorCode(interp, "TCL", "INTERNAL", "Tcl_SplitList", + NULL); } return TCL_ERROR; } argv[i] = p; - if (brace) { + if (literal) { memcpy(p, element, (size_t) elSize); p += elSize; *p = 0; p++; } else { - TclCopyAndCollapse(elSize, element, p); - p += elSize+1; + p += 1 + TclCopyAndCollapse(elSize, element, p); } } @@ -491,128 +851,49 @@ Tcl_SplitList( /* *---------------------------------------------------------------------- * - * TclMarkList -- - * - * Marks the locations within a string where list elements start and - * computes where they end. + * Tcl_ScanElement -- * - * Results - * The return value is normally TCL_OK, which means that the list was - * successfully split up. If TCL_ERROR is returned, it means that "list" - * didn't have proper list structure; the interp's result will contain a - * more detailed error message. + * This function is a companion function to Tcl_ConvertElement. It scans + * a string to see what needs to be done to it (e.g. add backslashes or + * enclosing braces) to make the string into a valid Tcl list element. * - * *argvPtr will be filled in with the address of an array whose elements - * point to the places where the elements of list start, in order. - * *argcPtr will get filled in with the number of valid elements in the - * array. *argszPtr will get filled in with the address of an array whose - * elements are the lengths of the elements of the list, in order. - * Note: *argvPtr, *argcPtr and *argszPtr are only modified if the - * function returns normally. + * Results: + * The return value is an overestimate of the number of bytes that will + * be needed by Tcl_ConvertElement to produce a valid list element from + * src. The word at *flagPtr is filled in with a value needed by + * Tcl_ConvertElement when doing the actual conversion. * * Side effects: - * Memory is allocated. + * None. * *---------------------------------------------------------------------- */ int -TclMarkList( - Tcl_Interp *interp, /* Interpreter to use for error reporting. If - * NULL, no error message is left. */ - const char *list, /* Pointer to string with list structure. */ - const char *end, /* Pointer to first char after the list. */ - int *argcPtr, /* Pointer to location to fill in with the - * number of elements in the list. */ - const int **argszPtr, /* Pointer to place to store length of list - * elements. */ - const char ***argvPtr) /* Pointer to place to store pointer to array - * of pointers to list elements. */ +Tcl_ScanElement( + register const char *src, /* String to convert to list element. */ + register int *flagPtr) /* Where to store information to guide + * Tcl_ConvertCountedElement. */ { - const char **argv, *l, *element; - int *argn, length, size, i, result, elSize, brace; - - /* - * Figure out how much space to allocate. There must be enough space for - * the array of pointers and lengths. To estimate the number of pointers - * needed, count the number of whitespace characters in the list. - */ - - for (size=2, l=list ; l!=end ; l++) { - if (isspace(UCHAR(*l))) { /* INTL: ISO space. */ - size++; - - /* - * Consecutive space can only count as a single list delimiter. - */ - - while (1) { - char next = *(l + 1); - - if ((l+1) == end) { - break; - } - l++; - if (isspace(UCHAR(next))) { /* INTL: ISO space. */ - continue; - } - break; - } - } - } - length = l - list; - argv = (const char **) ckalloc((unsigned) size * sizeof(char *)); - argn = (int *) ckalloc((unsigned) size * sizeof(int *)); - - for (i = 0; list != end; i++) { - const char *prevList = list; - - result = TclFindElement(interp, list, length, &element, &list, - &elSize, &brace); - length -= (list - prevList); - if (result != TCL_OK) { - ckfree((char *) argv); - ckfree((char *) argn); - return result; - } - if (*element == 0) { - break; - } - if (i >= size) { - ckfree((char *) argv); - ckfree((char *) argn); - if (interp != NULL) { - Tcl_SetResult(interp, "internal error in TclMarkList", - TCL_STATIC); - } - return TCL_ERROR; - } - argv[i] = element; - argn[i] = elSize; - } - - argv[i] = NULL; - argn[i] = 0; - *argvPtr = argv; - *argszPtr = argn; - *argcPtr = i; - return TCL_OK; + return Tcl_ScanCountedElement(src, -1, flagPtr); } /* *---------------------------------------------------------------------- * - * Tcl_ScanElement -- + * Tcl_ScanCountedElement -- * - * This function is a companion function to Tcl_ConvertElement. It scans - * a string to see what needs to be done to it (e.g. add backslashes or - * enclosing braces) to make the string into a valid Tcl list element. + * This function is a companion function to Tcl_ConvertCountedElement. It + * scans a string to see what needs to be done to it (e.g. add + * backslashes or enclosing braces) to make the string into a valid Tcl + * list element. If length is -1, then the string is scanned from src up + * to the first null byte. * * Results: - * The return value is an overestimate of the number of characters that - * will be needed by Tcl_ConvertElement to produce a valid list element - * from string. The word at *flagPtr is filled in with a value needed by - * Tcl_ConvertElement when doing the actual conversion. + * The return value is an overestimate of the number of bytes that will + * be needed by Tcl_ConvertCountedElement to produce a valid list element + * from src. The word at *flagPtr is filled in with a value needed by + * Tcl_ConvertCountedElement when doing the actual conversion. * * Side effects: * None. @@ -621,30 +902,42 @@ TclMarkList( */ int -Tcl_ScanElement( - register const char *string,/* String to convert to list element. */ - register int *flagPtr) /* Where to store information to guide - * Tcl_ConvertCountedElement. */ +Tcl_ScanCountedElement( + const char *src, /* String to convert to Tcl list element. */ + int length, /* Number of bytes in src, or -1. */ + int *flagPtr) /* Where to store information to guide + * Tcl_ConvertElement. */ { - return Tcl_ScanCountedElement(string, -1, flagPtr); + int flags = CONVERT_ANY; + int numBytes = TclScanElement(src, length, &flags); + + *flagPtr = flags; + return numBytes; } /* *---------------------------------------------------------------------- * - * Tcl_ScanCountedElement -- + * TclScanElement -- * - * This function is a companion function to Tcl_ConvertCountedElement. It - * scans a string to see what needs to be done to it (e.g. add - * backslashes or enclosing braces) to make the string into a valid Tcl - * list element. If length is -1, then the string is scanned up to the - * first null byte. + * This function is a companion function to TclConvertElement. It scans a + * string to see what needs to be done to it (e.g. add backslashes or + * enclosing braces) to make the string into a valid Tcl list element. If + * length is -1, then the string is scanned from src up to the first null + * byte. A NULL value for src is treated as an empty string. The incoming + * value of *flagPtr is a report from the caller what additional flags it + * will pass to TclConvertElement(). * * Results: - * The return value is an overestimate of the number of characters that - * will be needed by Tcl_ConvertCountedElement to produce a valid list - * element from string. The word at *flagPtr is filled in with a value - * needed by Tcl_ConvertCountedElement when doing the actual conversion. + * The recommended formatting mode for the element is determined and a + * value is written to *flagPtr indicating that recommendation. This + * recommendation is combined with the incoming flag values in *flagPtr + * set by the caller to determine how many bytes will be needed by + * TclConvertElement() in which to write the formatted element following + * the recommendation modified by the flag values. This number of bytes + * is the return value of the routine. In some situations it may be an + * overestimate, but so long as the caller passes the same flags to + * TclConvertElement(), it will be large enough. * * Side effects: * None. @@ -653,115 +946,269 @@ Tcl_ScanElement( */ int -Tcl_ScanCountedElement( - const char *string, /* String to convert to Tcl list element. */ - int length, /* Number of bytes in string, or -1. */ +TclScanElement( + const char *src, /* String to convert to Tcl list element. */ + int length, /* Number of bytes in src, or -1. */ int *flagPtr) /* Where to store information to guide * Tcl_ConvertElement. */ { - int flags, nestingLevel; - register const char *p, *lastChar; - - /* - * This function and Tcl_ConvertElement together do two things: - * - * 1. They produce a proper list, one that will yield back the argument - * strings when evaluated or when disassembled with Tcl_SplitList. This - * is the most important thing. - * - * 2. They try to produce legible output, which means minimizing the use - * of backslashes (using braces instead). However, there are some - * situations where backslashes must be used (e.g. an element like - * "{abc": the leading brace will have to be backslashed. For each - * element, one of three things must be done: - * - * (a) Use the element as-is (it doesn't contain any special - * characters). This is the most desirable option. - * - * (b) Enclose the element in braces, but leave the contents alone. - * This happens if the element contains embedded space, or if it - * contains characters with special interpretation ($, [, ;, or \), - * or if it starts with a brace or double-quote, or if there are no - * characters in the element. - * - * (c) Don't enclose the element in braces, but add backslashes to - * prevent special interpretation of special characters. This is a - * last resort used when the argument would normally fall under - * case (b) but contains unmatched braces. It also occurs if the - * last character of the argument is a backslash or if the element - * contains a backslash followed by newline. - * - * The function figures out how many bytes will be needed to store the - * result (actually, it overestimates). It also collects information about - * the element in the form of a flags word. - * - * Note: list elements produced by this function and - * Tcl_ConvertCountedElement must have the property that they can be - * enclosing in curly braces to make sub-lists. This means, for example, - * that we must not leave unmatched curly braces in the resulting list - * element. This property is necessary in order for functions like - * Tcl_DStringStartSublist to work. - */ + const char *p = src; + int nestingLevel = 0; /* Brace nesting count */ + int forbidNone = 0; /* Do not permit CONVERT_NONE mode. Something + * needs protection or escape. */ + int requireEscape = 0; /* Force use of CONVERT_ESCAPE mode. For some + * reason bare or brace-quoted form fails. */ + int extra = 0; /* Count of number of extra bytes needed for + * formatted element, assuming we use escape + * sequences in formatting. */ + int bytesNeeded; /* Buffer length computed to complete the + * element formatting in the selected mode. */ +#if COMPAT + int preferEscape = 0; /* Use preferences to track whether to use */ + int preferBrace = 0; /* CONVERT_MASK mode. */ + int braceCount = 0; /* Count of all braces '{' '}' seen. */ +#endif /* COMPAT */ + + if ((p == NULL) || (length == 0) || ((*p == '\0') && (length == -1))) { + /* + * Empty string element must be brace quoted. + */ - nestingLevel = 0; - flags = 0; - if (string == NULL) { - string = ""; - } - if (length == -1) { - length = strlen(string); + *flagPtr = CONVERT_BRACE; + return 2; } - lastChar = string + length; - p = string; - if ((p == lastChar) || (*p == '{') || (*p == '"')) { - flags |= USE_BRACES; + + if ((*p == '{') || (*p == '"')) { + /* + * Must escape or protect so leading character of value is not + * misinterpreted as list element delimiting syntax. + */ + + forbidNone = 1; +#if COMPAT + preferBrace = 1; +#endif /* COMPAT */ } - for (; p < lastChar; p++) { + + while (length) { + if (CHAR_TYPE(*p) != TYPE_NORMAL) { switch (*p) { - case '{': + case '{': /* TYPE_BRACE */ +#if COMPAT + braceCount++; +#endif /* COMPAT */ + extra++; /* Escape '{' => '\{' */ nestingLevel++; break; - case '}': + case '}': /* TYPE_BRACE */ +#if COMPAT + braceCount++; +#endif /* COMPAT */ + extra++; /* Escape '}' => '\}' */ nestingLevel--; if (nestingLevel < 0) { - flags |= TCL_DONT_USE_BRACES|BRACES_UNMATCHED; + /* + * Unbalanced braces! Cannot format with brace quoting. + */ + + requireEscape = 1; } break; - case '[': - case '$': - case ';': - case ' ': - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': - flags |= USE_BRACES; + case ']': /* TYPE_CLOSE_BRACK */ + case '"': /* TYPE_SPACE */ +#if COMPAT + forbidNone = 1; + extra++; /* Escapes all just prepend a backslash */ + preferEscape = 1; break; - case '\\': - if ((p+1 == lastChar) || (p[1] == '\n')) { - flags = TCL_DONT_USE_BRACES | BRACES_UNMATCHED; - } else { - int size; +#else + /* FLOW THROUGH */ +#endif /* COMPAT */ + case '[': /* TYPE_SUBS */ + case '$': /* TYPE_SUBS */ + case ';': /* TYPE_COMMAND_END */ + case ' ': /* TYPE_SPACE */ + case '\f': /* TYPE_SPACE */ + case '\n': /* TYPE_COMMAND_END */ + case '\r': /* TYPE_SPACE */ + case '\t': /* TYPE_SPACE */ + case '\v': /* TYPE_SPACE */ + forbidNone = 1; + extra++; /* Escape sequences all one byte longer. */ +#if COMPAT + preferBrace = 1; +#endif /* COMPAT */ + break; + case '\\': /* TYPE_SUBS */ + extra++; /* Escape '\' => '\\' */ + if ((length == 1) || ((length == -1) && (p[1] == '\0'))) { + /* + * Final backslash. Cannot format with brace quoting. + */ + + requireEscape = 1; + break; + } + if (p[1] == '\n') { + extra++; /* Escape newline => '\n', one byte longer */ + + /* + * Backslash newline sequence. Brace quoting not permitted. + */ - Tcl_UtfBackslash(p, &size, NULL); - p += size-1; - flags |= USE_BRACES; + requireEscape = 1; + length -= (length > 0); + p++; + break; } + if ((p[1] == '{') || (p[1] == '}') || (p[1] == '\\')) { + extra++; /* Escape sequences all one byte longer. */ + length -= (length > 0); + p++; + } + forbidNone = 1; +#if COMPAT + preferBrace = 1; +#endif /* COMPAT */ + break; + case '\0': /* TYPE_SUBS */ + if (length == -1) { + goto endOfString; + } + /* TODO: Panic on improper encoding? */ break; } + } + length -= (length > 0); + p++; } + + endOfString: if (nestingLevel != 0) { - flags = TCL_DONT_USE_BRACES | BRACES_UNMATCHED; + /* + * Unbalanced braces! Cannot format with brace quoting. + */ + + requireEscape = 1; } - *flagPtr = flags; /* - * Allow enough space to backslash every character plus leave two spaces - * for braces. + * We need at least as many bytes as are in the element value... */ - return 2*(p-string) + 2; + bytesNeeded = p - src; + + if (requireEscape) { + /* + * We must use escape sequences. Add all the extra bytes needed to + * have room to create them. + */ + + bytesNeeded += extra; + + /* + * Make room to escape leading #, if needed. + */ + + if ((*src == '#') && !(*flagPtr & TCL_DONT_QUOTE_HASH)) { + bytesNeeded++; + } + *flagPtr = CONVERT_ESCAPE; + goto overflowCheck; + } + if (*flagPtr & CONVERT_ANY) { + /* + * The caller has not let us know what flags it will pass to + * TclConvertElement() so compute the max size we might need for any + * possible choice. Normally the formatting using escape sequences is + * the longer one, and a minimum "extra" value of 2 makes sure we + * don't request too small a buffer in those edge cases where that's + * not true. + */ + + if (extra < 2) { + extra = 2; + } + *flagPtr &= ~CONVERT_ANY; + *flagPtr |= TCL_DONT_USE_BRACES; + } + if (forbidNone) { + /* + * We must request some form of quoting of escaping... + */ + +#if COMPAT + if (preferEscape && !preferBrace) { + /* + * If we are quoting solely due to ] or internal " characters use + * the CONVERT_MASK mode where we escape all special characters + * except for braces. "extra" counted space needed to escape + * braces too, so substract "braceCount" to get our actual needs. + */ + + bytesNeeded += (extra - braceCount); + /* Make room to escape leading #, if needed. */ + if ((*src == '#') && !(*flagPtr & TCL_DONT_QUOTE_HASH)) { + bytesNeeded++; + } + + /* + * If the caller reports it will direct TclConvertElement() to + * use full escapes on the element, add back the bytes needed to + * escape the braces. + */ + + if (*flagPtr & TCL_DONT_USE_BRACES) { + bytesNeeded += braceCount; + } + *flagPtr = CONVERT_MASK; + goto overflowCheck; + } +#endif /* COMPAT */ + if (*flagPtr & TCL_DONT_USE_BRACES) { + /* + * If the caller reports it will direct TclConvertElement() to + * use escapes, add the extra bytes needed to have room for them. + */ + + bytesNeeded += extra; + + /* + * Make room to escape leading #, if needed. + */ + + if ((*src == '#') && !(*flagPtr & TCL_DONT_QUOTE_HASH)) { + bytesNeeded++; + } + } else { + /* + * Add 2 bytes for room for the enclosing braces. + */ + + bytesNeeded += 2; + } + *flagPtr = CONVERT_BRACE; + goto overflowCheck; + } + + /* + * So far, no need to quote or escape anything. + */ + + if ((*src == '#') && !(*flagPtr & TCL_DONT_QUOTE_HASH)) { + /* + * If we need to quote a leading #, make room to enclose in braces. + */ + + bytesNeeded += 2; + } + *flagPtr = CONVERT_NONE; + + overflowCheck: + if (bytesNeeded < 0) { + Tcl_Panic("TclScanElement: string length overflow"); + } + return bytesNeeded; } /* @@ -822,125 +1269,191 @@ Tcl_ConvertCountedElement( char *dst, /* Place to put list-ified element. */ int flags) /* Flags produced by Tcl_ScanElement. */ { - register char *p = dst; - register const char *lastChar; + int numBytes = TclConvertElement(src, length, dst, flags); + dst[numBytes] = '\0'; + return numBytes; +} + +/* + *---------------------------------------------------------------------- + * + * TclConvertElement -- + * + * This is a companion function to TclScanElement. Given the information + * produced by TclScanElement, this function converts a string to a list + * element equal to that string. + * + * Results: + * Information is copied to *dst in the form of a list element identical + * to src (i.e. if Tcl_SplitList is applied to dst it will produce a + * string identical to src). The return value is a count of the number of + * characters copied (not including the terminating NULL character). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclConvertElement( + register const char *src, /* Source information for list element. */ + int length, /* Number of bytes in src, or -1. */ + char *dst, /* Place to put list-ified element. */ + int flags) /* Flags produced by Tcl_ScanElement. */ +{ + int conversion = flags & CONVERT_MASK; + char *p = dst; /* - * See the comment block at the beginning of the Tcl_ScanElement code for - * details of how this works. + * Let the caller demand we use escape sequences rather than braces. */ - if (src && length == -1) { - length = strlen(src); + if ((flags & TCL_DONT_USE_BRACES) && (conversion & CONVERT_BRACE)) { + conversion = CONVERT_ESCAPE; } - if ((src == NULL) || (length == 0)) { - p[0] = '{'; - p[1] = '}'; - p[2] = 0; - return 2; + + /* + * No matter what the caller demands, empty string must be braced! + */ + + if ((src == NULL) || (length == 0) || (*src == '\0' && length == -1)) { + src = tclEmptyStringRep; + length = 0; + conversion = CONVERT_BRACE; } - lastChar = src + length; + + /* + * Escape leading hash as needed and requested. + */ + if ((*src == '#') && !(flags & TCL_DONT_QUOTE_HASH)) { - flags |= USE_BRACES; + if (conversion == CONVERT_ESCAPE) { + p[0] = '\\'; + p[1] = '#'; + p += 2; + src++; + length -= (length > 0); + } else { + conversion = CONVERT_BRACE; + } + } + + /* + * No escape or quoting needed. Copy the literal string value. + */ + + if (conversion == CONVERT_NONE) { + if (length == -1) { + /* TODO: INT_MAX overflow? */ + while (*src) { + *p++ = *src++; + } + return p - dst; + } else { + memcpy(dst, src, length); + return length; + } } - if ((flags & USE_BRACES) && !(flags & TCL_DONT_USE_BRACES)) { + + /* + * Formatted string is original string enclosed in braces. + */ + + if (conversion == CONVERT_BRACE) { *p = '{'; p++; - for (; src != lastChar; src++, p++) { - *p = *src; + if (length == -1) { + /* TODO: INT_MAX overflow? */ + while (*src) { + *p++ = *src++; + } + } else { + memcpy(p, src, length); + p += length; } *p = '}'; p++; - } else { - if (*src == '{') { - /* - * Can't have a leading brace unless the whole element is enclosed - * in braces. Add a backslash before the brace. Furthermore, this - * may destroy the balance between open and close braces, so set - * BRACES_UNMATCHED. - */ + return p - dst; + } - p[0] = '\\'; - p[1] = '{'; - p += 2; - src++; - flags |= BRACES_UNMATCHED; - } else if ((*src == '#') && !(flags & TCL_DONT_QUOTE_HASH)) { - /* - * Leading '#' could be seen by [eval] as the start of a comment, - * if on the first element of a list, so quote it. - */ + /* conversion == CONVERT_ESCAPE or CONVERT_MASK */ - p[0] = '\\'; - p[1] = '#'; - p += 2; - src++; - } - for (; src != lastChar; src++) { - switch (*src) { - case ']': - case '[': - case '$': - case ';': - case ' ': - case '\\': - case '"': - *p = '\\'; - p++; - break; - case '{': - case '}': - /* - * It may not seem necessary to backslash braces, but it is. - * The reason for this is that the resulting list element may - * actually be an element of a sub-list enclosed in braces - * (e.g. if Tcl_DStringStartSublist has been invoked), so - * there may be a brace mismatch if the braces aren't - * backslashed. - */ + /* + * Formatted string is original string converted to escape sequences. + */ - if (flags & BRACES_UNMATCHED) { - *p = '\\'; - p++; - } - break; - case '\f': - *p = '\\'; - p++; - *p = 'f'; - p++; - continue; - case '\n': - *p = '\\'; - p++; - *p = 'n'; - p++; - continue; - case '\r': - *p = '\\'; - p++; - *p = 'r'; - p++; - continue; - case '\t': - *p = '\\'; - p++; - *p = 't'; - p++; - continue; - case '\v': + for ( ; length; src++, length -= (length > 0)) { + switch (*src) { + case ']': + case '[': + case '$': + case ';': + case ' ': + case '\\': + case '"': + *p = '\\'; + p++; + break; + case '{': + case '}': +#if COMPAT + if (conversion == CONVERT_ESCAPE) +#endif /* COMPAT */ + { *p = '\\'; p++; - *p = 'v'; - p++; - continue; } - *p = *src; + break; + case '\f': + *p = '\\'; + p++; + *p = 'f'; + p++; + continue; + case '\n': + *p = '\\'; + p++; + *p = 'n'; + p++; + continue; + case '\r': + *p = '\\'; + p++; + *p = 'r'; + p++; + continue; + case '\t': + *p = '\\'; p++; + *p = 't'; + p++; + continue; + case '\v': + *p = '\\'; + p++; + *p = 'v'; + p++; + continue; + case '\0': + if (length == -1) { + return p - dst; + } + + /* + * If we reach this point, there's an embedded NULL in the string + * range being processed, which should not happen when the + * encoding rules for Tcl strings are properly followed. If the + * day ever comes when we stop tolerating such things, this is + * where to put the Tcl_Panic(). + */ + + break; } + *p = *src; + p++; } - *p = '\0'; - return p-dst; + return p - dst; } /* @@ -968,12 +1481,22 @@ Tcl_Merge( int argc, /* How many strings to merge. */ const char *const *argv) /* Array of string values. */ { -# define LOCAL_SIZE 20 - int localFlags[LOCAL_SIZE], *flagPtr; - int numChars; - char *result; - char *dst; - int i; +#define LOCAL_SIZE 20 + int localFlags[LOCAL_SIZE], *flagPtr = NULL; + int i, bytesNeeded = 0; + char *result, *dst; + const int maxFlags = UINT_MAX / sizeof(int); + + /* + * Handle empty list case first, so logic of the general case can be + * simpler. + */ + + if (argc == 0) { + result = ckalloc(1); + result[0] = '\0'; + return result; + } /* * Pass 1: estimate space, gather flags. @@ -981,35 +1504,51 @@ Tcl_Merge( if (argc <= LOCAL_SIZE) { flagPtr = localFlags; + } else if (argc > maxFlags) { + /* + * We cannot allocate a large enough flag array to format this list in + * one pass. We could imagine converting this routine to a multi-pass + * implementation, but for sizeof(int) == 4, the limit is a max of + * 2^30 list elements and since each element is at least one byte + * formatted, and requires one byte space between it and the next one, + * that a minimum space requirement of 2^31 bytes, which is already + * INT_MAX. If we tried to format a list of > maxFlags elements, we're + * just going to overflow the size limits on the formatted string + * anyway, so just issue that same panic early. + */ + + Tcl_Panic("max size for a Tcl value (%d bytes) exceeded", INT_MAX); } else { - flagPtr = (int *) ckalloc((unsigned) argc*sizeof(int)); + flagPtr = ckalloc(argc * sizeof(int)); } - numChars = 1; for (i = 0; i < argc; i++) { - numChars += Tcl_ScanElement(argv[i], &flagPtr[i]) + 1; + flagPtr[i] = ( i ? TCL_DONT_QUOTE_HASH : 0 ); + bytesNeeded += TclScanElement(argv[i], -1, &flagPtr[i]); + if (bytesNeeded < 0) { + Tcl_Panic("max size for a Tcl value (%d bytes) exceeded", INT_MAX); + } } + if (bytesNeeded > INT_MAX - argc + 1) { + Tcl_Panic("max size for a Tcl value (%d bytes) exceeded", INT_MAX); + } + bytesNeeded += argc; /* * Pass two: copy into the result area. */ - result = (char *) ckalloc((unsigned) numChars); + result = ckalloc(bytesNeeded); dst = result; for (i = 0; i < argc; i++) { - numChars = Tcl_ConvertElement(argv[i], dst, - flagPtr[i] | (i==0 ? 0 : TCL_DONT_QUOTE_HASH)); - dst += numChars; + flagPtr[i] |= ( i ? TCL_DONT_QUOTE_HASH : 0 ); + dst += TclConvertElement(argv[i], -1, dst, flagPtr[i]); *dst = ' '; dst++; } - if (dst == result) { - *dst = 0; - } else { - dst[-1] = 0; - } + dst[-1] = 0; if (flagPtr != localFlags) { - ckfree((char *) flagPtr); + ckfree(flagPtr); } return result; } @@ -1051,6 +1590,167 @@ Tcl_Backslash( /* *---------------------------------------------------------------------- * + * TclTrimRight -- + * + * Takes two counted strings in the Tcl encoding which must both be null + * terminated. Conceptually trims from the right side of the first string + * all characters found in the second string. + * + * Results: + * The number of bytes to be removed from the end of the string. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclTrimRight( + const char *bytes, /* String to be trimmed... */ + int numBytes, /* ...and its length in bytes */ + const char *trim, /* String of trim characters... */ + int numTrim) /* ...and its length in bytes */ +{ + const char *p = bytes + numBytes; + int pInc; + + if ((bytes[numBytes] != '\0') || (trim[numTrim] != '\0')) { + Tcl_Panic("TclTrimRight works only on null-terminated strings"); + } + + /* + * Empty strings -> nothing to do. + */ + + if ((numBytes == 0) || (numTrim == 0)) { + return 0; + } + + /* + * Outer loop: iterate over string to be trimmed. + */ + + do { + Tcl_UniChar ch1; + const char *q = trim; + int bytesLeft = numTrim; + + p = Tcl_UtfPrev(p, bytes); + pInc = TclUtfToUniChar(p, &ch1); + + /* + * Inner loop: scan trim string for match to current character. + */ + + do { + Tcl_UniChar ch2; + int qInc = TclUtfToUniChar(q, &ch2); + + if (ch1 == ch2) { + break; + } + + q += qInc; + bytesLeft -= qInc; + } while (bytesLeft); + + if (bytesLeft == 0) { + /* + * No match; trim task done; *p is last non-trimmed char. + */ + + p += pInc; + break; + } + } while (p > bytes); + + return numBytes - (p - bytes); +} + +/* + *---------------------------------------------------------------------- + * + * TclTrimLeft -- + * + * Takes two counted strings in the Tcl encoding which must both be null + * terminated. Conceptually trims from the left side of the first string + * all characters found in the second string. + * + * Results: + * The number of bytes to be removed from the start of the string. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclTrimLeft( + const char *bytes, /* String to be trimmed... */ + int numBytes, /* ...and its length in bytes */ + const char *trim, /* String of trim characters... */ + int numTrim) /* ...and its length in bytes */ +{ + const char *p = bytes; + + if ((bytes[numBytes] != '\0') || (trim[numTrim] != '\0')) { + Tcl_Panic("TclTrimLeft works only on null-terminated strings"); + } + + /* + * Empty strings -> nothing to do. + */ + + if ((numBytes == 0) || (numTrim == 0)) { + return 0; + } + + /* + * Outer loop: iterate over string to be trimmed. + */ + + do { + Tcl_UniChar ch1; + int pInc = TclUtfToUniChar(p, &ch1); + const char *q = trim; + int bytesLeft = numTrim; + + /* + * Inner loop: scan trim string for match to current character. + */ + + do { + Tcl_UniChar ch2; + int qInc = TclUtfToUniChar(q, &ch2); + + if (ch1 == ch2) { + break; + } + + q += qInc; + bytesLeft -= qInc; + } while (bytesLeft); + + if (bytesLeft == 0) { + /* + * No match; trim task done; *p is first non-trimmed char. + */ + + break; + } + + p += pInc; + numBytes -= pInc; + } while (numBytes); + + return p - bytes; +} + +/* + *---------------------------------------------------------------------- + * * Tcl_Concat -- * * Concatenate a set of strings into a single large string. @@ -1067,56 +1767,97 @@ Tcl_Backslash( *---------------------------------------------------------------------- */ +/* The whitespace characters trimmed during [concat] operations */ +#define CONCAT_WS " \f\v\r\t\n" +#define CONCAT_WS_SIZE (int) (sizeof(CONCAT_WS "") - 1) + char * Tcl_Concat( int argc, /* Number of strings to concatenate. */ const char *const *argv) /* Array of strings to concatenate. */ { - int totalSize, i; - char *p; - char *result; + int i, needSpace = 0, bytesNeeded = 0; + char *result, *p; + + /* + * Dispose of the empty result corner case first to simplify later code. + */ - for (totalSize = 1, i = 0; i < argc; i++) { - totalSize += strlen(argv[i]) + 1; - } - result = (char *) ckalloc((unsigned) totalSize); if (argc == 0) { - *result = '\0'; + result = (char *) ckalloc(1); + result[0] = '\0'; return result; } - for (p = result, i = 0; i < argc; i++) { - const char *element; - int length; + /* + * First allocate the result buffer at the size required. + */ + + for (i = 0; i < argc; i++) { + bytesNeeded += strlen(argv[i]); + if (bytesNeeded < 0) { + Tcl_Panic("Tcl_Concat: max size of Tcl value exceeded"); + } + } + if (bytesNeeded + argc - 1 < 0) { /* - * Clip white space off the front and back of the string to generate a - * neater result, and ignore any empty elements. + * Panic test could be tighter, but not going to bother for this + * legacy routine. */ + Tcl_Panic("Tcl_Concat: max size of Tcl value exceeded"); + } + + /* + * All element bytes + (argc - 1) spaces + 1 terminating NULL. + */ + + result = ckalloc((unsigned) (bytesNeeded + argc)); + + for (p = result, i = 0; i < argc; i++) { + int trim, elemLength; + const char *element; + element = argv[i]; - while (isspace(UCHAR(*element))) { /* INTL: ISO space. */ - element++; - } - for (length = strlen(element); - (length > 0) - && (isspace(UCHAR(element[length-1]))) /* INTL: ISO space. */ - && ((length < 2) || (element[length-2] != '\\')); - length--) { - /* Null loop body. */ - } - if (length == 0) { + elemLength = strlen(argv[i]); + + /* + * Trim away the leading whitespace. + */ + + trim = TclTrimLeft(element, elemLength, CONCAT_WS, CONCAT_WS_SIZE); + element += trim; + elemLength -= trim; + + /* + * Trim away the trailing whitespace. Do not permit trimming to expose + * a final backslash character. + */ + + trim = TclTrimRight(element, elemLength, CONCAT_WS, CONCAT_WS_SIZE); + trim -= trim && (element[elemLength - trim - 1] == '\\'); + elemLength -= trim; + + /* + * If we're left with empty element after trimming, do nothing. + */ + + if (elemLength == 0) { continue; } - memcpy(p, element, (size_t) length); - p += length; - *p = ' '; - p++; - } - if (p != result) { - p[-1] = 0; - } else { - *p = 0; + + /* + * Append to the result with space if needed. + */ + + if (needSpace) { + *p++ = ' '; + } + memcpy(p, element, (size_t) elemLength); + p += elemLength; + needSpace = 1; } + *p = '\0'; return result; } @@ -1143,64 +1884,39 @@ Tcl_ConcatObj( int objc, /* Number of objects to concatenate. */ Tcl_Obj *const objv[]) /* Array of objects to concatenate. */ { - int allocSize, finalSize, length, elemLength, i; - char *p; + int i, elemLength, needSpace = 0, bytesNeeded = 0; const char *element; - char *concatStr; Tcl_Obj *objPtr, *resPtr; /* * Check first to see if all the items are of list type or empty. If so, * we will concat them together as lists, and return a list object. This - * is only valid when the lists have no current string representation, - * since we don't know what the original type was. An original string rep - * may have lost some whitespace info when converted which could be - * important. + * is only valid when the lists are in canonical form. */ for (i = 0; i < objc; i++) { - List *listRepPtr; + int length; objPtr = objv[i]; - if (objPtr->typePtr != &tclListType) { - TclGetString(objPtr); - if (objPtr->length) { - break; - } else { - continue; - } + if (TclListObjIsCanonical(objPtr)) { + continue; } - listRepPtr = (List *) objPtr->internalRep.twoPtrValue.ptr1; - if (objPtr->bytes != NULL && !listRepPtr->canonicalFlag) { + Tcl_GetStringFromObj(objPtr, &length); + if (length > 0) { break; } } if (i == objc) { - Tcl_Obj **listv; - int listc; - resPtr = NULL; for (i = 0; i < objc; i++) { - /* - * Tcl_ListObjAppendList could be used here, but this saves us a - * bit of type checking (since we've already done it). Use of - * INT_MAX tells us to always put the new stuff on the end. It - * will be set right in Tcl_ListObjReplace. - * Note that all objs at this point are either lists or have an - * empty string rep. - */ - objPtr = objv[i]; - if (objPtr->bytes && !objPtr->length) { + if (objPtr->bytes && objPtr->length == 0) { continue; } - TclListObjGetElements(NULL, objPtr, &listc, &listv); - if (listc) { - if (resPtr) { - Tcl_ListObjReplace(NULL, resPtr, INT_MAX, 0, listc, listv); - } else { - resPtr = TclListObjCopy(NULL, objPtr); - } + if (resPtr) { + Tcl_ListObjAppendList(NULL, resPtr, objPtr); + } else { + resPtr = TclListObjCopy(NULL, objPtr); } } if (!resPtr) { @@ -1212,81 +1928,69 @@ Tcl_ConcatObj( /* * Something cannot be determined to be safe, so build the concatenation * the slow way, using the string representations. + * + * First try to pre-allocate the size required. */ - allocSize = 0; for (i = 0; i < objc; i++) { - objPtr = objv[i]; - element = TclGetStringFromObj(objPtr, &length); - if ((element != NULL) && (length > 0)) { - allocSize += (length + 1); + element = TclGetStringFromObj(objv[i], &elemLength); + bytesNeeded += elemLength; + if (bytesNeeded < 0) { + break; } } - if (allocSize == 0) { - allocSize = 1; /* enough for the NULL byte at end */ - } /* - * Allocate storage for the concatenated result. Note that allocSize is - * one more than the total number of characters, and so includes room for - * the terminating NULL byte. + * Does not matter if this fails, will simply try later to build up the + * string with each Append reallocating as needed with the usual string + * append algorithm. When that fails it will report the error. */ - concatStr = ckalloc((unsigned) allocSize); + TclNewObj(resPtr); + Tcl_AttemptSetObjLength(resPtr, bytesNeeded + objc - 1); + Tcl_SetObjLength(resPtr, 0); - /* - * Now concatenate the elements. Clip white space off the front and back - * to generate a neater result, and ignore any empty elements. Also put a - * null byte at the end. - */ + for (i = 0; i < objc; i++) { + int trim; + + element = TclGetStringFromObj(objv[i], &elemLength); - finalSize = 0; - if (objc == 0) { - *concatStr = '\0'; - } else { - p = concatStr; - for (i = 0; i < objc; i++) { - objPtr = objv[i]; - element = TclGetStringFromObj(objPtr, &elemLength); - while ((elemLength > 0) && (UCHAR(*element) < 127) - && isspace(UCHAR(*element))) { /* INTL: ISO C space. */ - element++; - elemLength--; - } + /* + * Trim away the leading whitespace. + */ - /* - * Trim trailing white space. But, be careful not to trim a space - * character if it is preceded by a backslash: in this case it - * could be significant. - */ + trim = TclTrimLeft(element, elemLength, CONCAT_WS, CONCAT_WS_SIZE); + element += trim; + elemLength -= trim; - while ((elemLength > 0) && (UCHAR(element[elemLength-1]) < 127) - && isspace(UCHAR(element[elemLength-1])) - /* INTL: ISO C space. */ - && ((elemLength < 2) || (element[elemLength-2] != '\\'))) { - elemLength--; - } - if (elemLength == 0) { - continue; /* nothing left of this element */ - } - memcpy(p, element, (size_t) elemLength); - p += elemLength; - *p = ' '; - p++; - finalSize += (elemLength + 1); + /* + * Trim away the trailing whitespace. Do not permit trimming to expose + * a final backslash character. + */ + + trim = TclTrimRight(element, elemLength, CONCAT_WS, CONCAT_WS_SIZE); + trim -= trim && (element[elemLength - trim - 1] == '\\'); + elemLength -= trim; + + /* + * If we're left with empty element after trimming, do nothing. + */ + + if (elemLength == 0) { + continue; } - if (p != concatStr) { - p[-1] = 0; - finalSize -= 1; /* we overwrote the final ' ' */ - } else { - *p = 0; + + /* + * Append to the result with space if needed. + */ + + if (needSpace) { + Tcl_AppendToObj(resPtr, " ", 1); } + Tcl_AppendToObj(resPtr, element, elemLength); + needSpace = 1; } - - TclNewObj(objPtr); - objPtr->bytes = concatStr; - objPtr->length = finalSize; - return objPtr; + return resPtr; } /* @@ -1683,6 +2387,7 @@ TclByteArrayMatch( /* * Matches ranges of form [a-z] or [z-a]. */ + break; } } else if (startChar == ch1) { @@ -1729,9 +2434,9 @@ TclByteArrayMatch( * * TclStringMatchObj -- * - * See if a particular string matches a particular pattern. - * Allows case insensitivity. This is the generic multi-type handler - * for the various matching algorithms. + * See if a particular string matches a particular pattern. Allows case + * insensitivity. This is the generic multi-type handler for the various + * matching algorithms. * * Results: * The return value is 1 if string matches pattern, and 0 otherwise. The @@ -1835,8 +2540,6 @@ Tcl_DStringAppend( * at end. */ { int newSize; - char *dst; - const char *end; if (length < 0) { length = strlen(bytes); @@ -1852,13 +2555,12 @@ Tcl_DStringAppend( if (newSize >= dsPtr->spaceAvl) { dsPtr->spaceAvl = newSize * 2; if (dsPtr->string == dsPtr->staticSpace) { - char *newString = ckalloc((unsigned) dsPtr->spaceAvl); + char *newString = ckalloc(dsPtr->spaceAvl); memcpy(newString, dsPtr->string, (size_t) dsPtr->length); dsPtr->string = newString; } else { - dsPtr->string = ckrealloc((void *) dsPtr->string, - (size_t) dsPtr->spaceAvl); + dsPtr->string = ckrealloc(dsPtr->string, dsPtr->spaceAvl); } } @@ -1866,18 +2568,46 @@ Tcl_DStringAppend( * Copy the new string into the buffer at the end of the old one. */ - for (dst = dsPtr->string + dsPtr->length, end = bytes+length; - bytes < end; bytes++, dst++) { - *dst = *bytes; - } - *dst = '\0'; + memcpy(dsPtr->string + dsPtr->length, bytes, length); dsPtr->length += length; + dsPtr->string[dsPtr->length] = '\0'; return dsPtr->string; } /* *---------------------------------------------------------------------- * + * TclDStringAppendObj, TclDStringAppendDString -- + * + * Simple wrappers round Tcl_DStringAppend that make it easier to append + * from particular sources of strings. + * + *---------------------------------------------------------------------- + */ + +char * +TclDStringAppendObj( + Tcl_DString *dsPtr, + Tcl_Obj *objPtr) +{ + int length; + char *bytes = Tcl_GetStringFromObj(objPtr, &length); + + return Tcl_DStringAppend(dsPtr, bytes, length); +} + +char * +TclDStringAppendDString( + Tcl_DString *dsPtr, + Tcl_DString *toAppendPtr) +{ + return Tcl_DStringAppend(dsPtr, Tcl_DStringValue(toAppendPtr), + Tcl_DStringLength(toAppendPtr)); +} + +/* + *---------------------------------------------------------------------- + * * Tcl_DStringAppendElement -- * * Append a list element to the current value of a dynamic string. @@ -1899,12 +2629,11 @@ Tcl_DStringAppendElement( const char *element) /* String to append. Must be * null-terminated. */ { - int newSize, flags, strSize; - char *dst; - - strSize = ((element== NULL) ? 0 : strlen(element)); - newSize = Tcl_ScanCountedElement(element, strSize, &flags) - + dsPtr->length + 1; + char *dst = dsPtr->string + dsPtr->length; + int needSpace = TclNeedSpace(dsPtr->string, dst); + int flags = needSpace ? TCL_DONT_QUOTE_HASH : 0; + int newSize = dsPtr->length + needSpace + + TclScanElement(element, -1, &flags); /* * Allocate a larger buffer for the string if the current one isn't large @@ -1917,14 +2646,14 @@ Tcl_DStringAppendElement( if (newSize >= dsPtr->spaceAvl) { dsPtr->spaceAvl = newSize * 2; if (dsPtr->string == dsPtr->staticSpace) { - char *newString = ckalloc((unsigned) dsPtr->spaceAvl); + char *newString = ckalloc(dsPtr->spaceAvl); memcpy(newString, dsPtr->string, (size_t) dsPtr->length); dsPtr->string = newString; } else { - dsPtr->string = (char *) ckrealloc((void *) dsPtr->string, - (size_t) dsPtr->spaceAvl); + dsPtr->string = ckrealloc(dsPtr->string, dsPtr->spaceAvl); } + dst = dsPtr->string + dsPtr->length; } /* @@ -1932,8 +2661,7 @@ Tcl_DStringAppendElement( * the end, with a space, if needed. */ - dst = dsPtr->string + dsPtr->length; - if (TclNeedSpace(dsPtr->string, dst)) { + if (needSpace) { *dst = ' '; dst++; dsPtr->length++; @@ -1946,7 +2674,8 @@ Tcl_DStringAppendElement( flags |= TCL_DONT_QUOTE_HASH; } - dsPtr->length += Tcl_ConvertCountedElement(element, strSize, dst, flags); + dsPtr->length += TclConvertElement(element, -1, dst, flags); + dsPtr->string[dsPtr->length] = '\0'; return dsPtr->string; } @@ -1999,13 +2728,12 @@ Tcl_DStringSetLength( dsPtr->spaceAvl = length + 1; } if (dsPtr->string == dsPtr->staticSpace) { - char *newString = ckalloc((unsigned) dsPtr->spaceAvl); + char *newString = ckalloc(dsPtr->spaceAvl); memcpy(newString, dsPtr->string, (size_t) dsPtr->length); dsPtr->string = newString; } else { - dsPtr->string = (char *) ckrealloc((void *) dsPtr->string, - (size_t) dsPtr->spaceAvl); + dsPtr->string = ckrealloc(dsPtr->string, dsPtr->spaceAvl); } } dsPtr->length = length; @@ -2068,23 +2796,8 @@ Tcl_DStringResult( Tcl_DString *dsPtr) /* Dynamic string that is to become the * result of interp. */ { - Interp *iPtr = (Interp *) interp; Tcl_ResetResult(interp); - - if (dsPtr->string != dsPtr->staticSpace) { - iPtr->result = dsPtr->string; - iPtr->freeProc = TCL_DYNAMIC; - } else if (dsPtr->length < TCL_RESULT_SIZE) { - iPtr->result = iPtr->resultSpace; - strcpy(iPtr->result, dsPtr->string); - } else { - Tcl_SetResult(interp, dsPtr->string, TCL_VOLATILE); - } - - dsPtr->string = dsPtr->staticSpace; - dsPtr->length = 0; - dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE; - dsPtr->staticSpace[0] = '\0'; + Tcl_SetObjResult(interp, TclDStringToObj(dsPtr)); } /* @@ -2120,6 +2833,39 @@ Tcl_DStringGetResult( } /* + * Do more efficient transfer when we know the result is a Tcl_Obj. When + * there's no st`ring result, we only have to deal with two cases: + * + * 1. When the string rep is the empty string, when we don't copy but + * instead use the staticSpace in the DString to hold an empty string. + + * 2. When the string rep is not there or there's a real string rep, when + * we use Tcl_GetString to fetch (or generate) the string rep - which + * we know to have been allocated with ckalloc() - and use it to + * populate the DString space. Then, we free the internal rep. and set + * the object's string representation back to the canonical empty + * string. + */ + + if (!iPtr->result[0] && iPtr->objResultPtr + && !Tcl_IsShared(iPtr->objResultPtr)) { + if (iPtr->objResultPtr->bytes == tclEmptyStringRep) { + dsPtr->string = dsPtr->staticSpace; + dsPtr->string[0] = 0; + dsPtr->length = 0; + dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE; + } else { + dsPtr->string = Tcl_GetString(iPtr->objResultPtr); + dsPtr->length = iPtr->objResultPtr->length; + dsPtr->spaceAvl = dsPtr->length + 1; + TclFreeIntRep(iPtr->objResultPtr); + iPtr->objResultPtr->bytes = tclEmptyStringRep; + iPtr->objResultPtr->length = 0; + } + return; + } + + /* * If the string result is empty, move the object result to the string * result, then reset the object result. */ @@ -2132,7 +2878,7 @@ Tcl_DStringGetResult( dsPtr->string = iPtr->result; dsPtr->spaceAvl = dsPtr->length+1; } else { - dsPtr->string = (char *) ckalloc((unsigned) dsPtr->length+1); + dsPtr->string = ckalloc(dsPtr->length+1); memcpy(dsPtr->string, iPtr->result, (unsigned) dsPtr->length+1); iPtr->freeProc(iPtr->result); } @@ -2143,7 +2889,7 @@ Tcl_DStringGetResult( dsPtr->string = dsPtr->staticSpace; dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE; } else { - dsPtr->string = (char *) ckalloc((unsigned) dsPtr->length+1); + dsPtr->string = ckalloc(dsPtr->length+1); dsPtr->spaceAvl = dsPtr->length + 1; } memcpy(dsPtr->string, iPtr->result, (unsigned) dsPtr->length+1); @@ -2156,6 +2902,64 @@ Tcl_DStringGetResult( /* *---------------------------------------------------------------------- * + * TclDStringToObj -- + * + * This function moves a dynamic string's contents to a new Tcl_Obj. Be + * aware that this function does *not* check that the encoding of the + * contents of the dynamic string is correct; this is the caller's + * responsibility to enforce. + * + * Results: + * The newly-allocated untyped (i.e., typePtr==NULL) Tcl_Obj with a + * reference count of zero. + * + * Side effects: + * The string is "moved" to the object. dsPtr is reinitialized to an + * empty string; it does not need to be Tcl_DStringFree'd after this if + * not used further. + * + *---------------------------------------------------------------------- + */ + +Tcl_Obj * +TclDStringToObj( + Tcl_DString *dsPtr) +{ + Tcl_Obj *result; + + if (dsPtr->length == 0) { + TclNewObj(result); + } else if (dsPtr->string == dsPtr->staticSpace) { + /* + * Static buffer, so must copy. + */ + + TclNewStringObj(result, dsPtr->string, dsPtr->length); + } else { + /* + * Dynamic buffer, so transfer ownership and reset. + */ + + TclNewObj(result); + result->bytes = dsPtr->string; + result->length = dsPtr->length; + } + + /* + * Re-establish the DString as empty with no buffer allocated. + */ + + dsPtr->string = dsPtr->staticSpace; + dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE; + dsPtr->length = 0; + dsPtr->staticSpace[0] = '\0'; + + return result; +} + +/* + *---------------------------------------------------------------------- + * * Tcl_DStringStartSublist -- * * This function adds the necessary information to a dynamic string @@ -2176,9 +2980,9 @@ Tcl_DStringStartSublist( Tcl_DString *dsPtr) /* Dynamic string. */ { if (TclNeedSpace(dsPtr->string, dsPtr->string + dsPtr->length)) { - Tcl_DStringAppend(dsPtr, " {", -1); + TclDStringAppendLiteral(dsPtr, " {"); } else { - Tcl_DStringAppend(dsPtr, "{", -1); + TclDStringAppendLiteral(dsPtr, "{"); } } @@ -2204,7 +3008,7 @@ void Tcl_DStringEndSublist( Tcl_DString *dsPtr) /* Dynamic string. */ { - Tcl_DStringAppend(dsPtr, "}", -1); + TclDStringAppendLiteral(dsPtr, "}"); } /* @@ -2239,125 +3043,148 @@ Tcl_PrintDouble( char *p, c; int exponent; int signum; - char buffer[TCL_DOUBLE_SPACE]; - Tcl_UniChar ch; - - int *precisionPtr = Tcl_GetThreadData(&precisionKey, (int)sizeof(int)); + char *digits; + char *end; + int *precisionPtr = Tcl_GetThreadData(&precisionKey, (int) sizeof(int)); /* - * If *precisionPtr == 0, then use TclDoubleDigits to develop a decimal - * significand and exponent, then format it in E or F format as - * appropriate. If *precisionPtr != 0, use the native sprintf and then add - * a trailing ".0" if there is no decimal point in the rep. + * Handle NaN. */ + + if (TclIsNaN(value)) { + TclFormatNaN(value, dst); + return; + } - if (*precisionPtr == 0) { + /* + * Handle infinities. + */ + + if (TclIsInfinite(value)) { /* - * Handle NaN. + * Remember to copy the terminating NUL too. */ - - if (TclIsNaN(value)) { - TclFormatNaN(value, dst); - return; + + if (value < 0) { + memcpy(dst, "-Inf", 5); + } else { + memcpy(dst, "Inf", 4); } + return; + } + /* + * Ordinary (normal and denormal) values. + */ + + if (*precisionPtr == 0) { + digits = TclDoubleDigits(value, -1, TCL_DD_SHORTEST, + &exponent, &signum, &end); + } else { /* - * Handle infinities. + * There are at least two possible interpretations for tcl_precision. + * + * The first is, "choose the decimal representation having + * $tcl_precision digits of significance that is nearest to the given + * number, breaking ties by rounding to even, and then trimming + * trailing zeros." This gives the greatest possible precision in the + * decimal string, but offers the anomaly that [expr 0.1] will be + * "0.10000000000000001". + * + * The second is "choose the decimal representation having at most + * $tcl_precision digits of significance that is nearest to the given + * number. If no such representation converts exactly to the given + * number, choose the one that is closest, breaking ties by rounding + * to even. If more than one such representation converts exactly to + * the given number, choose the shortest, breaking ties in favour of + * the nearest, breaking remaining ties in favour of the one ending in + * an even digit." + * + * Tcl 8.4 implements the first of these, which gives rise to + * anomalies in formatting: + * + * % expr 0.1 + * 0.10000000000000001 + * % expr 0.01 + * 0.01 + * % expr 1e-7 + * 9.9999999999999995e-08 + * + * For human readability, it appears better to choose the second rule, + * and let [expr 0.1] return 0.1. But for 8.4 compatibility, we prefer + * the first (the recommended zero value for tcl_precision avoids the + * problem entirely). + * + * Uncomment TCL_DD_SHORTEN_FLAG in the next call to prefer the method + * that allows floating point values to be shortened if it can be done + * without loss of precision. */ - if (TclIsInfinite(value)) { - if (value < 0) { - strcpy(dst, "-Inf"); - } else { - strcpy(dst, "Inf"); + digits = TclDoubleDigits(value, *precisionPtr, + TCL_DD_E_FORMAT /* | TCL_DD_SHORTEN_FLAG */, + &exponent, &signum, &end); + } + if (signum) { + *dst++ = '-'; + } + p = digits; + if (exponent < -4 || exponent > 16) { + /* + * E format for numbers < 1e-3 or >= 1e17. + */ + + *dst++ = *p++; + c = *p; + if (c != '\0') { + *dst++ = '.'; + while (c != '\0') { + *dst++ = c; + c = *++p; } - return; } /* - * Ordinary (normal and denormal) values. + * Tcl 8.4 appears to format with at least a two-digit exponent; + * preserve that behaviour when tcl_precision != 0 */ - exponent = TclDoubleDigits(buffer, value, &signum); - if (signum) { - *dst++ = '-'; - } - p = buffer; - if (exponent < -3 || exponent > 17) { - /* - * E format for numbers < 1e-3 or >= 1e17. - */ - - *dst++ = *p++; - c = *p; - if (c != '\0') { - *dst++ = '.'; - while (c != '\0') { - *dst++ = c; - c = *++p; - } - } - sprintf(dst, "e%+d", exponent-1); + if (*precisionPtr == 0) { + sprintf(dst, "e%+d", exponent); } else { - /* - * F format for others. - */ - - if (exponent <= 0) { - *dst++ = '0'; - } - c = *p; - while (exponent-- > 0) { - if (c != '\0') { - *dst++ = c; - c = *++p; - } else { - *dst++ = '0'; - } - } - *dst++ = '.'; - if (c == '\0') { - *dst++ = '0'; - } else { - while (++exponent < 0) { - *dst++ = '0'; - } - while (c != '\0') { - *dst++ = c; - c = *++p; - } - } - *dst++ = '\0'; + sprintf(dst, "e%+03d", exponent); } } else { /* - * tcl_precision is supplied, pass it to the native sprintf. + * F format for others. */ - - sprintf(dst, "%.*g", *precisionPtr, value); - - /* - * If the ASCII result looks like an integer, add ".0" so that it - * doesn't look like an integer anymore. This prevents floating-point - * values from being converted to integers unintentionally. Check for - * ASCII specifically to speed up the function. - */ - - for (p = dst; *p != 0;) { - if (UCHAR(*p) < 0x80) { - c = *p++; + + if (exponent < 0) { + *dst++ = '0'; + } + c = *p; + while (exponent-- >= 0) { + if (c != '\0') { + *dst++ = c; + c = *++p; } else { - p += Tcl_UtfToUniChar(p, &ch); - c = UCHAR(ch); + *dst++ = '0'; + } + } + *dst++ = '.'; + if (c == '\0') { + *dst++ = '0'; + } else { + while (++exponent < -1) { + *dst++ = '0'; } - if ((c == '.') || isalpha(UCHAR(c))) { /* INTL: ISO only. */ - return; + while (c != '\0') { + *dst++ = c; + c = *++p; } } - p[0] = '.'; - p[1] = '0'; - p[2] = 0; + *dst++ = '\0'; } + ckfree(digits); } /* @@ -2508,6 +3335,7 @@ TclNeedSpace( * NOTE: Remove this if other Unicode spaces ever get accepted as * list-element separators. */ + return 1; } switch (*end) { @@ -2527,6 +3355,94 @@ TclNeedSpace( /* *---------------------------------------------------------------------- * + * TclFormatInt -- + * + * This procedure formats an integer into a sequence of decimal digit + * characters in a buffer. If the integer is negative, a minus sign is + * inserted at the start of the buffer. A null character is inserted at + * the end of the formatted characters. It is the caller's responsibility + * to ensure that enough storage is available. This procedure has the + * effect of sprintf(buffer, "%ld", n) but is faster as proven in + * benchmarks. This is key to UpdateStringOfInt, which is a common path + * for a lot of code (e.g. int-indexed arrays). + * + * Results: + * An integer representing the number of characters formatted, not + * including the terminating \0. + * + * Side effects: + * The formatted characters are written into the storage pointer to by + * the "buffer" argument. + * + *---------------------------------------------------------------------- + */ + +int +TclFormatInt( + char *buffer, /* Points to the storage into which the + * formatted characters are written. */ + long n) /* The integer to format. */ +{ + long intVal; + int i; + int numFormatted, j; + const char *digits = "0123456789"; + + /* + * Check first whether "n" is zero. + */ + + if (n == 0) { + buffer[0] = '0'; + buffer[1] = 0; + return 1; + } + + /* + * Check whether "n" is the maximum negative value. This is -2^(m-1) for + * an m-bit word, and has no positive equivalent; negating it produces the + * same value. + */ + + intVal = -n; /* [Bug 3390638] Workaround for*/ + if (n == -n || intVal == n) { /* broken compiler optimizers. */ + return sprintf(buffer, "%ld", n); + } + + /* + * Generate the characters of the result backwards in the buffer. + */ + + intVal = (n < 0? -n : n); + i = 0; + buffer[0] = '\0'; + do { + i++; + buffer[i] = digits[intVal % 10]; + intVal = intVal/10; + } while (intVal > 0); + if (n < 0) { + i++; + buffer[i] = '-'; + } + numFormatted = i; + + /* + * Now reverse the characters. + */ + + for (j = 0; j < i; j++, i--) { + char tmp = buffer[i]; + + buffer[i] = buffer[j]; + buffer[j] = tmp; + } + return numFormatted; +} + +/* + *---------------------------------------------------------------------- + * * TclGetIntForIndex -- * * This function returns an integer corresponding to the list index held @@ -2584,7 +3500,7 @@ TclGetIntForIndex( * Leading whitespace is acceptable in an index. */ - while (length && isspace(UCHAR(*bytes))) { /* INTL: ISO space. */ + while (length && TclIsSpaceProc(*bytes)) { bytes++; length--; } @@ -2597,7 +3513,7 @@ TclGetIntForIndex( if ((savedOp != '+') && (savedOp != '-')) { goto parseError; } - if (isspace(UCHAR(opPtr[1]))) { + if (TclIsSpaceProc(opPtr[1])) { goto parseError; } *opPtr = '\0'; @@ -2623,16 +3539,10 @@ TclGetIntForIndex( parseError: if (interp != NULL) { - /* - * The result might not be empty; this resets it which should be both - * a cheap operation, and of little problem because this is an - * error-generation path anyway. - */ - bytes = Tcl_GetString(objPtr); - Tcl_ResetResult(interp); - Tcl_AppendResult(interp, "bad index \"", bytes, - "\": must be integer?[+-]integer? or end?[+-]integer?", NULL); + Tcl_SetObjResult(interp, Tcl_ObjPrintf( + "bad index \"%s\": must be integer?[+-]integer? or" + " end?[+-]integer?", bytes)); if (!strncmp(bytes, "end-", 4)) { bytes += 4; } @@ -2667,10 +3577,10 @@ static void UpdateStringOfEndOffset( register Tcl_Obj *objPtr) { - char buffer[TCL_INTEGER_SPACE + sizeof("end") + 1]; + char buffer[TCL_INTEGER_SPACE + 5]; register int len; - strcpy(buffer, "end"); + memcpy(buffer, "end", 4); len = sizeof("end") - 1; if (objPtr->internalRep.longValue != 0) { buffer[len++] = '-'; @@ -2724,9 +3634,8 @@ SetEndOffsetFromAny( if ((*bytes != 'e') || (strncmp(bytes, "end", (size_t)((length > 3) ? 3 : length)) != 0)) { if (interp != NULL) { - Tcl_ResetResult(interp); - Tcl_AppendResult(interp, "bad index \"", bytes, - "\": must be end?[+-]integer?", NULL); + Tcl_SetObjResult(interp, Tcl_ObjPrintf( + "bad index \"%s\": must be end?[+-]integer?", bytes)); Tcl_SetErrorCode(interp, "TCL", "VALUE", "INDEX", NULL); } return TCL_ERROR; @@ -2744,8 +3653,8 @@ SetEndOffsetFromAny( * after "end-" to Tcl_GetInt, then reverse for offset. */ - if (isspace(UCHAR(bytes[4]))) { - return TCL_ERROR; + if (TclIsSpaceProc(bytes[4])) { + goto badIndexFormat; } if (Tcl_GetInt(interp, bytes+4, &offset) != TCL_OK) { return TCL_ERROR; @@ -2758,10 +3667,10 @@ SetEndOffsetFromAny( * Conversion failed. Report the error. */ + badIndexFormat: if (interp != NULL) { - Tcl_ResetResult(interp); - Tcl_AppendResult(interp, "bad index \"", bytes, - "\": must be end?[+-]integer?", NULL); + Tcl_SetObjResult(interp, Tcl_ObjPrintf( + "bad index \"%s\": must be end?[+-]integer?", bytes)); Tcl_SetErrorCode(interp, "TCL", "VALUE", "INDEX", NULL); } return TCL_ERROR; @@ -2810,7 +3719,7 @@ TclCheckBadOctal( * zero. Try to generate a meaningful error message. */ - while (isspace(UCHAR(*p))) { /* INTL: ISO space. */ + while (TclIsSpaceProc(*p)) { p++; } if (*p == '+' || *p == '-') { @@ -2823,7 +3732,7 @@ TclCheckBadOctal( while (isdigit(UCHAR(*p))) { /* INTL: digit. */ p++; } - while (isspace(UCHAR(*p))) { /* INTL: ISO space. */ + while (TclIsSpaceProc(*p)) { p++; } if (*p == '\0') { @@ -2837,8 +3746,8 @@ TclCheckBadOctal( * be added to an existing error message as extra info. */ - Tcl_AppendResult(interp, " (looks like invalid octal number)", - NULL); + Tcl_AppendToObj(Tcl_GetObjResult(interp), + " (looks like invalid octal number)", -1); } return 1; } @@ -2865,7 +3774,8 @@ ClearHash( for (hPtr = Tcl_FirstHashEntry(tablePtr, &search); hPtr != NULL; hPtr = Tcl_NextHashEntry(&search)) { - Tcl_Obj *objPtr = (Tcl_Obj *) Tcl_GetHashValue(hPtr); + Tcl_Obj *objPtr = Tcl_GetHashValue(hPtr); + Tcl_DecrRefCount(objPtr); Tcl_DeleteHashEntry(hPtr); } @@ -2893,12 +3803,12 @@ static Tcl_HashTable * GetThreadHash( Tcl_ThreadDataKey *keyPtr) { - Tcl_HashTable **tablePtrPtr = (Tcl_HashTable **) - Tcl_GetThreadData(keyPtr, (int) sizeof(Tcl_HashTable *)); + Tcl_HashTable **tablePtrPtr = + Tcl_GetThreadData(keyPtr, sizeof(Tcl_HashTable *)); if (NULL == *tablePtrPtr) { - *tablePtrPtr = (Tcl_HashTable *)ckalloc(sizeof(Tcl_HashTable)); - Tcl_CreateThreadExitHandler(FreeThreadHash, (ClientData)*tablePtrPtr); + *tablePtrPtr = ckalloc(sizeof(Tcl_HashTable)); + Tcl_CreateThreadExitHandler(FreeThreadHash, *tablePtrPtr); Tcl_InitHashTable(*tablePtrPtr, TCL_ONE_WORD_KEYS); } return *tablePtrPtr; @@ -2922,11 +3832,11 @@ static void FreeThreadHash( ClientData clientData) { - Tcl_HashTable *tablePtr = (Tcl_HashTable *) clientData; + Tcl_HashTable *tablePtr = clientData; ClearHash(tablePtr); Tcl_DeleteHashTable(tablePtr); - ckfree((char *) tablePtr); + ckfree(tablePtr); } /* @@ -2944,7 +3854,7 @@ static void FreeProcessGlobalValue( ClientData clientData) { - ProcessGlobalValue *pgvPtr = (ProcessGlobalValue *) clientData; + ProcessGlobalValue *pgvPtr = clientData; pgvPtr->epoch++; pgvPtr->numBytes = 0; @@ -2989,10 +3899,10 @@ TclSetProcessGlobalValue( if (NULL != pgvPtr->value) { ckfree(pgvPtr->value); } else { - Tcl_CreateExitHandler(FreeProcessGlobalValue, (ClientData) pgvPtr); + Tcl_CreateExitHandler(FreeProcessGlobalValue, pgvPtr); } bytes = Tcl_GetStringFromObj(newValue, &pgvPtr->numBytes); - pgvPtr->value = ckalloc((unsigned) pgvPtr->numBytes + 1); + pgvPtr->value = ckalloc(pgvPtr->numBytes + 1); memcpy(pgvPtr->value, bytes, (unsigned) pgvPtr->numBytes + 1); if (pgvPtr->encoding) { Tcl_FreeEncoding(pgvPtr->encoding); @@ -3008,8 +3918,7 @@ TclSetProcessGlobalValue( Tcl_IncrRefCount(newValue); cacheMap = GetThreadHash(&pgvPtr->key); ClearHash(cacheMap); - hPtr = Tcl_CreateHashEntry(cacheMap, - INT2PTR(pgvPtr->epoch), &dummy); + hPtr = Tcl_CreateHashEntry(cacheMap, INT2PTR(pgvPtr->epoch), &dummy); Tcl_SetHashValue(hPtr, newValue); Tcl_MutexUnlock(&pgvPtr->mutex); } @@ -3058,8 +3967,7 @@ TclGetProcessGlobalValue( Tcl_DStringLength(&native), &newValue); Tcl_DStringFree(&native); ckfree(pgvPtr->value); - pgvPtr->value = ckalloc((unsigned) - Tcl_DStringLength(&newValue) + 1); + pgvPtr->value = ckalloc(Tcl_DStringLength(&newValue) + 1); memcpy(pgvPtr->value, Tcl_DStringValue(&newValue), (size_t) Tcl_DStringLength(&newValue) + 1); Tcl_DStringFree(&newValue); @@ -3273,9 +4181,9 @@ TclReToGlob( Tcl_DString *dsPtr, int *exactPtr) { - int anchorLeft, anchorRight, lastIsStar; + int anchorLeft, anchorRight, lastIsStar, numStars; char *dsStr, *dsStrStart; - const char *msg, *p, *strEnd; + const char *msg, *p, *strEnd, *code; strEnd = reStr + reStrLen; Tcl_DStringInit(dsPtr); @@ -3286,9 +4194,10 @@ TclReToGlob( if ((reStrLen >= 4) && (memcmp("***=", reStr, 4) == 0)) { /* - * At most, the glob pattern has length 2*reStrLen + 2 to - * backslash escape every character and have * at each end. + * At most, the glob pattern has length 2*reStrLen + 2 to backslash + * escape every character and have * at each end. */ + Tcl_DStringSetLength(dsPtr, reStrLen + 2); dsStr = dsStrStart = Tcl_DStringValue(dsPtr); *dsStr++ = '*'; @@ -3312,8 +4221,8 @@ TclReToGlob( } /* - * At most, the glob pattern has length reStrLen + 2 to account - * for possible * at each end. + * At most, the glob pattern has length reStrLen + 2 to account for + * possible * at each end. */ Tcl_DStringSetLength(dsPtr, reStrLen + 2); @@ -3323,15 +4232,16 @@ TclReToGlob( * Check for anchored REs (ie ^foo$), so we can use string equal if * possible. Do not alter the start of str so we can free it correctly. * - * Keep track of the last char being an unescaped star to prevent - * multiple instances. Simpler than checking that the last star - * may be escaped. + * Keep track of the last char being an unescaped star to prevent multiple + * instances. Simpler than checking that the last star may be escaped. */ msg = NULL; + code = NULL; p = reStr; anchorRight = 0; lastIsStar = 0; + numStars = 0; if (*p == '^') { anchorLeft = 1; @@ -3384,6 +4294,7 @@ TclReToGlob( break; default: msg = "invalid escape sequence"; + code = "BADESCAPE"; goto invalidGlob; } break; @@ -3395,6 +4306,7 @@ TclReToGlob( if (!lastIsStar) { *dsStr++ = '*'; lastIsStar = 1; + numStars++; } continue; } else if (p[1] == '+') { @@ -3402,6 +4314,7 @@ TclReToGlob( *dsStr++ = '?'; *dsStr++ = '*'; lastIsStar = 1; + numStars++; continue; } } @@ -3410,6 +4323,7 @@ TclReToGlob( case '$': if (p+1 != strEnd) { msg = "$ not anchor"; + code = "NONANCHOR"; goto invalidGlob; } anchorRight = 1; @@ -3417,14 +4331,25 @@ TclReToGlob( case '*': case '+': case '?': case '|': case '^': case '{': case '}': case '(': case ')': case '[': case ']': msg = "unhandled RE special char"; + code = "UNHANDLED"; goto invalidGlob; - break; default: *dsStr++ = *p; break; } lastIsStar = 0; } + if (numStars > 1) { + /* + * Heuristic: if >1 non-anchoring *, the risk is large that glob + * matching is slower than the RE engine, so report invalid. + */ + + msg = "excessive recursive glob backtrack potential"; + code = "OVERCOMPLEX"; + goto invalidGlob; + } + if (!anchorRight && !lastIsStar) { *dsStr++ = '*'; } @@ -3434,22 +4359,12 @@ TclReToGlob( *exactPtr = (anchorLeft && anchorRight); } -#if 0 - fprintf(stderr, "INPUT RE '%.*s' OUTPUT GLOB '%s' anchor %d:%d \n", - reStrLen, reStr, - Tcl_DStringValue(dsPtr), anchorLeft, anchorRight); - fflush(stderr); -#endif return TCL_OK; invalidGlob: -#if 0 - fprintf(stderr, "INPUT RE '%.*s' NO OUTPUT GLOB %s (%c)\n", - reStrLen, reStr, msg, *p); - fflush(stderr); -#endif if (interp != NULL) { - Tcl_AppendResult(interp, msg, NULL); + Tcl_SetObjResult(interp, Tcl_NewStringObj(msg, -1)); + Tcl_SetErrorCode(interp, "TCL", "RE2GLOB", code, NULL); } Tcl_DStringFree(dsPtr); return TCL_ERROR; |