summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordgp <dgp@users.sourceforge.net>2014-12-05 12:28:01 (GMT)
committerdgp <dgp@users.sourceforge.net>2014-12-05 12:28:01 (GMT)
commitbc8ffb392220e949c4aadbe13d7b97912a7ea7ab (patch)
tree94c5a2756c009d91338f8ee894ae0122dd4f35e9
parentbff00baca2d262fe129758037163387e65b99b46 (diff)
parentbfb321eb38ad0ef45285f0637c865cf701949b90 (diff)
downloadtcl-bc8ffb392220e949c4aadbe13d7b97912a7ea7ab.zip
tcl-bc8ffb392220e949c4aadbe13d7b97912a7ea7ab.tar.gz
tcl-bc8ffb392220e949c4aadbe13d7b97912a7ea7ab.tar.bz2
[d2ffcca163] Limit parsing results that are documented to accept only ASCII chars to actually follow that constraint. This requires not trusting isalnum(.) and isalpha(.) to deliver portable identical results.
-rw-r--r--generic/tclCompExpr.c51
-rw-r--r--generic/tclInt.h1
-rw-r--r--generic/tclParse.c63
-rw-r--r--tests/parse.test3
-rw-r--r--tests/parseExpr.test9
5 files changed, 83 insertions, 44 deletions
diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c
index 94c1bd6..448f99c 100644
--- a/generic/tclCompExpr.c
+++ b/generic/tclCompExpr.c
@@ -1967,7 +1967,7 @@ ParseLexeme(
case 'i':
if ((numBytes > 1) && (start[1] == 'n')
- && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {
+ && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
/*
* Must make this check so we can tell the difference between the
* "in" operator and the "int" function name and the "infinity"
@@ -1981,14 +1981,15 @@ ParseLexeme(
case 'e':
if ((numBytes > 1) && (start[1] == 'q')
- && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {
+ && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
*lexemePtr = STREQ;
return 2;
}
break;
case 'n':
- if ((numBytes > 1) && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {
+ if ((numBytes > 1)
+ && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
switch (start[1]) {
case 'e':
*lexemePtr = STRNEQ;
@@ -2003,8 +2004,7 @@ ParseLexeme(
literal = Tcl_NewObj();
if (TclParseNumber(NULL, literal, NULL, start, numBytes, &end,
TCL_PARSE_NO_WHITESPACE) == TCL_OK) {
- if (end < start + numBytes && !isalnum(UCHAR(*end))
- && UCHAR(*end) != '_') {
+ if (end < start + numBytes && !TclIsBareword(*end)) {
number:
TclInitStringRep(literal, start, end-start);
@@ -2029,7 +2029,7 @@ ParseLexeme(
const char *p = start;
while (p < end) {
- if (!isalnum(UCHAR(*p++))) {
+ if (!TclIsBareword(*p++)) {
/*
* The number has non-bareword characters, so we
* must treat it as a number.
@@ -2054,33 +2054,30 @@ ParseLexeme(
}
}
- if (Tcl_UtfCharComplete(start, numBytes)) {
- scanned = Tcl_UtfToUniChar(start, &ch);
- } else {
- char utfBytes[TCL_UTF_MAX];
-
- memcpy(utfBytes, start, (size_t) numBytes);
- utfBytes[numBytes] = '\0';
- scanned = Tcl_UtfToUniChar(utfBytes, &ch);
- }
- if (!isalnum(UCHAR(ch))) {
- *lexemePtr = INVALID;
- Tcl_DecrRefCount(literal);
- return scanned;
- }
- end = start;
- while (isalnum(UCHAR(ch)) || (UCHAR(ch) == '_')) {
- end += scanned;
- numBytes -= scanned;
- if (Tcl_UtfCharComplete(end, numBytes)) {
- scanned = Tcl_UtfToUniChar(end, &ch);
+ /*
+ * We reject leading underscores in bareword. No sensible reason why.
+ * Might be inspired by reserved identifier rules in C, which of course
+ * have no direct relevance here.
+ */
+
+ if (!TclIsBareword(*start) || *start == '_') {
+ if (Tcl_UtfCharComplete(start, numBytes)) {
+ scanned = Tcl_UtfToUniChar(start, &ch);
} else {
char utfBytes[TCL_UTF_MAX];
- memcpy(utfBytes, end, (size_t) numBytes);
+ memcpy(utfBytes, start, (size_t) numBytes);
utfBytes[numBytes] = '\0';
scanned = Tcl_UtfToUniChar(utfBytes, &ch);
}
+ *lexemePtr = INVALID;
+ Tcl_DecrRefCount(literal);
+ return scanned;
+ }
+ end = start;
+ while (numBytes && TclIsBareword(*end)) {
+ end += 1;
+ numBytes -= 1;
}
*lexemePtr = BAREWORD;
if (literalPtr) {
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 860c2a3..c989eda 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -2987,6 +2987,7 @@ MODULE_SCOPE void TclInitSubsystems(void);
MODULE_SCOPE int TclInterpReady(Tcl_Interp *interp);
MODULE_SCOPE int TclIsLocalScalar(const char *src, int len);
MODULE_SCOPE int TclIsSpaceProc(char byte);
+MODULE_SCOPE int TclIsBareword(char byte);
MODULE_SCOPE Tcl_Obj * TclJoinPath(int elements, Tcl_Obj * const objv[]);
MODULE_SCOPE int TclJoinThread(Tcl_ThreadId id, int *result);
MODULE_SCOPE void TclLimitRemoveAllHandlers(Tcl_Interp *interp);
diff --git a/generic/tclParse.c b/generic/tclParse.c
index ee0d4c4..ca12be5 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -621,6 +621,47 @@ TclIsSpaceProc(
/*
*----------------------------------------------------------------------
*
+ * TclIsBareword--
+ *
+ * Report whether byte is one that can be part of a "bareword".
+ * This concept is named in expression parsing, where it determines
+ * what can be a legal function name, but is the same definition used
+ * in determining what variable names can be parsed as variable
+ * substitutions without the benefit of enclosing braces. The set of
+ * ASCII chars that are accepted are the numeric chars ('0'-'9'),
+ * the alphabetic chars ('a'-'z', 'A'-'Z') and underscore ('_').
+ *
+ * Results:
+ * Returns 1, if byte is in the accepted set of chars, 0 otherwise.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclIsBareword(
+ char byte)
+{
+ if (byte < '0' || byte > 'z') {
+ return 0;
+ }
+ if (byte <= '9' || byte >= 'a') {
+ return 1;
+ }
+ if (byte == '_') {
+ return 1;
+ }
+ if (byte < 'A' || byte > 'Z') {
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
* ParseWhiteSpace --
*
* Scans up to numBytes bytes starting at src, consuming white space
@@ -1346,9 +1387,7 @@ Tcl_ParseVarName(
{
Tcl_Token *tokenPtr;
register const char *src;
- unsigned char c;
- int varIndex, offset;
- Tcl_UniChar ch;
+ int varIndex;
unsigned array;
if ((numBytes == 0) || (start == NULL)) {
@@ -1431,22 +1470,12 @@ Tcl_ParseVarName(
tokenPtr->numComponents = 0;
while (numBytes) {
- if (Tcl_UtfCharComplete(src, numBytes)) {
- offset = Tcl_UtfToUniChar(src, &ch);
- } else {
- char utfBytes[TCL_UTF_MAX];
-
- memcpy(utfBytes, src, (size_t) numBytes);
- utfBytes[numBytes] = '\0';
- offset = Tcl_UtfToUniChar(utfBytes, &ch);
- }
- c = UCHAR(ch);
- if (isalnum(c) || (c == '_')) { /* INTL: ISO only, UCHAR. */
- src += offset;
- numBytes -= offset;
+ if (TclIsBareword(*src)) {
+ src += 1;
+ numBytes -= 1;
continue;
}
- if ((c == ':') && (numBytes != 1) && (src[1] == ':')) {
+ if ((src[0] == ':') && (numBytes != 1) && (src[1] == ':')) {
src += 2;
numBytes -= 2;
while (numBytes && (*src == ':')) {
diff --git a/tests/parse.test b/tests/parse.test
index 5d8afeb..d73c725 100644
--- a/tests/parse.test
+++ b/tests/parse.test
@@ -663,6 +663,9 @@ test parse-12.24 {Tcl_ParseVarName procedure, missing close paren in array refer
test parse-12.25 {Tcl_ParseVarName procedure, nested array reference} testparser {
testparser {$x(a$y(b$z))} 0
} {- {$x(a$y(b$z))} 1 word {$x(a$y(b$z))} 8 variable {$x(a$y(b$z))} 7 text x 0 text a 0 variable {$y(b$z)} 4 text y 0 text b 0 variable {$z} 1 text z 0 {}}
+test parse-12.26 {Tcl_ParseVarName [d2ffcca163] non-ascii} testparser {
+ testparser "$\u0433" -1
+} "- {$\u0433} 1 word {$\u0433} 2 text {$} 0 text \u0433 0 {}"
test parse-13.1 {Tcl_ParseVar procedure} testparsevar {
set abc 24
diff --git a/tests/parseExpr.test b/tests/parseExpr.test
index 5c7986a..ef05454 100644
--- a/tests/parseExpr.test
+++ b/tests/parseExpr.test
@@ -1064,6 +1064,15 @@ test parseExpr-22.18 {Bug 3401704} -constraints testexprparser -body {
dict get $o -errorcode
} -result {TCL PARSE EXPR BADNUMBER BINARY}
+test parseExpr-22.19 {Bug d2ffcca163} -constraints testexprparser -body {
+ testexprparser \u0433 -1
+} -returnCodes error -match glob -result {*invalid character*}
+test parseExpr-22.20 {Bug d2ffcca163} -constraints testexprparser -body {
+ testexprparser \u043f -1
+} -returnCodes error -match glob -result {*invalid character*}
+test parseExpr-22.21 {Bug d2ffcca163} -constraints testexprparser -body {
+ testexprparser in\u0433(0) -1
+} -returnCodes error -match glob -result {missing operand*}
# cleanup
cleanupTests