From 61855b36b423d2293d99ccb6381e6622980e2cd9 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 25 Nov 2014 21:24:06 +0000 Subject: One way to fix the parser of $-substitution accepting non-ASCII varnames. --- generic/tclParse.c | 3 +++ tests/parse.test | 3 +++ 2 files changed, 6 insertions(+) diff --git a/generic/tclParse.c b/generic/tclParse.c index e475fb8..1523eb3 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -1438,6 +1438,9 @@ Tcl_ParseVarName( offset = Tcl_UtfToUniChar(utfBytes, &ch); } c = UCHAR(ch); + if (c != ch) { + break; + } if (isalnum(c) || (c == '_')) { /* INTL: ISO only, UCHAR. */ src += offset; numBytes -= offset; diff --git a/tests/parse.test b/tests/parse.test index d7de5ff..cd02386 100644 --- a/tests/parse.test +++ b/tests/parse.test @@ -656,6 +656,9 @@ test parse-12.24 {Tcl_ParseVarName procedure, missing close paren in array refer test parse-12.25 {Tcl_ParseVarName procedure, nested array reference} testparser { testparser {$x(a$y(b$z))} 0 } {- {$x(a$y(b$z))} 1 word {$x(a$y(b$z))} 8 variable {$x(a$y(b$z))} 7 text x 0 text a 0 variable {$y(b$z)} 4 text y 0 text b 0 variable {$z} 1 text z 0 {}} +test parse-12.26 {Tcl_ParseVarName [d2ffcca163] non-ascii} testparser { + testparser "$\u0433" -1 +} "- {$\u0433} 1 word {$\u0433} 2 text {$} 0 text \u0433 0 {}" test parse-13.1 {Tcl_ParseVar procedure} testparsevar { set abc 24 -- cgit v0.12 From 63678d528b5d4384d8a6fd8941b73888399227e5 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 26 Nov 2014 16:22:40 +0000 Subject: I like this patch better. Retain the byte orientation of the parser. --- generic/tclParse.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/generic/tclParse.c b/generic/tclParse.c index 1523eb3..90ec43d 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -1344,8 +1344,7 @@ Tcl_ParseVarName( Tcl_Token *tokenPtr; register const char *src; unsigned char c; - int varIndex, offset; - Tcl_UniChar ch; + int varIndex; unsigned array; if ((numBytes == 0) || (start == NULL)) { @@ -1428,22 +1427,10 @@ Tcl_ParseVarName( tokenPtr->numComponents = 0; while (numBytes) { - if (Tcl_UtfCharComplete(src, numBytes)) { - offset = Tcl_UtfToUniChar(src, &ch); - } else { - char utfBytes[TCL_UTF_MAX]; - - memcpy(utfBytes, src, (size_t) numBytes); - utfBytes[numBytes] = '\0'; - offset = Tcl_UtfToUniChar(utfBytes, &ch); - } - c = UCHAR(ch); - if (c != ch) { - break; - } + c = UCHAR(*src); if (isalnum(c) || (c == '_')) { /* INTL: ISO only, UCHAR. */ - src += offset; - numBytes -= offset; + src += 1; + numBytes -= 1; continue; } if ((c == ':') && (numBytes != 1) && (src[1] == ':')) { -- cgit v0.12 From f12dc54f7578abc9e2d5f625157263ec5a0bc40e Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 26 Nov 2014 17:00:55 +0000 Subject: Same issue in expr parser also tested and fixed. --- generic/tclCompExpr.c | 32 ++++++++++++-------------------- tests/parseExpr.test | 6 ++++++ 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index 9142e2b..dde4e56 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1969,31 +1969,23 @@ ParseLexeme( } } - if (Tcl_UtfCharComplete(start, numBytes)) { - scanned = Tcl_UtfToUniChar(start, &ch); - } else { - char utfBytes[TCL_UTF_MAX]; - memcpy(utfBytes, start, (size_t) numBytes); - utfBytes[numBytes] = '\0'; - scanned = Tcl_UtfToUniChar(utfBytes, &ch); - } - if (!isalnum(UCHAR(ch))) { - *lexemePtr = INVALID; - Tcl_DecrRefCount(literal); - return scanned; - } - end = start; - while (isalnum(UCHAR(ch)) || (UCHAR(ch) == '_')) { - end += scanned; - numBytes -= scanned; - if (Tcl_UtfCharComplete(end, numBytes)) { - scanned = Tcl_UtfToUniChar(end, &ch); + if (!isalnum(UCHAR(*start))) { + if (Tcl_UtfCharComplete(start, numBytes)) { + scanned = Tcl_UtfToUniChar(start, &ch); } else { char utfBytes[TCL_UTF_MAX]; - memcpy(utfBytes, end, (size_t) numBytes); + memcpy(utfBytes, start, (size_t) numBytes); utfBytes[numBytes] = '\0'; scanned = Tcl_UtfToUniChar(utfBytes, &ch); } + *lexemePtr = INVALID; + Tcl_DecrRefCount(literal); + return scanned; + } + end = start; + while (numBytes && (isalnum(UCHAR(*end)) || (UCHAR(*end) == '_'))) { + end += 1; + numBytes -= 1; } *lexemePtr = BAREWORD; if (literalPtr) { diff --git a/tests/parseExpr.test b/tests/parseExpr.test index c1c489b..3e0df29 100644 --- a/tests/parseExpr.test +++ b/tests/parseExpr.test @@ -1051,6 +1051,12 @@ test parseExpr-22.18 {Bug 3401704} -constraints testexprparser -body { testexprparser 0b02 -1 } -returnCodes error -match glob -result {*invalid binary number*} +test parseExpr-22.19 {Bug d2ffcca163} -constraints testexprparser -body { + testexprparser \u0433 -1 +} -returnCodes error -match glob -result {*invalid character*} +test parseExpr-22.20 {Bug d2ffcca163} -constraints testexprparser -body { + testexprparser \u043f -1 +} -returnCodes error -match glob -result {*invalid character*} # cleanup cleanupTests -- cgit v0.12 From f9fb8c8daf918e57d3a50a0428623a5e5c260e70 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 4 Dec 2014 18:26:31 +0000 Subject: Stop using isalnum(.). Its results are not portable. Replace with our own private routine TclIsBareword() that does exactly what we want. --- generic/tclCompExpr.c | 15 ++++++++++----- generic/tclInt.h | 1 + generic/tclParse.c | 47 +++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 54 insertions(+), 9 deletions(-) diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index dde4e56..2470931 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1920,8 +1920,7 @@ ParseLexeme( literal = Tcl_NewObj(); if (TclParseNumber(NULL, literal, NULL, start, numBytes, &end, TCL_PARSE_NO_WHITESPACE) == TCL_OK) { - if (end < start + numBytes && !isalnum(UCHAR(*end)) - && UCHAR(*end) != '_') { + if (end < start + numBytes && !TclIsBareword(*end)) { number: TclInitStringRep(literal, start, end-start); @@ -1945,7 +1944,7 @@ ParseLexeme( if (literal->typePtr == &tclDoubleType) { const char *p = start; while (p < end) { - if (!isalnum(UCHAR(*p++))) { + if (!TclIsBareword(*p++)) { /* * The number has non-bareword characters, so we * must treat it as a number. @@ -1969,7 +1968,13 @@ ParseLexeme( } } - if (!isalnum(UCHAR(*start))) { + /* + * We reject leading underscores in bareword. No sensible reason why. + * Might be inspired by reserved identifier rules in C, which of course + * have no direct relevance here. + */ + + if (!TclIsBareword(*start) || *start == '_') { if (Tcl_UtfCharComplete(start, numBytes)) { scanned = Tcl_UtfToUniChar(start, &ch); } else { @@ -1983,7 +1988,7 @@ ParseLexeme( return scanned; } end = start; - while (numBytes && (isalnum(UCHAR(*end)) || (UCHAR(*end) == '_'))) { + while (numBytes && TclIsBareword(*end)) { end += 1; numBytes -= 1; } diff --git a/generic/tclInt.h b/generic/tclInt.h index dd66d76..255ee23 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2608,6 +2608,7 @@ MODULE_SCOPE void TclInitSubsystems(void); MODULE_SCOPE int TclInterpReady(Tcl_Interp *interp); MODULE_SCOPE int TclIsLocalScalar(const char *src, int len); MODULE_SCOPE int TclIsSpaceProc(char byte); +MODULE_SCOPE int TclIsBareword(char byte); MODULE_SCOPE int TclJoinThread(Tcl_ThreadId id, int *result); MODULE_SCOPE void TclLimitRemoveAllHandlers(Tcl_Interp *interp); MODULE_SCOPE Tcl_Obj * TclLindexList(Tcl_Interp *interp, diff --git a/generic/tclParse.c b/generic/tclParse.c index 90ec43d..025304c 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -628,6 +628,47 @@ TclIsSpaceProc( /* *---------------------------------------------------------------------- * + * TclIsBareword-- + * + * Report whether byte is one that can be part of a "bareword". + * This concept is named in expression parsing, where it determines + * what can be a legal function name, but is the same definition used + * in determining what variable names can be parsed as variable + * substitutions without the benefit of enclosing braces. The set of + * ASCII chars that are accepted are the numeric chars ('0'-'9'), + * the alphabetic chars ('a'-'z', 'A'-'Z') and underscore ('_'). + * + * Results: + * Returns 1, if byte is in the accepted set of chars, 0 otherwise. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclIsBareword( + char byte) +{ + if (byte < '0' || byte > 'z') { + return 0; + } + if (byte <= '9' || byte >= 'a') { + return 1; + } + if (byte == '_') { + return 1; + } + if (byte < 'A' || byte > 'Z') { + return 0; + } + return 1; +} + +/* + *---------------------------------------------------------------------- + * * ParseWhiteSpace -- * * Scans up to numBytes bytes starting at src, consuming white space @@ -1343,7 +1384,6 @@ Tcl_ParseVarName( { Tcl_Token *tokenPtr; register const char *src; - unsigned char c; int varIndex; unsigned array; @@ -1427,13 +1467,12 @@ Tcl_ParseVarName( tokenPtr->numComponents = 0; while (numBytes) { - c = UCHAR(*src); - if (isalnum(c) || (c == '_')) { /* INTL: ISO only, UCHAR. */ + if (TclIsBareword(*src)) { src += 1; numBytes -= 1; continue; } - if ((c == ':') && (numBytes != 1) && (src[1] == ':')) { + if ((src[0] == ':') && (numBytes != 1) && (src[1] == ':')) { src += 2; numBytes -= 2; while (numBytes && (*src == ':')) { -- cgit v0.12 From c4c311b0e2ca10bc95a35e3f60d9a939b55c26e4 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 4 Dec 2014 20:45:11 +0000 Subject: The isalpha(.) calls remaining in the expr parser still bring nonportability. Commit a test that demonstrates that. --- tests/parseExpr.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/parseExpr.test b/tests/parseExpr.test index 3e0df29..c3b0d71 100644 --- a/tests/parseExpr.test +++ b/tests/parseExpr.test @@ -1057,6 +1057,9 @@ test parseExpr-22.19 {Bug d2ffcca163} -constraints testexprparser -body { test parseExpr-22.20 {Bug d2ffcca163} -constraints testexprparser -body { testexprparser \u043f -1 } -returnCodes error -match glob -result {*invalid character*} +test parseExpr-22.21 {Bug d2ffcca163} -constraints testexprparser -body { + testexprparser in\u0433(0) -1 +} -returnCodes error -match glob -result {missing operand*} # cleanup cleanupTests -- cgit v0.12 From 640e5762bc7e0aa585f1135dde28e2fa9434e79e Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 4 Dec 2014 21:29:18 +0000 Subject: Limit isalpha(.) calls in the expr parser to only apply to known ASCII arguments to make the results portable. --- generic/tclCompExpr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index 2470931..7b67970 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1883,7 +1883,7 @@ ParseLexeme( case 'i': if ((numBytes > 1) && (start[1] == 'n') - && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { + && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) { /* * Must make this check so we can tell the difference between @@ -1898,14 +1898,15 @@ ParseLexeme( case 'e': if ((numBytes > 1) && (start[1] == 'q') - && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { + && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) { *lexemePtr = STREQ; return 2; } break; case 'n': - if ((numBytes > 1) && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { + if ((numBytes > 1) + && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) { switch (start[1]) { case 'e': *lexemePtr = STRNEQ; -- cgit v0.12