diff options
author | dgp <dgp@users.sourceforge.net> | 2014-12-04 22:10:57 (GMT) |
---|---|---|
committer | dgp <dgp@users.sourceforge.net> | 2014-12-04 22:10:57 (GMT) |
commit | 3041f3f9a1d8a242105ffe99eebae201a7079549 (patch) | |
tree | ca5b53d4a09396ccb6458f4999e411a421a36961 /generic | |
parent | 886a7575564f8e4e164cb67dff4554778650868e (diff) | |
parent | 640e5762bc7e0aa585f1135dde28e2fa9434e79e (diff) | |
download | tcl-3041f3f9a1d8a242105ffe99eebae201a7079549.zip tcl-3041f3f9a1d8a242105ffe99eebae201a7079549.tar.gz tcl-3041f3f9a1d8a242105ffe99eebae201a7079549.tar.bz2 |
[d2ffcca163] Limit parsing results that are documented to accept only
ASCII chars to actually follow that constraint. This requires not trusting
isalnum(.) and isalpha(.) to deliver portable identical results.
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tclCompExpr.c | 50 | ||||
-rw-r--r-- | generic/tclInt.h | 1 | ||||
-rw-r--r-- | generic/tclParse.c | 63 |
3 files changed, 71 insertions, 43 deletions
diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index 9142e2b..7b67970 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1883,7 +1883,7 @@ ParseLexeme( case 'i': if ((numBytes > 1) && (start[1] == 'n') - && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { + && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) { /* * Must make this check so we can tell the difference between @@ -1898,14 +1898,15 @@ ParseLexeme( case 'e': if ((numBytes > 1) && (start[1] == 'q') - && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { + && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) { *lexemePtr = STREQ; return 2; } break; case 'n': - if ((numBytes > 1) && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { + if ((numBytes > 1) + && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) { switch (start[1]) { case 'e': *lexemePtr = STRNEQ; @@ -1920,8 +1921,7 @@ ParseLexeme( literal = Tcl_NewObj(); if (TclParseNumber(NULL, literal, NULL, start, numBytes, &end, TCL_PARSE_NO_WHITESPACE) == TCL_OK) { - if (end < start + numBytes && !isalnum(UCHAR(*end)) - && UCHAR(*end) != '_') { + if (end < start + numBytes && !TclIsBareword(*end)) { number: TclInitStringRep(literal, start, end-start); @@ -1945,7 +1945,7 @@ ParseLexeme( if (literal->typePtr == &tclDoubleType) { const char *p = start; while (p < end) { - if (!isalnum(UCHAR(*p++))) { + if (!TclIsBareword(*p++)) { /* * The number has non-bareword characters, so we * must treat it as a number. @@ -1969,31 +1969,29 @@ ParseLexeme( } } - if (Tcl_UtfCharComplete(start, numBytes)) { - scanned = Tcl_UtfToUniChar(start, &ch); - } else { - char utfBytes[TCL_UTF_MAX]; - memcpy(utfBytes, start, (size_t) numBytes); - utfBytes[numBytes] = '\0'; - scanned = Tcl_UtfToUniChar(utfBytes, &ch); - } - if (!isalnum(UCHAR(ch))) { - *lexemePtr = INVALID; - Tcl_DecrRefCount(literal); - return scanned; - } - end = start; - while (isalnum(UCHAR(ch)) || (UCHAR(ch) == '_')) { - end += scanned; - numBytes -= scanned; - if (Tcl_UtfCharComplete(end, numBytes)) { - scanned = Tcl_UtfToUniChar(end, &ch); + /* + * We reject leading underscores in bareword. No sensible reason why. + * Might be inspired by reserved identifier rules in C, which of course + * have no direct relevance here. + */ + + if (!TclIsBareword(*start) || *start == '_') { + if (Tcl_UtfCharComplete(start, numBytes)) { + scanned = Tcl_UtfToUniChar(start, &ch); } else { char utfBytes[TCL_UTF_MAX]; - memcpy(utfBytes, end, (size_t) numBytes); + memcpy(utfBytes, start, (size_t) numBytes); utfBytes[numBytes] = '\0'; scanned = Tcl_UtfToUniChar(utfBytes, &ch); } + *lexemePtr = INVALID; + Tcl_DecrRefCount(literal); + return scanned; + } + end = start; + while (numBytes && TclIsBareword(*end)) { + end += 1; + numBytes -= 1; } *lexemePtr = BAREWORD; if (literalPtr) { diff --git a/generic/tclInt.h b/generic/tclInt.h index dd66d76..255ee23 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2608,6 +2608,7 @@ MODULE_SCOPE void TclInitSubsystems(void); MODULE_SCOPE int TclInterpReady(Tcl_Interp *interp); MODULE_SCOPE int TclIsLocalScalar(const char *src, int len); MODULE_SCOPE int TclIsSpaceProc(char byte); +MODULE_SCOPE int TclIsBareword(char byte); MODULE_SCOPE int TclJoinThread(Tcl_ThreadId id, int *result); MODULE_SCOPE void TclLimitRemoveAllHandlers(Tcl_Interp *interp); MODULE_SCOPE Tcl_Obj * TclLindexList(Tcl_Interp *interp, diff --git a/generic/tclParse.c b/generic/tclParse.c index e475fb8..025304c 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -628,6 +628,47 @@ TclIsSpaceProc( /* *---------------------------------------------------------------------- * + * TclIsBareword-- + * + * Report whether byte is one that can be part of a "bareword". + * This concept is named in expression parsing, where it determines + * what can be a legal function name, but is the same definition used + * in determining what variable names can be parsed as variable + * substitutions without the benefit of enclosing braces. The set of + * ASCII chars that are accepted are the numeric chars ('0'-'9'), + * the alphabetic chars ('a'-'z', 'A'-'Z') and underscore ('_'). + * + * Results: + * Returns 1, if byte is in the accepted set of chars, 0 otherwise. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclIsBareword( + char byte) +{ + if (byte < '0' || byte > 'z') { + return 0; + } + if (byte <= '9' || byte >= 'a') { + return 1; + } + if (byte == '_') { + return 1; + } + if (byte < 'A' || byte > 'Z') { + return 0; + } + return 1; +} + +/* + *---------------------------------------------------------------------- + * * ParseWhiteSpace -- * * Scans up to numBytes bytes starting at src, consuming white space @@ -1343,9 +1384,7 @@ Tcl_ParseVarName( { Tcl_Token *tokenPtr; register const char *src; - unsigned char c; - int varIndex, offset; - Tcl_UniChar ch; + int varIndex; unsigned array; if ((numBytes == 0) || (start == NULL)) { @@ -1428,22 +1467,12 @@ Tcl_ParseVarName( tokenPtr->numComponents = 0; while (numBytes) { - if (Tcl_UtfCharComplete(src, numBytes)) { - offset = Tcl_UtfToUniChar(src, &ch); - } else { - char utfBytes[TCL_UTF_MAX]; - - memcpy(utfBytes, src, (size_t) numBytes); - utfBytes[numBytes] = '\0'; - offset = Tcl_UtfToUniChar(utfBytes, &ch); - } - c = UCHAR(ch); - if (isalnum(c) || (c == '_')) { /* INTL: ISO only, UCHAR. */ - src += offset; - numBytes -= offset; + if (TclIsBareword(*src)) { + src += 1; + numBytes -= 1; continue; } - if ((c == ':') && (numBytes != 1) && (src[1] == ':')) { + if ((src[0] == ':') && (numBytes != 1) && (src[1] == ':')) { src += 2; numBytes -= 2; while (numBytes && (*src == ':')) { |