diff options
author | dgp <dgp@users.sourceforge.net> | 2014-12-05 12:28:01 (GMT) |
---|---|---|
committer | dgp <dgp@users.sourceforge.net> | 2014-12-05 12:28:01 (GMT) |
commit | 71a48e59c2eb9d90233530198576fc445abfda42 (patch) | |
tree | 94c5a2756c009d91338f8ee894ae0122dd4f35e9 /generic | |
parent | 423020bd63284883ef0e1343574dd256b7023ae0 (diff) | |
parent | 3041f3f9a1d8a242105ffe99eebae201a7079549 (diff) | |
download | tcl-71a48e59c2eb9d90233530198576fc445abfda42.zip tcl-71a48e59c2eb9d90233530198576fc445abfda42.tar.gz tcl-71a48e59c2eb9d90233530198576fc445abfda42.tar.bz2 |
[d2ffcca163] Limit parsing results that are documented to accept only ASCII chars to actually follow that constraint. This requires not trusting isalnum(.) and isalpha(.) to deliver portable identical results.
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tclCompExpr.c | 51 | ||||
-rw-r--r-- | generic/tclInt.h | 1 | ||||
-rw-r--r-- | generic/tclParse.c | 63 |
3 files changed, 71 insertions, 44 deletions
diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index 94c1bd6..448f99c 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1967,7 +1967,7 @@ ParseLexeme( case 'i': if ((numBytes > 1) && (start[1] == 'n') - && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { + && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) { /* * Must make this check so we can tell the difference between the * "in" operator and the "int" function name and the "infinity" @@ -1981,14 +1981,15 @@ ParseLexeme( case 'e': if ((numBytes > 1) && (start[1] == 'q') - && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { + && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) { *lexemePtr = STREQ; return 2; } break; case 'n': - if ((numBytes > 1) && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { + if ((numBytes > 1) + && ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) { switch (start[1]) { case 'e': *lexemePtr = STRNEQ; @@ -2003,8 +2004,7 @@ ParseLexeme( literal = Tcl_NewObj(); if (TclParseNumber(NULL, literal, NULL, start, numBytes, &end, TCL_PARSE_NO_WHITESPACE) == TCL_OK) { - if (end < start + numBytes && !isalnum(UCHAR(*end)) - && UCHAR(*end) != '_') { + if (end < start + numBytes && !TclIsBareword(*end)) { number: TclInitStringRep(literal, start, end-start); @@ -2029,7 +2029,7 @@ ParseLexeme( const char *p = start; while (p < end) { - if (!isalnum(UCHAR(*p++))) { + if (!TclIsBareword(*p++)) { /* * The number has non-bareword characters, so we * must treat it as a number. @@ -2054,33 +2054,30 @@ ParseLexeme( } } - if (Tcl_UtfCharComplete(start, numBytes)) { - scanned = Tcl_UtfToUniChar(start, &ch); - } else { - char utfBytes[TCL_UTF_MAX]; - - memcpy(utfBytes, start, (size_t) numBytes); - utfBytes[numBytes] = '\0'; - scanned = Tcl_UtfToUniChar(utfBytes, &ch); - } - if (!isalnum(UCHAR(ch))) { - *lexemePtr = INVALID; - Tcl_DecrRefCount(literal); - return scanned; - } - end = start; - while (isalnum(UCHAR(ch)) || (UCHAR(ch) == '_')) { - end += scanned; - numBytes -= scanned; - if (Tcl_UtfCharComplete(end, numBytes)) { - scanned = Tcl_UtfToUniChar(end, &ch); + /* + * We reject leading underscores in bareword. No sensible reason why. + * Might be inspired by reserved identifier rules in C, which of course + * have no direct relevance here. + */ + + if (!TclIsBareword(*start) || *start == '_') { + if (Tcl_UtfCharComplete(start, numBytes)) { + scanned = Tcl_UtfToUniChar(start, &ch); } else { char utfBytes[TCL_UTF_MAX]; - memcpy(utfBytes, end, (size_t) numBytes); + memcpy(utfBytes, start, (size_t) numBytes); utfBytes[numBytes] = '\0'; scanned = Tcl_UtfToUniChar(utfBytes, &ch); } + *lexemePtr = INVALID; + Tcl_DecrRefCount(literal); + return scanned; + } + end = start; + while (numBytes && TclIsBareword(*end)) { + end += 1; + numBytes -= 1; } *lexemePtr = BAREWORD; if (literalPtr) { diff --git a/generic/tclInt.h b/generic/tclInt.h index 860c2a3..c989eda 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2987,6 +2987,7 @@ MODULE_SCOPE void TclInitSubsystems(void); MODULE_SCOPE int TclInterpReady(Tcl_Interp *interp); MODULE_SCOPE int TclIsLocalScalar(const char *src, int len); MODULE_SCOPE int TclIsSpaceProc(char byte); +MODULE_SCOPE int TclIsBareword(char byte); MODULE_SCOPE Tcl_Obj * TclJoinPath(int elements, Tcl_Obj * const objv[]); MODULE_SCOPE int TclJoinThread(Tcl_ThreadId id, int *result); MODULE_SCOPE void TclLimitRemoveAllHandlers(Tcl_Interp *interp); diff --git a/generic/tclParse.c b/generic/tclParse.c index ee0d4c4..ca12be5 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -621,6 +621,47 @@ TclIsSpaceProc( /* *---------------------------------------------------------------------- * + * TclIsBareword-- + * + * Report whether byte is one that can be part of a "bareword". + * This concept is named in expression parsing, where it determines + * what can be a legal function name, but is the same definition used + * in determining what variable names can be parsed as variable + * substitutions without the benefit of enclosing braces. The set of + * ASCII chars that are accepted are the numeric chars ('0'-'9'), + * the alphabetic chars ('a'-'z', 'A'-'Z') and underscore ('_'). + * + * Results: + * Returns 1, if byte is in the accepted set of chars, 0 otherwise. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclIsBareword( + char byte) +{ + if (byte < '0' || byte > 'z') { + return 0; + } + if (byte <= '9' || byte >= 'a') { + return 1; + } + if (byte == '_') { + return 1; + } + if (byte < 'A' || byte > 'Z') { + return 0; + } + return 1; +} + +/* + *---------------------------------------------------------------------- + * * ParseWhiteSpace -- * * Scans up to numBytes bytes starting at src, consuming white space @@ -1346,9 +1387,7 @@ Tcl_ParseVarName( { Tcl_Token *tokenPtr; register const char *src; - unsigned char c; - int varIndex, offset; - Tcl_UniChar ch; + int varIndex; unsigned array; if ((numBytes == 0) || (start == NULL)) { @@ -1431,22 +1470,12 @@ Tcl_ParseVarName( tokenPtr->numComponents = 0; while (numBytes) { - if (Tcl_UtfCharComplete(src, numBytes)) { - offset = Tcl_UtfToUniChar(src, &ch); - } else { - char utfBytes[TCL_UTF_MAX]; - - memcpy(utfBytes, src, (size_t) numBytes); - utfBytes[numBytes] = '\0'; - offset = Tcl_UtfToUniChar(utfBytes, &ch); - } - c = UCHAR(ch); - if (isalnum(c) || (c == '_')) { /* INTL: ISO only, UCHAR. */ - src += offset; - numBytes -= offset; + if (TclIsBareword(*src)) { + src += 1; + numBytes -= 1; continue; } - if ((c == ':') && (numBytes != 1) && (src[1] == ':')) { + if ((src[0] == ':') && (numBytes != 1) && (src[1] == ':')) { src += 2; numBytes -= 2; while (numBytes && (*src == ':')) { |