diff options
| -rw-r--r-- | generic/tclCompExpr.c | 51 | ||||
| -rw-r--r-- | generic/tclInt.h | 1 | ||||
| -rw-r--r-- | generic/tclParse.c | 63 | ||||
| -rw-r--r-- | tests/parse.test | 3 | ||||
| -rw-r--r-- | tests/parseExpr.test | 9 | 
5 files changed, 83 insertions, 44 deletions
| diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index 94c1bd6..448f99c 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1967,7 +1967,7 @@ ParseLexeme(      case 'i':  	if ((numBytes > 1) && (start[1] == 'n') -		&& ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { +		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {  	    /*  	     * Must make this check so we can tell the difference between the  	     * "in" operator and the "int" function name and the "infinity" @@ -1981,14 +1981,15 @@ ParseLexeme(      case 'e':  	if ((numBytes > 1) && (start[1] == 'q') -		&& ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { +		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {  	    *lexemePtr = STREQ;  	    return 2;  	}  	break;      case 'n': -	if ((numBytes > 1) && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) { +	if ((numBytes > 1) +		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {  	    switch (start[1]) {  	    case 'e':  		*lexemePtr = STRNEQ; @@ -2003,8 +2004,7 @@ ParseLexeme(      literal = Tcl_NewObj();      if (TclParseNumber(NULL, literal, NULL, start, numBytes, &end,  	    TCL_PARSE_NO_WHITESPACE) == TCL_OK) { -	if (end < start + numBytes && !isalnum(UCHAR(*end)) -		&& UCHAR(*end) != '_') { +	if (end < start + numBytes && !TclIsBareword(*end)) {  	number:  	    TclInitStringRep(literal, start, end-start); @@ -2029,7 +2029,7 @@ ParseLexeme(  		const char *p = start;  		while (p < end) { -		    if (!isalnum(UCHAR(*p++))) { +		    if (!TclIsBareword(*p++)) {  			/*  			 * The number has non-bareword characters, so we   			 * must treat it as a number. @@ -2054,33 +2054,30 @@ ParseLexeme(  	}      } -    if (Tcl_UtfCharComplete(start, numBytes)) { -	scanned = Tcl_UtfToUniChar(start, &ch); -    } else { -	char utfBytes[TCL_UTF_MAX]; - -	memcpy(utfBytes, start, (size_t) numBytes); -	utfBytes[numBytes] = '\0'; -	scanned = Tcl_UtfToUniChar(utfBytes, &ch); -    } -    if (!isalnum(UCHAR(ch))) { -	*lexemePtr = INVALID; -	Tcl_DecrRefCount(literal); -	return scanned; -    } -    end = start; -    while (isalnum(UCHAR(ch)) || (UCHAR(ch) == '_')) { -	end += scanned; -	numBytes -= scanned; -	if (Tcl_UtfCharComplete(end, numBytes)) { -	    scanned = Tcl_UtfToUniChar(end, &ch); +    /* +     * We reject leading underscores in bareword.  No sensible reason why. +     * Might be inspired by reserved identifier rules in C, which of course +     * have no direct relevance here. +     */   + +    if (!TclIsBareword(*start) || *start == '_') { +	if (Tcl_UtfCharComplete(start, numBytes)) { +	    scanned = Tcl_UtfToUniChar(start, &ch);  	} else {  	    char utfBytes[TCL_UTF_MAX]; -	    memcpy(utfBytes, end, (size_t) numBytes); +	    memcpy(utfBytes, start, (size_t) numBytes);  	    utfBytes[numBytes] = '\0';  	    scanned = Tcl_UtfToUniChar(utfBytes, &ch);  	} +	*lexemePtr = INVALID; +	Tcl_DecrRefCount(literal); +	return scanned; +    } +    end = start; +    while (numBytes && TclIsBareword(*end)) { +	end += 1; +	numBytes -= 1;      }      *lexemePtr = BAREWORD;      if (literalPtr) { diff --git a/generic/tclInt.h b/generic/tclInt.h index 860c2a3..c989eda 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2987,6 +2987,7 @@ MODULE_SCOPE void	TclInitSubsystems(void);  MODULE_SCOPE int	TclInterpReady(Tcl_Interp *interp);  MODULE_SCOPE int	TclIsLocalScalar(const char *src, int len);  MODULE_SCOPE int	TclIsSpaceProc(char byte); +MODULE_SCOPE int	TclIsBareword(char byte);  MODULE_SCOPE Tcl_Obj *	TclJoinPath(int elements, Tcl_Obj * const objv[]);  MODULE_SCOPE int	TclJoinThread(Tcl_ThreadId id, int *result);  MODULE_SCOPE void	TclLimitRemoveAllHandlers(Tcl_Interp *interp); diff --git a/generic/tclParse.c b/generic/tclParse.c index ee0d4c4..ca12be5 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -621,6 +621,47 @@ TclIsSpaceProc(  /*   *----------------------------------------------------------------------   * + * TclIsBareword-- + * + *	Report whether byte is one that can be part of a "bareword". + *	This concept is named in expression parsing, where it determines + *	what can be a legal function name, but is the same definition used + *	in determining what variable names can be parsed as variable + *	substitutions without the benefit of enclosing braces.  The set of + *	ASCII chars that are accepted are the numeric chars ('0'-'9'), + *	the alphabetic chars ('a'-'z', 'A'-'Z')	and underscore ('_'). + * + * Results: + *	Returns 1, if byte is in the accepted set of chars, 0 otherwise. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +TclIsBareword( +    char byte) +{ +    if (byte < '0' || byte > 'z') { +	return 0; +    } +    if (byte <= '9' || byte >= 'a') { +	return 1; +    } +    if (byte == '_') { +	return 1; +    } +    if (byte < 'A' || byte > 'Z') { +	return 0; +    } +    return 1; +} + +/* + *---------------------------------------------------------------------- + *   * ParseWhiteSpace --   *   *	Scans up to numBytes bytes starting at src, consuming white space @@ -1346,9 +1387,7 @@ Tcl_ParseVarName(  {      Tcl_Token *tokenPtr;      register const char *src; -    unsigned char c; -    int varIndex, offset; -    Tcl_UniChar ch; +    int varIndex;      unsigned array;      if ((numBytes == 0) || (start == NULL)) { @@ -1431,22 +1470,12 @@ Tcl_ParseVarName(  	tokenPtr->numComponents = 0;  	while (numBytes) { -	    if (Tcl_UtfCharComplete(src, numBytes)) { -		offset = Tcl_UtfToUniChar(src, &ch); -	    } else { -		char utfBytes[TCL_UTF_MAX]; - -		memcpy(utfBytes, src, (size_t) numBytes); -		utfBytes[numBytes] = '\0'; -		offset = Tcl_UtfToUniChar(utfBytes, &ch); -	    } -	    c = UCHAR(ch); -	    if (isalnum(c) || (c == '_')) {	/* INTL: ISO only, UCHAR. */ -		src += offset; -		numBytes -= offset; +	    if (TclIsBareword(*src)) { +		src += 1; +		numBytes -= 1;  		continue;  	    } -	    if ((c == ':') && (numBytes != 1) && (src[1] == ':')) { +	    if ((src[0] == ':') && (numBytes != 1) && (src[1] == ':')) {  		src += 2;  		numBytes -= 2;  		while (numBytes && (*src == ':')) { diff --git a/tests/parse.test b/tests/parse.test index 5d8afeb..d73c725 100644 --- a/tests/parse.test +++ b/tests/parse.test @@ -663,6 +663,9 @@ test parse-12.24 {Tcl_ParseVarName procedure, missing close paren in array refer  test parse-12.25 {Tcl_ParseVarName procedure, nested array reference} testparser {      testparser {$x(a$y(b$z))} 0  } {- {$x(a$y(b$z))} 1 word {$x(a$y(b$z))} 8 variable {$x(a$y(b$z))} 7 text x 0 text a 0 variable {$y(b$z)} 4 text y 0 text b 0 variable {$z} 1 text z 0 {}} +test parse-12.26 {Tcl_ParseVarName [d2ffcca163] non-ascii} testparser { +    testparser "$\u0433" -1 +} "- {$\u0433} 1 word {$\u0433} 2 text {$} 0 text \u0433 0 {}"  test parse-13.1 {Tcl_ParseVar procedure} testparsevar {      set abc 24 diff --git a/tests/parseExpr.test b/tests/parseExpr.test index 5c7986a..ef05454 100644 --- a/tests/parseExpr.test +++ b/tests/parseExpr.test @@ -1064,6 +1064,15 @@ test parseExpr-22.18 {Bug 3401704} -constraints testexprparser -body {      dict get $o -errorcode  } -result {TCL PARSE EXPR BADNUMBER BINARY} +test parseExpr-22.19 {Bug d2ffcca163} -constraints testexprparser -body { +    testexprparser \u0433 -1 +} -returnCodes error -match glob -result {*invalid character*} +test parseExpr-22.20 {Bug d2ffcca163} -constraints testexprparser -body { +    testexprparser \u043f -1 +} -returnCodes error -match glob -result {*invalid character*} +test parseExpr-22.21 {Bug d2ffcca163} -constraints testexprparser -body { +    testexprparser in\u0433(0) -1 +} -returnCodes error -match glob -result {missing operand*}  # cleanup  cleanupTests | 
