From 61855b36b423d2293d99ccb6381e6622980e2cd9 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Tue, 25 Nov 2014 21:24:06 +0000
Subject: One way to fix the parser of $-substitution accepting non-ASCII
 varnames.

---
 generic/tclParse.c | 3 +++
 tests/parse.test   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/generic/tclParse.c b/generic/tclParse.c
index e475fb8..1523eb3 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -1438,6 +1438,9 @@ Tcl_ParseVarName(
 		offset = Tcl_UtfToUniChar(utfBytes, &ch);
 	    }
 	    c = UCHAR(ch);
+	    if (c != ch) {
+		break;
+	    }
 	    if (isalnum(c) || (c == '_')) {	/* INTL: ISO only, UCHAR. */
 		src += offset;
 		numBytes -= offset;
diff --git a/tests/parse.test b/tests/parse.test
index d7de5ff..cd02386 100644
--- a/tests/parse.test
+++ b/tests/parse.test
@@ -656,6 +656,9 @@ test parse-12.24 {Tcl_ParseVarName procedure, missing close paren in array refer
 test parse-12.25 {Tcl_ParseVarName procedure, nested array reference} testparser {
     testparser {$x(a$y(b$z))} 0
 } {- {$x(a$y(b$z))} 1 word {$x(a$y(b$z))} 8 variable {$x(a$y(b$z))} 7 text x 0 text a 0 variable {$y(b$z)} 4 text y 0 text b 0 variable {$z} 1 text z 0 {}}
+test parse-12.26 {Tcl_ParseVarName [d2ffcca163] non-ascii} testparser {
+    testparser "$\u0433" -1
+} "- {$\u0433} 1 word {$\u0433} 2 text {$} 0 text \u0433 0 {}"
 
 test parse-13.1 {Tcl_ParseVar procedure} testparsevar {
     set abc 24
-- 
cgit v0.12


From 63678d528b5d4384d8a6fd8941b73888399227e5 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Wed, 26 Nov 2014 16:22:40 +0000
Subject: I like this patch better.  Retain the byte orientation of the parser.

---
 generic/tclParse.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/generic/tclParse.c b/generic/tclParse.c
index 1523eb3..90ec43d 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -1344,8 +1344,7 @@ Tcl_ParseVarName(
     Tcl_Token *tokenPtr;
     register const char *src;
     unsigned char c;
-    int varIndex, offset;
-    Tcl_UniChar ch;
+    int varIndex;
     unsigned array;
 
     if ((numBytes == 0) || (start == NULL)) {
@@ -1428,22 +1427,10 @@ Tcl_ParseVarName(
 	tokenPtr->numComponents = 0;
 
 	while (numBytes) {
-	    if (Tcl_UtfCharComplete(src, numBytes)) {
-		offset = Tcl_UtfToUniChar(src, &ch);
-	    } else {
-		char utfBytes[TCL_UTF_MAX];
-
-		memcpy(utfBytes, src, (size_t) numBytes);
-		utfBytes[numBytes] = '\0';
-		offset = Tcl_UtfToUniChar(utfBytes, &ch);
-	    }
-	    c = UCHAR(ch);
-	    if (c != ch) {
-		break;
-	    }
+	    c = UCHAR(*src);
 	    if (isalnum(c) || (c == '_')) {	/* INTL: ISO only, UCHAR. */
-		src += offset;
-		numBytes -= offset;
+		src += 1;
+		numBytes -= 1;
 		continue;
 	    }
 	    if ((c == ':') && (numBytes != 1) && (src[1] == ':')) {
-- 
cgit v0.12


From f12dc54f7578abc9e2d5f625157263ec5a0bc40e Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Wed, 26 Nov 2014 17:00:55 +0000
Subject: Same issue in expr parser also tested and fixed.

---
 generic/tclCompExpr.c | 32 ++++++++++++--------------------
 tests/parseExpr.test  |  6 ++++++
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c
index 9142e2b..dde4e56 100644
--- a/generic/tclCompExpr.c
+++ b/generic/tclCompExpr.c
@@ -1969,31 +1969,23 @@ ParseLexeme(
 	}
     }
 
-    if (Tcl_UtfCharComplete(start, numBytes)) {
-	scanned = Tcl_UtfToUniChar(start, &ch);
-    } else {
-	char utfBytes[TCL_UTF_MAX];
-	memcpy(utfBytes, start, (size_t) numBytes);
-	utfBytes[numBytes] = '\0';
-	scanned = Tcl_UtfToUniChar(utfBytes, &ch);
-    }
-    if (!isalnum(UCHAR(ch))) {
-	*lexemePtr = INVALID;
-	Tcl_DecrRefCount(literal);
-	return scanned;
-    }
-    end = start;
-    while (isalnum(UCHAR(ch)) || (UCHAR(ch) == '_')) {
-	end += scanned;
-	numBytes -= scanned;
-	if (Tcl_UtfCharComplete(end, numBytes)) {
-	    scanned = Tcl_UtfToUniChar(end, &ch);
+    if (!isalnum(UCHAR(*start))) {
+	if (Tcl_UtfCharComplete(start, numBytes)) {
+	    scanned = Tcl_UtfToUniChar(start, &ch);
 	} else {
 	    char utfBytes[TCL_UTF_MAX];
-	    memcpy(utfBytes, end, (size_t) numBytes);
+	    memcpy(utfBytes, start, (size_t) numBytes);
 	    utfBytes[numBytes] = '\0';
 	    scanned = Tcl_UtfToUniChar(utfBytes, &ch);
 	}
+	*lexemePtr = INVALID;
+	Tcl_DecrRefCount(literal);
+	return scanned;
+    }
+    end = start;
+    while (numBytes && (isalnum(UCHAR(*end)) || (UCHAR(*end) == '_'))) {
+	end += 1;
+	numBytes -= 1;
     }
     *lexemePtr = BAREWORD;
     if (literalPtr) {
diff --git a/tests/parseExpr.test b/tests/parseExpr.test
index c1c489b..3e0df29 100644
--- a/tests/parseExpr.test
+++ b/tests/parseExpr.test
@@ -1051,6 +1051,12 @@ test parseExpr-22.18 {Bug 3401704} -constraints testexprparser -body {
     testexprparser 0b02 -1
 } -returnCodes error -match glob -result {*invalid binary number*}
 
+test parseExpr-22.19 {Bug d2ffcca163} -constraints testexprparser -body {
+    testexprparser \u0433 -1
+} -returnCodes error -match glob -result {*invalid character*}
+test parseExpr-22.20 {Bug d2ffcca163} -constraints testexprparser -body {
+    testexprparser \u043f -1
+} -returnCodes error -match glob -result {*invalid character*}
 
 # cleanup
 cleanupTests
-- 
cgit v0.12


From f9fb8c8daf918e57d3a50a0428623a5e5c260e70 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 4 Dec 2014 18:26:31 +0000
Subject: Stop using isalnum(.).  Its results are not portable.  Replace with
 our own private routine TclIsBareword() that does exactly what we want.

---
 generic/tclCompExpr.c | 15 ++++++++++-----
 generic/tclInt.h      |  1 +
 generic/tclParse.c    | 47 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c
index dde4e56..2470931 100644
--- a/generic/tclCompExpr.c
+++ b/generic/tclCompExpr.c
@@ -1920,8 +1920,7 @@ ParseLexeme(
     literal = Tcl_NewObj();
     if (TclParseNumber(NULL, literal, NULL, start, numBytes, &end,
 	    TCL_PARSE_NO_WHITESPACE) == TCL_OK) {
-	if (end < start + numBytes && !isalnum(UCHAR(*end))
-		&& UCHAR(*end) != '_') {
+	if (end < start + numBytes && !TclIsBareword(*end)) {
 	
 	number:
 	    TclInitStringRep(literal, start, end-start);
@@ -1945,7 +1944,7 @@ ParseLexeme(
 	    if (literal->typePtr == &tclDoubleType) {
 		const char *p = start;
 		while (p < end) {
-		    if (!isalnum(UCHAR(*p++))) {
+		    if (!TclIsBareword(*p++)) {
 			/*
 			 * The number has non-bareword characters, so we 
 			 * must treat it as a number.
@@ -1969,7 +1968,13 @@ ParseLexeme(
 	}
     }
 
-    if (!isalnum(UCHAR(*start))) {
+    /*
+     * We reject leading underscores in bareword.  No sensible reason why.
+     * Might be inspired by reserved identifier rules in C, which of course
+     * have no direct relevance here.
+     */  
+
+    if (!TclIsBareword(*start) || *start == '_') {
 	if (Tcl_UtfCharComplete(start, numBytes)) {
 	    scanned = Tcl_UtfToUniChar(start, &ch);
 	} else {
@@ -1983,7 +1988,7 @@ ParseLexeme(
 	return scanned;
     }
     end = start;
-    while (numBytes && (isalnum(UCHAR(*end)) || (UCHAR(*end) == '_'))) {
+    while (numBytes && TclIsBareword(*end)) {
 	end += 1;
 	numBytes -= 1;
     }
diff --git a/generic/tclInt.h b/generic/tclInt.h
index dd66d76..255ee23 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -2608,6 +2608,7 @@ MODULE_SCOPE void	TclInitSubsystems(void);
 MODULE_SCOPE int	TclInterpReady(Tcl_Interp *interp);
 MODULE_SCOPE int	TclIsLocalScalar(const char *src, int len);
 MODULE_SCOPE int	TclIsSpaceProc(char byte);
+MODULE_SCOPE int	TclIsBareword(char byte);
 MODULE_SCOPE int	TclJoinThread(Tcl_ThreadId id, int *result);
 MODULE_SCOPE void	TclLimitRemoveAllHandlers(Tcl_Interp *interp);
 MODULE_SCOPE Tcl_Obj *	TclLindexList(Tcl_Interp *interp,
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 90ec43d..025304c 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -628,6 +628,47 @@ TclIsSpaceProc(
 /*
  *----------------------------------------------------------------------
  *
+ * TclIsBareword--
+ *
+ *	Report whether byte is one that can be part of a "bareword".
+ *	This concept is named in expression parsing, where it determines
+ *	what can be a legal function name, but is the same definition used
+ *	in determining what variable names can be parsed as variable
+ *	substitutions without the benefit of enclosing braces.  The set of
+ *	ASCII chars that are accepted are the numeric chars ('0'-'9'),
+ *	the alphabetic chars ('a'-'z', 'A'-'Z')	and underscore ('_').
+ *
+ * Results:
+ *	Returns 1, if byte is in the accepted set of chars, 0 otherwise.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclIsBareword(
+    char byte)
+{
+    if (byte < '0' || byte > 'z') {
+	return 0;
+    }
+    if (byte <= '9' || byte >= 'a') {
+	return 1;
+    }
+    if (byte == '_') {
+	return 1;
+    }
+    if (byte < 'A' || byte > 'Z') {
+	return 0;
+    }
+    return 1;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
  * ParseWhiteSpace --
  *
  *	Scans up to numBytes bytes starting at src, consuming white space
@@ -1343,7 +1384,6 @@ Tcl_ParseVarName(
 {
     Tcl_Token *tokenPtr;
     register const char *src;
-    unsigned char c;
     int varIndex;
     unsigned array;
 
@@ -1427,13 +1467,12 @@ Tcl_ParseVarName(
 	tokenPtr->numComponents = 0;
 
 	while (numBytes) {
-	    c = UCHAR(*src);
-	    if (isalnum(c) || (c == '_')) {	/* INTL: ISO only, UCHAR. */
+	    if (TclIsBareword(*src)) {
 		src += 1;
 		numBytes -= 1;
 		continue;
 	    }
-	    if ((c == ':') && (numBytes != 1) && (src[1] == ':')) {
+	    if ((src[0] == ':') && (numBytes != 1) && (src[1] == ':')) {
 		src += 2;
 		numBytes -= 2;
 		while (numBytes && (*src == ':')) {
-- 
cgit v0.12


From c4c311b0e2ca10bc95a35e3f60d9a939b55c26e4 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 4 Dec 2014 20:45:11 +0000
Subject: The isalpha(.) calls remaining in the expr parser still bring
 nonportability. Commit a test that demonstrates that.

---
 tests/parseExpr.test | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/parseExpr.test b/tests/parseExpr.test
index 3e0df29..c3b0d71 100644
--- a/tests/parseExpr.test
+++ b/tests/parseExpr.test
@@ -1057,6 +1057,9 @@ test parseExpr-22.19 {Bug d2ffcca163} -constraints testexprparser -body {
 test parseExpr-22.20 {Bug d2ffcca163} -constraints testexprparser -body {
     testexprparser \u043f -1
 } -returnCodes error -match glob -result {*invalid character*}
+test parseExpr-22.21 {Bug d2ffcca163} -constraints testexprparser -body {
+    testexprparser in\u0433(0) -1
+} -returnCodes error -match glob -result {missing operand*}
 
 # cleanup
 cleanupTests
-- 
cgit v0.12


From 640e5762bc7e0aa585f1135dde28e2fa9434e79e Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Thu, 4 Dec 2014 21:29:18 +0000
Subject: Limit isalpha(.) calls in the expr parser to only apply to known
 ASCII arguments to make the results portable.

---
 generic/tclCompExpr.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c
index 2470931..7b67970 100644
--- a/generic/tclCompExpr.c
+++ b/generic/tclCompExpr.c
@@ -1883,7 +1883,7 @@ ParseLexeme(
 
     case 'i':
 	if ((numBytes > 1) && (start[1] == 'n')
-		&& ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {
+		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
 
 	    /*
 	     * Must make this check so we can tell the difference between
@@ -1898,14 +1898,15 @@ ParseLexeme(
 
     case 'e':
 	if ((numBytes > 1) && (start[1] == 'q')
-		&& ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {
+		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
 	    *lexemePtr = STREQ;
 	    return 2;
 	}
 	break;
 
     case 'n':
-	if ((numBytes > 1) && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {
+	if ((numBytes > 1)
+		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
 	    switch (start[1]) {
 	    case 'e':
 		*lexemePtr = STRNEQ;
-- 
cgit v0.12