summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2020-04-24 15:10:23 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2020-04-24 15:10:23 (GMT)
commit88661ac0e967d6c68f8ecdb76646b652214fe25b (patch)
treeacc60b1cd89061ad1e6245799aaefbaba21c2a08
parent423df1f681b111dff8a5633b5aaa6e0049aaeddf (diff)
downloadtcl-88661ac0e967d6c68f8ecdb76646b652214fe25b.zip
tcl-88661ac0e967d6c68f8ecdb76646b652214fe25b.tar.gz
tcl-88661ac0e967d6c68f8ecdb76646b652214fe25b.tar.bz2
Add protections against overflow in Unicode values. Backported from 8.6. Also remove some out-of-date comments.
-rw-r--r--generic/regc_lex.c22
-rw-r--r--generic/tclEncoding.c13
-rw-r--r--generic/tclInt.h2
-rw-r--r--generic/tclParse.c152
-rw-r--r--tests/encoding.test14
5 files changed, 115 insertions, 88 deletions
diff --git a/generic/regc_lex.c b/generic/regc_lex.c
index dab061a..f49c162 100644
--- a/generic/regc_lex.c
+++ b/generic/regc_lex.c
@@ -63,7 +63,7 @@
/*
- lexstart - set up lexical stuff, scan leading options
- ^ static VOID lexstart(struct vars *);
+ ^ static void lexstart(struct vars *);
*/
static void
lexstart(
@@ -89,7 +89,7 @@ lexstart(
/*
- prefixes - implement various special prefixes
- ^ static VOID prefixes(struct vars *);
+ ^ static void prefixes(struct vars *);
*/
static void
prefixes(
@@ -207,7 +207,7 @@ prefixes(
- lexnest - "call a subroutine", interpolating string at the lexical level
* Note, this is not a very general facility. There are a number of
* implicit assumptions about what sorts of strings can be subroutines.
- ^ static VOID lexnest(struct vars *, const chr *, const chr *);
+ ^ static void lexnest(struct vars *, const chr *, const chr *);
*/
static void
lexnest(
@@ -288,7 +288,7 @@ static const chr brbackw[] = { /* \w within brackets */
/*
- lexword - interpolate a bracket expression for word characters
* Possibly ought to inquire whether there is a "word" character class.
- ^ static VOID lexword(struct vars *);
+ ^ static void lexword(struct vars *);
*/
static void
lexword(
@@ -755,6 +755,7 @@ lexescape(
struct vars *v)
{
chr c;
+ int i;
static const chr alert[] = {
CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
};
@@ -831,17 +832,26 @@ lexescape(
RETV(PLAIN, CHR('\t'));
break;
case CHR('u'):
- c = lexdigits(v, 16, 4, 4);
+ c = (uchr) lexdigits(v, 16, 1, 4);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
RETV(PLAIN, c);
break;
case CHR('U'):
- c = lexdigits(v, 16, 8, 8);
+ i = lexdigits(v, 16, 8, 8);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
+#if CHRBITS > 16
+ if ((unsigned)i > 0x10FFFF) {
+ i = 0xFFFD;
+ }
+#else
+ if ((unsigned)i & ~0xFFFF) {
+ i = 0xFFFD;
+ }
+#endif
RETV(PLAIN, c);
break;
case CHR('v'):
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index ac9f667..0824c4f 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2471,11 +2471,6 @@ UtfToUnicodeProc(
break;
}
src += TclUtfToUniChar(src, &ch);
- /*
- * Need to handle this in a way that won't cause misalignment
- * by casting dst to a Tcl_UniChar. [Bug 1122671]
- * XXX: This hard-codes the assumed size of Tcl_UniChar as 2.
- */
#if TCL_UTF_MAX > 3
if (ch & ~0xFFFF) {
ch = 0xFFFD;
@@ -2685,12 +2680,8 @@ TableFromUtfProc(
len = TclUtfToUniChar(src, &ch);
#if TCL_UTF_MAX > 3
- /*
- * This prevents a crash condition. More evaluation is required for
- * full support of int Tcl_UniChar. [Bug 1004065]
- */
-
- if (ch & 0xffff0000) {
+ /* Unicode chars > +U0FFFF cannot be represented in any table encoding */
+ if (ch & ~0xFFFF) {
word = 0;
} else
#endif
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 3fa14e7..b6d6a88 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -2650,8 +2650,6 @@ MODULE_SCOPE int TclObjUnsetVar2(Tcl_Interp *interp,
Tcl_Obj *part1Ptr, Tcl_Obj *part2Ptr, int flags);
MODULE_SCOPE int TclParseBackslash(const char *src,
int numBytes, int *readPtr, char *dst);
-MODULE_SCOPE int TclParseHex(const char *src, int numBytes,
- Tcl_UniChar *resultPtr);
MODULE_SCOPE int TclParseNumber(Tcl_Interp *interp, Tcl_Obj *objPtr,
const char *expected, const char *bytes,
int numBytes, const char **endPtrPtr, int flags);
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 7bead99..cfd6337 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -176,19 +176,21 @@ static int ParseTokens(const char *src, int numBytes, int mask,
int flags, Tcl_Parse *parsePtr);
static int ParseWhiteSpace(const char *src, int numBytes,
int *incompletePtr, char *typePtr);
+static int ParseHex(const char *src, int numBytes,
+ int *resultPtr);
/*
*----------------------------------------------------------------------
*
* TclParseInit --
*
- * Initialize the fields of a Tcl_Parse struct.
+ * Initialize the fields of a Tcl_Parse struct.
*
* Results:
- * None.
+ * None.
*
* Side effects:
- * The Tcl_Parse struct pointed to by parsePtr gets initialized.
+ * The Tcl_Parse struct pointed to by parsePtr gets initialized.
*
*----------------------------------------------------------------------
*/
@@ -251,7 +253,7 @@ Tcl_ParseCommand(
* command terminator. If zero, then close
* bracket has no special meaning. */
register Tcl_Parse *parsePtr)
- /* Structure to fill in with information about
+ /* Structure to fill in with information about
* the parsed command; any previous
* information in the structure is ignored. */
{
@@ -496,9 +498,10 @@ Tcl_ParseCommand(
* tokens representing the expanded list.
*/
- CONST char *listStart;
+ const char *listStart;
int growthNeeded = wordIndex + 2*elemCount
- parsePtr->numTokens;
+
parsePtr->numWords += elemCount - 1;
if (growthNeeded > 0) {
TclGrowParseTokenArray(parsePtr, growthNeeded);
@@ -713,7 +716,7 @@ ParseWhiteSpace(
if (p[1] != '\n') {
break;
}
- p+=2;
+ p += 2;
if (--numBytes == 0) {
*incompletePtr = 1;
break;
@@ -761,7 +764,7 @@ TclParseAllWhiteSpace(
/*
*----------------------------------------------------------------------
*
- * TclParseHex --
+ * ParseHex --
*
* Scans a hexadecimal number as a Tcl_UniChar value (e.g., for parsing
* \x and \u escape sequences). At most numBytes bytes are scanned.
@@ -781,24 +784,24 @@ TclParseAllWhiteSpace(
*/
int
-TclParseHex(
+ParseHex(
const char *src, /* First character to parse. */
int numBytes, /* Max number of byes to scan */
- Tcl_UniChar *resultPtr) /* Points to storage provided by caller where
- * the Tcl_UniChar resulting from the
+ int *resultPtr) /* Points to storage provided by caller where
+ * the character resulting from the
* conversion is to be written. */
{
- Tcl_UniChar result = 0;
+ int result = 0;
register const char *p = src;
while (numBytes--) {
unsigned char digit = UCHAR(*p);
- if (!isxdigit(digit)) {
+ if (!isxdigit(digit) || (result > 0x10FFF)) {
break;
}
- ++p;
+ p++;
result <<= 4;
if (digit >= 'a') {
@@ -823,14 +826,14 @@ TclParseHex(
* sequence as defined by Tcl's parsing rules.
*
* Results:
- * Records at readPtr the number of bytes making up the backslash
- * sequence. Records at dst the UTF-8 encoded equivalent of that
- * backslash sequence. Returns the number of bytes written to dst, at
- * most TCL_UTF_MAX. Either readPtr or dst may be NULL, if the results
- * are not needed, but the return value is the same either way.
+ * Records at readPtr the number of bytes making up the backslash
+ * sequence. Records at dst the UTF-8 encoded equivalent of that
+ * backslash sequence. Returns the number of bytes written to dst, at
+ * most TCL_UTF_MAX. Either readPtr or dst may be NULL, if the results
+ * are not needed, but the return value is the same either way.
*
* Side effects:
- * None.
+ * None.
*
*----------------------------------------------------------------------
*/
@@ -848,7 +851,8 @@ TclParseBackslash(
* written there. */
{
register const char *p = src+1;
- Tcl_UniChar result;
+ Tcl_UniChar unichar = 0;
+ int result;
int count;
char buf[TCL_UTF_MAX];
@@ -876,7 +880,7 @@ TclParseBackslash(
count = 2;
switch (*p) {
/*
- * Note: in the conversions below, use absolute values (e.g., 0xa)
+ * Note: in the conversions below, use absolute values (e.g., 0xA)
* rather than symbolic values (e.g. \n) that get converted by the
* compiler. It's possible that compilers on some platforms will do
* the symbolic conversions differently, which could result in
@@ -890,22 +894,22 @@ TclParseBackslash(
result = 0x8;
break;
case 'f':
- result = 0xc;
+ result = 0xC;
break;
case 'n':
- result = 0xa;
+ result = 0xA;
break;
case 'r':
- result = 0xd;
+ result = 0xD;
break;
case 't':
result = 0x9;
break;
case 'v':
- result = 0xb;
+ result = 0xB;
break;
case 'x':
- count += TclParseHex(p+1, numBytes-2, &result);
+ count += ParseHex(p+1, numBytes-2, &result);
if (count == 2) {
/*
* No hexadigits -> This is just "x".
@@ -920,7 +924,7 @@ TclParseBackslash(
}
break;
case 'u':
- count += TclParseHex(p+1, (numBytes > 5) ? 4 : numBytes-2, &result);
+ count += ParseHex(p+1, (numBytes > 5) ? 4 : numBytes-2, &result);
if (count == 2) {
/*
* No hexadigits -> This is just "u".
@@ -928,6 +932,20 @@ TclParseBackslash(
result = 'u';
}
break;
+#if TCL_UTF_MAX > 3
+ case 'U':
+ count += ParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result);
+ if (count == 2) {
+ /*
+ * No hexadigits -> This is just "U".
+ */
+ result = 'U';
+ } else if ((result | 0x7FF) == 0xDFFF) {
+ /* Upper or lower surrogate, not allowed in this syntax. */
+ result = 0xFFFD;
+ }
+ break;
+#endif
case '\n':
count--;
do {
@@ -946,21 +964,21 @@ TclParseBackslash(
*/
if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */
- result = (unsigned char)(*p - '0');
+ result = UCHAR(*p - '0');
p++;
if ((numBytes == 2) || !isdigit(UCHAR(*p)) /* INTL: digit */
|| (UCHAR(*p) >= '8')) {
break;
}
count = 3;
- result = (unsigned char)((result << 3) + (*p - '0'));
+ result = UCHAR((result << 3) + (*p - '0'));
p++;
if ((numBytes == 3) || !isdigit(UCHAR(*p)) /* INTL: digit */
|| (UCHAR(*p) >= '8')) {
break;
}
count = 4;
- result = (unsigned char)((result << 3) + (*p - '0'));
+ result = UCHAR((result << 3) + (*p - '0'));
break;
}
@@ -972,14 +990,15 @@ TclParseBackslash(
*/
if (Tcl_UtfCharComplete(p, numBytes - 1)) {
- count = Tcl_UtfToUniChar(p, &result) + 1; /* +1 for '\' */
+ count = Tcl_UtfToUniChar(p, &unichar) + 1; /* +1 for '\' */
} else {
char utfBytes[TCL_UTF_MAX];
memcpy(utfBytes, p, (size_t) (numBytes - 1));
utfBytes[numBytes - 1] = '\0';
- count = Tcl_UtfToUniChar(utfBytes, &result) + 1;
+ count = Tcl_UtfToUniChar(utfBytes, &unichar) + 1;
}
+ result = unichar;
break;
}
@@ -987,7 +1006,7 @@ TclParseBackslash(
if (readPtr != NULL) {
*readPtr = count;
}
- return Tcl_UniCharToUtf((int) result, dst);
+ return Tcl_UniCharToUtf(result, dst);
}
/*
@@ -999,11 +1018,11 @@ TclParseBackslash(
* defined by Tcl's parsing rules.
*
* Results:
- * Records in parsePtr information about the parse. Returns the number of
- * bytes consumed.
+ * Records in parsePtr information about the parse. Returns the number of
+ * bytes consumed.
*
* Side effects:
- * None.
+ * None.
*
*----------------------------------------------------------------------
*/
@@ -1156,7 +1175,7 @@ ParseTokens(
}
/*
- * This is a variable reference. Call Tcl_ParseVarName to do all
+ * This is a variable reference. Call Tcl_ParseVarName to do all
* the dirty work of parsing the name.
*/
@@ -1180,7 +1199,7 @@ ParseTokens(
}
/*
- * Command substitution. Call Tcl_ParseCommand recursively (and
+ * Command substitution. Call Tcl_ParseCommand recursively (and
* repeatedly) to parse the nested command(s), then throw away the
* parse information.
*/
@@ -1191,7 +1210,7 @@ ParseTokens(
TclStackAlloc(parsePtr->interp, sizeof(Tcl_Parse));
while (1) {
const char *curEnd;
-
+
if (Tcl_ParseCommand(parsePtr->interp, src, numBytes, 1,
nestedPtr) != TCL_OK) {
parsePtr->errorType = nestedPtr->errorType;
@@ -1337,7 +1356,7 @@ Tcl_FreeParse(
* call to Tcl_ParseCommand. */
{
if (parsePtr->tokenPtr != parsePtr->staticTokens) {
- ckfree((char *) parsePtr->tokenPtr);
+ ckfree((char *)parsePtr->tokenPtr);
parsePtr->tokenPtr = parsePtr->staticTokens;
}
}
@@ -1661,7 +1680,7 @@ Tcl_ParseBraces(
* the string consists of all bytes up to the
* first null character. */
register Tcl_Parse *parsePtr,
- /* Structure to fill in with information about
+ /* Structure to fill in with information about
* the string. */
int append, /* Non-zero means append tokens to existing
* information in parsePtr; zero means ignore
@@ -1862,7 +1881,7 @@ Tcl_ParseQuotedString(
* the string consists of all bytes up to the
* first null character. */
register Tcl_Parse *parsePtr,
- /* Structure to fill in with information about
+ /* Structure to fill in with information about
* the string. */
int append, /* Non-zero means append tokens to existing
* information in parsePtr; zero means ignore
@@ -2167,13 +2186,13 @@ Tcl_SubstObj(
* non-TCL_OK completion code arises.
*
* Results:
- * The return value is a standard Tcl completion code. The result in
- * interp is the substituted value, or an error message if TCL_ERROR is
- * returned. If tokensLeftPtr is not NULL, then it points to an int where
- * the number of tokens remaining to be processed is written.
+ * The return value is a standard Tcl completion code. The result in
+ * interp is the substituted value, or an error message if TCL_ERROR is
+ * returned. If tokensLeftPtr is not NULL, then it points to an int where
+ * the number of tokens remaining to be processed is written.
*
* Side effects:
- * Can be anything, depending on the types of substitution done.
+ * Can be anything, depending on the types of substitution done.
*
*----------------------------------------------------------------------
*/
@@ -2212,8 +2231,8 @@ TclSubstTokens(
int code = TCL_OK;
#define NUM_STATIC_POS 20
int isLiteral, maxNumCL, numCL, i, adjust;
- int* clPosition = NULL;
- Interp* iPtr = (Interp*) interp;
+ int *clPosition = NULL;
+ Interp *iPtr = (Interp *) interp;
int inFile = iPtr->evalFlags & TCL_EVAL_FILE;
/*
@@ -2230,24 +2249,24 @@ TclSubstTokens(
* For the handling of continuation lines in literals we first check if
* this is actually a literal. For if not we can forego the additional
* processing. Otherwise we pre-allocate a small table to store the
- * locations of all continuation lines we find in this literal, if
- * any. The table is extended if needed.
+ * locations of all continuation lines we find in this literal, if any.
+ * The table is extended if needed.
*/
- numCL = 0;
- maxNumCL = 0;
+ numCL = 0;
+ maxNumCL = 0;
isLiteral = 1;
for (i=0 ; i < count; i++) {
- if ((tokenPtr[i].type != TCL_TOKEN_TEXT) &&
- (tokenPtr[i].type != TCL_TOKEN_BS)) {
+ if ((tokenPtr[i].type != TCL_TOKEN_TEXT)
+ && (tokenPtr[i].type != TCL_TOKEN_BS)) {
isLiteral = 0;
break;
}
}
if (isLiteral) {
- maxNumCL = NUM_STATIC_POS;
- clPosition = (int*) ckalloc (maxNumCL*sizeof(int));
+ maxNumCL = NUM_STATIC_POS;
+ clPosition = (int *)ckalloc(maxNumCL * sizeof(int));
}
adjust = 0;
@@ -2268,6 +2287,7 @@ TclSubstTokens(
appendByteLength = TclParseBackslash(tokenPtr->start,
tokenPtr->size, NULL, utfCharBytes);
append = utfCharBytes;
+
/*
* If the backslash sequence we found is in a literal, and
* represented a continuation line, we compute and store its
@@ -2287,6 +2307,7 @@ TclSubstTokens(
(tokenPtr->start[1] == '\n')) {
if (isLiteral) {
int clPos;
+
if (result == 0) {
clPos = 0;
} else {
@@ -2295,13 +2316,13 @@ TclSubstTokens(
if (numCL >= maxNumCL) {
maxNumCL *= 2;
- clPosition = (int*) ckrealloc ((char*)clPosition,
+ clPosition = (int *)ckrealloc ((char*)clPosition,
maxNumCL*sizeof(int));
}
clPosition[numCL] = clPos;
- numCL ++;
+ numCL++;
}
- adjust ++;
+ adjust++;
}
break;
@@ -2316,8 +2337,9 @@ TclSubstTokens(
*/
int theline;
- TclAdvanceContinuations (&line, &clNextOuter,
- tokenPtr->start - outerScript);
+
+ TclAdvanceContinuations(&line, &clNextOuter,
+ tokenPtr->start - outerScript);
theline = line + adjust;
/* TIP #280: Transfer line information to nested command */
code = TclEvalEx(interp, tokenPtr->start+1, tokenPtr->size-2,
@@ -2326,7 +2348,8 @@ TclSubstTokens(
* Restore flag reset by nested eval for future bracketed
* commands and their cmdframe setup
*/
- if (inFile) {
+
+ if (inFile) {
iPtr->evalFlags |= TCL_EVAL_FILE;
}
}
@@ -2429,6 +2452,7 @@ TclSubstTokens(
if (code != TCL_ERROR) { /* Keep error message in result! */
if (result != NULL) {
Tcl_SetObjResult(interp, result);
+
/*
* If the code found continuation lines (which implies that this
* word is a literal), then we store the accumulated table of
@@ -2447,7 +2471,7 @@ TclSubstTokens(
*/
if (maxNumCL) {
- ckfree ((char*) clPosition);
+ ckfree((char*) clPosition);
}
} else {
Tcl_ResetResult(interp);
diff --git a/tests/encoding.test b/tests/encoding.test
index 8722a93..c259327 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -31,6 +31,7 @@ proc runtests {} {
# Some tests require the testencoding command
testConstraint testencoding [llength [info commands testencoding]]
testConstraint exec [llength [info commands exec]]
+testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}]
# TclInitEncodingSubsystem is tested by the rest of this file
# TclFinalizeEncodingSubsystem is not currently tested
@@ -267,16 +268,16 @@ test encoding-11.6 {LoadEncodingFile: invalid file} {testencoding} {
# OpenEncodingFile is fully tested by the rest of the tests in this file.
test encoding-12.1 {LoadTableEncoding: normal encoding} {
- set x [encoding convertto iso8859-3 \u120]
- append x [encoding convertto iso8859-3 \ud5]
- append x [encoding convertfrom iso8859-3 \xd5]
+ set x [encoding convertto iso8859-3 \u0120]
+ append x [encoding convertto iso8859-3 \xD5]
+ append x [encoding convertfrom iso8859-3 \xD5]
} "\xd5?\u120"
test encoding-12.2 {LoadTableEncoding: single-byte encoding} {
set x [encoding convertto iso8859-3 ab\u0120g]
- append x [encoding convertfrom iso8859-3 ab\xd5g]
+ append x [encoding convertfrom iso8859-3 ab\xD5g]
} "ab\xd5gab\u120g"
test encoding-12.3 {LoadTableEncoding: multi-byte encoding} {
- set x [encoding convertto shiftjis ab\u4e4eg]
+ set x [encoding convertto shiftjis ab\u4E4Eg]
append x [encoding convertfrom shiftjis ab\x8c\xc1g]
} "ab\x8c\xc1gab\u4e4eg"
test encoding-12.4 {LoadTableEncoding: double-byte encoding} {
@@ -288,6 +289,9 @@ test encoding-12.5 {LoadTableEncoding: symbol encoding} {
append x [encoding convertto symbol \u67]
append x [encoding convertfrom symbol \x67]
} "\x67\x67\u3b3"
+test encoding-12.6 {LoadTableEncoding: overflow in char value} fullutf {
+ encoding convertto iso8859-3 \U01000
+} "?"
test encoding-13.1 {LoadEscapeTable} {
viewable [set x [encoding convertto iso2022 ab\u4e4e\u68d9g]]