diff options
| author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-04-24 15:10:23 (GMT) |
|---|---|---|
| committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-04-24 15:10:23 (GMT) |
| commit | 88661ac0e967d6c68f8ecdb76646b652214fe25b (patch) | |
| tree | acc60b1cd89061ad1e6245799aaefbaba21c2a08 | |
| parent | 423df1f681b111dff8a5633b5aaa6e0049aaeddf (diff) | |
| download | tcl-88661ac0e967d6c68f8ecdb76646b652214fe25b.zip tcl-88661ac0e967d6c68f8ecdb76646b652214fe25b.tar.gz tcl-88661ac0e967d6c68f8ecdb76646b652214fe25b.tar.bz2 | |
Add protections against overflow in Unicode values. Backported from 8.6. Also remove some out-of-date comments.
| -rw-r--r-- | generic/regc_lex.c | 22 | ||||
| -rw-r--r-- | generic/tclEncoding.c | 13 | ||||
| -rw-r--r-- | generic/tclInt.h | 2 | ||||
| -rw-r--r-- | generic/tclParse.c | 152 | ||||
| -rw-r--r-- | tests/encoding.test | 14 |
5 files changed, 115 insertions, 88 deletions
diff --git a/generic/regc_lex.c b/generic/regc_lex.c index dab061a..f49c162 100644 --- a/generic/regc_lex.c +++ b/generic/regc_lex.c @@ -63,7 +63,7 @@ /* - lexstart - set up lexical stuff, scan leading options - ^ static VOID lexstart(struct vars *); + ^ static void lexstart(struct vars *); */ static void lexstart( @@ -89,7 +89,7 @@ lexstart( /* - prefixes - implement various special prefixes - ^ static VOID prefixes(struct vars *); + ^ static void prefixes(struct vars *); */ static void prefixes( @@ -207,7 +207,7 @@ prefixes( - lexnest - "call a subroutine", interpolating string at the lexical level * Note, this is not a very general facility. There are a number of * implicit assumptions about what sorts of strings can be subroutines. - ^ static VOID lexnest(struct vars *, const chr *, const chr *); + ^ static void lexnest(struct vars *, const chr *, const chr *); */ static void lexnest( @@ -288,7 +288,7 @@ static const chr brbackw[] = { /* \w within brackets */ /* - lexword - interpolate a bracket expression for word characters * Possibly ought to inquire whether there is a "word" character class. - ^ static VOID lexword(struct vars *); + ^ static void lexword(struct vars *); */ static void lexword( @@ -755,6 +755,7 @@ lexescape( struct vars *v) { chr c; + int i; static const chr alert[] = { CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') }; @@ -831,17 +832,26 @@ lexescape( RETV(PLAIN, CHR('\t')); break; case CHR('u'): - c = lexdigits(v, 16, 4, 4); + c = (uchr) lexdigits(v, 16, 1, 4); if (ISERR()) { FAILW(REG_EESCAPE); } RETV(PLAIN, c); break; case CHR('U'): - c = lexdigits(v, 16, 8, 8); + i = lexdigits(v, 16, 8, 8); if (ISERR()) { FAILW(REG_EESCAPE); } +#if CHRBITS > 16 + if ((unsigned)i > 0x10FFFF) { + i = 0xFFFD; + } +#else + if ((unsigned)i & ~0xFFFF) { + i = 0xFFFD; + } +#endif RETV(PLAIN, c); break; case CHR('v'): diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index ac9f667..0824c4f 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2471,11 +2471,6 @@ UtfToUnicodeProc( break; } src += TclUtfToUniChar(src, &ch); - /* - * Need to handle this in a way that won't cause misalignment - * by casting dst to a Tcl_UniChar. [Bug 1122671] - * XXX: This hard-codes the assumed size of Tcl_UniChar as 2. - */ #if TCL_UTF_MAX > 3 if (ch & ~0xFFFF) { ch = 0xFFFD; @@ -2685,12 +2680,8 @@ TableFromUtfProc( len = TclUtfToUniChar(src, &ch); #if TCL_UTF_MAX > 3 - /* - * This prevents a crash condition. More evaluation is required for - * full support of int Tcl_UniChar. [Bug 1004065] - */ - - if (ch & 0xffff0000) { + /* Unicode chars > +U0FFFF cannot be represented in any table encoding */ + if (ch & ~0xFFFF) { word = 0; } else #endif diff --git a/generic/tclInt.h b/generic/tclInt.h index 3fa14e7..b6d6a88 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2650,8 +2650,6 @@ MODULE_SCOPE int TclObjUnsetVar2(Tcl_Interp *interp, Tcl_Obj *part1Ptr, Tcl_Obj *part2Ptr, int flags); MODULE_SCOPE int TclParseBackslash(const char *src, int numBytes, int *readPtr, char *dst); -MODULE_SCOPE int TclParseHex(const char *src, int numBytes, - Tcl_UniChar *resultPtr); MODULE_SCOPE int TclParseNumber(Tcl_Interp *interp, Tcl_Obj *objPtr, const char *expected, const char *bytes, int numBytes, const char **endPtrPtr, int flags); diff --git a/generic/tclParse.c b/generic/tclParse.c index 7bead99..cfd6337 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -176,19 +176,21 @@ static int ParseTokens(const char *src, int numBytes, int mask, int flags, Tcl_Parse *parsePtr); static int ParseWhiteSpace(const char *src, int numBytes, int *incompletePtr, char *typePtr); +static int ParseHex(const char *src, int numBytes, + int *resultPtr); /* *---------------------------------------------------------------------- * * TclParseInit -- * - * Initialize the fields of a Tcl_Parse struct. + * Initialize the fields of a Tcl_Parse struct. * * Results: - * None. + * None. * * Side effects: - * The Tcl_Parse struct pointed to by parsePtr gets initialized. + * The Tcl_Parse struct pointed to by parsePtr gets initialized. * *---------------------------------------------------------------------- */ @@ -251,7 +253,7 @@ Tcl_ParseCommand( * command terminator. If zero, then close * bracket has no special meaning. */ register Tcl_Parse *parsePtr) - /* Structure to fill in with information about + /* Structure to fill in with information about * the parsed command; any previous * information in the structure is ignored. */ { @@ -496,9 +498,10 @@ Tcl_ParseCommand( * tokens representing the expanded list. */ - CONST char *listStart; + const char *listStart; int growthNeeded = wordIndex + 2*elemCount - parsePtr->numTokens; + parsePtr->numWords += elemCount - 1; if (growthNeeded > 0) { TclGrowParseTokenArray(parsePtr, growthNeeded); @@ -713,7 +716,7 @@ ParseWhiteSpace( if (p[1] != '\n') { break; } - p+=2; + p += 2; if (--numBytes == 0) { *incompletePtr = 1; break; @@ -761,7 +764,7 @@ TclParseAllWhiteSpace( /* *---------------------------------------------------------------------- * - * TclParseHex -- + * ParseHex -- * * Scans a hexadecimal number as a Tcl_UniChar value (e.g., for parsing * \x and \u escape sequences). At most numBytes bytes are scanned. @@ -781,24 +784,24 @@ TclParseAllWhiteSpace( */ int -TclParseHex( +ParseHex( const char *src, /* First character to parse. */ int numBytes, /* Max number of byes to scan */ - Tcl_UniChar *resultPtr) /* Points to storage provided by caller where - * the Tcl_UniChar resulting from the + int *resultPtr) /* Points to storage provided by caller where + * the character resulting from the * conversion is to be written. */ { - Tcl_UniChar result = 0; + int result = 0; register const char *p = src; while (numBytes--) { unsigned char digit = UCHAR(*p); - if (!isxdigit(digit)) { + if (!isxdigit(digit) || (result > 0x10FFF)) { break; } - ++p; + p++; result <<= 4; if (digit >= 'a') { @@ -823,14 +826,14 @@ TclParseHex( * sequence as defined by Tcl's parsing rules. * * Results: - * Records at readPtr the number of bytes making up the backslash - * sequence. Records at dst the UTF-8 encoded equivalent of that - * backslash sequence. Returns the number of bytes written to dst, at - * most TCL_UTF_MAX. Either readPtr or dst may be NULL, if the results - * are not needed, but the return value is the same either way. + * Records at readPtr the number of bytes making up the backslash + * sequence. Records at dst the UTF-8 encoded equivalent of that + * backslash sequence. Returns the number of bytes written to dst, at + * most TCL_UTF_MAX. Either readPtr or dst may be NULL, if the results + * are not needed, but the return value is the same either way. * * Side effects: - * None. + * None. * *---------------------------------------------------------------------- */ @@ -848,7 +851,8 @@ TclParseBackslash( * written there. */ { register const char *p = src+1; - Tcl_UniChar result; + Tcl_UniChar unichar = 0; + int result; int count; char buf[TCL_UTF_MAX]; @@ -876,7 +880,7 @@ TclParseBackslash( count = 2; switch (*p) { /* - * Note: in the conversions below, use absolute values (e.g., 0xa) + * Note: in the conversions below, use absolute values (e.g., 0xA) * rather than symbolic values (e.g. \n) that get converted by the * compiler. It's possible that compilers on some platforms will do * the symbolic conversions differently, which could result in @@ -890,22 +894,22 @@ TclParseBackslash( result = 0x8; break; case 'f': - result = 0xc; + result = 0xC; break; case 'n': - result = 0xa; + result = 0xA; break; case 'r': - result = 0xd; + result = 0xD; break; case 't': result = 0x9; break; case 'v': - result = 0xb; + result = 0xB; break; case 'x': - count += TclParseHex(p+1, numBytes-2, &result); + count += ParseHex(p+1, numBytes-2, &result); if (count == 2) { /* * No hexadigits -> This is just "x". @@ -920,7 +924,7 @@ TclParseBackslash( } break; case 'u': - count += TclParseHex(p+1, (numBytes > 5) ? 4 : numBytes-2, &result); + count += ParseHex(p+1, (numBytes > 5) ? 4 : numBytes-2, &result); if (count == 2) { /* * No hexadigits -> This is just "u". @@ -928,6 +932,20 @@ TclParseBackslash( result = 'u'; } break; +#if TCL_UTF_MAX > 3 + case 'U': + count += ParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result); + if (count == 2) { + /* + * No hexadigits -> This is just "U". + */ + result = 'U'; + } else if ((result | 0x7FF) == 0xDFFF) { + /* Upper or lower surrogate, not allowed in this syntax. */ + result = 0xFFFD; + } + break; +#endif case '\n': count--; do { @@ -946,21 +964,21 @@ TclParseBackslash( */ if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */ - result = (unsigned char)(*p - '0'); + result = UCHAR(*p - '0'); p++; if ((numBytes == 2) || !isdigit(UCHAR(*p)) /* INTL: digit */ || (UCHAR(*p) >= '8')) { break; } count = 3; - result = (unsigned char)((result << 3) + (*p - '0')); + result = UCHAR((result << 3) + (*p - '0')); p++; if ((numBytes == 3) || !isdigit(UCHAR(*p)) /* INTL: digit */ || (UCHAR(*p) >= '8')) { break; } count = 4; - result = (unsigned char)((result << 3) + (*p - '0')); + result = UCHAR((result << 3) + (*p - '0')); break; } @@ -972,14 +990,15 @@ TclParseBackslash( */ if (Tcl_UtfCharComplete(p, numBytes - 1)) { - count = Tcl_UtfToUniChar(p, &result) + 1; /* +1 for '\' */ + count = Tcl_UtfToUniChar(p, &unichar) + 1; /* +1 for '\' */ } else { char utfBytes[TCL_UTF_MAX]; memcpy(utfBytes, p, (size_t) (numBytes - 1)); utfBytes[numBytes - 1] = '\0'; - count = Tcl_UtfToUniChar(utfBytes, &result) + 1; + count = Tcl_UtfToUniChar(utfBytes, &unichar) + 1; } + result = unichar; break; } @@ -987,7 +1006,7 @@ TclParseBackslash( if (readPtr != NULL) { *readPtr = count; } - return Tcl_UniCharToUtf((int) result, dst); + return Tcl_UniCharToUtf(result, dst); } /* @@ -999,11 +1018,11 @@ TclParseBackslash( * defined by Tcl's parsing rules. * * Results: - * Records in parsePtr information about the parse. Returns the number of - * bytes consumed. + * Records in parsePtr information about the parse. Returns the number of + * bytes consumed. * * Side effects: - * None. + * None. * *---------------------------------------------------------------------- */ @@ -1156,7 +1175,7 @@ ParseTokens( } /* - * This is a variable reference. Call Tcl_ParseVarName to do all + * This is a variable reference. Call Tcl_ParseVarName to do all * the dirty work of parsing the name. */ @@ -1180,7 +1199,7 @@ ParseTokens( } /* - * Command substitution. Call Tcl_ParseCommand recursively (and + * Command substitution. Call Tcl_ParseCommand recursively (and * repeatedly) to parse the nested command(s), then throw away the * parse information. */ @@ -1191,7 +1210,7 @@ ParseTokens( TclStackAlloc(parsePtr->interp, sizeof(Tcl_Parse)); while (1) { const char *curEnd; - + if (Tcl_ParseCommand(parsePtr->interp, src, numBytes, 1, nestedPtr) != TCL_OK) { parsePtr->errorType = nestedPtr->errorType; @@ -1337,7 +1356,7 @@ Tcl_FreeParse( * call to Tcl_ParseCommand. */ { if (parsePtr->tokenPtr != parsePtr->staticTokens) { - ckfree((char *) parsePtr->tokenPtr); + ckfree((char *)parsePtr->tokenPtr); parsePtr->tokenPtr = parsePtr->staticTokens; } } @@ -1661,7 +1680,7 @@ Tcl_ParseBraces( * the string consists of all bytes up to the * first null character. */ register Tcl_Parse *parsePtr, - /* Structure to fill in with information about + /* Structure to fill in with information about * the string. */ int append, /* Non-zero means append tokens to existing * information in parsePtr; zero means ignore @@ -1862,7 +1881,7 @@ Tcl_ParseQuotedString( * the string consists of all bytes up to the * first null character. */ register Tcl_Parse *parsePtr, - /* Structure to fill in with information about + /* Structure to fill in with information about * the string. */ int append, /* Non-zero means append tokens to existing * information in parsePtr; zero means ignore @@ -2167,13 +2186,13 @@ Tcl_SubstObj( * non-TCL_OK completion code arises. * * Results: - * The return value is a standard Tcl completion code. The result in - * interp is the substituted value, or an error message if TCL_ERROR is - * returned. If tokensLeftPtr is not NULL, then it points to an int where - * the number of tokens remaining to be processed is written. + * The return value is a standard Tcl completion code. The result in + * interp is the substituted value, or an error message if TCL_ERROR is + * returned. If tokensLeftPtr is not NULL, then it points to an int where + * the number of tokens remaining to be processed is written. * * Side effects: - * Can be anything, depending on the types of substitution done. + * Can be anything, depending on the types of substitution done. * *---------------------------------------------------------------------- */ @@ -2212,8 +2231,8 @@ TclSubstTokens( int code = TCL_OK; #define NUM_STATIC_POS 20 int isLiteral, maxNumCL, numCL, i, adjust; - int* clPosition = NULL; - Interp* iPtr = (Interp*) interp; + int *clPosition = NULL; + Interp *iPtr = (Interp *) interp; int inFile = iPtr->evalFlags & TCL_EVAL_FILE; /* @@ -2230,24 +2249,24 @@ TclSubstTokens( * For the handling of continuation lines in literals we first check if * this is actually a literal. For if not we can forego the additional * processing. Otherwise we pre-allocate a small table to store the - * locations of all continuation lines we find in this literal, if - * any. The table is extended if needed. + * locations of all continuation lines we find in this literal, if any. + * The table is extended if needed. */ - numCL = 0; - maxNumCL = 0; + numCL = 0; + maxNumCL = 0; isLiteral = 1; for (i=0 ; i < count; i++) { - if ((tokenPtr[i].type != TCL_TOKEN_TEXT) && - (tokenPtr[i].type != TCL_TOKEN_BS)) { + if ((tokenPtr[i].type != TCL_TOKEN_TEXT) + && (tokenPtr[i].type != TCL_TOKEN_BS)) { isLiteral = 0; break; } } if (isLiteral) { - maxNumCL = NUM_STATIC_POS; - clPosition = (int*) ckalloc (maxNumCL*sizeof(int)); + maxNumCL = NUM_STATIC_POS; + clPosition = (int *)ckalloc(maxNumCL * sizeof(int)); } adjust = 0; @@ -2268,6 +2287,7 @@ TclSubstTokens( appendByteLength = TclParseBackslash(tokenPtr->start, tokenPtr->size, NULL, utfCharBytes); append = utfCharBytes; + /* * If the backslash sequence we found is in a literal, and * represented a continuation line, we compute and store its @@ -2287,6 +2307,7 @@ TclSubstTokens( (tokenPtr->start[1] == '\n')) { if (isLiteral) { int clPos; + if (result == 0) { clPos = 0; } else { @@ -2295,13 +2316,13 @@ TclSubstTokens( if (numCL >= maxNumCL) { maxNumCL *= 2; - clPosition = (int*) ckrealloc ((char*)clPosition, + clPosition = (int *)ckrealloc ((char*)clPosition, maxNumCL*sizeof(int)); } clPosition[numCL] = clPos; - numCL ++; + numCL++; } - adjust ++; + adjust++; } break; @@ -2316,8 +2337,9 @@ TclSubstTokens( */ int theline; - TclAdvanceContinuations (&line, &clNextOuter, - tokenPtr->start - outerScript); + + TclAdvanceContinuations(&line, &clNextOuter, + tokenPtr->start - outerScript); theline = line + adjust; /* TIP #280: Transfer line information to nested command */ code = TclEvalEx(interp, tokenPtr->start+1, tokenPtr->size-2, @@ -2326,7 +2348,8 @@ TclSubstTokens( * Restore flag reset by nested eval for future bracketed * commands and their cmdframe setup */ - if (inFile) { + + if (inFile) { iPtr->evalFlags |= TCL_EVAL_FILE; } } @@ -2429,6 +2452,7 @@ TclSubstTokens( if (code != TCL_ERROR) { /* Keep error message in result! */ if (result != NULL) { Tcl_SetObjResult(interp, result); + /* * If the code found continuation lines (which implies that this * word is a literal), then we store the accumulated table of @@ -2447,7 +2471,7 @@ TclSubstTokens( */ if (maxNumCL) { - ckfree ((char*) clPosition); + ckfree((char*) clPosition); } } else { Tcl_ResetResult(interp); diff --git a/tests/encoding.test b/tests/encoding.test index 8722a93..c259327 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -31,6 +31,7 @@ proc runtests {} { # Some tests require the testencoding command testConstraint testencoding [llength [info commands testencoding]] testConstraint exec [llength [info commands exec]] +testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}] # TclInitEncodingSubsystem is tested by the rest of this file # TclFinalizeEncodingSubsystem is not currently tested @@ -267,16 +268,16 @@ test encoding-11.6 {LoadEncodingFile: invalid file} {testencoding} { # OpenEncodingFile is fully tested by the rest of the tests in this file. test encoding-12.1 {LoadTableEncoding: normal encoding} { - set x [encoding convertto iso8859-3 \u120] - append x [encoding convertto iso8859-3 \ud5] - append x [encoding convertfrom iso8859-3 \xd5] + set x [encoding convertto iso8859-3 \u0120] + append x [encoding convertto iso8859-3 \xD5] + append x [encoding convertfrom iso8859-3 \xD5] } "\xd5?\u120" test encoding-12.2 {LoadTableEncoding: single-byte encoding} { set x [encoding convertto iso8859-3 ab\u0120g] - append x [encoding convertfrom iso8859-3 ab\xd5g] + append x [encoding convertfrom iso8859-3 ab\xD5g] } "ab\xd5gab\u120g" test encoding-12.3 {LoadTableEncoding: multi-byte encoding} { - set x [encoding convertto shiftjis ab\u4e4eg] + set x [encoding convertto shiftjis ab\u4E4Eg] append x [encoding convertfrom shiftjis ab\x8c\xc1g] } "ab\x8c\xc1gab\u4e4eg" test encoding-12.4 {LoadTableEncoding: double-byte encoding} { @@ -288,6 +289,9 @@ test encoding-12.5 {LoadTableEncoding: symbol encoding} { append x [encoding convertto symbol \u67] append x [encoding convertfrom symbol \x67] } "\x67\x67\u3b3" +test encoding-12.6 {LoadTableEncoding: overflow in char value} fullutf { + encoding convertto iso8859-3 \U01000 +} "?" test encoding-13.1 {LoadEscapeTable} { viewable [set x [encoding convertto iso2022 ab\u4e4e\u68d9g]] |
