diff options
author | Lysandros Nikolaou <lisandrosnik@gmail.com> | 2023-10-16 16:59:18 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-16 16:59:18 (GMT) |
commit | 3b87e520fcbe4ceb94e3a46a874a450ee9d50b01 (patch) | |
tree | 52a87cc4616bd7ebf88f50e1fdb0f88aca662082 | |
parent | b8e5b1b28ac4855ce6b4d04c777fb35811a82d42 (diff) | |
download | cpython-3b87e520fcbe4ceb94e3a46a874a450ee9d50b01.zip cpython-3b87e520fcbe4ceb94e3a46a874a450ee9d50b01.tar.gz cpython-3b87e520fcbe4ceb94e3a46a874a450ee9d50b01.tar.bz2 |
[3.12] gh-107450: Check for overflow in the tokenizer and fix overflow test (GH-110832) (#110931)
(cherry picked from commit a1ac5590e0f8fe008e5562d22edab65d0c1c5507)
Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Filipe LaĆns <lains@riseup.net>
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
-rw-r--r-- | Include/errcode.h | 38 | ||||
-rw-r--r-- | Lib/test/test_exceptions.py | 16 | ||||
-rw-r--r-- | Parser/pegen_errors.c | 11 | ||||
-rw-r--r-- | Parser/tokenizer.c | 4 |
4 files changed, 40 insertions, 29 deletions
diff --git a/Include/errcode.h b/Include/errcode.h index 54ae929..bd9066b 100644 --- a/Include/errcode.h +++ b/Include/errcode.h @@ -4,7 +4,6 @@ extern "C" { #endif - /* Error codes passed around between file input, tokenizer, parser and interpreter. This is necessary so we can turn them into Python exceptions at a higher level. Note that some errors have a @@ -13,24 +12,25 @@ extern "C" { the parser only returns E_EOF when it hits EOF immediately, and it never returns E_OK. */ -#define E_OK 10 /* No error */ -#define E_EOF 11 /* End Of File */ -#define E_INTR 12 /* Interrupted */ -#define E_TOKEN 13 /* Bad token */ -#define E_SYNTAX 14 /* Syntax error */ -#define E_NOMEM 15 /* Ran out of memory */ -#define E_DONE 16 /* Parsing complete */ -#define E_ERROR 17 /* Execution error */ -#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */ -#define E_OVERFLOW 19 /* Node had too many children */ -#define E_TOODEEP 20 /* Too many indentation levels */ -#define E_DEDENT 21 /* No matching outer block for dedent */ -#define E_DECODE 22 /* Error in decoding into Unicode */ -#define E_EOFS 23 /* EOF in triple-quoted string */ -#define E_EOLS 24 /* EOL in single-quoted string */ -#define E_LINECONT 25 /* Unexpected characters after a line continuation */ -#define E_BADSINGLE 27 /* Ill-formed single statement input */ -#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ +#define E_OK 10 /* No error */ +#define E_EOF 11 /* End Of File */ +#define E_INTR 12 /* Interrupted */ +#define E_TOKEN 13 /* Bad token */ +#define E_SYNTAX 14 /* Syntax error */ +#define E_NOMEM 15 /* Ran out of memory */ +#define E_DONE 16 /* Parsing complete */ +#define E_ERROR 17 /* Execution error */ +#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */ +#define E_OVERFLOW 19 /* Node had too many children */ +#define E_TOODEEP 20 /* Too many indentation levels */ +#define E_DEDENT 21 /* No matching outer block for dedent */ +#define E_DECODE 22 /* Error in decoding into Unicode */ +#define E_EOFS 23 /* EOF in triple-quoted string */ +#define E_EOLS 24 /* EOL in single-quoted string */ +#define E_LINECONT 25 /* Unexpected characters after a line continuation */ +#define E_BADSINGLE 27 /* Ill-formed single statement input */ +#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ +#define E_COLUMNOVERFLOW 29 /* Column offset overflow */ #ifdef __cplusplus } diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index 9de7e73..3049015 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -18,6 +18,12 @@ from test.support.os_helper import TESTFN, unlink from test.support.warnings_helper import check_warnings from test import support +try: + from _testcapi import INT_MAX +except ImportError: + INT_MAX = 2**31 - 1 + + class NaiveException(Exception): def __init__(self, x): @@ -318,11 +324,13 @@ class ExceptionTests(unittest.TestCase): check('(yield i) = 2', 1, 2) check('def f(*):\n pass', 1, 7) + @unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset") @support.requires_resource('cpu') - @support.bigmemtest(support._2G, memuse=1.5) - def testMemoryErrorBigSource(self, _size): - with self.assertRaises(OverflowError): - exec(f"if True:\n {' ' * 2**31}print('hello world')") + @support.bigmemtest(INT_MAX, memuse=2, dry_run=False) + def testMemoryErrorBigSource(self, size): + src = b"if True:\n%*s" % (size, b"pass") + with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"): + compile(src, '<fragment>', 'exec') @cpython_only def testSettingException(self): diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index 71c4765..6390a66 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -66,6 +66,7 @@ _Pypegen_tokenizer_error(Parser *p) const char *msg = NULL; PyObject* errtype = PyExc_SyntaxError; Py_ssize_t col_offset = -1; + p->error_indicator = 1; switch (p->tok->done) { case E_TOKEN: msg = "invalid token"; @@ -101,6 +102,10 @@ _Pypegen_tokenizer_error(Parser *p) msg = "unexpected character after line continuation character"; break; } + case E_COLUMNOVERFLOW: + PyErr_SetString(PyExc_OverflowError, + "Parser column offset overflow - source line is too big"); + return -1; default: msg = "unknown parsing error"; } @@ -233,12 +238,6 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *err col_offset = 0; } else { const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; - if (p->tok->cur - start > INT_MAX) { - PyErr_SetString(PyExc_OverflowError, - "Parser column offset overflow - source line is too big"); - p->error_indicator = 1; - return NULL; - } col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); } } else { diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index a7786d0..a59b728 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1366,6 +1366,10 @@ tok_nextc(struct tok_state *tok) int rc; for (;;) { if (tok->cur != tok->inp) { + if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) { + tok->done = E_COLUMNOVERFLOW; + return EOF; + } tok->col_offset++; return Py_CHARMASK(*tok->cur++); /* Fast path */ } |