summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLysandros Nikolaou <lisandrosnik@gmail.com>2023-10-16 16:59:18 (GMT)
committerGitHub <noreply@github.com>2023-10-16 16:59:18 (GMT)
commit3b87e520fcbe4ceb94e3a46a874a450ee9d50b01 (patch)
tree52a87cc4616bd7ebf88f50e1fdb0f88aca662082
parentb8e5b1b28ac4855ce6b4d04c777fb35811a82d42 (diff)
downloadcpython-3b87e520fcbe4ceb94e3a46a874a450ee9d50b01.zip
cpython-3b87e520fcbe4ceb94e3a46a874a450ee9d50b01.tar.gz
cpython-3b87e520fcbe4ceb94e3a46a874a450ee9d50b01.tar.bz2
[3.12] gh-107450: Check for overflow in the tokenizer and fix overflow test (GH-110832) (#110931)
(cherry picked from commit a1ac5590e0f8fe008e5562d22edab65d0c1c5507) Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com> Co-authored-by: Filipe LaĆ­ns <lains@riseup.net> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
-rw-r--r--Include/errcode.h38
-rw-r--r--Lib/test/test_exceptions.py16
-rw-r--r--Parser/pegen_errors.c11
-rw-r--r--Parser/tokenizer.c4
4 files changed, 40 insertions, 29 deletions
diff --git a/Include/errcode.h b/Include/errcode.h
index 54ae929..bd9066b 100644
--- a/Include/errcode.h
+++ b/Include/errcode.h
@@ -4,7 +4,6 @@
extern "C" {
#endif
-
/* Error codes passed around between file input, tokenizer, parser and
interpreter. This is necessary so we can turn them into Python
exceptions at a higher level. Note that some errors have a
@@ -13,24 +12,25 @@ extern "C" {
the parser only returns E_EOF when it hits EOF immediately, and it
never returns E_OK. */
-#define E_OK 10 /* No error */
-#define E_EOF 11 /* End Of File */
-#define E_INTR 12 /* Interrupted */
-#define E_TOKEN 13 /* Bad token */
-#define E_SYNTAX 14 /* Syntax error */
-#define E_NOMEM 15 /* Ran out of memory */
-#define E_DONE 16 /* Parsing complete */
-#define E_ERROR 17 /* Execution error */
-#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */
-#define E_OVERFLOW 19 /* Node had too many children */
-#define E_TOODEEP 20 /* Too many indentation levels */
-#define E_DEDENT 21 /* No matching outer block for dedent */
-#define E_DECODE 22 /* Error in decoding into Unicode */
-#define E_EOFS 23 /* EOF in triple-quoted string */
-#define E_EOLS 24 /* EOL in single-quoted string */
-#define E_LINECONT 25 /* Unexpected characters after a line continuation */
-#define E_BADSINGLE 27 /* Ill-formed single statement input */
-#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */
+#define E_OK 10 /* No error */
+#define E_EOF 11 /* End Of File */
+#define E_INTR 12 /* Interrupted */
+#define E_TOKEN 13 /* Bad token */
+#define E_SYNTAX 14 /* Syntax error */
+#define E_NOMEM 15 /* Ran out of memory */
+#define E_DONE 16 /* Parsing complete */
+#define E_ERROR 17 /* Execution error */
+#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */
+#define E_OVERFLOW 19 /* Node had too many children */
+#define E_TOODEEP 20 /* Too many indentation levels */
+#define E_DEDENT 21 /* No matching outer block for dedent */
+#define E_DECODE 22 /* Error in decoding into Unicode */
+#define E_EOFS 23 /* EOF in triple-quoted string */
+#define E_EOLS 24 /* EOL in single-quoted string */
+#define E_LINECONT 25 /* Unexpected characters after a line continuation */
+#define E_BADSINGLE 27 /* Ill-formed single statement input */
+#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */
+#define E_COLUMNOVERFLOW 29 /* Column offset overflow */
#ifdef __cplusplus
}
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py
index 9de7e73..3049015 100644
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -18,6 +18,12 @@ from test.support.os_helper import TESTFN, unlink
from test.support.warnings_helper import check_warnings
from test import support
+try:
+ from _testcapi import INT_MAX
+except ImportError:
+ INT_MAX = 2**31 - 1
+
+
class NaiveException(Exception):
def __init__(self, x):
@@ -318,11 +324,13 @@ class ExceptionTests(unittest.TestCase):
check('(yield i) = 2', 1, 2)
check('def f(*):\n pass', 1, 7)
+ @unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset")
@support.requires_resource('cpu')
- @support.bigmemtest(support._2G, memuse=1.5)
- def testMemoryErrorBigSource(self, _size):
- with self.assertRaises(OverflowError):
- exec(f"if True:\n {' ' * 2**31}print('hello world')")
+ @support.bigmemtest(INT_MAX, memuse=2, dry_run=False)
+ def testMemoryErrorBigSource(self, size):
+ src = b"if True:\n%*s" % (size, b"pass")
+ with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"):
+ compile(src, '<fragment>', 'exec')
@cpython_only
def testSettingException(self):
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
index 71c4765..6390a66 100644
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -66,6 +66,7 @@ _Pypegen_tokenizer_error(Parser *p)
const char *msg = NULL;
PyObject* errtype = PyExc_SyntaxError;
Py_ssize_t col_offset = -1;
+ p->error_indicator = 1;
switch (p->tok->done) {
case E_TOKEN:
msg = "invalid token";
@@ -101,6 +102,10 @@ _Pypegen_tokenizer_error(Parser *p)
msg = "unexpected character after line continuation character";
break;
}
+ case E_COLUMNOVERFLOW:
+ PyErr_SetString(PyExc_OverflowError,
+ "Parser column offset overflow - source line is too big");
+ return -1;
default:
msg = "unknown parsing error";
}
@@ -233,12 +238,6 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *err
col_offset = 0;
} else {
const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
- if (p->tok->cur - start > INT_MAX) {
- PyErr_SetString(PyExc_OverflowError,
- "Parser column offset overflow - source line is too big");
- p->error_indicator = 1;
- return NULL;
- }
col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
}
} else {
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index a7786d0..a59b728 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1366,6 +1366,10 @@ tok_nextc(struct tok_state *tok)
int rc;
for (;;) {
if (tok->cur != tok->inp) {
+ if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
+ tok->done = E_COLUMNOVERFLOW;
+ return EOF;
+ }
tok->col_offset++;
return Py_CHARMASK(*tok->cur++); /* Fast path */
}