From 4325a766f5f603ef6dfb8c4d5798e5e73cb5efd5 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 12 Dec 2021 07:06:50 +0000 Subject: bpo-46054: Fix parsing error when parsing non-utf8 characters in source files (GH-30068) --- Lib/test/test_exceptions.py | 12 ++++++++++++ .../2021-12-12-05-30-21.bpo-46054.2P-foG.rst | 2 ++ Parser/tokenizer.c | 13 +++++-------- 3 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index c861d8f..1021026 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -2387,6 +2387,18 @@ class SyntaxErrorTests(unittest.TestCase): finally: unlink(TESTFN) + def test_non_utf8(self): + # Check non utf-8 characters + try: + with open(TESTFN, 'bw') as testfile: + testfile.write(b'\x7fELF\x02\x01\x01\x00\x00\x00') + rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN) + err = err.decode('utf-8').splitlines() + + self.assertEqual(err[-1], "SyntaxError: invalid non-printable character U+007F") + finally: + unlink(TESTFN) + def test_attributes_new_constructor(self): args = ("bad.py", 1, 2, "abcdefg", 1, 100) the_exception = SyntaxError("bad bad", args) diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst b/Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst new file mode 100644 index 0000000..6ca91f0 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst @@ -0,0 +1,2 @@ +Fix parser error when parsing non-utf8 characters in source files. Patch by +Pablo Galindo. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 6358cdf..a560572 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -819,10 +819,10 @@ tok_readline_raw(struct tok_state *tok) tok_concatenate_interactive_new_line(tok, line) == -1) { return 0; } - if (*tok->inp == '\0') { + tok->inp = strchr(tok->inp, '\0'); + if (tok->inp == tok->buf) { return 0; } - tok->inp = strchr(tok->inp, '\0'); } while (tok->inp[-1] != '\n'); return 1; } @@ -984,12 +984,9 @@ tok_underflow_file(struct tok_state *tok) { } /* The default encoding is UTF-8, so make sure we don't have any non-UTF-8 sequences in it. */ - if (!tok->encoding - && (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) { - if (!ensure_utf8(tok->cur, tok)) { - error_ret(tok); - return 0; - } + if (!tok->encoding && !ensure_utf8(tok->cur, tok)) { + error_ret(tok); + return 0; } assert(tok->done == E_OK); return tok->done == E_OK; -- cgit v0.12