diff options
author | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2022-09-06 23:40:17 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-09-06 23:40:17 (GMT) |
commit | bb0dab5c48de0fabec6b3eb27316394e3b65ee2c (patch) | |
tree | 0d85aa6ab4846f9038e572ceb4665d6ddefa15ad | |
parent | a389fdb0958746c4c4ee8849a71a276516f33776 (diff) | |
download | cpython-bb0dab5c48de0fabec6b3eb27316394e3b65ee2c.zip cpython-bb0dab5c48de0fabec6b3eb27316394e3b65ee2c.tar.gz cpython-bb0dab5c48de0fabec6b3eb27316394e3b65ee2c.tar.bz2 |
gh-96611: Fix error message for invalid UTF-8 in mid-multiline string (GH-96623)
(cherry picked from commit 05692c67c51b78a5a5a7bb61d646519025e38015)
Co-authored-by: Michael Droettboom <mdboom@gmail.com>
-rw-r--r-- | Lib/test/test_source_encoding.py | 12 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst | 2 | ||||
-rw-r--r-- | Parser/tokenizer.c | 2 |
3 files changed, 16 insertions, 0 deletions
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 8d7b573..d37914d 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -148,6 +148,18 @@ class MiscSourceEncodingTest(unittest.TestCase): self.assertTrue(c.exception.args[0].startswith(expected), msg=c.exception.args[0]) + def test_file_parse_error_multiline(self): + # gh96611: + with open(TESTFN, "wb") as fd: + fd.write(b'print("""\n\xb1""")\n') + + try: + retcode, stdout, stderr = script_helper.assert_python_failure(TESTFN) + + self.assertGreater(retcode, 0) + self.assertIn(b"Non-UTF-8 code starting with '\\xb1'", stderr) + finally: + os.unlink(TESTFN) class AbstractSourceEncodingTest: diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst b/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst new file mode 100644 index 0000000..08bd409 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst @@ -0,0 +1,2 @@ +When loading a file with invalid UTF-8 inside a multi-line string, a correct +SyntaxError is emitted. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index b61ac12..b5ebcd0 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1945,6 +1945,8 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Get rest of string */ while (end_quote_size != quote_size) { c = tok_nextc(tok); + if (tok->done == E_DECODE) + break; if (c == EOF || (quote_size == 1 && c == '\n')) { assert(tok->multi_line_start != NULL); // shift the tok_state's location into |