summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Droettboom <mdboom@gmail.com>2022-09-06 23:12:16 (GMT)
committerGitHub <noreply@github.com>2022-09-06 23:12:16 (GMT)
commit05692c67c51b78a5a5a7bb61d646519025e38015 (patch)
tree1fe2f031c0d77bcfdd09d1acb20a752d176f8047
parent67444902a0f10419a557d0a2d3b8675c31b075a9 (diff)
downloadcpython-05692c67c51b78a5a5a7bb61d646519025e38015.zip
cpython-05692c67c51b78a5a5a7bb61d646519025e38015.tar.gz
cpython-05692c67c51b78a5a5a7bb61d646519025e38015.tar.bz2
gh-96611: Fix error message for invalid UTF-8 in mid-multiline string (#96623)
-rw-r--r--Lib/test/test_source_encoding.py12
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst2
-rw-r--r--Parser/tokenizer.c2
3 files changed, 16 insertions, 0 deletions
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index 8e68b4e..feaff47 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -147,6 +147,18 @@ class MiscSourceEncodingTest(unittest.TestCase):
self.assertTrue(c.exception.args[0].startswith(expected),
msg=c.exception.args[0])
+ def test_file_parse_error_multiline(self):
+ # gh96611:
+ with open(TESTFN, "wb") as fd:
+ fd.write(b'print("""\n\xb1""")\n')
+
+ try:
+ retcode, stdout, stderr = script_helper.assert_python_failure(TESTFN)
+
+ self.assertGreater(retcode, 0)
+ self.assertIn(b"Non-UTF-8 code starting with '\\xb1'", stderr)
+ finally:
+ os.unlink(TESTFN)
class AbstractSourceEncodingTest:
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst b/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst
new file mode 100644
index 0000000..08bd409
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst
@@ -0,0 +1,2 @@
+When loading a file with invalid UTF-8 inside a multi-line string, a correct
+SyntaxError is emitted.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index f2606f1..6d08db5 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1936,6 +1936,8 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
/* Get rest of string */
while (end_quote_size != quote_size) {
c = tok_nextc(tok);
+ if (tok->done == E_DECODE)
+ break;
if (c == EOF || (quote_size == 1 && c == '\n')) {
assert(tok->multi_line_start != NULL);
// shift the tok_state's location into