diff options
author | Michael Droettboom <mdboom@gmail.com> | 2022-09-07 21:23:54 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-09-07 21:23:54 (GMT) |
commit | 8bc356a7dd50cbdb46d10b8c7e457832431f5d9e (patch) | |
tree | 9426a7c4149963ad5e85964a8a2ac3f6b7bf72cc /Lib | |
parent | 3e26de3c1f24bf0810eaaf7d75a4332775870e78 (diff) | |
download | cpython-8bc356a7dd50cbdb46d10b8c7e457832431f5d9e.zip cpython-8bc356a7dd50cbdb46d10b8c7e457832431f5d9e.tar.gz cpython-8bc356a7dd50cbdb46d10b8c7e457832431f5d9e.tar.bz2 |
gh-96268: Fix loading invalid UTF-8 (#96270)
This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8.
It also fixes an off-by-one error introduced in 3.10 for the line number when the tokenizer reports bad UTF8.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/test/test_source_encoding.py | 13 |
1 files changed, 10 insertions, 3 deletions
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index feaff47..cfc4b13 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -247,8 +247,10 @@ class UTF8ValidatorTest(unittest.TestCase): # test it is to write actual files to disk. # Each example is put inside a string at the top of the file so - # it's an otherwise valid Python source file. - template = b'"%s"\n' + # it's an otherwise valid Python source file. Put some newlines + # beforehand so we can assert that the error is reported on the + # correct line. + template = b'\n\n\n"%s"\n' fn = TESTFN self.addCleanup(unlink, fn) @@ -256,7 +258,12 @@ class UTF8ValidatorTest(unittest.TestCase): def check(content): with open(fn, 'wb') as fp: fp.write(template % content) - script_helper.assert_python_failure(fn) + rc, stdout, stderr = script_helper.assert_python_failure(fn) + # We want to assert that the python subprocess failed gracefully, + # not via a signal. + self.assertGreaterEqual(rc, 1) + self.assertIn(b"Non-UTF-8 code starting with", stderr) + self.assertIn(b"on line 4", stderr) # continuation bytes in a sequence of 2, 3, or 4 bytes continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] |