diff options
author | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2022-08-16 16:26:40 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-08-16 16:26:40 (GMT) |
commit | 2bb363cfcd7563fdd29ac93563f95b8a5205b008 (patch) | |
tree | 06996ca844d715e896eedfd379de3b0f27c7d6c6 | |
parent | f6aa6ebf43edb1be3874e0ce137f315df99eafc8 (diff) | |
download | cpython-2bb363cfcd7563fdd29ac93563f95b8a5205b008.zip cpython-2bb363cfcd7563fdd29ac93563f95b8a5205b008.tar.gz cpython-2bb363cfcd7563fdd29ac93563f95b8a5205b008.tar.bz2 |
[3.11] gh-94823: Improve coverage in tokenizer.c:valid_utf8 (GH-94856) (#96029)
Co-authored-by: Michael Droettboom <mdboom@gmail.com>
-rw-r--r-- | Lib/test/test_source_encoding.py | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index a0375fd..e1b0de2 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -224,6 +224,67 @@ class AbstractSourceEncodingTest: out = self.check_script_output(src, br"'\n\n\n'") +class UTF8ValidatorTest(unittest.TestCase): + @unittest.skipIf(not sys.platform.startswith("linux"), + "Too slow to run on non-Linux platforms") + def test_invalid_utf8(self): + # This is a port of test_utf8_decode_invalid_sequences in + # test_unicode.py to exercise the separate utf8 validator in + # Parser/tokenizer.c used when reading source files. + + # That file is written using low-level C file I/O, so the only way to + # test it is to write actual files to disk. + + # Each example is put inside a string at the top of the file so + # it's an otherwise valid Python source file. + template = b'"%s"\n' + + with tempfile.TemporaryDirectory() as tmpd: + fn = os.path.join(tmpd, 'test.py') + + def check(content): + with open(fn, 'wb') as fp: + fp.write(template % content) + script_helper.assert_python_failure(fn) + + # continuation bytes in a sequence of 2, 3, or 4 bytes + continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] + # start bytes of a 2-byte sequence equivalent to code points < 0x7F + invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)] + # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF + invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)] + invalid_start_bytes = ( + continuation_bytes + invalid_2B_seq_start_bytes + + invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)] + ) + + for byte in invalid_start_bytes: + check(byte) + + for sb in invalid_2B_seq_start_bytes: + for cb in continuation_bytes: + check(sb + cb) + + for sb in invalid_4B_seq_start_bytes: + for cb1 in continuation_bytes[:3]: + for cb3 in continuation_bytes[:3]: + check(sb+cb1+b'\x80'+cb3) + + for cb in [bytes([x]) for x in range(0x80, 0xA0)]: + check(b'\xE0'+cb+b'\x80') + check(b'\xE0'+cb+b'\xBF') + # surrogates + for cb in [bytes([x]) for x in range(0xA0, 0xC0)]: + check(b'\xED'+cb+b'\x80') + check(b'\xED'+cb+b'\xBF') + for cb in [bytes([x]) for x in range(0x80, 0x90)]: + check(b'\xF0'+cb+b'\x80\x80') + check(b'\xF0'+cb+b'\xBF\xBF') + for cb in [bytes([x]) for x in range(0x90, 0xC0)]: + check(b'\xF4'+cb+b'\x80\x80') + check(b'\xF4'+cb+b'\xBF\xBF') + + class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): def check_script_output(self, src, expected): |