diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2015-10-01 19:54:51 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2015-10-01 19:54:51 (GMT) |
commit | 01ada3996bded57d1baf9c54b050fc55907d9b13 (patch) | |
tree | 5f795a91278beb5ae831a75eef3f4b20c5c3a6f8 /Lib/test/test_codecs.py | |
parent | 29a1445136c7353543b516a085c38b8be9ce5109 (diff) | |
download | cpython-01ada3996bded57d1baf9c54b050fc55907d9b13.zip cpython-01ada3996bded57d1baf9c54b050fc55907d9b13.tar.gz cpython-01ada3996bded57d1baf9c54b050fc55907d9b13.tar.bz2 |
Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error
handlers: ``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``.
Patch co-written with Serhiy Storchaka.
Diffstat (limited to 'Lib/test/test_codecs.py')
-rw-r--r-- | Lib/test/test_codecs.py | 37 |
1 files changed, 27 insertions, 10 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 254c0c1..4658497 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -361,6 +361,12 @@ class ReadTest(MixInCheckStateHandling): self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), "[?]".encode(self.encoding)) + # sequential surrogate characters + self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"), + "[]".encode(self.encoding)) + self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"), + "[??]".encode(self.encoding)) + bom = "".encode(self.encoding) for before, after in [("\U00010fff", "A"), ("[", "]"), ("A", "\U00010fff")]: @@ -753,6 +759,7 @@ class UTF8Test(ReadTest, unittest.TestCase): encoding = "utf-8" ill_formed_sequence = b"\xed\xb2\x80" ill_formed_sequence_replace = "\ufffd" * 3 + BOM = b'' def test_partial(self): self.check_partial( @@ -785,23 +792,32 @@ class UTF8Test(ReadTest, unittest.TestCase): super().test_lone_surrogates() # not sure if this is making sense for # UTF-16 and UTF-32 - self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"), - b'[\x80]') + self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"), + self.BOM + b'[\x80]') + + with self.assertRaises(UnicodeEncodeError) as cm: + "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape") + exc = cm.exception + self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF') def test_surrogatepass_handler(self): - self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), - b"abc\xed\xa0\x80def") - self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"), + self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"), + self.BOM + b"abc\xed\xa0\x80def") + self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"), + self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80") + self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"), + self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]') + + self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"), "abc\ud800def") - self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"), - b"\xf0\x90\xbf\xbf\xed\xa0\x80") - self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"), + self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"), "\U00010fff\uD800") + self.assertTrue(codecs.lookup_error("surrogatepass")) with self.assertRaises(UnicodeDecodeError): - b"abc\xed\xa0".decode("utf-8", "surrogatepass") + b"abc\xed\xa0".decode(self.encoding, "surrogatepass") with self.assertRaises(UnicodeDecodeError): - b"abc\xed\xa0z".decode("utf-8", "surrogatepass") + b"abc\xed\xa0z".decode(self.encoding, "surrogatepass") @unittest.skipUnless(sys.platform == 'win32', @@ -1008,6 +1024,7 @@ class ReadBufferTest(unittest.TestCase): class UTF8SigTest(UTF8Test, unittest.TestCase): encoding = "utf-8-sig" + BOM = codecs.BOM_UTF8 def test_partial(self): self.check_partial( |