diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-11-19 09:32:41 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-11-19 09:32:41 (GMT) |
commit | 58cf607d13c178f41aed05458296b68e985c5fff (patch) | |
tree | d9a39a30200eef16fec17f0ed934186e8e864149 /Lib | |
parent | a938bcfe952975cd117994acfef3712d61221f20 (diff) | |
download | cpython-58cf607d13c178f41aed05458296b68e985c5fff.zip cpython-58cf607d13c178f41aed05458296b68e985c5fff.tar.gz cpython-58cf607d13c178f41aed05458296b68e985c5fff.tar.bz2 |
Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.
The utf-16* and utf-32* encoders no longer allow surrogate code points
(U+D800-U+DFFF) to be encoded.
The utf-32* decoders no longer decode byte sequences that correspond to
surrogate code points.
The surrogatepass error handler now works with the utf-16* and utf-32* codecs.
Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/test/test_codecs.py | 68 |
1 files changed, 56 insertions, 12 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 55becf4..31bd089 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -300,8 +300,46 @@ class ReadTest(MixInCheckStateHandling): self.assertEqual(reader.readline(), s5) self.assertEqual(reader.readline(), "") + ill_formed_sequence_replace = "\ufffd" + + def test_lone_surrogates(self): + self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) + self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), + "[\\udc80]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), + "[�]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), + "[]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), + "[?]".encode(self.encoding)) + + bom = "".encode(self.encoding) + for before, after in [("\U00010fff", "A"), ("[", "]"), + ("A", "\U00010fff")]: + before_sequence = before.encode(self.encoding)[len(bom):] + after_sequence = after.encode(self.encoding)[len(bom):] + test_string = before + "\uDC80" + after + test_sequence = (bom + before_sequence + + self.ill_formed_sequence + after_sequence) + self.assertRaises(UnicodeDecodeError, test_sequence.decode, + self.encoding) + self.assertEqual(test_string.encode(self.encoding, + "surrogatepass"), + test_sequence) + self.assertEqual(test_sequence.decode(self.encoding, + "surrogatepass"), + test_string) + self.assertEqual(test_sequence.decode(self.encoding, "ignore"), + before + after) + self.assertEqual(test_sequence.decode(self.encoding, "replace"), + before + self.ill_formed_sequence_replace + after) + class UTF32Test(ReadTest, unittest.TestCase): encoding = "utf-32" + if sys.byteorder == 'little': + ill_formed_sequence = b"\x80\xdc\x00\x00" + else: + ill_formed_sequence = b"\x00\x00\xdc\x80" spamle = (b'\xff\xfe\x00\x00' b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' @@ -393,6 +431,7 @@ class UTF32Test(ReadTest, unittest.TestCase): class UTF32LETest(ReadTest, unittest.TestCase): encoding = "utf-32-le" + ill_formed_sequence = b"\x80\xdc\x00\x00" def test_partial(self): self.check_partial( @@ -437,6 +476,7 @@ class UTF32LETest(ReadTest, unittest.TestCase): class UTF32BETest(ReadTest, unittest.TestCase): encoding = "utf-32-be" + ill_formed_sequence = b"\x00\x00\xdc\x80" def test_partial(self): self.check_partial( @@ -482,6 +522,10 @@ class UTF32BETest(ReadTest, unittest.TestCase): class UTF16Test(ReadTest, unittest.TestCase): encoding = "utf-16" + if sys.byteorder == 'little': + ill_formed_sequence = b"\x80\xdc" + else: + ill_formed_sequence = b"\xdc\x80" spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' @@ -562,6 +606,7 @@ class UTF16Test(ReadTest, unittest.TestCase): class UTF16LETest(ReadTest, unittest.TestCase): encoding = "utf-16-le" + ill_formed_sequence = b"\x80\xdc" def test_partial(self): self.check_partial( @@ -605,6 +650,7 @@ class UTF16LETest(ReadTest, unittest.TestCase): class UTF16BETest(ReadTest, unittest.TestCase): encoding = "utf-16-be" + ill_formed_sequence = b"\xdc\x80" def test_partial(self): self.check_partial( @@ -648,6 +694,8 @@ class UTF16BETest(ReadTest, unittest.TestCase): class UTF8Test(ReadTest, unittest.TestCase): encoding = "utf-8" + ill_formed_sequence = b"\xed\xb2\x80" + ill_formed_sequence_replace = "\ufffd" * 3 def test_partial(self): self.check_partial( @@ -677,18 +725,11 @@ class UTF8Test(ReadTest, unittest.TestCase): u, u.encode(self.encoding)) def test_lone_surrogates(self): - self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8") - self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8") - self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"), - b'[\\udc80]') - self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"), - b'[�]') - self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"), + super().test_lone_surrogates() + # not sure if this is making sense for + # UTF-16 and UTF-32 + self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"), b'[\x80]') - self.assertEqual("[\uDC80]".encode("utf-8", "ignore"), - b'[]') - self.assertEqual("[\uDC80]".encode("utf-8", "replace"), - b'[?]') def test_surrogatepass_handler(self): self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), @@ -851,6 +892,9 @@ class UTF7Test(ReadTest, unittest.TestCase): self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') + test_lone_surrogates = None + + class UTF16ExTest(unittest.TestCase): def test_errors(self): @@ -875,7 +919,7 @@ class ReadBufferTest(unittest.TestCase): self.assertRaises(TypeError, codecs.readbuffer_encode) self.assertRaises(TypeError, codecs.readbuffer_encode, 42) -class UTF8SigTest(ReadTest, unittest.TestCase): +class UTF8SigTest(UTF8Test, unittest.TestCase): encoding = "utf-8-sig" def test_partial(self): |