diff options
Diffstat (limited to 'Lib/test')
-rw-r--r-- | Lib/test/test_codeccallbacks.py | 7 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 140 |
2 files changed, 145 insertions, 2 deletions
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index f76ec65..9b731d5 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -285,7 +285,8 @@ class CodecCallbackTest(unittest.TestCase): def test_longstrings(self): # test long strings to check for memory overflow problems - errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"] + errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", + "backslashreplace"] # register the handlers under different names, # to prevent the codec from recognizing the name for err in errors: @@ -293,7 +294,8 @@ class CodecCallbackTest(unittest.TestCase): l = 1000 errors += [ "test." + err for err in errors ] for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]: - for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"): + for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", + "utf-8", "utf-7", "utf-16", "utf-32"): for err in errors: try: uni.encode(enc, err) @@ -812,6 +814,7 @@ class CodecCallbackTest(unittest.TestCase): ("utf-7", b"++"), ("utf-8", b"\xff"), ("utf-16", b"\xff"), + ("utf-32", b"\xff"), ("unicode-escape", b"\\u123g"), ("raw-unicode-escape", b"\\u123g"), ("unicode-internal", b"\xff"), diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 89a3473..f2ee524 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -277,6 +277,143 @@ class ReadTest(unittest.TestCase, MixInCheckStateHandling): self.assertEqual(reader.readline(), s5) self.assertEqual(reader.readline(), "") +class UTF32Test(ReadTest): + encoding = "utf-32" + + spamle = (b'\xff\xfe\x00\x00' + b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' + b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') + spambe = (b'\x00\x00\xfe\xff' + b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' + b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') + + def test_only_one_bom(self): + _,_,reader,writer = codecs.lookup(self.encoding) + # encode some stream + s = io.BytesIO() + f = writer(s) + f.write("spam") + f.write("spam") + d = s.getvalue() + # check whether there is exactly one BOM in it + self.assert_(d == self.spamle or d == self.spambe) + # try to read it back + s = io.BytesIO(d) + f = reader(s) + self.assertEquals(f.read(), "spamspam") + + def test_badbom(self): + s = io.BytesIO(4*b"\xff") + f = codecs.getreader(self.encoding)(s) + self.assertRaises(UnicodeError, f.read) + + s = io.BytesIO(8*b"\xff") + f = codecs.getreader(self.encoding)(s) + self.assertRaises(UnicodeError, f.read) + + def test_partial(self): + self.check_partial( + "\x00\xff\u0100\uffff", + [ + "", # first byte of BOM read + "", # second byte of BOM read + "", # third byte of BOM read + "", # fourth byte of BOM read => byteorder known + "", + "", + "", + "\x00", + "\x00", + "\x00", + "\x00", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100\uffff", + ] + ) + + def test_errors(self): + self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, + b"\xff", "strict", True) + + def test_decoder_state(self): + self.check_state_handling_decode(self.encoding, + "spamspam", self.spamle) + self.check_state_handling_decode(self.encoding, + "spamspam", self.spambe) + +class UTF32LETest(ReadTest): + encoding = "utf-32-le" + + def test_partial(self): + self.check_partial( + "\x00\xff\u0100\uffff", + [ + "", + "", + "", + "\x00", + "\x00", + "\x00", + "\x00", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100\uffff", + ] + ) + + def test_simple(self): + self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00") + + def test_errors(self): + self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, + b"\xff", "strict", True) + +class UTF32BETest(ReadTest): + encoding = "utf-32-be" + + def test_partial(self): + self.check_partial( + "\x00\xff\u0100\uffff", + [ + "", + "", + "", + "\x00", + "\x00", + "\x00", + "\x00", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100\uffff", + ] + ) + + def test_simple(self): + self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03") + + def test_errors(self): + self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, + b"\xff", "strict", True) + class UTF16Test(ReadTest): encoding = "utf-16" @@ -1284,6 +1421,9 @@ class WithStmtTest(unittest.TestCase): def test_main(): test_support.run_unittest( + UTF32Test, + UTF32LETest, + UTF32BETest, UTF16Test, UTF16LETest, UTF16BETest, |