summaryrefslogtreecommitdiffstats
path: root/Lib/test
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/test')
-rw-r--r--Lib/test/test_codeccallbacks.py7
-rw-r--r--Lib/test/test_codecs.py140
2 files changed, 145 insertions, 2 deletions
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index f76ec65..9b731d5 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -285,7 +285,8 @@ class CodecCallbackTest(unittest.TestCase):
def test_longstrings(self):
# test long strings to check for memory overflow problems
- errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
+ errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
+ "backslashreplace"]
# register the handlers under different names,
# to prevent the codec from recognizing the name
for err in errors:
@@ -293,7 +294,8 @@ class CodecCallbackTest(unittest.TestCase):
l = 1000
errors += [ "test." + err for err in errors ]
for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
- for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
+ for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
+ "utf-8", "utf-7", "utf-16", "utf-32"):
for err in errors:
try:
uni.encode(enc, err)
@@ -812,6 +814,7 @@ class CodecCallbackTest(unittest.TestCase):
("utf-7", b"++"),
("utf-8", b"\xff"),
("utf-16", b"\xff"),
+ ("utf-32", b"\xff"),
("unicode-escape", b"\\u123g"),
("raw-unicode-escape", b"\\u123g"),
("unicode-internal", b"\xff"),
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 89a3473..f2ee524 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -277,6 +277,143 @@ class ReadTest(unittest.TestCase, MixInCheckStateHandling):
self.assertEqual(reader.readline(), s5)
self.assertEqual(reader.readline(), "")
+class UTF32Test(ReadTest):
+ encoding = "utf-32"
+
+ spamle = (b'\xff\xfe\x00\x00'
+ b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
+ b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
+ spambe = (b'\x00\x00\xfe\xff'
+ b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
+ b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
+
+ def test_only_one_bom(self):
+ _,_,reader,writer = codecs.lookup(self.encoding)
+ # encode some stream
+ s = io.BytesIO()
+ f = writer(s)
+ f.write("spam")
+ f.write("spam")
+ d = s.getvalue()
+ # check whether there is exactly one BOM in it
+ self.assert_(d == self.spamle or d == self.spambe)
+ # try to read it back
+ s = io.BytesIO(d)
+ f = reader(s)
+ self.assertEquals(f.read(), "spamspam")
+
+ def test_badbom(self):
+ s = io.BytesIO(4*b"\xff")
+ f = codecs.getreader(self.encoding)(s)
+ self.assertRaises(UnicodeError, f.read)
+
+ s = io.BytesIO(8*b"\xff")
+ f = codecs.getreader(self.encoding)(s)
+ self.assertRaises(UnicodeError, f.read)
+
+ def test_partial(self):
+ self.check_partial(
+ "\x00\xff\u0100\uffff",
+ [
+ "", # first byte of BOM read
+ "", # second byte of BOM read
+ "", # third byte of BOM read
+ "", # fourth byte of BOM read => byteorder known
+ "",
+ "",
+ "",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
+ b"\xff", "strict", True)
+
+ def test_decoder_state(self):
+ self.check_state_handling_decode(self.encoding,
+ "spamspam", self.spamle)
+ self.check_state_handling_decode(self.encoding,
+ "spamspam", self.spambe)
+
+class UTF32LETest(ReadTest):
+ encoding = "utf-32-le"
+
+ def test_partial(self):
+ self.check_partial(
+ "\x00\xff\u0100\uffff",
+ [
+ "",
+ "",
+ "",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_simple(self):
+ self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
+ b"\xff", "strict", True)
+
+class UTF32BETest(ReadTest):
+ encoding = "utf-32-be"
+
+ def test_partial(self):
+ self.check_partial(
+ "\x00\xff\u0100\uffff",
+ [
+ "",
+ "",
+ "",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_simple(self):
+ self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
+ b"\xff", "strict", True)
+
class UTF16Test(ReadTest):
encoding = "utf-16"
@@ -1284,6 +1421,9 @@ class WithStmtTest(unittest.TestCase):
def test_main():
test_support.run_unittest(
+ UTF32Test,
+ UTF32LETest,
+ UTF32BETest,
UTF16Test,
UTF16LETest,
UTF16BETest,