Apply SF patch #1775604: This adds three new codecs (utf-32, utf-32-le and

ut-32-be). On narrow builds the codecs combine surrogate pairs in the unicode object into one codepoint on encoding and create surrogate pairs for codepoints outside the BMP on decoding. Lone surrogates are passed through unchanged in all cases. Backport to the trunk will follow.
author: Walter Dörwald <walter@livinglogic.de> 2007-08-16 21:55:45 (GMT)
committer: Walter Dörwald <walter@livinglogic.de> 2007-08-16 21:55:45 (GMT)
commit: 41980caf644163f1ff74a793b30f1c424eeede82 (patch)
tree: dba1c68090fce4379eced5a27a5b8d4b4f55340c /Lib/test
parent: 066100909ae45e7acd59b2ac81338d3cfcf44384 (diff)
download: cpython-41980caf644163f1ff74a793b30f1c424eeede82.zip
cpython-41980caf644163f1ff74a793b30f1c424eeede82.tar.gz
cpython-41980caf644163f1ff74a793b30f1c424eeede82.tar.bz2
2 files changed, 145 insertions, 2 deletions
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index f76ec65..9b731d5 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -285,7 +285,8 @@ class CodecCallbackTest(unittest.TestCase):
 
     def test_longstrings(self):
         # test long strings to check for memory overflow problems
-        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
+        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
+                   "backslashreplace"]
         # register the handlers under different names,
         # to prevent the codec from recognizing the name
         for err in errors:
@@ -293,7 +294,8 @@ class CodecCallbackTest(unittest.TestCase):
         l = 1000
         errors += [ "test." + err for err in errors ]
         for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
-            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
+            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
+                        "utf-8", "utf-7", "utf-16", "utf-32"):
                 for err in errors:
                     try:
                         uni.encode(enc, err)
@@ -812,6 +814,7 @@ class CodecCallbackTest(unittest.TestCase):
             ("utf-7", b"++"),
             ("utf-8",  b"\xff"),
             ("utf-16", b"\xff"),
+            ("utf-32", b"\xff"),
             ("unicode-escape", b"\\u123g"),
             ("raw-unicode-escape", b"\\u123g"),
             ("unicode-internal", b"\xff"),
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 89a3473..f2ee524 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -277,6 +277,143 @@ class ReadTest(unittest.TestCase, MixInCheckStateHandling):
         self.assertEqual(reader.readline(), s5)
         self.assertEqual(reader.readline(), "")
 
+class UTF32Test(ReadTest):
+    encoding = "utf-32"
+
+    spamle = (b'\xff\xfe\x00\x00'
+              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
+              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
+    spambe = (b'\x00\x00\xfe\xff'
+              b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
+              b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
+
+    def test_only_one_bom(self):
+        _,_,reader,writer = codecs.lookup(self.encoding)
+        # encode some stream
+        s = io.BytesIO()
+        f = writer(s)
+        f.write("spam")
+        f.write("spam")
+        d = s.getvalue()
+        # check whether there is exactly one BOM in it
+        self.assert_(d == self.spamle or d == self.spambe)
+        # try to read it back
+        s = io.BytesIO(d)
+        f = reader(s)
+        self.assertEquals(f.read(), "spamspam")
+
+    def test_badbom(self):
+        s = io.BytesIO(4*b"\xff")
+        f = codecs.getreader(self.encoding)(s)
+        self.assertRaises(UnicodeError, f.read)
+
+        s = io.BytesIO(8*b"\xff")
+        f = codecs.getreader(self.encoding)(s)
+        self.assertRaises(UnicodeError, f.read)
+
+    def test_partial(self):
+        self.check_partial(
+            "\x00\xff\u0100\uffff",
+            [
+                "", # first byte of BOM read
+                "", # second byte of BOM read
+                "", # third byte of BOM read
+                "", # fourth byte of BOM read => byteorder known
+                "",
+                "",
+                "",
+                "\x00",
+                "\x00",
+                "\x00",
+                "\x00",
+                "\x00\xff",
+                "\x00\xff",
+                "\x00\xff",
+                "\x00\xff",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100\uffff",
+            ]
+        )
+
+    def test_errors(self):
+        self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
+                          b"\xff", "strict", True)
+
+    def test_decoder_state(self):
+        self.check_state_handling_decode(self.encoding,
+                                         "spamspam", self.spamle)
+        self.check_state_handling_decode(self.encoding,
+                                         "spamspam", self.spambe)
+
+class UTF32LETest(ReadTest):
+    encoding = "utf-32-le"
+
+    def test_partial(self):
+        self.check_partial(
+            "\x00\xff\u0100\uffff",
+            [
+                "",
+                "",
+                "",
+                "\x00",
+                "\x00",
+                "\x00",
+                "\x00",
+                "\x00\xff",
+                "\x00\xff",
+                "\x00\xff",
+                "\x00\xff",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100\uffff",
+            ]
+        )
+
+    def test_simple(self):
+        self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
+
+    def test_errors(self):
+        self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
+                          b"\xff", "strict", True)
+
+class UTF32BETest(ReadTest):
+    encoding = "utf-32-be"
+
+    def test_partial(self):
+        self.check_partial(
+            "\x00\xff\u0100\uffff",
+            [
+                "",
+                "",
+                "",
+                "\x00",
+                "\x00",
+                "\x00",
+                "\x00",
+                "\x00\xff",
+                "\x00\xff",
+                "\x00\xff",
+                "\x00\xff",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100",
+                "\x00\xff\u0100\uffff",
+            ]
+        )
+
+    def test_simple(self):
+        self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
+
+    def test_errors(self):
+        self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
+                          b"\xff", "strict", True)
+
 class UTF16Test(ReadTest):
     encoding = "utf-16"
 
@@ -1284,6 +1421,9 @@ class WithStmtTest(unittest.TestCase):
 
 def test_main():
     test_support.run_unittest(
+        UTF32Test,
+        UTF32LETest,
+        UTF32BETest,
         UTF16Test,
         UTF16LETest,
         UTF16BETest,
author	Walter Dörwald <walter@livinglogic.de>	2007-08-16 21:55:45 (GMT)
committer	Walter Dörwald <walter@livinglogic.de>	2007-08-16 21:55:45 (GMT)
commit	41980caf644163f1ff74a793b30f1c424eeede82 (patch)
tree	dba1c68090fce4379eced5a27a5b8d4b4f55340c /Lib/test
parent	066100909ae45e7acd59b2ac81338d3cfcf44384 (diff)
download	cpython-41980caf644163f1ff74a793b30f1c424eeede82.zip cpython-41980caf644163f1ff74a793b30f1c424eeede82.tar.gz cpython-41980caf644163f1ff74a793b30f1c424eeede82.tar.bz2