SF bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain

about illegal code points. The codec now supports PEP 293 style error handlers. (This is a variant of the Nik Haldimann's patch that detects truncated data)
author: Walter Dörwald <walter@livinglogic.de> 2005-08-30 10:23:14 (GMT)
committer: Walter Dörwald <walter@livinglogic.de> 2005-08-30 10:23:14 (GMT)
commit: a47d1c08d0911f2f49d92b8c6035593a672af436 (patch)
tree: b89cf4f689e9037da807a5e2509d87715d64057f /Lib
parent: 523c9f0709d5e7af4d45817b92cf5ce01609269c (diff)
download: cpython-a47d1c08d0911f2f49d92b8c6035593a672af436.zip
cpython-a47d1c08d0911f2f49d92b8c6035593a672af436.tar.gz
cpython-a47d1c08d0911f2f49d92b8c6035593a672af436.tar.bz2
2 files changed, 82 insertions, 3 deletions
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 8f0d590..f8e59cd 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -111,7 +111,7 @@ class CodecCallbackTest(unittest.TestCase):
             sout += "\\U%08x" % sys.maxunicode
         self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
 
-    def test_relaxedutf8(self):
+    def test_decoderelaxedutf8(self):
         # This is the test for a decoding callback handler,
         # that relaxes the UTF-8 minimal encoding restriction.
         # A null byte that is encoded as "\xc0\x80" will be
@@ -158,6 +158,35 @@ class CodecCallbackTest(unittest.TestCase):
         charmap[ord("?")] = u"XYZ"
         self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
 
+    def test_decodeunicodeinternal(self):
+        self.assertRaises(
+            UnicodeDecodeError,
+            "\x00\x00\x00\x00\x00".decode,
+            "unicode-internal",
+        )
+        if sys.maxunicode > 0xffff:
+            def handler_unicodeinternal(exc):
+                if not isinstance(exc, UnicodeDecodeError):
+                    raise TypeError("don't know how to handle %r" % exc)
+                return (u"\x01", 1)
+
+            self.assertEqual(
+                "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
+                u"\u0000"
+            )
+
+            self.assertEqual(
+                "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
+                u"\u0000\ufffd"
+            )
+
+            codecs.register_error("test.hui", handler_unicodeinternal)
+
+            self.assertEqual(
+                "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
+                u"\u0000\u0001\u0000"
+            )
+
     def test_callbacks(self):
         def handler1(exc):
             if not isinstance(exc, UnicodeEncodeError) \
@@ -503,7 +532,8 @@ class CodecCallbackTest(unittest.TestCase):
             for (enc, bytes) in (
                 ("ascii", "\xff"),
                 ("utf-8", "\xff"),
-                ("utf-7", "+x-")
+                ("utf-7", "+x-"),
+                ("unicode-internal", "\x00"),
             ):
                 self.assertRaises(
                     TypeError,
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 5189e80..a4d58c6 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1,7 +1,7 @@
 from test import test_support
 import unittest
 import codecs
-import StringIO
+import sys, StringIO
 
 class Queue(object):
     """
@@ -453,6 +453,54 @@ class PunycodeTest(unittest.TestCase):
         for uni, puny in punycode_testcases:
             self.assertEquals(uni, puny.decode("punycode"))
 
+class UnicodeInternalTest(unittest.TestCase):
+    def test_bug1251300(self):
+        # Decoding with unicode_internal used to not correctly handle "code
+        # points" above 0x10ffff on UCS-4 builds.
+        if sys.maxunicode > 0xffff:
+            ok = [
+                ("\x00\x10\xff\xff", u"\U0010ffff"),
+                ("\x00\x00\x01\x01", u"\U00000101"),
+                ("", u""),
+            ]
+            not_ok = [
+                "\x7f\xff\xff\xff",
+                "\x80\x00\x00\x00",
+                "\x81\x00\x00\x00",
+                "\x00",
+                "\x00\x00\x00\x00\x00",
+            ]
+            for internal, uni in ok:
+                if sys.byteorder == "little":
+                    internal = "".join(reversed(internal))
+                self.assertEquals(uni, internal.decode("unicode_internal"))
+            for internal in not_ok:
+                if sys.byteorder == "little":
+                    internal = "".join(reversed(internal))
+                self.assertRaises(UnicodeDecodeError, internal.decode,
+                    "unicode_internal")
+
+    def test_decode_error_attributes(self):
+        if sys.maxunicode > 0xffff:
+            try:
+                "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
+            except UnicodeDecodeError, ex:
+                self.assertEquals("unicode_internal", ex.encoding)
+                self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
+                self.assertEquals(4, ex.start)
+                self.assertEquals(8, ex.end)
+            else:
+                self.fail()
+
+    def test_decode_callback(self):
+        if sys.maxunicode > 0xffff:
+            codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
+            decoder = codecs.getdecoder("unicode_internal")
+            ab = u"ab".encode("unicode_internal")
+            ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
+                "UnicodeInternalTest")
+            self.assertEquals((u"ab", 12), ignored)
+
 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
 nameprep_tests = [
     # 3.1 Map to nothing.
@@ -885,6 +933,7 @@ def test_main():
         EscapeDecodeTest,
         RecodingTest,
         PunycodeTest,
+        UnicodeInternalTest,
         NameprepTest,
         CodecTest,
         CodecsModuleTest,
author	Walter Dörwald <walter@livinglogic.de>	2005-08-30 10:23:14 (GMT)
committer	Walter Dörwald <walter@livinglogic.de>	2005-08-30 10:23:14 (GMT)
commit	a47d1c08d0911f2f49d92b8c6035593a672af436 (patch)
tree	b89cf4f689e9037da807a5e2509d87715d64057f /Lib
parent	523c9f0709d5e7af4d45817b92cf5ce01609269c (diff)
download	cpython-a47d1c08d0911f2f49d92b8c6035593a672af436.zip cpython-a47d1c08d0911f2f49d92b8c6035593a672af436.tar.gz cpython-a47d1c08d0911f2f49d92b8c6035593a672af436.tar.bz2