diff options
Diffstat (limited to 'Lib/test/test_codecs.py')
| -rw-r--r-- | Lib/test/test_codecs.py | 143 | 
1 files changed, 133 insertions, 10 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index b93e0ab..4740b68 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -27,6 +27,7 @@ def coding_checker(self, coder):          self.assertEqual(coder(input), (expect, len(input)))      return check +  class Queue(object):      """      queue: write bytes at one end, read bytes from the other end @@ -47,6 +48,7 @@ class Queue(object):              self._buffer = self._buffer[size:]              return s +  class MixInCheckStateHandling:      def check_state_handling_decode(self, encoding, u, s):          for i in range(len(s)+1): @@ -80,6 +82,7 @@ class MixInCheckStateHandling:              part2 = d.encode(u[i:], True)              self.assertEqual(s, part1+part2) +  class ReadTest(MixInCheckStateHandling):      def check_partial(self, input, partialresults):          # get a StreamReader for the encoding and feed the bytestring version @@ -358,6 +361,12 @@ class ReadTest(MixInCheckStateHandling):          self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),                           "[?]".encode(self.encoding)) +        # sequential surrogate characters +        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"), +                         "[]".encode(self.encoding)) +        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"), +                         "[??]".encode(self.encoding)) +          bom = "".encode(self.encoding)          for before, after in [("\U00010fff", "A"), ("[", "]"),                                ("A", "\U00010fff")]: @@ -383,6 +392,7 @@ class ReadTest(MixInCheckStateHandling):              self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),                               before + backslashreplace + after) +  class UTF32Test(ReadTest, unittest.TestCase):      encoding = "utf-32"      if sys.byteorder == 'little': @@ -478,6 +488,7 @@ class UTF32Test(ReadTest, unittest.TestCase):          self.assertEqual('\U00010000' * 1024,                           codecs.utf_32_decode(encoded_be)[0]) +  class UTF32LETest(ReadTest, unittest.TestCase):      encoding = "utf-32-le"      ill_formed_sequence = b"\x80\xdc\x00\x00" @@ -523,6 +534,7 @@ class UTF32LETest(ReadTest, unittest.TestCase):          self.assertEqual('\U00010000' * 1024,                           codecs.utf_32_le_decode(encoded)[0]) +  class UTF32BETest(ReadTest, unittest.TestCase):      encoding = "utf-32-be"      ill_formed_sequence = b"\x00\x00\xdc\x80" @@ -747,6 +759,7 @@ class UTF8Test(ReadTest, unittest.TestCase):      encoding = "utf-8"      ill_formed_sequence = b"\xed\xb2\x80"      ill_formed_sequence_replace = "\ufffd" * 3 +    BOM = b''      def test_partial(self):          self.check_partial( @@ -775,27 +788,49 @@ class UTF8Test(ReadTest, unittest.TestCase):          self.check_state_handling_decode(self.encoding,                                           u, u.encode(self.encoding)) +    def test_decode_error(self): +        for data, error_handler, expected in ( +            (b'[\x80\xff]', 'ignore', '[]'), +            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), +            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), +            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), +        ): +            with self.subTest(data=data, error_handler=error_handler, +                              expected=expected): +                self.assertEqual(data.decode(self.encoding, error_handler), +                                 expected) +      def test_lone_surrogates(self):          super().test_lone_surrogates()          # not sure if this is making sense for          # UTF-16 and UTF-32 -        self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"), -                         b'[\x80]') +        self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"), +                         self.BOM + b'[\x80]') + +        with self.assertRaises(UnicodeEncodeError) as cm: +            "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape") +        exc = cm.exception +        self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')      def test_surrogatepass_handler(self): -        self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), -                         b"abc\xed\xa0\x80def") -        self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"), +        self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"), +                         self.BOM + b"abc\xed\xa0\x80def") +        self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"), +                         self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80") +        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"), +                         self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]') + +        self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),                           "abc\ud800def") -        self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"), -                         b"\xf0\x90\xbf\xbf\xed\xa0\x80") -        self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"), +        self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),                           "\U00010fff\uD800") +          self.assertTrue(codecs.lookup_error("surrogatepass"))          with self.assertRaises(UnicodeDecodeError): -            b"abc\xed\xa0".decode("utf-8", "surrogatepass") +            b"abc\xed\xa0".decode(self.encoding, "surrogatepass")          with self.assertRaises(UnicodeDecodeError): -            b"abc\xed\xa0z".decode("utf-8", "surrogatepass") +            b"abc\xed\xa0z".decode(self.encoding, "surrogatepass") +  @unittest.skipUnless(sys.platform == 'win32',                       'cp65001 is a Windows-only codec') @@ -1059,6 +1094,7 @@ class ReadBufferTest(unittest.TestCase):  class UTF8SigTest(UTF8Test, unittest.TestCase):      encoding = "utf-8-sig" +    BOM = codecs.BOM_UTF8      def test_partial(self):          self.check_partial( @@ -1194,6 +1230,7 @@ class EscapeDecodeTest(unittest.TestCase):          self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))          self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8)) +  class RecodingTest(unittest.TestCase):      def test_recoding(self):          f = io.BytesIO() @@ -1313,6 +1350,7 @@ for i in punycode_testcases:      if len(i)!=2:          print(repr(i)) +  class PunycodeTest(unittest.TestCase):      def test_encode(self):          for uni, puny in punycode_testcases: @@ -1332,6 +1370,7 @@ class PunycodeTest(unittest.TestCase):              puny = puny.decode("ascii").encode("ascii")              self.assertEqual(uni, puny.decode("punycode")) +  class UnicodeInternalTest(unittest.TestCase):      @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')      def test_bug1251300(self): @@ -1586,6 +1625,7 @@ class NameprepTest(unittest.TestCase):                  except Exception as e:                      raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) +  class IDNACodecTest(unittest.TestCase):      def test_builtin_decode(self):          self.assertEqual(str(b"python.org", "idna"), "python.org") @@ -1672,6 +1712,7 @@ class IDNACodecTest(unittest.TestCase):              self.assertRaises(Exception,                  b"python.org".decode, "idna", errors) +  class CodecsModuleTest(unittest.TestCase):      def test_decode(self): @@ -1780,6 +1821,7 @@ class CodecsModuleTest(unittest.TestCase):              self.assertRaises(UnicodeError,                  codecs.decode, b'abc', 'undefined', errors) +  class StreamReaderTest(unittest.TestCase):      def setUp(self): @@ -1790,6 +1832,7 @@ class StreamReaderTest(unittest.TestCase):          f = self.reader(self.stream)          self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00']) +  class EncodedFileTest(unittest.TestCase):      def test_basic(self): @@ -1920,6 +1963,7 @@ broken_unicode_with_stateful = [      "unicode_internal"  ] +  class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):      def test_basics(self):          s = "abc123"  # all codecs should be able to encode these @@ -2082,6 +2126,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):                  self.check_state_handling_decode(encoding, u, u.encode(encoding))                  self.check_state_handling_encode(encoding, u, u.encode(encoding)) +  class CharmapTest(unittest.TestCase):      def test_decode_with_string_map(self):          self.assertEqual( @@ -2332,6 +2377,7 @@ class WithStmtTest(unittest.TestCase):                                         info.streamwriter, 'strict') as srw:              self.assertEqual(srw.read(), "\xfc") +  class TypesTest(unittest.TestCase):      def test_decode_unicode(self):          # Most decoders don't accept unicode input @@ -2622,6 +2668,7 @@ else:      bytes_transform_encodings.append("bz2_codec")      transform_aliases["bz2_codec"] = ["bz2"] +  class TransformCodecTest(unittest.TestCase):      def test_basics(self): @@ -3099,5 +3146,81 @@ class CodePageTest(unittest.TestCase):          self.assertEqual(decoded, ('abc', 3)) +class ASCIITest(unittest.TestCase): +    def test_encode(self): +        self.assertEqual('abc123'.encode('ascii'), b'abc123') + +    def test_encode_error(self): +        for data, error_handler, expected in ( +            ('[\x80\xff\u20ac]', 'ignore', b'[]'), +            ('[\x80\xff\u20ac]', 'replace', b'[???]'), +            ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'), +            ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace', +             b'[\\x80\\xff\\u20ac\\U000abcde]'), +            ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), +        ): +            with self.subTest(data=data, error_handler=error_handler, +                              expected=expected): +                self.assertEqual(data.encode('ascii', error_handler), +                                 expected) + +    def test_encode_surrogateescape_error(self): +        with self.assertRaises(UnicodeEncodeError): +            # the first character can be decoded, but not the second +            '\udc80\xff'.encode('ascii', 'surrogateescape') + +    def test_decode(self): +        self.assertEqual(b'abc'.decode('ascii'), 'abc') + +    def test_decode_error(self): +        for data, error_handler, expected in ( +            (b'[\x80\xff]', 'ignore', '[]'), +            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), +            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), +            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), +        ): +            with self.subTest(data=data, error_handler=error_handler, +                              expected=expected): +                self.assertEqual(data.decode('ascii', error_handler), +                                 expected) + + +class Latin1Test(unittest.TestCase): +    def test_encode(self): +        for data, expected in ( +            ('abc', b'abc'), +            ('\x80\xe9\xff', b'\x80\xe9\xff'), +        ): +            with self.subTest(data=data, expected=expected): +                self.assertEqual(data.encode('latin1'), expected) + +    def test_encode_errors(self): +        for data, error_handler, expected in ( +            ('[\u20ac\udc80]', 'ignore', b'[]'), +            ('[\u20ac\udc80]', 'replace', b'[??]'), +            ('[\u20ac\U000abcde]', 'backslashreplace', +             b'[\\u20ac\\U000abcde]'), +            ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'), +            ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), +        ): +            with self.subTest(data=data, error_handler=error_handler, +                              expected=expected): +                self.assertEqual(data.encode('latin1', error_handler), +                                 expected) + +    def test_encode_surrogateescape_error(self): +        with self.assertRaises(UnicodeEncodeError): +            # the first character can be decoded, but not the second +            '\udc80\u20ac'.encode('latin1', 'surrogateescape') + +    def test_decode(self): +        for data, expected in ( +            (b'abc', 'abc'), +            (b'[\x80\xff]', '[\x80\xff]'), +        ): +            with self.subTest(data=data, expected=expected): +                self.assertEqual(data.decode('latin1'), expected) + +  if __name__ == "__main__":      unittest.main()  | 
