summaryrefslogtreecommitdiffstats
path: root/Lib/test/test_codecs.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/test/test_codecs.py')
-rw-r--r--Lib/test/test_codecs.py401
1 files changed, 352 insertions, 49 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 4899a59..5daaa19 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1,8 +1,25 @@
-from test import support
-import unittest
+import _testcapi
import codecs
+import io
import locale
-import sys, _testcapi, io
+import sys
+import unittest
+import warnings
+
+from test import support
+
+if sys.platform == 'win32':
+ VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
+else:
+ VISTA_OR_LATER = False
+
+try:
+ import ctypes
+except ImportError:
+ ctypes = None
+ SIZEOF_WCHAR_T = -1
+else:
+ SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
class Queue(object):
"""
@@ -622,8 +639,113 @@ class UTF8Test(ReadTest):
b"abc\xed\xa0\x80def")
self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
"abc\ud800def")
+ self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
+ b"\xf0\x90\xbf\xbf\xed\xa0\x80")
+ self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
+ "\U00010fff\uD800")
self.assertTrue(codecs.lookup_error("surrogatepass"))
+@unittest.skipUnless(sys.platform == 'win32',
+ 'cp65001 is a Windows-only codec')
+class CP65001Test(ReadTest):
+ encoding = "cp65001"
+
+ def test_encode(self):
+ tests = [
+ ('abc', 'strict', b'abc'),
+ ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
+ ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
+ ]
+ if VISTA_OR_LATER:
+ tests.extend((
+ ('\udc80', 'strict', None),
+ ('\udc80', 'ignore', b''),
+ ('\udc80', 'replace', b'?'),
+ ('\udc80', 'backslashreplace', b'\\udc80'),
+ ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
+ ))
+ else:
+ tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
+ for text, errors, expected in tests:
+ if expected is not None:
+ try:
+ encoded = text.encode('cp65001', errors)
+ except UnicodeEncodeError as err:
+ self.fail('Unable to encode %a to cp65001 with '
+ 'errors=%r: %s' % (text, errors, err))
+ self.assertEqual(encoded, expected,
+ '%a.encode("cp65001", %r)=%a != %a'
+ % (text, errors, encoded, expected))
+ else:
+ self.assertRaises(UnicodeEncodeError,
+ text.encode, "cp65001", errors)
+
+ def test_decode(self):
+ tests = [
+ (b'abc', 'strict', 'abc'),
+ (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
+ (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
+ (b'\xef\xbf\xbd', 'strict', '\ufffd'),
+ (b'[\xc3\xa9]', 'strict', '[\xe9]'),
+ # invalid bytes
+ (b'[\xff]', 'strict', None),
+ (b'[\xff]', 'ignore', '[]'),
+ (b'[\xff]', 'replace', '[\ufffd]'),
+ (b'[\xff]', 'surrogateescape', '[\udcff]'),
+ ]
+ if VISTA_OR_LATER:
+ tests.extend((
+ (b'[\xed\xb2\x80]', 'strict', None),
+ (b'[\xed\xb2\x80]', 'ignore', '[]'),
+ (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
+ ))
+ else:
+ tests.extend((
+ (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
+ ))
+ for raw, errors, expected in tests:
+ if expected is not None:
+ try:
+ decoded = raw.decode('cp65001', errors)
+ except UnicodeDecodeError as err:
+ self.fail('Unable to decode %a from cp65001 with '
+ 'errors=%r: %s' % (raw, errors, err))
+ self.assertEqual(decoded, expected,
+ '%a.decode("cp65001", %r)=%a != %a'
+ % (raw, errors, decoded, expected))
+ else:
+ self.assertRaises(UnicodeDecodeError,
+ raw.decode, 'cp65001', errors)
+
+ @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
+ def test_lone_surrogates(self):
+ self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
+ self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
+ self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
+ b'[\\udc80]')
+ self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
+ b'[�]')
+ self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
+ b'[\x80]')
+ self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
+ b'[]')
+ self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
+ b'[?]')
+
+ @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
+ def test_surrogatepass_handler(self):
+ self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
+ b"abc\xed\xa0\x80def")
+ self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
+ "abc\ud800def")
+ self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
+ b"\xf0\x90\xbf\xbf\xed\xa0\x80")
+ self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
+ "\U00010fff\uD800")
+ self.assertTrue(codecs.lookup_error("surrogatepass"))
+
+
+
class UTF7Test(ReadTest):
encoding = "utf-7"
@@ -884,61 +1006,80 @@ class PunycodeTest(unittest.TestCase):
self.assertEqual(uni, puny.decode("punycode"))
class UnicodeInternalTest(unittest.TestCase):
+ @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
def test_bug1251300(self):
# Decoding with unicode_internal used to not correctly handle "code
# points" above 0x10ffff on UCS-4 builds.
- if sys.maxunicode > 0xffff:
- ok = [
- (b"\x00\x10\xff\xff", "\U0010ffff"),
- (b"\x00\x00\x01\x01", "\U00000101"),
- (b"", ""),
- ]
- not_ok = [
- b"\x7f\xff\xff\xff",
- b"\x80\x00\x00\x00",
- b"\x81\x00\x00\x00",
- b"\x00",
- b"\x00\x00\x00\x00\x00",
- ]
- for internal, uni in ok:
- if sys.byteorder == "little":
- internal = bytes(reversed(internal))
+ ok = [
+ (b"\x00\x10\xff\xff", "\U0010ffff"),
+ (b"\x00\x00\x01\x01", "\U00000101"),
+ (b"", ""),
+ ]
+ not_ok = [
+ b"\x7f\xff\xff\xff",
+ b"\x80\x00\x00\x00",
+ b"\x81\x00\x00\x00",
+ b"\x00",
+ b"\x00\x00\x00\x00\x00",
+ ]
+ for internal, uni in ok:
+ if sys.byteorder == "little":
+ internal = bytes(reversed(internal))
+ with support.check_warnings():
self.assertEqual(uni, internal.decode("unicode_internal"))
- for internal in not_ok:
- if sys.byteorder == "little":
- internal = bytes(reversed(internal))
+ for internal in not_ok:
+ if sys.byteorder == "little":
+ internal = bytes(reversed(internal))
+ with support.check_warnings(('unicode_internal codec has been '
+ 'deprecated', DeprecationWarning)):
self.assertRaises(UnicodeDecodeError, internal.decode,
- "unicode_internal")
-
+ "unicode_internal")
+ if sys.byteorder == "little":
+ invalid = b"\x00\x00\x11\x00"
+ else:
+ invalid = b"\x00\x11\x00\x00"
+ with support.check_warnings():
+ self.assertRaises(UnicodeDecodeError,
+ invalid.decode, "unicode_internal")
+ with support.check_warnings():
+ self.assertEqual(invalid.decode("unicode_internal", "replace"),
+ '\ufffd')
+
+ @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
def test_decode_error_attributes(self):
- if sys.maxunicode > 0xffff:
- try:
+ try:
+ with support.check_warnings(('unicode_internal codec has been '
+ 'deprecated', DeprecationWarning)):
b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
- except UnicodeDecodeError as ex:
- self.assertEqual("unicode_internal", ex.encoding)
- self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
- self.assertEqual(4, ex.start)
- self.assertEqual(8, ex.end)
- else:
- self.fail()
+ except UnicodeDecodeError as ex:
+ self.assertEqual("unicode_internal", ex.encoding)
+ self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
+ self.assertEqual(4, ex.start)
+ self.assertEqual(8, ex.end)
+ else:
+ self.fail()
+ @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
def test_decode_callback(self):
- if sys.maxunicode > 0xffff:
- codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
- decoder = codecs.getdecoder("unicode_internal")
+ codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
+ decoder = codecs.getdecoder("unicode_internal")
+ with support.check_warnings(('unicode_internal codec has been '
+ 'deprecated', DeprecationWarning)):
ab = "ab".encode("unicode_internal").decode()
ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
"ascii"),
"UnicodeInternalTest")
- self.assertEqual(("ab", 12), ignored)
+ self.assertEqual(("ab", 12), ignored)
def test_encode_length(self):
- # Issue 3739
- encoder = codecs.getencoder("unicode_internal")
- self.assertEqual(encoder("a")[1], 1)
- self.assertEqual(encoder("\xe9\u0142")[1], 2)
+ with support.check_warnings(('unicode_internal codec has been '
+ 'deprecated', DeprecationWarning)):
+ # Issue 3739
+ encoder = codecs.getencoder("unicode_internal")
+ self.assertEqual(encoder("a")[1], 1)
+ self.assertEqual(encoder("\xe9\u0142")[1], 2)
- self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
+ self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
nameprep_tests = [
@@ -1262,7 +1403,7 @@ class EncodedFileTest(unittest.TestCase):
self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
f = io.BytesIO()
- ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
+ ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
ef.write(b'\xc3\xbc')
self.assertEqual(f.getvalue(), b'\xfc')
@@ -1394,10 +1535,13 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
elif encoding == "latin_1":
name = "latin_1"
self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
- (b, size) = codecs.getencoder(encoding)(s)
- self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
- (chars, size) = codecs.getdecoder(encoding)(b)
- self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
+
+ with support.check_warnings():
+ # unicode-internal has been deprecated
+ (b, size) = codecs.getencoder(encoding)(s)
+ self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
+ (chars, size) = codecs.getdecoder(encoding)(b)
+ self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
if encoding not in broken_unicode_with_streams:
# check stream reader/writer
@@ -1501,7 +1645,9 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
def test_bad_encode_args(self):
for encoding in all_unicode_encodings:
encoder = codecs.getencoder(encoding)
- self.assertRaises(TypeError, encoder)
+ with support.check_warnings():
+ # unicode-internal has been deprecated
+ self.assertRaises(TypeError, encoder)
def test_encoding_map_type_initialized(self):
from encodings import cp1140
@@ -1593,6 +1739,12 @@ class TypesTest(unittest.TestCase):
self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
+ self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
+ self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
+
+ self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
+ self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
+
class SurrogateEscapeTest(unittest.TestCase):
def test_utf8(self):
@@ -1623,7 +1775,7 @@ class SurrogateEscapeTest(unittest.TestCase):
def test_latin1(self):
# Issue6373
- self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
+ self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
b"\xe4\xeb\xef\xf6\xfc")
@@ -1732,6 +1884,155 @@ class TransformCodecTest(unittest.TestCase):
self.assertEqual(sout, b"\x80")
+@unittest.skipUnless(sys.platform == 'win32',
+ 'code pages are specific to Windows')
+class CodePageTest(unittest.TestCase):
+ # CP_UTF8 is already tested by CP65001Test
+ CP_UTF8 = 65001
+
+ def test_invalid_code_page(self):
+ self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
+ self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
+ self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
+ self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
+
+ def test_code_page_name(self):
+ self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
+ codecs.code_page_encode, 932, '\xff')
+ self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
+ codecs.code_page_decode, 932, b'\x81\x00')
+ self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
+ codecs.code_page_decode, self.CP_UTF8, b'\xff')
+
+ def check_decode(self, cp, tests):
+ for raw, errors, expected in tests:
+ if expected is not None:
+ try:
+ decoded = codecs.code_page_decode(cp, raw, errors)
+ except UnicodeDecodeError as err:
+ self.fail('Unable to decode %a from "cp%s" with '
+ 'errors=%r: %s' % (raw, cp, errors, err))
+ self.assertEqual(decoded[0], expected,
+ '%a.decode("cp%s", %r)=%a != %a'
+ % (raw, cp, errors, decoded[0], expected))
+ # assert 0 <= decoded[1] <= len(raw)
+ self.assertGreaterEqual(decoded[1], 0)
+ self.assertLessEqual(decoded[1], len(raw))
+ else:
+ self.assertRaises(UnicodeDecodeError,
+ codecs.code_page_decode, cp, raw, errors)
+
+ def check_encode(self, cp, tests):
+ for text, errors, expected in tests:
+ if expected is not None:
+ try:
+ encoded = codecs.code_page_encode(cp, text, errors)
+ except UnicodeEncodeError as err:
+ self.fail('Unable to encode %a to "cp%s" with '
+ 'errors=%r: %s' % (text, cp, errors, err))
+ self.assertEqual(encoded[0], expected,
+ '%a.encode("cp%s", %r)=%a != %a'
+ % (text, cp, errors, encoded[0], expected))
+ self.assertEqual(encoded[1], len(text))
+ else:
+ self.assertRaises(UnicodeEncodeError,
+ codecs.code_page_encode, cp, text, errors)
+
+ def test_cp932(self):
+ self.check_encode(932, (
+ ('abc', 'strict', b'abc'),
+ ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
+ # test error handlers
+ ('\xff', 'strict', None),
+ ('[\xff]', 'ignore', b'[]'),
+ ('[\xff]', 'replace', b'[y]'),
+ ('[\u20ac]', 'replace', b'[?]'),
+ ('[\xff]', 'backslashreplace', b'[\\xff]'),
+ ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
+ ))
+ self.check_decode(932, (
+ (b'abc', 'strict', 'abc'),
+ (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
+ # invalid bytes
+ (b'[\xff]', 'strict', None),
+ (b'[\xff]', 'ignore', '[]'),
+ (b'[\xff]', 'replace', '[\ufffd]'),
+ (b'[\xff]', 'surrogateescape', '[\udcff]'),
+ (b'\x81\x00abc', 'strict', None),
+ (b'\x81\x00abc', 'ignore', '\x00abc'),
+ (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
+ ))
+
+ def test_cp1252(self):
+ self.check_encode(1252, (
+ ('abc', 'strict', b'abc'),
+ ('\xe9\u20ac', 'strict', b'\xe9\x80'),
+ ('\xff', 'strict', b'\xff'),
+ ('\u0141', 'strict', None),
+ ('\u0141', 'ignore', b''),
+ ('\u0141', 'replace', b'L'),
+ ))
+ self.check_decode(1252, (
+ (b'abc', 'strict', 'abc'),
+ (b'\xe9\x80', 'strict', '\xe9\u20ac'),
+ (b'\xff', 'strict', '\xff'),
+ ))
+
+ def test_cp_utf7(self):
+ cp = 65000
+ self.check_encode(cp, (
+ ('abc', 'strict', b'abc'),
+ ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
+ ('\U0010ffff', 'strict', b'+2//f/w-'),
+ ('\udc80', 'strict', b'+3IA-'),
+ ('\ufffd', 'strict', b'+//0-'),
+ ))
+ self.check_decode(cp, (
+ (b'abc', 'strict', 'abc'),
+ (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
+ (b'+2//f/w-', 'strict', '\U0010ffff'),
+ (b'+3IA-', 'strict', '\udc80'),
+ (b'+//0-', 'strict', '\ufffd'),
+ # invalid bytes
+ (b'[+/]', 'strict', '[]'),
+ (b'[\xff]', 'strict', '[\xff]'),
+ ))
+
+ def test_multibyte_encoding(self):
+ self.check_decode(932, (
+ (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
+ (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
+ ))
+ self.check_decode(self.CP_UTF8, (
+ (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
+ (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
+ ))
+ if VISTA_OR_LATER:
+ self.check_encode(self.CP_UTF8, (
+ ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
+ ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
+ ))
+
+ def test_incremental(self):
+ decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
+ self.assertEqual(decoded, ('', 0))
+
+ decoded = codecs.code_page_decode(932,
+ b'\xe9\x80\xe9', 'strict',
+ False)
+ self.assertEqual(decoded, ('\u9a3e', 2))
+
+ decoded = codecs.code_page_decode(932,
+ b'\xe9\x80\xe9\x80', 'strict',
+ False)
+ self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
+
+ decoded = codecs.code_page_decode(932,
+ b'abc', 'strict',
+ False)
+ self.assertEqual(decoded, ('abc', 3))
+
+
def test_main():
support.run_unittest(
UTF32Test,
@@ -1742,6 +2043,7 @@ def test_main():
UTF16BETest,
UTF8Test,
UTF8SigTest,
+ CP65001Test,
UTF7Test,
UTF16ExTest,
ReadBufferTest,
@@ -1760,6 +2062,7 @@ def test_main():
SurrogateEscapeTest,
BomTest,
TransformCodecTest,
+ CodePageTest,
)