summaryrefslogtreecommitdiffstats
path: root/Lib/test/test_codecs.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/test/test_codecs.py')
-rw-r--r--Lib/test/test_codecs.py520
1 files changed, 486 insertions, 34 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index e412a64..43886fc 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1,8 +1,14 @@
from test import support
import unittest
import codecs
+import locale
import sys, _testcapi, io
+def coding_checker(self, coder):
+ def check(input, expect):
+ self.assertEqual(coder(input), (expect, len(input)))
+ return check
+
class Queue(object):
"""
queue: write bytes at one end, read bytes from the other end
@@ -29,7 +35,7 @@ class MixInCheckStateHandling:
d = codecs.getincrementaldecoder(encoding)()
part1 = d.decode(s[:i])
state = d.getstate()
- self.assertTrue(isinstance(state[1], int))
+ self.assertIsInstance(state[1], int)
# Check that the condition stated in the documentation for
# IncrementalDecoder.getstate() holds
if not state[1]:
@@ -72,7 +78,6 @@ class ReadTest(unittest.TestCase, MixInCheckStateHandling):
# check that there's nothing left in the buffers
self.assertEqual(r.read(), "")
self.assertEqual(r.bytebuffer, b"")
- self.assertEqual(r.charbuffer, "")
# do the check again, this time using a incremental decoder
d = codecs.getincrementaldecoder(self.encoding)()
@@ -313,7 +318,7 @@ class UTF32Test(ReadTest):
def test_partial(self):
self.check_partial(
- "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
[
"", # first byte of BOM read
"", # second byte of BOM read
@@ -335,6 +340,10 @@ class UTF32Test(ReadTest):
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
]
)
@@ -369,7 +378,7 @@ class UTF32LETest(ReadTest):
def test_partial(self):
self.check_partial(
- "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
[
"",
"",
@@ -387,6 +396,10 @@ class UTF32LETest(ReadTest):
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
]
)
@@ -409,7 +422,7 @@ class UTF32BETest(ReadTest):
def test_partial(self):
self.check_partial(
- "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
[
"",
"",
@@ -427,6 +440,10 @@ class UTF32BETest(ReadTest):
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
]
)
@@ -477,7 +494,7 @@ class UTF16Test(ReadTest):
def test_partial(self):
self.check_partial(
- "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
[
"", # first byte of BOM read
"", # second byte of BOM read => byteorder known
@@ -489,6 +506,10 @@ class UTF16Test(ReadTest):
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
]
)
@@ -526,7 +547,7 @@ class UTF16LETest(ReadTest):
def test_partial(self):
self.check_partial(
- "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
[
"",
"\x00",
@@ -536,19 +557,40 @@ class UTF16LETest(ReadTest):
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
]
)
def test_errors(self):
- self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
- b"\xff", "strict", True)
+ tests = [
+ (b'\xff', '\ufffd'),
+ (b'A\x00Z', 'A\ufffd'),
+ (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
+ (b'\x00\xd8', '\ufffd'),
+ (b'\x00\xd8A', '\ufffd'),
+ (b'\x00\xd8A\x00', '\ufffdA'),
+ (b'\x00\xdcA\x00', '\ufffdA'),
+ ]
+ for raw, expected in tests:
+ self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
+ raw, 'strict', True)
+ self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
+
+ def test_nonbmp(self):
+ self.assertEqual("\U00010203".encode(self.encoding),
+ b'\x00\xd8\x03\xde')
+ self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
+ "\U00010203")
class UTF16BETest(ReadTest):
encoding = "utf-16-be"
def test_partial(self):
self.check_partial(
- "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
[
"",
"\x00",
@@ -558,19 +600,40 @@ class UTF16BETest(ReadTest):
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff",
+ "\x00\xff\u0100\uffff\U00010000",
]
)
def test_errors(self):
- self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
- b"\xff", "strict", True)
+ tests = [
+ (b'\xff', '\ufffd'),
+ (b'\x00A\xff', 'A\ufffd'),
+ (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
+ (b'\xd8\x00', '\ufffd'),
+ (b'\xd8\x00\xdc', '\ufffd'),
+ (b'\xd8\x00\x00A', '\ufffdA'),
+ (b'\xdc\x00\x00A', '\ufffdA'),
+ ]
+ for raw, expected in tests:
+ self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
+ raw, 'strict', True)
+ self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
+
+ def test_nonbmp(self):
+ self.assertEqual("\U00010203".encode(self.encoding),
+ b'\xd8\x00\xde\x03')
+ self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
+ "\U00010203")
class UTF8Test(ReadTest):
encoding = "utf-8"
def test_partial(self):
self.check_partial(
- "\x00\xff\u07ff\u0800\uffff",
+ "\x00\xff\u07ff\u0800\uffff\U00010000",
[
"\x00",
"\x00",
@@ -583,6 +646,10 @@ class UTF8Test(ReadTest):
"\x00\xff\u07ff\u0800",
"\x00\xff\u07ff\u0800",
"\x00\xff\u07ff\u0800\uffff",
+ "\x00\xff\u07ff\u0800\uffff",
+ "\x00\xff\u07ff\u0800\uffff",
+ "\x00\xff\u07ff\u0800\uffff",
+ "\x00\xff\u07ff\u0800\uffff\U00010000",
]
)
@@ -611,6 +678,10 @@ class UTF8Test(ReadTest):
self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
"abc\ud800def")
self.assertTrue(codecs.lookup_error("surrogatepass"))
+ with self.assertRaises(UnicodeDecodeError):
+ b"abc\xed\xa0".decode("utf-8", "surrogatepass")
+ with self.assertRaises(UnicodeDecodeError):
+ b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
class UTF7Test(ReadTest):
encoding = "utf-7"
@@ -651,24 +722,12 @@ class ReadBufferTest(unittest.TestCase):
self.assertRaises(TypeError, codecs.readbuffer_encode)
self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
-class CharBufferTest(unittest.TestCase):
-
- def test_string(self):
- self.assertEqual(codecs.charbuffer_encode(b"spam"), (b"spam", 4))
-
- def test_empty(self):
- self.assertEqual(codecs.charbuffer_encode(b""), (b"", 0))
-
- def test_bad_args(self):
- self.assertRaises(TypeError, codecs.charbuffer_encode)
- self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
-
class UTF8SigTest(ReadTest):
encoding = "utf-8-sig"
def test_partial(self):
self.check_partial(
- "\ufeff\x00\xff\u07ff\u0800\uffff",
+ "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
[
"",
"",
@@ -687,6 +746,10 @@ class UTF8SigTest(ReadTest):
"\ufeff\x00\xff\u07ff\u0800",
"\ufeff\x00\xff\u07ff\u0800",
"\ufeff\x00\xff\u07ff\u0800\uffff",
+ "\ufeff\x00\xff\u07ff\u0800\uffff",
+ "\ufeff\x00\xff\u07ff\u0800\uffff",
+ "\ufeff\x00\xff\u07ff\u0800\uffff",
+ "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
]
)
@@ -745,7 +808,55 @@ class UTF8SigTest(ReadTest):
class EscapeDecodeTest(unittest.TestCase):
def test_empty(self):
- self.assertEqual(codecs.escape_decode(""), ("", 0))
+ self.assertEqual(codecs.escape_decode(b""), (b"", 0))
+
+ def test_raw(self):
+ decode = codecs.escape_decode
+ for b in range(256):
+ b = bytes([b])
+ if b != b'\\':
+ self.assertEqual(decode(b + b'0'), (b + b'0', 2))
+
+ def test_escape(self):
+ decode = codecs.escape_decode
+ check = coding_checker(self, decode)
+ check(b"[\\\n]", b"[]")
+ check(br'[\"]', b'["]')
+ check(br"[\']", b"[']")
+ check(br"[\\]", br"[\]")
+ check(br"[\a]", b"[\x07]")
+ check(br"[\b]", b"[\x08]")
+ check(br"[\t]", b"[\x09]")
+ check(br"[\n]", b"[\x0a]")
+ check(br"[\v]", b"[\x0b]")
+ check(br"[\f]", b"[\x0c]")
+ check(br"[\r]", b"[\x0d]")
+ check(br"[\7]", b"[\x07]")
+ check(br"[\8]", br"[\8]")
+ check(br"[\78]", b"[\x078]")
+ check(br"[\41]", b"[!]")
+ check(br"[\418]", b"[!8]")
+ check(br"[\101]", b"[A]")
+ check(br"[\1010]", b"[A0]")
+ check(br"[\501]", b"[A]")
+ check(br"[\x41]", b"[A]")
+ check(br"[\X41]", br"[\X41]")
+ check(br"[\x410]", b"[A0]")
+ for b in range(256):
+ if b not in b'\n"\'\\abtnvfr01234567x':
+ b = bytes([b])
+ check(b'\\' + b, b'\\' + b)
+
+ def test_errors(self):
+ decode = codecs.escape_decode
+ self.assertRaises(ValueError, decode, br"\x")
+ self.assertRaises(ValueError, decode, br"[\x]")
+ self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
+ self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
+ self.assertRaises(ValueError, decode, br"\x0")
+ self.assertRaises(ValueError, decode, br"[\x0]")
+ self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
+ self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
class RecodingTest(unittest.TestCase):
def test_recoding(self):
@@ -1231,6 +1342,19 @@ class CodecsModuleTest(unittest.TestCase):
self.assertRaises(TypeError, codecs.getwriter)
self.assertRaises(LookupError, codecs.getwriter, "__spam__")
+ def test_lookup_issue1813(self):
+ # Issue #1813: under Turkish locales, lookup of some codecs failed
+ # because 'I' is lowercased as "ı" (dotless i)
+ oldlocale = locale.setlocale(locale.LC_CTYPE)
+ self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
+ try:
+ locale.setlocale(locale.LC_CTYPE, 'tr_TR')
+ except locale.Error:
+ # Unsupported locale on this system
+ self.skipTest('test needs Turkish locale')
+ c = codecs.lookup('ASCII')
+ self.assertEqual(c.name, 'ascii')
+
class StreamReaderTest(unittest.TestCase):
def setUp(self):
@@ -1274,6 +1398,7 @@ all_unicode_encodings = [
"cp424",
"cp437",
"cp500",
+ "cp720",
"cp737",
"cp775",
"cp850",
@@ -1281,6 +1406,7 @@ all_unicode_encodings = [
"cp855",
"cp856",
"cp857",
+ "cp858",
"cp860",
"cp861",
"cp862",
@@ -1369,11 +1495,6 @@ broken_incremental_coders = broken_unicode_with_streams + [
"idna",
]
-# The following encodings only support "strict" mode
-only_strict_mode = [
- "idna",
-]
-
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
def test_basics(self):
s = "abc123" # all codecs should be able to encode these
@@ -1448,7 +1569,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
self.assertEqual(result, "")
- if encoding not in only_strict_mode:
+ if encoding not in ("idna", "mbcs"):
# check incremental decoder/encoder with errors argument
try:
encoder = codecs.getincrementalencoder(encoding)("ignore")
@@ -1514,6 +1635,14 @@ class CharmapTest(unittest.TestCase):
("abc", 3)
)
+ self.assertRaises(UnicodeDecodeError,
+ codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
+ )
+
+ self.assertRaises(UnicodeDecodeError,
+ codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
+ )
+
self.assertEqual(
codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
("ab\ufffd", 3)
@@ -1540,6 +1669,149 @@ class CharmapTest(unittest.TestCase):
("", len(allbytes))
)
+ def test_decode_with_int2str_map(self):
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "strict",
+ {0: 'a', 1: 'b', 2: 'c'}),
+ ("abc", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "strict",
+ {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
+ ("AaBbCc", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "strict",
+ {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
+ ("\U0010FFFFbc", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "strict",
+ {0: 'a', 1: 'b', 2: ''}),
+ ("ab", 3)
+ )
+
+ self.assertRaises(UnicodeDecodeError,
+ codecs.charmap_decode, b"\x00\x01\x02", "strict",
+ {0: 'a', 1: 'b'}
+ )
+
+ self.assertRaises(UnicodeDecodeError,
+ codecs.charmap_decode, b"\x00\x01\x02", "strict",
+ {0: 'a', 1: 'b', 2: None}
+ )
+
+ # Issue #14850
+ self.assertRaises(UnicodeDecodeError,
+ codecs.charmap_decode, b"\x00\x01\x02", "strict",
+ {0: 'a', 1: 'b', 2: '\ufffe'}
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "replace",
+ {0: 'a', 1: 'b'}),
+ ("ab\ufffd", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "replace",
+ {0: 'a', 1: 'b', 2: None}),
+ ("ab\ufffd", 3)
+ )
+
+ # Issue #14850
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "replace",
+ {0: 'a', 1: 'b', 2: '\ufffe'}),
+ ("ab\ufffd", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "ignore",
+ {0: 'a', 1: 'b'}),
+ ("ab", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "ignore",
+ {0: 'a', 1: 'b', 2: None}),
+ ("ab", 3)
+ )
+
+ # Issue #14850
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "ignore",
+ {0: 'a', 1: 'b', 2: '\ufffe'}),
+ ("ab", 3)
+ )
+
+ allbytes = bytes(range(256))
+ self.assertEqual(
+ codecs.charmap_decode(allbytes, "ignore", {}),
+ ("", len(allbytes))
+ )
+
+ def test_decode_with_int2int_map(self):
+ a = ord('a')
+ b = ord('b')
+ c = ord('c')
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "strict",
+ {0: a, 1: b, 2: c}),
+ ("abc", 3)
+ )
+
+ # Issue #15379
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "strict",
+ {0: 0x10FFFF, 1: b, 2: c}),
+ ("\U0010FFFFbc", 3)
+ )
+
+ self.assertRaises(TypeError,
+ codecs.charmap_decode, b"\x00\x01\x02", "strict",
+ {0: 0x110000, 1: b, 2: c}
+ )
+
+ self.assertRaises(UnicodeDecodeError,
+ codecs.charmap_decode, b"\x00\x01\x02", "strict",
+ {0: a, 1: b},
+ )
+
+ self.assertRaises(UnicodeDecodeError,
+ codecs.charmap_decode, b"\x00\x01\x02", "strict",
+ {0: a, 1: b, 2: 0xFFFE},
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "replace",
+ {0: a, 1: b}),
+ ("ab\ufffd", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "replace",
+ {0: a, 1: b, 2: 0xFFFE}),
+ ("ab\ufffd", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "ignore",
+ {0: a, 1: b}),
+ ("ab", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "ignore",
+ {0: a, 1: b, 2: 0xFFFE}),
+ ("ab", 3)
+ )
+
+
class WithStmtTest(unittest.TestCase):
def test_encodedfile(self):
f = io.BytesIO(b"\xc3\xbc")
@@ -1583,6 +1855,135 @@ class TypesTest(unittest.TestCase):
self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
+
+class UnicodeEscapeTest(unittest.TestCase):
+ def test_empty(self):
+ self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
+ self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
+
+ def test_raw_encode(self):
+ encode = codecs.unicode_escape_encode
+ for b in range(32, 127):
+ if b != b'\\'[0]:
+ self.assertEqual(encode(chr(b)), (bytes([b]), 1))
+
+ def test_raw_decode(self):
+ decode = codecs.unicode_escape_decode
+ for b in range(256):
+ if b != b'\\'[0]:
+ self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
+
+ def test_escape_encode(self):
+ encode = codecs.unicode_escape_encode
+ check = coding_checker(self, encode)
+ check('\t', br'\t')
+ check('\n', br'\n')
+ check('\r', br'\r')
+ check('\\', br'\\')
+ for b in range(32):
+ if chr(b) not in '\t\n\r':
+ check(chr(b), ('\\x%02x' % b).encode())
+ for b in range(127, 256):
+ check(chr(b), ('\\x%02x' % b).encode())
+ check('\u20ac', br'\u20ac')
+ check('\U0001d120', br'\U0001d120')
+
+ def test_escape_decode(self):
+ decode = codecs.unicode_escape_decode
+ check = coding_checker(self, decode)
+ check(b"[\\\n]", "[]")
+ check(br'[\"]', '["]')
+ check(br"[\']", "[']")
+ check(br"[\\]", r"[\]")
+ check(br"[\a]", "[\x07]")
+ check(br"[\b]", "[\x08]")
+ check(br"[\t]", "[\x09]")
+ check(br"[\n]", "[\x0a]")
+ check(br"[\v]", "[\x0b]")
+ check(br"[\f]", "[\x0c]")
+ check(br"[\r]", "[\x0d]")
+ check(br"[\7]", "[\x07]")
+ check(br"[\8]", r"[\8]")
+ check(br"[\78]", "[\x078]")
+ check(br"[\41]", "[!]")
+ check(br"[\418]", "[!8]")
+ check(br"[\101]", "[A]")
+ check(br"[\1010]", "[A0]")
+ check(br"[\x41]", "[A]")
+ check(br"[\x410]", "[A0]")
+ check(br"\u20ac", "\u20ac")
+ check(br"\U0001d120", "\U0001d120")
+ for b in range(256):
+ if b not in b'\n"\'\\abtnvfr01234567xuUN':
+ check(b'\\' + bytes([b]), '\\' + chr(b))
+
+ def test_decode_errors(self):
+ decode = codecs.unicode_escape_decode
+ for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
+ for i in range(d):
+ self.assertRaises(UnicodeDecodeError, decode,
+ b"\\" + c + b"0"*i)
+ self.assertRaises(UnicodeDecodeError, decode,
+ b"[\\" + c + b"0"*i + b"]")
+ data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
+ self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
+ self.assertEqual(decode(data, "replace"),
+ ("[\ufffd]\ufffd", len(data)))
+ self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
+ self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
+ self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
+
+
+class RawUnicodeEscapeTest(unittest.TestCase):
+ def test_empty(self):
+ self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
+ self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
+
+ def test_raw_encode(self):
+ encode = codecs.raw_unicode_escape_encode
+ for b in range(256):
+ self.assertEqual(encode(chr(b)), (bytes([b]), 1))
+
+ def test_raw_decode(self):
+ decode = codecs.raw_unicode_escape_decode
+ for b in range(256):
+ self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
+
+ def test_escape_encode(self):
+ encode = codecs.raw_unicode_escape_encode
+ check = coding_checker(self, encode)
+ for b in range(256):
+ if b not in b'uU':
+ check('\\' + chr(b), b'\\' + bytes([b]))
+ check('\u20ac', br'\u20ac')
+ check('\U0001d120', br'\U0001d120')
+
+ def test_escape_decode(self):
+ decode = codecs.raw_unicode_escape_decode
+ check = coding_checker(self, decode)
+ for b in range(256):
+ if b not in b'uU':
+ check(b'\\' + bytes([b]), '\\' + chr(b))
+ check(br"\u20ac", "\u20ac")
+ check(br"\U0001d120", "\U0001d120")
+
+ def test_decode_errors(self):
+ decode = codecs.raw_unicode_escape_decode
+ for c, d in (b'u', 4), (b'U', 4):
+ for i in range(d):
+ self.assertRaises(UnicodeDecodeError, decode,
+ b"\\" + c + b"0"*i)
+ self.assertRaises(UnicodeDecodeError, decode,
+ b"[\\" + c + b"0"*i + b"]")
+ data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
+ self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
+ self.assertEqual(decode(data, "replace"),
+ ("[\ufffd]\ufffd", len(data)))
+ self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
+ self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
+ self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
+
+
class SurrogateEscapeTest(unittest.TestCase):
def test_utf8(self):
@@ -1674,6 +2075,54 @@ class BomTest(unittest.TestCase):
self.assertEqual(f.read(), data * 2)
+bytes_transform_encodings = [
+ "base64_codec",
+ "uu_codec",
+ "quopri_codec",
+ "hex_codec",
+]
+try:
+ import zlib
+except ImportError:
+ pass
+else:
+ bytes_transform_encodings.append("zlib_codec")
+try:
+ import bz2
+except ImportError:
+ pass
+else:
+ bytes_transform_encodings.append("bz2_codec")
+
+class TransformCodecTest(unittest.TestCase):
+
+ def test_basics(self):
+ binput = bytes(range(256))
+ for encoding in bytes_transform_encodings:
+ # generic codecs interface
+ (o, size) = codecs.getencoder(encoding)(binput)
+ self.assertEqual(size, len(binput))
+ (i, size) = codecs.getdecoder(encoding)(o)
+ self.assertEqual(size, len(o))
+ self.assertEqual(i, binput)
+
+ def test_read(self):
+ for encoding in bytes_transform_encodings:
+ sin = codecs.encode(b"\x80", encoding)
+ reader = codecs.getreader(encoding)(io.BytesIO(sin))
+ sout = reader.read()
+ self.assertEqual(sout, b"\x80")
+
+ def test_readline(self):
+ for encoding in bytes_transform_encodings:
+ if encoding in ['uu_codec', 'zlib_codec']:
+ continue
+ sin = codecs.encode(b"\x80", encoding)
+ reader = codecs.getreader(encoding)(io.BytesIO(sin))
+ sout = reader.readline()
+ self.assertEqual(sout, b"\x80")
+
+
def test_main():
support.run_unittest(
UTF32Test,
@@ -1684,10 +2133,10 @@ def test_main():
UTF16BETest,
UTF8Test,
UTF8SigTest,
+ EscapeDecodeTest,
UTF7Test,
UTF16ExTest,
ReadBufferTest,
- CharBufferTest,
RecodingTest,
PunycodeTest,
UnicodeInternalTest,
@@ -1700,8 +2149,11 @@ def test_main():
CharmapTest,
WithStmtTest,
TypesTest,
+ UnicodeEscapeTest,
+ RawUnicodeEscapeTest,
SurrogateEscapeTest,
BomTest,
+ TransformCodecTest,
)