From d267ac20c309e37d85a986b4417aa8ab4d05dabc Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 10 May 2019 03:19:54 +0200 Subject: bpo-36778: cp65001 encoding becomes an alias to utf_8 (GH-13230) --- Doc/library/codecs.rst | 3 +- Lib/encodings/aliases.py | 1 + Lib/encodings/cp65001.py | 43 ----------- Lib/test/test_codecs.py | 89 ---------------------- .../2019-05-10-01-06-36.bpo-36778.GRqeiS.rst | 2 + 5 files changed, 4 insertions(+), 134 deletions(-) delete mode 100644 Lib/encodings/cp65001.py create mode 100644 Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index b324637..8d3daa3 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1106,8 +1106,7 @@ particular, the following variants typically exist: +-----------------+--------------------------------+--------------------------------+ | cp1258 | windows-1258 | Vietnamese | +-----------------+--------------------------------+--------------------------------+ -| cp65001 | | Windows only: Windows UTF-8 | -| | | (``CP_UTF8``) | +| cp65001 | | Alias to ``utf_8`` encoding | | | | | | | | .. versionadded:: 3.3 | +-----------------+--------------------------------+--------------------------------+ diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py index 2e63c2f..5ef40a3 100644 --- a/Lib/encodings/aliases.py +++ b/Lib/encodings/aliases.py @@ -534,6 +534,7 @@ aliases = { 'utf8' : 'utf_8', 'utf8_ucs2' : 'utf_8', 'utf8_ucs4' : 'utf_8', + 'cp65001' : 'utf_8', # uu_codec codec 'uu' : 'uu_codec', diff --git a/Lib/encodings/cp65001.py b/Lib/encodings/cp65001.py deleted file mode 100644 index 95cb2ae..0000000 --- a/Lib/encodings/cp65001.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Code page 65001: Windows UTF-8 (CP_UTF8). -""" - -import codecs -import functools - -if not hasattr(codecs, 'code_page_encode'): - raise LookupError("cp65001 encoding is only available on Windows") - -### Codec APIs - -encode = functools.partial(codecs.code_page_encode, 65001) -_decode = functools.partial(codecs.code_page_decode, 65001) - -def decode(input, errors='strict'): - return codecs.code_page_decode(65001, input, errors, True) - -class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - return encode(input, self.errors)[0] - -class IncrementalDecoder(codecs.BufferedIncrementalDecoder): - _buffer_decode = _decode - -class StreamWriter(codecs.StreamWriter): - encode = encode - -class StreamReader(codecs.StreamReader): - decode = _decode - -### encodings module API - -def getregentry(): - return codecs.CodecInfo( - name='cp65001', - encode=encode, - decode=decode, - incrementalencoder=IncrementalEncoder, - incrementaldecoder=IncrementalDecoder, - streamreader=StreamReader, - streamwriter=StreamWriter, - ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 027a84e..8c14f59 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -875,95 +875,6 @@ class UTF8Test(ReadTest, unittest.TestCase): b"abc\xed\xa0z".decode(self.encoding, "surrogatepass") -@unittest.skipUnless(sys.platform == 'win32', - 'cp65001 is a Windows-only codec') -class CP65001Test(ReadTest, unittest.TestCase): - encoding = "cp65001" - - def test_encode(self): - tests = [ - ('abc', 'strict', b'abc'), - ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), - ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), - ('\udc80', 'strict', None), - ('\udc80', 'ignore', b''), - ('\udc80', 'replace', b'?'), - ('\udc80', 'backslashreplace', b'\\udc80'), - ('\udc80', 'namereplace', b'\\udc80'), - ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), - ] - for text, errors, expected in tests: - if expected is not None: - try: - encoded = text.encode('cp65001', errors) - except UnicodeEncodeError as err: - self.fail('Unable to encode %a to cp65001 with ' - 'errors=%r: %s' % (text, errors, err)) - self.assertEqual(encoded, expected, - '%a.encode("cp65001", %r)=%a != %a' - % (text, errors, encoded, expected)) - else: - self.assertRaises(UnicodeEncodeError, - text.encode, "cp65001", errors) - - def test_decode(self): - tests = [ - (b'abc', 'strict', 'abc'), - (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'), - (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'), - (b'\xef\xbf\xbd', 'strict', '\ufffd'), - (b'[\xc3\xa9]', 'strict', '[\xe9]'), - # invalid bytes - (b'[\xff]', 'strict', None), - (b'[\xff]', 'ignore', '[]'), - (b'[\xff]', 'replace', '[\ufffd]'), - (b'[\xff]', 'surrogateescape', '[\udcff]'), - (b'[\xed\xb2\x80]', 'strict', None), - (b'[\xed\xb2\x80]', 'ignore', '[]'), - (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), - ] - for raw, errors, expected in tests: - if expected is not None: - try: - decoded = raw.decode('cp65001', errors) - except UnicodeDecodeError as err: - self.fail('Unable to decode %a from cp65001 with ' - 'errors=%r: %s' % (raw, errors, err)) - self.assertEqual(decoded, expected, - '%a.decode("cp65001", %r)=%a != %a' - % (raw, errors, decoded, expected)) - else: - self.assertRaises(UnicodeDecodeError, - raw.decode, 'cp65001', errors) - - def test_lone_surrogates(self): - self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001") - self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") - self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"), - b'[\\udc80]') - self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"), - b'[\\udc80]') - self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"), - b'[�]') - self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"), - b'[\x80]') - self.assertEqual("[\uDC80]".encode("cp65001", "ignore"), - b'[]') - self.assertEqual("[\uDC80]".encode("cp65001", "replace"), - b'[?]') - - def test_surrogatepass_handler(self): - self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"), - b"abc\xed\xa0\x80def") - self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"), - "abc\ud800def") - self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"), - b"\xf0\x90\xbf\xbf\xed\xa0\x80") - self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"), - "\U00010fff\uD800") - self.assertTrue(codecs.lookup_error("surrogatepass")) - - class UTF7Test(ReadTest, unittest.TestCase): encoding = "utf-7" diff --git a/Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst b/Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst new file mode 100644 index 0000000..5e594a3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst @@ -0,0 +1,2 @@ +``cp65001`` encoding (Windows code page 65001) becomes an alias to ``utf_8`` +encoding. -- cgit v0.12