summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@redhat.com>2019-05-10 01:19:54 (GMT)
committerGitHub <noreply@github.com>2019-05-10 01:19:54 (GMT)
commitd267ac20c309e37d85a986b4417aa8ab4d05dabc (patch)
tree701e11585f2a69916429f798895ba6fac1ec0545
parent137be34180a20dba53948d126b961069f299f153 (diff)
downloadcpython-d267ac20c309e37d85a986b4417aa8ab4d05dabc.zip
cpython-d267ac20c309e37d85a986b4417aa8ab4d05dabc.tar.gz
cpython-d267ac20c309e37d85a986b4417aa8ab4d05dabc.tar.bz2
bpo-36778: cp65001 encoding becomes an alias to utf_8 (GH-13230)
-rw-r--r--Doc/library/codecs.rst3
-rw-r--r--Lib/encodings/aliases.py1
-rw-r--r--Lib/encodings/cp65001.py43
-rw-r--r--Lib/test/test_codecs.py89
-rw-r--r--Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst2
5 files changed, 4 insertions, 134 deletions
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
index b324637..8d3daa3 100644
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1106,8 +1106,7 @@ particular, the following variants typically exist:
+-----------------+--------------------------------+--------------------------------+
| cp1258 | windows-1258 | Vietnamese |
+-----------------+--------------------------------+--------------------------------+
-| cp65001 | | Windows only: Windows UTF-8 |
-| | | (``CP_UTF8``) |
+| cp65001 | | Alias to ``utf_8`` encoding |
| | | |
| | | .. versionadded:: 3.3 |
+-----------------+--------------------------------+--------------------------------+
diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py
index 2e63c2f..5ef40a3 100644
--- a/Lib/encodings/aliases.py
+++ b/Lib/encodings/aliases.py
@@ -534,6 +534,7 @@ aliases = {
'utf8' : 'utf_8',
'utf8_ucs2' : 'utf_8',
'utf8_ucs4' : 'utf_8',
+ 'cp65001' : 'utf_8',
# uu_codec codec
'uu' : 'uu_codec',
diff --git a/Lib/encodings/cp65001.py b/Lib/encodings/cp65001.py
deleted file mode 100644
index 95cb2ae..0000000
--- a/Lib/encodings/cp65001.py
+++ /dev/null
@@ -1,43 +0,0 @@
-"""
-Code page 65001: Windows UTF-8 (CP_UTF8).
-"""
-
-import codecs
-import functools
-
-if not hasattr(codecs, 'code_page_encode'):
- raise LookupError("cp65001 encoding is only available on Windows")
-
-### Codec APIs
-
-encode = functools.partial(codecs.code_page_encode, 65001)
-_decode = functools.partial(codecs.code_page_decode, 65001)
-
-def decode(input, errors='strict'):
- return codecs.code_page_decode(65001, input, errors, True)
-
-class IncrementalEncoder(codecs.IncrementalEncoder):
- def encode(self, input, final=False):
- return encode(input, self.errors)[0]
-
-class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
- _buffer_decode = _decode
-
-class StreamWriter(codecs.StreamWriter):
- encode = encode
-
-class StreamReader(codecs.StreamReader):
- decode = _decode
-
-### encodings module API
-
-def getregentry():
- return codecs.CodecInfo(
- name='cp65001',
- encode=encode,
- decode=decode,
- incrementalencoder=IncrementalEncoder,
- incrementaldecoder=IncrementalDecoder,
- streamreader=StreamReader,
- streamwriter=StreamWriter,
- )
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 027a84e..8c14f59 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -875,95 +875,6 @@ class UTF8Test(ReadTest, unittest.TestCase):
b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
-@unittest.skipUnless(sys.platform == 'win32',
- 'cp65001 is a Windows-only codec')
-class CP65001Test(ReadTest, unittest.TestCase):
- encoding = "cp65001"
-
- def test_encode(self):
- tests = [
- ('abc', 'strict', b'abc'),
- ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
- ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
- ('\udc80', 'strict', None),
- ('\udc80', 'ignore', b''),
- ('\udc80', 'replace', b'?'),
- ('\udc80', 'backslashreplace', b'\\udc80'),
- ('\udc80', 'namereplace', b'\\udc80'),
- ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
- ]
- for text, errors, expected in tests:
- if expected is not None:
- try:
- encoded = text.encode('cp65001', errors)
- except UnicodeEncodeError as err:
- self.fail('Unable to encode %a to cp65001 with '
- 'errors=%r: %s' % (text, errors, err))
- self.assertEqual(encoded, expected,
- '%a.encode("cp65001", %r)=%a != %a'
- % (text, errors, encoded, expected))
- else:
- self.assertRaises(UnicodeEncodeError,
- text.encode, "cp65001", errors)
-
- def test_decode(self):
- tests = [
- (b'abc', 'strict', 'abc'),
- (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
- (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
- (b'\xef\xbf\xbd', 'strict', '\ufffd'),
- (b'[\xc3\xa9]', 'strict', '[\xe9]'),
- # invalid bytes
- (b'[\xff]', 'strict', None),
- (b'[\xff]', 'ignore', '[]'),
- (b'[\xff]', 'replace', '[\ufffd]'),
- (b'[\xff]', 'surrogateescape', '[\udcff]'),
- (b'[\xed\xb2\x80]', 'strict', None),
- (b'[\xed\xb2\x80]', 'ignore', '[]'),
- (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
- ]
- for raw, errors, expected in tests:
- if expected is not None:
- try:
- decoded = raw.decode('cp65001', errors)
- except UnicodeDecodeError as err:
- self.fail('Unable to decode %a from cp65001 with '
- 'errors=%r: %s' % (raw, errors, err))
- self.assertEqual(decoded, expected,
- '%a.decode("cp65001", %r)=%a != %a'
- % (raw, errors, decoded, expected))
- else:
- self.assertRaises(UnicodeDecodeError,
- raw.decode, 'cp65001', errors)
-
- def test_lone_surrogates(self):
- self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
- self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
- self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
- b'[\\udc80]')
- self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
- b'[\\udc80]')
- self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
- b'[&#56448;]')
- self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
- b'[\x80]')
- self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
- b'[]')
- self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
- b'[?]')
-
- def test_surrogatepass_handler(self):
- self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
- b"abc\xed\xa0\x80def")
- self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
- "abc\ud800def")
- self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
- b"\xf0\x90\xbf\xbf\xed\xa0\x80")
- self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
- "\U00010fff\uD800")
- self.assertTrue(codecs.lookup_error("surrogatepass"))
-
-
class UTF7Test(ReadTest, unittest.TestCase):
encoding = "utf-7"
diff --git a/Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst b/Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst
new file mode 100644
index 0000000..5e594a3
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst
@@ -0,0 +1,2 @@
+``cp65001`` encoding (Windows code page 65001) becomes an alias to ``utf_8``
+encoding.