summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@haypocalc.com>2011-10-26 23:38:56 (GMT)
committerVictor Stinner <victor.stinner@haypocalc.com>2011-10-26 23:38:56 (GMT)
commit2f3ca9f20efad37aad479d557c282e08481602d0 (patch)
tree1830560daa4865d29734aa3c538eacd3199e110e
parentcc9695643fc40780f51719d5e9a272283a743077 (diff)
downloadcpython-2f3ca9f20efad37aad479d557c282e08481602d0.zip
cpython-2f3ca9f20efad37aad479d557c282e08481602d0.tar.gz
cpython-2f3ca9f20efad37aad479d557c282e08481602d0.tar.bz2
Close #13247: Add cp65001 codec, the Windows UTF-8 (CP_UTF8)
-rw-r--r--Doc/library/codecs.rst5
-rw-r--r--Doc/whatsnew/3.3.rst5
-rw-r--r--Lib/encodings/cp65001.py40
-rw-r--r--Lib/test/test_codecs.py176
-rw-r--r--Misc/NEWS2
5 files changed, 168 insertions, 60 deletions
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
index 4b33c61..4523c7f 100644
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1011,6 +1011,11 @@ particular, the following variants typically exist:
+-----------------+--------------------------------+--------------------------------+
| cp1258 | windows-1258 | Vietnamese |
+-----------------+--------------------------------+--------------------------------+
+| cp65001 | | Windows only: Windows UTF-8 |
+| | | (``CP_UTF8``) |
+| | | |
+| | | .. versionadded:: 3.3 |
++-----------------+--------------------------------+--------------------------------+
| euc_jp | eucjp, ujis, u-jis | Japanese |
+-----------------+--------------------------------+--------------------------------+
| euc_jis_2004 | jisx0213, eucjis2004 | Japanese |
diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst
index 6ae8315..1ee9c1b 100644
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -225,6 +225,11 @@ The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty
:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of
only ``replace`` to encode and ``ignore`` to decode.
+A new Windows-only codec has been added: ``cp65001`` (:issue:`13247`). It is
+the Windows code page 65001 (Windows UTF-8, ``CP_UTF8``). For example, it is
+used by ``sys.stdout`` if the console output code page is set to cp65001 (e.g.
+using ``chcp 65001`` command).
+
Multibyte CJK decoders now resynchronize faster. They only ignore the first
byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312',
'replace')`` now returns a ``\n`` after the replacement character.
diff --git a/Lib/encodings/cp65001.py b/Lib/encodings/cp65001.py
new file mode 100644
index 0000000..287eb87
--- /dev/null
+++ b/Lib/encodings/cp65001.py
@@ -0,0 +1,40 @@
+"""
+Code page 65001: Windows UTF-8 (CP_UTF8).
+"""
+
+import codecs
+import functools
+
+if not hasattr(codecs, 'code_page_encode'):
+ raise LookupError("cp65001 encoding is only available on Windows")
+
+### Codec APIs
+
+encode = functools.partial(codecs.code_page_encode, 65001)
+decode = functools.partial(codecs.code_page_decode, 65001)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+ def encode(self, input, final=False):
+ return encode(input, self.errors)[0]
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+ _buffer_decode = decode
+
+class StreamWriter(codecs.StreamWriter):
+ encode = encode
+
+class StreamReader(codecs.StreamReader):
+ decode = decode
+
+### encodings module API
+
+def getregentry():
+ return codecs.CodecInfo(
+ name='cp65001',
+ encode=encode,
+ decode=decode,
+ incrementalencoder=IncrementalEncoder,
+ incrementaldecoder=IncrementalDecoder,
+ streamreader=StreamReader,
+ streamwriter=StreamWriter,
+ )
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index fa257b8..ffd2d79 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -4,6 +4,11 @@ import codecs
import locale
import sys, _testcapi, io
+if sys.platform == 'win32':
+ VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
+else:
+ VISTA_OR_LATER = False
+
try:
import ctypes
except ImportError:
@@ -636,6 +641,107 @@ class UTF8Test(ReadTest):
"\U00010fff\uD800")
self.assertTrue(codecs.lookup_error("surrogatepass"))
+@unittest.skipUnless(sys.platform == 'win32',
+ 'cp65001 is a Windows-only codec')
+class CP65001Test(ReadTest):
+ encoding = "cp65001"
+
+ def test_encode(self):
+ tests = [
+ ('abc', 'strict', b'abc'),
+ ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
+ ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
+ ]
+ if VISTA_OR_LATER:
+ tests.extend((
+ ('\udc80', 'strict', None),
+ ('\udc80', 'ignore', b''),
+ ('\udc80', 'replace', b'?'),
+ ('\udc80', 'backslashreplace', b'\\udc80'),
+ ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
+ ))
+ else:
+ tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
+ for text, errors, expected in tests:
+ if expected is not None:
+ try:
+ encoded = text.encode('cp65001', errors)
+ except UnicodeEncodeError as err:
+ self.fail('Unable to encode %a to cp65001 with '
+ 'errors=%r: %s' % (text, errors, err))
+ self.assertEqual(encoded, expected,
+ '%a.encode("cp65001", %r)=%a != %a'
+ % (text, errors, encoded, expected))
+ else:
+ self.assertRaises(UnicodeEncodeError,
+ text.encode, "cp65001", errors)
+
+ def test_decode(self):
+ tests = [
+ (b'abc', 'strict', 'abc'),
+ (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
+ (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
+ (b'\xef\xbf\xbd', 'strict', '\ufffd'),
+ (b'[\xc3\xa9]', 'strict', '[\xe9]'),
+ # invalid bytes
+ (b'[\xff]', 'strict', None),
+ (b'[\xff]', 'ignore', '[]'),
+ (b'[\xff]', 'replace', '[\ufffd]'),
+ (b'[\xff]', 'surrogateescape', '[\udcff]'),
+ ]
+ if VISTA_OR_LATER:
+ tests.extend((
+ (b'[\xed\xb2\x80]', 'strict', None),
+ (b'[\xed\xb2\x80]', 'ignore', '[]'),
+ (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
+ ))
+ else:
+ tests.extend((
+ (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
+ ))
+ for raw, errors, expected in tests:
+ if expected is not None:
+ try:
+ decoded = raw.decode('cp65001', errors)
+ except UnicodeDecodeError as err:
+ self.fail('Unable to decode %a from cp65001 with '
+ 'errors=%r: %s' % (raw, errors, err))
+ self.assertEqual(decoded, expected,
+ '%a.decode("cp65001", %r)=%a != %a'
+ % (raw, errors, decoded, expected))
+ else:
+ self.assertRaises(UnicodeDecodeError,
+ raw.decode, 'cp65001', errors)
+
+ @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
+ def test_lone_surrogates(self):
+ self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
+ self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
+ self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
+ b'[\\udc80]')
+ self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
+ b'[&#56448;]')
+ self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
+ b'[\x80]')
+ self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
+ b'[]')
+ self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
+ b'[?]')
+
+ @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
+ def test_surrogatepass_handler(self):
+ self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
+ b"abc\xed\xa0\x80def")
+ self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
+ "abc\ud800def")
+ self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
+ b"\xf0\x90\xbf\xbf\xed\xa0\x80")
+ self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
+ "\U00010fff\uD800")
+ self.assertTrue(codecs.lookup_error("surrogatepass"))
+
+
+
class UTF7Test(ReadTest):
encoding = "utf-7"
@@ -1747,11 +1853,9 @@ class TransformCodecTest(unittest.TestCase):
@unittest.skipUnless(sys.platform == 'win32',
'code pages are specific to Windows')
class CodePageTest(unittest.TestCase):
+ # CP_UTF8 is already tested by CP65001Test
CP_UTF8 = 65001
- def vista_or_later(self):
- return (sys.getwindowsversion().major >= 6)
-
def test_invalid_code_page(self):
self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
@@ -1804,19 +1908,22 @@ class CodePageTest(unittest.TestCase):
self.check_encode(932, (
('abc', 'strict', b'abc'),
('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
- # not encodable
+ # test error handlers
('\xff', 'strict', None),
('[\xff]', 'ignore', b'[]'),
('[\xff]', 'replace', b'[y]'),
('[\u20ac]', 'replace', b'[?]'),
+ ('[\xff]', 'backslashreplace', b'[\\xff]'),
+ ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
))
self.check_decode(932, (
(b'abc', 'strict', 'abc'),
(b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
# invalid bytes
- (b'\xff', 'strict', None),
- (b'\xff', 'ignore', ''),
- (b'\xff', 'replace', '\ufffd'),
+ (b'[\xff]', 'strict', None),
+ (b'[\xff]', 'ignore', '[]'),
+ (b'[\xff]', 'replace', '[\ufffd]'),
+ (b'[\xff]', 'surrogateescape', '[\udcff]'),
(b'\x81\x00abc', 'strict', None),
(b'\x81\x00abc', 'ignore', '\x00abc'),
(b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
@@ -1857,58 +1964,6 @@ class CodePageTest(unittest.TestCase):
(b'[\xff]', 'strict', '[\xff]'),
))
- def test_cp_utf8(self):
- cp = self.CP_UTF8
-
- tests = [
- ('abc', 'strict', b'abc'),
- ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
- ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
- ]
- if self.vista_or_later():
- tests.append(('\udc80', 'strict', None))
- tests.append(('\udc80', 'ignore', b''))
- tests.append(('\udc80', 'replace', b'?'))
- else:
- tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
- self.check_encode(cp, tests)
-
- tests = [
- (b'abc', 'strict', 'abc'),
- (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
- (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
- (b'\xef\xbf\xbd', 'strict', '\ufffd'),
- (b'[\xc3\xa9]', 'strict', '[\xe9]'),
- # invalid bytes
- (b'[\xff]', 'strict', None),
- (b'[\xff]', 'ignore', '[]'),
- (b'[\xff]', 'replace', '[\ufffd]'),
- ]
- if self.vista_or_later():
- tests.extend((
- (b'[\xed\xb2\x80]', 'strict', None),
- (b'[\xed\xb2\x80]', 'ignore', '[]'),
- (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
- ))
- else:
- tests.extend((
- (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
- ))
- self.check_decode(cp, tests)
-
- def test_error_handlers(self):
- self.check_encode(932, (
- ('\xff', 'backslashreplace', b'\\xff'),
- ('\xff', 'xmlcharrefreplace', b'&#255;'),
- ))
- self.check_decode(932, (
- (b'\xff', 'surrogateescape', '\udcff'),
- ))
- if self.vista_or_later():
- self.check_encode(self.CP_UTF8, (
- ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
- ))
-
def test_multibyte_encoding(self):
self.check_decode(932, (
(b'\x84\xe9\x80', 'ignore', '\u9a3e'),
@@ -1918,7 +1973,7 @@ class CodePageTest(unittest.TestCase):
(b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
(b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
))
- if self.vista_or_later():
+ if VISTA_OR_LATER:
self.check_encode(self.CP_UTF8, (
('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
@@ -1951,6 +2006,7 @@ def test_main():
UTF16BETest,
UTF8Test,
UTF8SigTest,
+ CP65001Test,
UTF7Test,
UTF16ExTest,
ReadBufferTest,
diff --git a/Misc/NEWS b/Misc/NEWS
index d0aa17c..0a7f41a 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -341,6 +341,8 @@ Core and Builtins
Library
-------
+- Issue #13247: Add cp65001 codec, the Windows UTF-8 (CP_UTF8).
+
- Issue #13226: Add RTLD_xxx constants to the os module. These constants can be
used with sys.setdlopenflags().