From 2f3ca9f20efad37aad479d557c282e08481602d0 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 27 Oct 2011 01:38:56 +0200 Subject: Close #13247: Add cp65001 codec, the Windows UTF-8 (CP_UTF8) --- Doc/library/codecs.rst | 5 ++ Doc/whatsnew/3.3.rst | 5 ++ Lib/encodings/cp65001.py | 40 +++++++++++ Lib/test/test_codecs.py | 176 +++++++++++++++++++++++++++++++---------------- Misc/NEWS | 2 + 5 files changed, 168 insertions(+), 60 deletions(-) create mode 100644 Lib/encodings/cp65001.py diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 4b33c61..4523c7f 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1011,6 +1011,11 @@ particular, the following variants typically exist: +-----------------+--------------------------------+--------------------------------+ | cp1258 | windows-1258 | Vietnamese | +-----------------+--------------------------------+--------------------------------+ +| cp65001 | | Windows only: Windows UTF-8 | +| | | (``CP_UTF8``) | +| | | | +| | | .. versionadded:: 3.3 | ++-----------------+--------------------------------+--------------------------------+ | euc_jp | eucjp, ujis, u-jis | Japanese | +-----------------+--------------------------------+--------------------------------+ | euc_jis_2004 | jisx0213, eucjis2004 | Japanese | diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst index 6ae8315..1ee9c1b 100644 --- a/Doc/whatsnew/3.3.rst +++ b/Doc/whatsnew/3.3.rst @@ -225,6 +225,11 @@ The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty :mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of only ``replace`` to encode and ``ignore`` to decode. +A new Windows-only codec has been added: ``cp65001`` (:issue:`13247`). It is +the Windows code page 65001 (Windows UTF-8, ``CP_UTF8``). For example, it is +used by ``sys.stdout`` if the console output code page is set to cp65001 (e.g. +using ``chcp 65001`` command). + Multibyte CJK decoders now resynchronize faster. They only ignore the first byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312', 'replace')`` now returns a ``\n`` after the replacement character. diff --git a/Lib/encodings/cp65001.py b/Lib/encodings/cp65001.py new file mode 100644 index 0000000..287eb87 --- /dev/null +++ b/Lib/encodings/cp65001.py @@ -0,0 +1,40 @@ +""" +Code page 65001: Windows UTF-8 (CP_UTF8). +""" + +import codecs +import functools + +if not hasattr(codecs, 'code_page_encode'): + raise LookupError("cp65001 encoding is only available on Windows") + +### Codec APIs + +encode = functools.partial(codecs.code_page_encode, 65001) +decode = functools.partial(codecs.code_page_decode, 65001) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + _buffer_decode = decode + +class StreamWriter(codecs.StreamWriter): + encode = encode + +class StreamReader(codecs.StreamReader): + decode = decode + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='cp65001', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index fa257b8..ffd2d79 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -4,6 +4,11 @@ import codecs import locale import sys, _testcapi, io +if sys.platform == 'win32': + VISTA_OR_LATER = (sys.getwindowsversion().major >= 6) +else: + VISTA_OR_LATER = False + try: import ctypes except ImportError: @@ -636,6 +641,107 @@ class UTF8Test(ReadTest): "\U00010fff\uD800") self.assertTrue(codecs.lookup_error("surrogatepass")) +@unittest.skipUnless(sys.platform == 'win32', + 'cp65001 is a Windows-only codec') +class CP65001Test(ReadTest): + encoding = "cp65001" + + def test_encode(self): + tests = [ + ('abc', 'strict', b'abc'), + ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), + ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), + ] + if VISTA_OR_LATER: + tests.extend(( + ('\udc80', 'strict', None), + ('\udc80', 'ignore', b''), + ('\udc80', 'replace', b'?'), + ('\udc80', 'backslashreplace', b'\\udc80'), + ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), + )) + else: + tests.append(('\udc80', 'strict', b'\xed\xb2\x80')) + for text, errors, expected in tests: + if expected is not None: + try: + encoded = text.encode('cp65001', errors) + except UnicodeEncodeError as err: + self.fail('Unable to encode %a to cp65001 with ' + 'errors=%r: %s' % (text, errors, err)) + self.assertEqual(encoded, expected, + '%a.encode("cp65001", %r)=%a != %a' + % (text, errors, encoded, expected)) + else: + self.assertRaises(UnicodeEncodeError, + text.encode, "cp65001", errors) + + def test_decode(self): + tests = [ + (b'abc', 'strict', 'abc'), + (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'), + (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'), + (b'\xef\xbf\xbd', 'strict', '\ufffd'), + (b'[\xc3\xa9]', 'strict', '[\xe9]'), + # invalid bytes + (b'[\xff]', 'strict', None), + (b'[\xff]', 'ignore', '[]'), + (b'[\xff]', 'replace', '[\ufffd]'), + (b'[\xff]', 'surrogateescape', '[\udcff]'), + ] + if VISTA_OR_LATER: + tests.extend(( + (b'[\xed\xb2\x80]', 'strict', None), + (b'[\xed\xb2\x80]', 'ignore', '[]'), + (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), + )) + else: + tests.extend(( + (b'[\xed\xb2\x80]', 'strict', '[\udc80]'), + )) + for raw, errors, expected in tests: + if expected is not None: + try: + decoded = raw.decode('cp65001', errors) + except UnicodeDecodeError as err: + self.fail('Unable to decode %a from cp65001 with ' + 'errors=%r: %s' % (raw, errors, err)) + self.assertEqual(decoded, expected, + '%a.decode("cp65001", %r)=%a != %a' + % (raw, errors, decoded, expected)) + else: + self.assertRaises(UnicodeDecodeError, + raw.decode, 'cp65001', errors) + + @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later') + def test_lone_surrogates(self): + self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001") + self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") + self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"), + b'[\\udc80]') + self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"), + b'[�]') + self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"), + b'[\x80]') + self.assertEqual("[\uDC80]".encode("cp65001", "ignore"), + b'[]') + self.assertEqual("[\uDC80]".encode("cp65001", "replace"), + b'[?]') + + @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later') + def test_surrogatepass_handler(self): + self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"), + b"abc\xed\xa0\x80def") + self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"), + "abc\ud800def") + self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"), + b"\xf0\x90\xbf\xbf\xed\xa0\x80") + self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"), + "\U00010fff\uD800") + self.assertTrue(codecs.lookup_error("surrogatepass")) + + + class UTF7Test(ReadTest): encoding = "utf-7" @@ -1747,11 +1853,9 @@ class TransformCodecTest(unittest.TestCase): @unittest.skipUnless(sys.platform == 'win32', 'code pages are specific to Windows') class CodePageTest(unittest.TestCase): + # CP_UTF8 is already tested by CP65001Test CP_UTF8 = 65001 - def vista_or_later(self): - return (sys.getwindowsversion().major >= 6) - def test_invalid_code_page(self): self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') @@ -1804,19 +1908,22 @@ class CodePageTest(unittest.TestCase): self.check_encode(932, ( ('abc', 'strict', b'abc'), ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'), - # not encodable + # test error handlers ('\xff', 'strict', None), ('[\xff]', 'ignore', b'[]'), ('[\xff]', 'replace', b'[y]'), ('[\u20ac]', 'replace', b'[?]'), + ('[\xff]', 'backslashreplace', b'[\\xff]'), + ('[\xff]', 'xmlcharrefreplace', b'[ÿ]'), )) self.check_decode(932, ( (b'abc', 'strict', 'abc'), (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'), # invalid bytes - (b'\xff', 'strict', None), - (b'\xff', 'ignore', ''), - (b'\xff', 'replace', '\ufffd'), + (b'[\xff]', 'strict', None), + (b'[\xff]', 'ignore', '[]'), + (b'[\xff]', 'replace', '[\ufffd]'), + (b'[\xff]', 'surrogateescape', '[\udcff]'), (b'\x81\x00abc', 'strict', None), (b'\x81\x00abc', 'ignore', '\x00abc'), (b'\x81\x00abc', 'replace', '\ufffd\x00abc'), @@ -1857,58 +1964,6 @@ class CodePageTest(unittest.TestCase): (b'[\xff]', 'strict', '[\xff]'), )) - def test_cp_utf8(self): - cp = self.CP_UTF8 - - tests = [ - ('abc', 'strict', b'abc'), - ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), - ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), - ] - if self.vista_or_later(): - tests.append(('\udc80', 'strict', None)) - tests.append(('\udc80', 'ignore', b'')) - tests.append(('\udc80', 'replace', b'?')) - else: - tests.append(('\udc80', 'strict', b'\xed\xb2\x80')) - self.check_encode(cp, tests) - - tests = [ - (b'abc', 'strict', 'abc'), - (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'), - (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'), - (b'\xef\xbf\xbd', 'strict', '\ufffd'), - (b'[\xc3\xa9]', 'strict', '[\xe9]'), - # invalid bytes - (b'[\xff]', 'strict', None), - (b'[\xff]', 'ignore', '[]'), - (b'[\xff]', 'replace', '[\ufffd]'), - ] - if self.vista_or_later(): - tests.extend(( - (b'[\xed\xb2\x80]', 'strict', None), - (b'[\xed\xb2\x80]', 'ignore', '[]'), - (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), - )) - else: - tests.extend(( - (b'[\xed\xb2\x80]', 'strict', '[\udc80]'), - )) - self.check_decode(cp, tests) - - def test_error_handlers(self): - self.check_encode(932, ( - ('\xff', 'backslashreplace', b'\\xff'), - ('\xff', 'xmlcharrefreplace', b'ÿ'), - )) - self.check_decode(932, ( - (b'\xff', 'surrogateescape', '\udcff'), - )) - if self.vista_or_later(): - self.check_encode(self.CP_UTF8, ( - ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), - )) - def test_multibyte_encoding(self): self.check_decode(932, ( (b'\x84\xe9\x80', 'ignore', '\u9a3e'), @@ -1918,7 +1973,7 @@ class CodePageTest(unittest.TestCase): (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), )) - if self.vista_or_later(): + if VISTA_OR_LATER: self.check_encode(self.CP_UTF8, ( ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), @@ -1951,6 +2006,7 @@ def test_main(): UTF16BETest, UTF8Test, UTF8SigTest, + CP65001Test, UTF7Test, UTF16ExTest, ReadBufferTest, diff --git a/Misc/NEWS b/Misc/NEWS index d0aa17c..0a7f41a 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -341,6 +341,8 @@ Core and Builtins Library ------- +- Issue #13247: Add cp65001 codec, the Windows UTF-8 (CP_UTF8). + - Issue #13226: Add RTLD_xxx constants to the os module. These constants can be used with sys.setdlopenflags(). -- cgit v0.12