From f5aba58480bb0dd45181f609487ac2ecfcc98673 Mon Sep 17 00:00:00 2001 From: Steve Dower Date: Tue, 6 Sep 2016 19:42:27 -0700 Subject: Issue #27959: Adds oem encoding, alias ansi to mbcs, move aliasmbcs to codec lookup --- Include/unicodeobject.h | 2 +- Lib/encodings/__init__.py | 10 +++++ Lib/encodings/aliases.py | 1 + Lib/encodings/oem.py | 41 ++++++++++++++++++++ Lib/site.py | 16 -------- Lib/test/test_codecs.py | 62 ++++++++++++++---------------- Modules/_codecsmodule.c | 36 ++++++++++++++++++ Modules/clinic/_codecsmodule.c.h | 81 +++++++++++++++++++++++++++++++++++++++- 8 files changed, 198 insertions(+), 51 deletions(-) create mode 100644 Lib/encodings/oem.py diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 3d7431d..d38721f 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1663,7 +1663,7 @@ PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( const char *string, /* MBCS encoded string */ - Py_ssize_t length, /* size of string */ + Py_ssize_t length, /* size of string */ const char *errors /* error handling */ ); diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 320011b..9a9b90b 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -29,6 +29,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). """#" import codecs +import sys from . import aliases _cache = {} @@ -151,3 +152,12 @@ def search_function(encoding): # Register the search_function in the Python codec registry codecs.register(search_function) + +if sys.platform == 'win32': + def _alias_mbcs(encoding): + import _bootlocale + if encoding == _bootlocale.getpreferredencoding(False): + import encodings.mbcs + return encodings.mbcs.getregentry() + + codecs.register(_alias_mbcs) diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py index 67c828d..2e63c2f 100644 --- a/Lib/encodings/aliases.py +++ b/Lib/encodings/aliases.py @@ -458,6 +458,7 @@ aliases = { 'macturkish' : 'mac_turkish', # mbcs codec + 'ansi' : 'mbcs', 'dbcs' : 'mbcs', # ptcp154 codec diff --git a/Lib/encodings/oem.py b/Lib/encodings/oem.py new file mode 100644 index 0000000..2c3426b --- /dev/null +++ b/Lib/encodings/oem.py @@ -0,0 +1,41 @@ +""" Python 'oem' Codec for Windows + +""" +# Import them explicitly to cause an ImportError +# on non-Windows systems +from codecs import oem_encode, oem_decode +# for IncrementalDecoder, IncrementalEncoder, ... +import codecs + +### Codec APIs + +encode = oem_encode + +def decode(input, errors='strict'): + return oem_decode(input, errors, True) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return oem_encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + _buffer_decode = oem_decode + +class StreamWriter(codecs.StreamWriter): + encode = oem_encode + +class StreamReader(codecs.StreamReader): + decode = oem_decode + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='oem', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/site.py b/Lib/site.py index a84e3bb..a536ef1 100644 --- a/Lib/site.py +++ b/Lib/site.py @@ -423,21 +423,6 @@ def enablerlcompleter(): sys.__interactivehook__ = register_readline -def aliasmbcs(): - """On Windows, some default encodings are not provided by Python, - while they are always available as "mbcs" in each locale. Make - them usable by aliasing to "mbcs" in such a case.""" - if sys.platform == 'win32': - import _bootlocale, codecs - enc = _bootlocale.getpreferredencoding(False) - if enc.startswith('cp'): # "cp***" ? - try: - codecs.lookup(enc) - except LookupError: - import encodings - encodings._cache[enc] = encodings._unknown - encodings.aliases.aliases[enc] = 'mbcs' - CONFIG_LINE = r'^(?P(\w|[-_])+)\s*=\s*(?P.*)\s*$' def venv(known_paths): @@ -560,7 +545,6 @@ def main(): setcopyright() sethelper() enablerlcompleter() - aliasmbcs() execsitecustomize() if ENABLE_USER_SITE: execusercustomize() diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index d875340..825a7dd 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -8,11 +8,6 @@ import encodings from test import support -if sys.platform == 'win32': - VISTA_OR_LATER = (sys.getwindowsversion().major >= 6) -else: - VISTA_OR_LATER = False - try: import ctypes except ImportError: @@ -841,18 +836,13 @@ class CP65001Test(ReadTest, unittest.TestCase): ('abc', 'strict', b'abc'), ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), + ('\udc80', 'strict', None), + ('\udc80', 'ignore', b''), + ('\udc80', 'replace', b'?'), + ('\udc80', 'backslashreplace', b'\\udc80'), + ('\udc80', 'namereplace', b'\\udc80'), + ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), ] - if VISTA_OR_LATER: - tests.extend(( - ('\udc80', 'strict', None), - ('\udc80', 'ignore', b''), - ('\udc80', 'replace', b'?'), - ('\udc80', 'backslashreplace', b'\\udc80'), - ('\udc80', 'namereplace', b'\\udc80'), - ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), - )) - else: - tests.append(('\udc80', 'strict', b'\xed\xb2\x80')) for text, errors, expected in tests: if expected is not None: try: @@ -879,17 +869,10 @@ class CP65001Test(ReadTest, unittest.TestCase): (b'[\xff]', 'ignore', '[]'), (b'[\xff]', 'replace', '[\ufffd]'), (b'[\xff]', 'surrogateescape', '[\udcff]'), + (b'[\xed\xb2\x80]', 'strict', None), + (b'[\xed\xb2\x80]', 'ignore', '[]'), + (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), ] - if VISTA_OR_LATER: - tests.extend(( - (b'[\xed\xb2\x80]', 'strict', None), - (b'[\xed\xb2\x80]', 'ignore', '[]'), - (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), - )) - else: - tests.extend(( - (b'[\xed\xb2\x80]', 'strict', '[\udc80]'), - )) for raw, errors, expected in tests: if expected is not None: try: @@ -904,7 +887,6 @@ class CP65001Test(ReadTest, unittest.TestCase): self.assertRaises(UnicodeDecodeError, raw.decode, 'cp65001', errors) - @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later') def test_lone_surrogates(self): self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001") self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") @@ -921,7 +903,6 @@ class CP65001Test(ReadTest, unittest.TestCase): self.assertEqual("[\uDC80]".encode("cp65001", "replace"), b'[?]') - @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later') def test_surrogatepass_handler(self): self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"), b"abc\xed\xa0\x80def") @@ -1951,6 +1932,8 @@ all_unicode_encodings = [ if hasattr(codecs, "mbcs_encode"): all_unicode_encodings.append("mbcs") +if hasattr(codecs, "oem_encode"): + all_unicode_encodings.append("oem") # The following encoding is not tested, because it's not supposed # to work: @@ -3119,11 +3102,10 @@ class CodePageTest(unittest.TestCase): (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), )) - if VISTA_OR_LATER: - self.check_encode(self.CP_UTF8, ( - ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), - ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), - )) + self.check_encode(self.CP_UTF8, ( + ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), + ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), + )) def test_incremental(self): decoded = codecs.code_page_decode(932, b'\x82', 'strict', False) @@ -3144,6 +3126,20 @@ class CodePageTest(unittest.TestCase): False) self.assertEqual(decoded, ('abc', 3)) + def test_mbcs_alias(self): + # Check that looking up our 'default' codepage will return + # mbcs when we don't have a more specific one available + import _bootlocale + def _get_fake_codepage(*a): + return 'cp123' + old_getpreferredencoding = _bootlocale.getpreferredencoding + _bootlocale.getpreferredencoding = _get_fake_codepage + try: + codec = codecs.lookup('cp123') + self.assertEqual(codec.name, 'mbcs') + finally: + _bootlocale.getpreferredencoding = old_getpreferredencoding + class ASCIITest(unittest.TestCase): def test_encode(self): diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 4e75995..4d25a53 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -626,6 +626,25 @@ _codecs_mbcs_decode_impl(PyObject *module, Py_buffer *data, } /*[clinic input] +_codecs.oem_decode + data: Py_buffer + errors: str(accept={str, NoneType}) = NULL + final: int(c_default="0") = False + / +[clinic start generated code]*/ + +static PyObject * +_codecs_oem_decode_impl(PyObject *module, Py_buffer *data, + const char *errors, int final) +/*[clinic end generated code: output=da1617612f3fcad8 input=95b8a92c446b03cd]*/ +{ + Py_ssize_t consumed = data->len; + PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP, + data->buf, data->len, errors, final ? NULL : &consumed); + return codec_tuple(decoded, consumed); +} + +/*[clinic input] _codecs.code_page_decode codepage: int data: Py_buffer @@ -971,6 +990,21 @@ _codecs_mbcs_encode_impl(PyObject *module, PyObject *str, const char *errors) } /*[clinic input] +_codecs.oem_encode + str: unicode + errors: str(accept={str, NoneType}) = NULL + / +[clinic start generated code]*/ + +static PyObject * +_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors) +/*[clinic end generated code: output=65d5982c737de649 input=3fc5f0028aad3cda]*/ +{ + return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors), + PyUnicode_GET_LENGTH(str)); +} + +/*[clinic input] _codecs.code_page_encode code_page: int str: unicode @@ -1075,6 +1109,8 @@ static PyMethodDef _codecs_functions[] = { _CODECS_READBUFFER_ENCODE_METHODDEF _CODECS_MBCS_ENCODE_METHODDEF _CODECS_MBCS_DECODE_METHODDEF + _CODECS_OEM_ENCODE_METHODDEF + _CODECS_OEM_DECODE_METHODDEF _CODECS_CODE_PAGE_ENCODE_METHODDEF _CODECS_CODE_PAGE_DECODE_METHODDEF _CODECS_REGISTER_ERROR_METHODDEF diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index 090b1ef..6a63cec 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -805,6 +805,45 @@ exit: #if defined(HAVE_MBCS) +PyDoc_STRVAR(_codecs_oem_decode__doc__, +"oem_decode($module, data, errors=None, final=False, /)\n" +"--\n" +"\n"); + +#define _CODECS_OEM_DECODE_METHODDEF \ + {"oem_decode", (PyCFunction)_codecs_oem_decode, METH_VARARGS, _codecs_oem_decode__doc__}, + +static PyObject * +_codecs_oem_decode_impl(PyObject *module, Py_buffer *data, + const char *errors, int final); + +static PyObject * +_codecs_oem_decode(PyObject *module, PyObject *args) +{ + PyObject *return_value = NULL; + Py_buffer data = {NULL, NULL}; + const char *errors = NULL; + int final = 0; + + if (!PyArg_ParseTuple(args, "y*|zi:oem_decode", + &data, &errors, &final)) { + goto exit; + } + return_value = _codecs_oem_decode_impl(module, &data, errors, final); + +exit: + /* Cleanup for data */ + if (data.obj) { + PyBuffer_Release(&data); + } + + return return_value; +} + +#endif /* defined(HAVE_MBCS) */ + +#if defined(HAVE_MBCS) + PyDoc_STRVAR(_codecs_code_page_decode__doc__, "code_page_decode($module, codepage, data, errors=None, final=False, /)\n" "--\n" @@ -1346,6 +1385,38 @@ exit: #if defined(HAVE_MBCS) +PyDoc_STRVAR(_codecs_oem_encode__doc__, +"oem_encode($module, str, errors=None, /)\n" +"--\n" +"\n"); + +#define _CODECS_OEM_ENCODE_METHODDEF \ + {"oem_encode", (PyCFunction)_codecs_oem_encode, METH_VARARGS, _codecs_oem_encode__doc__}, + +static PyObject * +_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors); + +static PyObject * +_codecs_oem_encode(PyObject *module, PyObject *args) +{ + PyObject *return_value = NULL; + PyObject *str; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "U|z:oem_encode", + &str, &errors)) { + goto exit; + } + return_value = _codecs_oem_encode_impl(module, str, errors); + +exit: + return return_value; +} + +#endif /* defined(HAVE_MBCS) */ + +#if defined(HAVE_MBCS) + PyDoc_STRVAR(_codecs_code_page_encode__doc__, "code_page_encode($module, code_page, str, errors=None, /)\n" "--\n" @@ -1446,6 +1517,10 @@ exit: #define _CODECS_MBCS_DECODE_METHODDEF #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */ +#ifndef _CODECS_OEM_DECODE_METHODDEF + #define _CODECS_OEM_DECODE_METHODDEF +#endif /* !defined(_CODECS_OEM_DECODE_METHODDEF) */ + #ifndef _CODECS_CODE_PAGE_DECODE_METHODDEF #define _CODECS_CODE_PAGE_DECODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_DECODE_METHODDEF) */ @@ -1454,7 +1529,11 @@ exit: #define _CODECS_MBCS_ENCODE_METHODDEF #endif /* !defined(_CODECS_MBCS_ENCODE_METHODDEF) */ +#ifndef _CODECS_OEM_ENCODE_METHODDEF + #define _CODECS_OEM_ENCODE_METHODDEF +#endif /* !defined(_CODECS_OEM_ENCODE_METHODDEF) */ + #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=0221e4eece62c905 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=7874e2d559d49368 input=a9049054013a1b77]*/ -- cgit v0.12