From 3a50e7056ebfb835785c84df925d79bde6d0d209 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 18 Oct 2011 21:21:00 +0200 Subject: Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore error handlers on all Windows versions. The MBCS codec is now supporting all error handlers, instead of only replace to encode and ignore to decode. --- Doc/library/codecs.rst | 7 +- Doc/whatsnew/3.3.rst | 5 + Include/unicodeobject.h | 16 +- Lib/test/test_codecs.py | 198 +++++++++++++ Misc/NEWS | 4 + Modules/_codecsmodule.c | 50 ++++ Objects/unicodeobject.c | 750 +++++++++++++++++++++++++++++++++++++++--------- Python/pythonrun.c | 5 +- 8 files changed, 888 insertions(+), 147 deletions(-) diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 84593f2..2a7abf9 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1280,12 +1280,13 @@ functions can be used directly if desired. .. module:: encodings.mbcs :synopsis: Windows ANSI codepage -Encode operand according to the ANSI codepage (CP_ACP). This codec only -supports ``'strict'`` and ``'replace'`` error handlers to encode, and -``'strict'`` and ``'ignore'`` error handlers to decode. +Encode operand according to the ANSI codepage (CP_ACP). Availability: Windows only. +.. versionchanged:: 3.3 + Support any error handler. + .. versionchanged:: 3.2 Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used to encode, and ``'ignore'`` to decode. diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst index 945aa97..eb62968 100644 --- a/Doc/whatsnew/3.3.rst +++ b/Doc/whatsnew/3.3.rst @@ -197,6 +197,11 @@ The :mod:`array` module supports the :c:type:`long long` type using ``q`` and codecs ------ +The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty +``replace`` and ``ignore`` error handlers on all Windows versions. The +:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of +only ``replace`` to encode and ``ignore`` to decode. + Multibyte CJK decoders now resynchronize faster. They only ignore the first byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312', 'replace')`` now returns a ``\n`` after the replacement character. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index a1725e5..99ec44c 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1466,6 +1466,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( Py_ssize_t *consumed /* bytes consumed */ ); +PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( + int code_page, /* code page number */ + const char *string, /* encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed /* bytes consumed */ + ); + PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( PyObject *unicode /* Unicode object */ ); @@ -1473,11 +1481,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( #ifndef Py_LIMITED_API PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( const Py_UNICODE *data, /* Unicode char buffer */ - Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ + Py_ssize_t length, /* number of Py_UNICODE chars to encode */ const char *errors /* error handling */ ); #endif +PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( + int code_page, /* code page number */ + PyObject *unicode, /* Unicode object */ + const char *errors /* error handling */ + ); + #endif /* HAVE_MBCS */ /* --- Decimal Encoder ---------------------------------------------------- */ diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index e9ce95a..f714a44 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1744,6 +1744,203 @@ class TransformCodecTest(unittest.TestCase): self.assertEqual(sout, b"\x80") +class CodePageTest(unittest.TestCase): + CP_UTF8 = 65001 + vista_or_later = (sys.getwindowsversion().major >= 6) + + def test_invalid_code_page(self): + self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') + self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') + self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a') + self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a') + + def test_code_page_name(self): + self.assertRaisesRegex(UnicodeEncodeError, 'cp932', + codecs.code_page_encode, 932, '\xff') + self.assertRaisesRegex(UnicodeDecodeError, 'cp932', + codecs.code_page_decode, 932, b'\x81\x00') + self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', + codecs.code_page_decode, self.CP_UTF8, b'\xff') + + def check_decode(self, cp, tests): + for raw, errors, expected in tests: + if expected is not None: + try: + decoded = codecs.code_page_decode(cp, raw, errors) + except UnicodeDecodeError as err: + self.fail('Unable to decode %a from "cp%s" with ' + 'errors=%r: %s' % (raw, cp, errors, err)) + self.assertEqual(decoded[0], expected, + '%a.decode("cp%s", %r)=%a != %a' + % (raw, cp, errors, decoded[0], expected)) + # assert 0 <= decoded[1] <= len(raw) + self.assertGreaterEqual(decoded[1], 0) + self.assertLessEqual(decoded[1], len(raw)) + else: + self.assertRaises(UnicodeDecodeError, + codecs.code_page_decode, cp, raw, errors) + + def check_encode(self, cp, tests): + for text, errors, expected in tests: + if expected is not None: + try: + encoded = codecs.code_page_encode(cp, text, errors) + except UnicodeEncodeError as err: + self.fail('Unable to encode %a to "cp%s" with ' + 'errors=%r: %s' % (text, cp, errors, err)) + self.assertEqual(encoded[0], expected, + '%a.encode("cp%s", %r)=%a != %a' + % (text, cp, errors, encoded[0], expected)) + self.assertEqual(encoded[1], len(text)) + else: + self.assertRaises(UnicodeEncodeError, + codecs.code_page_encode, cp, text, errors) + + def test_cp932(self): + self.check_encode(932, ( + ('abc', 'strict', b'abc'), + ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'), + # not encodable + ('\xff', 'strict', None), + ('[\xff]', 'ignore', b'[]'), + ('[\xff]', 'replace', b'[y]'), + ('[\u20ac]', 'replace', b'[?]'), + )) + tests = [ + (b'abc', 'strict', 'abc'), + (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'), + # invalid bytes + (b'\xff', 'strict', None), + (b'\xff', 'ignore', ''), + (b'\xff', 'replace', '\ufffd'), + (b'\x81\x00abc', 'strict', None), + (b'\x81\x00abc', 'ignore', '\x00abc'), + ] + if self.vista_or_later: + tests.append((b'\x81\x00abc', 'replace', '\ufffd\x00abc')) + else: + tests.append((b'\x81\x00abc', 'replace', '\x00\x00abc')) + self.check_decode(932, tests) + + def test_cp1252(self): + self.check_encode(1252, ( + ('abc', 'strict', b'abc'), + ('\xe9\u20ac', 'strict', b'\xe9\x80'), + ('\xff', 'strict', b'\xff'), + ('\u0141', 'strict', None), + ('\u0141', 'ignore', b''), + ('\u0141', 'replace', b'L'), + )) + self.check_decode(1252, ( + (b'abc', 'strict', 'abc'), + (b'\xe9\x80', 'strict', '\xe9\u20ac'), + (b'\xff', 'strict', '\xff'), + )) + + def test_cp_utf7(self): + cp = 65000 + self.check_encode(cp, ( + ('abc', 'strict', b'abc'), + ('\xe9\u20ac', 'strict', b'+AOkgrA-'), + ('\U0010ffff', 'strict', b'+2//f/w-'), + ('\udc80', 'strict', b'+3IA-'), + ('\ufffd', 'strict', b'+//0-'), + )) + self.check_decode(cp, ( + (b'abc', 'strict', 'abc'), + (b'+AOkgrA-', 'strict', '\xe9\u20ac'), + (b'+2//f/w-', 'strict', '\U0010ffff'), + (b'+3IA-', 'strict', '\udc80'), + (b'+//0-', 'strict', '\ufffd'), + # invalid bytes + (b'[+/]', 'strict', '[]'), + (b'[\xff]', 'strict', '[\xff]'), + )) + + def test_cp_utf8(self): + cp = self.CP_UTF8 + + tests = [ + ('abc', 'strict', b'abc'), + ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), + ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), + ] + if self.vista_or_later: + tests.append(('\udc80', 'strict', None)) + tests.append(('\udc80', 'ignore', b'')) + tests.append(('\udc80', 'replace', b'?')) + else: + tests.append(('\udc80', 'strict', b'\xed\xb2\x80')) + self.check_encode(cp, tests) + + tests = [ + (b'abc', 'strict', 'abc'), + (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'), + (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'), + (b'\xef\xbf\xbd', 'strict', '\ufffd'), + (b'[\xc3\xa9]', 'strict', '[\xe9]'), + # invalid bytes + (b'[\xff]', 'strict', None), + (b'[\xff]', 'ignore', '[]'), + (b'[\xff]', 'replace', '[\ufffd]'), + ] + if self.vista_or_later: + tests.extend(( + (b'[\xed\xb2\x80]', 'strict', None), + (b'[\xed\xb2\x80]', 'ignore', '[]'), + (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), + )) + else: + tests.extend(( + (b'[\xed\xb2\x80]', 'strict', '[\udc80]'), + )) + self.check_decode(cp, tests) + + def test_error_handlers(self): + self.check_encode(932, ( + ('\xff', 'backslashreplace', b'\\xff'), + ('\xff', 'xmlcharrefreplace', b'ÿ'), + )) + self.check_decode(932, ( + (b'\xff', 'surrogateescape', '\udcff'), + )) + if self.vista_or_later: + self.check_encode(self.CP_UTF8, ( + ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), + )) + + def test_multibyte_encoding(self): + self.check_decode(932, ( + (b'\x84\xe9\x80', 'ignore', '\u9a3e'), + (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'), + )) + self.check_decode(self.CP_UTF8, ( + (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), + (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), + )) + if self.vista_or_later: + self.check_encode(self.CP_UTF8, ( + ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), + ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), + )) + + def test_incremental(self): + decoded = codecs.code_page_decode(932, + b'\xe9\x80\xe9', 'strict', + False) + self.assertEqual(decoded, ('\u9a3e', 2)) + + decoded = codecs.code_page_decode(932, + b'\xe9\x80\xe9\x80', 'strict', + False) + self.assertEqual(decoded, ('\u9a3e\u9a3e', 4)) + + decoded = codecs.code_page_decode(932, + b'abc', 'strict', + False) + self.assertEqual(decoded, ('abc', 3)) + + def test_main(): support.run_unittest( UTF32Test, @@ -1772,6 +1969,7 @@ def test_main(): SurrogateEscapeTest, BomTest, TransformCodecTest, + CodePageTest, ) diff --git a/Misc/NEWS b/Misc/NEWS index 3d77bbf..ef69bf2 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1? Core and Builtins ----------------- +- Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore + error handlers on all Windows versions. The MBCS codec is now supporting all + error handlers, instead of only replace to encode and ignore to decode. + - Issue #13188: When called without an explicit traceback argument, generator.throw() now gets the traceback from the passed exception's ``__traceback__`` attribute. Patch by Petri Lehtinen. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 26c8788..be31fd2 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -612,6 +612,31 @@ mbcs_decode(PyObject *self, return codec_tuple(decoded, consumed); } +static PyObject * +code_page_decode(PyObject *self, + PyObject *args) +{ + Py_buffer pbuf; + const char *errors = NULL; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + int code_page; + + if (!PyArg_ParseTuple(args, "iy*|zi:code_page_decode", + &code_page, &pbuf, &errors, &final)) + return NULL; + consumed = pbuf.len; + + decoded = PyUnicode_DecodeCodePageStateful(code_page, + pbuf.buf, pbuf.len, errors, + final ? NULL : &consumed); + PyBuffer_Release(&pbuf); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + #endif /* HAVE_MBCS */ /* --- Encoder ------------------------------------------------------------ */ @@ -1011,6 +1036,29 @@ mbcs_encode(PyObject *self, return v; } +static PyObject * +code_page_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + int code_page; + + if (!PyArg_ParseTuple(args, "iO|z:code_page_encode", + &code_page, &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeCodePage(code_page, + str, + errors), + PyUnicode_GET_LENGTH(str)); + Py_DECREF(str); + return v; +} + #endif /* HAVE_MBCS */ /* --- Error handler registry --------------------------------------------- */ @@ -1101,6 +1149,8 @@ static PyMethodDef _codecs_functions[] = { #ifdef HAVE_MBCS {"mbcs_encode", mbcs_encode, METH_VARARGS}, {"mbcs_decode", mbcs_decode, METH_VARARGS}, + {"code_page_encode", code_page_encode, METH_VARARGS}, + {"code_page_decode", code_page_decode, METH_VARARGS}, #endif {"register_error", register_error, METH_VARARGS, register_error__doc__}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5f56cf7..9d11546 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -429,6 +429,10 @@ _PyUnicode_CheckConsistency(void *op, int check_content) } #endif +#ifdef HAVE_MBCS +static OSVERSIONINFOEX winver; +#endif + /* --- Bloom Filters ----------------------------------------------------- */ /* stuff to implement simple "bloom filters" for Unicode characters. @@ -6896,130 +6900,307 @@ PyUnicode_AsASCIIString(PyObject *unicode) #define NEED_RETRY #endif -/* XXX This code is limited to "true" double-byte encodings, as - a) it assumes an incomplete character consists of a single byte, and - b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte - encodings, see IsDBCSLeadByteEx documentation. */ +#ifndef WC_ERR_INVALID_CHARS +# define WC_ERR_INVALID_CHARS 0x0080 +#endif + +static char* +code_page_name(UINT code_page, PyObject **obj) +{ + *obj = NULL; + if (code_page == CP_ACP) + return "mbcs"; + if (code_page == CP_UTF7) + return "CP_UTF7"; + if (code_page == CP_UTF8) + return "CP_UTF8"; + + *obj = PyBytes_FromFormat("cp%u", code_page); + if (*obj == NULL) + return NULL; + return PyBytes_AS_STRING(*obj); +} static int -is_dbcs_lead_byte(const char *s, int offset) +is_dbcs_lead_byte(UINT code_page, const char *s, int offset) { const char *curr = s + offset; + const char *prev; - if (IsDBCSLeadByte(*curr)) { - const char *prev = CharPrev(s, curr); - return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); - } + if (!IsDBCSLeadByteEx(code_page, *curr)) + return 0; + + prev = CharPrevExA(code_page, s, curr, 0); + if (prev == curr) + return 1; + /* FIXME: This code is limited to "true" double-byte encodings, + as it assumes an incomplete character consists of a single + byte. */ + if (curr - prev == 2) + return 1; + if (!IsDBCSLeadByteEx(code_page, *prev)) + return 1; return 0; } +static DWORD +decode_code_page_flags(UINT code_page) +{ + if (code_page == CP_UTF7) { + /* The CP_UTF7 decoder only supports flags=0 */ + return 0; + } + else + return MB_ERR_INVALID_CHARS; +} + /* - * Decode MBCS string into unicode object. If 'final' is set, converts - * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. + * Decode a byte string from a Windows code page into unicode object in strict + * mode. + * + * Returns consumed size if succeed, returns -2 on decode error, or raise a + * WindowsError and returns -1 on other error. */ static int -decode_mbcs(PyUnicodeObject **v, - const char *s, /* MBCS string */ - int size, /* sizeof MBCS string */ - int final, - const char *errors) +decode_code_page_strict(UINT code_page, + PyUnicodeObject **v, + const char *in, + int insize) { - Py_UNICODE *p; - Py_ssize_t n; - DWORD usize; - DWORD flags; + const DWORD flags = decode_code_page_flags(code_page); + Py_UNICODE *out; + DWORD outsize; - assert(size >= 0); + /* First get the size of the result */ + assert(insize > 0); + outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); + if (outsize <= 0) + goto error; - /* check and handle 'errors' arg */ - if (errors==NULL || strcmp(errors, "strict")==0) - flags = MB_ERR_INVALID_CHARS; - else if (strcmp(errors, "ignore")==0) - flags = 0; + if (*v == NULL) { + /* Create unicode object */ + *v = _PyUnicode_New(outsize); + if (*v == NULL) + return -1; + out = PyUnicode_AS_UNICODE(*v); + } else { - PyErr_Format(PyExc_ValueError, - "mbcs encoding does not support errors='%s'", - errors); - return -1; + /* Extend unicode object */ + Py_ssize_t n = PyUnicode_GET_SIZE(*v); + if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0) + return -1; + out = PyUnicode_AS_UNICODE(*v) + n; } - /* Skip trailing lead-byte unless 'final' is set */ - if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) - --size; + /* Do the conversion */ + outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); + if (outsize <= 0) + goto error; + return insize; - /* First get the size of the result */ - if (size > 0) { - usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); - if (usize==0) - goto mbcs_decode_error; - } else - usize = 0; +error: + if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) + return -2; + PyErr_SetFromWindowsErr(0); + return -1; +} + +/* + * Decode a byte string from a code page into unicode object with an error + * handler. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. + */ +static int +decode_code_page_errors(UINT code_page, + PyUnicodeObject **v, + const char *in, + int size, + const char *errors) +{ + const char *startin = in; + const char *endin = in + size; + const DWORD flags = decode_code_page_flags(code_page); + /* Ideally, we should get reason from FormatMessage. This is the Windows + 2000 English version of the message. */ + const char *reason = "No mapping for the Unicode character exists " + "in the target code page."; + /* each step cannot decode more than 1 character, but a character can be + represented as a surrogate pair */ + wchar_t buffer[2], *startout, *out; + int insize, outsize; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *encoding_obj = NULL; + char *encoding; + DWORD err; + int ret = -1; + + assert(size > 0); + + encoding = code_page_name(code_page, &encoding_obj); + if (encoding == NULL) + return -1; + + if (errors == NULL || strcmp(errors, "strict") == 0) { + /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a + UnicodeDecodeError. */ + make_decode_exception(&exc, encoding, in, size, 0, 0, reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_CLEAR(exc); + } + goto error; + } if (*v == NULL) { /* Create unicode object */ - *v = _PyUnicode_New(usize); + if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { + PyErr_NoMemory(); + goto error; + } + *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); if (*v == NULL) - return -1; - n = 0; + goto error; + startout = PyUnicode_AS_UNICODE(*v); } else { /* Extend unicode object */ - n = PyUnicode_GET_SIZE(*v); - if (PyUnicode_Resize((PyObject**)v, n + usize) < 0) - return -1; + Py_ssize_t n = PyUnicode_GET_SIZE(*v); + if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { + PyErr_NoMemory(); + goto error; + } + if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) + goto error; + startout = PyUnicode_AS_UNICODE(*v) + n; } - /* Do the conversion */ - if (usize > 0) { - p = PyUnicode_AS_UNICODE(*v) + n; - if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { - goto mbcs_decode_error; + /* Decode the byte string character per character */ + out = startout; + while (in < endin) + { + /* Decode a character */ + insize = 1; + do + { + outsize = MultiByteToWideChar(code_page, flags, + in, insize, + buffer, Py_ARRAY_LENGTH(buffer)); + if (outsize > 0) + break; + err = GetLastError(); + if (err != ERROR_NO_UNICODE_TRANSLATION + && err != ERROR_INSUFFICIENT_BUFFER) + { + PyErr_SetFromWindowsErr(0); + goto error; + } + insize++; + } + /* 4=maximum length of a UTF-8 sequence */ + while (insize <= 4 && (in + insize) <= endin); + + if (outsize <= 0) { + Py_ssize_t startinpos, endinpos, outpos; + + startinpos = in - startin; + endinpos = startinpos + 1; + outpos = out - PyUnicode_AS_UNICODE(*v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + encoding, reason, + &startin, &endin, &startinpos, &endinpos, &exc, &in, + v, &outpos, &out)) + { + goto error; + } + } + else { + in += insize; + memcpy(out, buffer, outsize * sizeof(wchar_t)); + out += outsize; } } - return size; -mbcs_decode_error: - /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then - we raise a UnicodeDecodeError - else it is a 'generic' - windows error - */ - if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { - /* Ideally, we should get reason from FormatMessage - this - is the Windows 2000 English version of the message - */ - PyObject *exc = NULL; - const char *reason = "No mapping for the Unicode character exists " - "in the target multi-byte code page."; - make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); - if (exc != NULL) { - PyCodec_StrictErrors(exc); - Py_DECREF(exc); + /* write a NUL character at the end */ + *out = 0; + + /* Extend unicode object */ + outsize = out - startout; + assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); + if (PyUnicode_Resize((PyObject**)v, outsize) < 0) + goto error; + ret = 0; + +error: + Py_XDECREF(encoding_obj); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return ret; +} + +/* + * Decode a byte string from a Windows code page into unicode object. If + * 'final' is set, converts trailing lead-byte too. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. + */ +static int +decode_code_page(UINT code_page, + PyUnicodeObject **v, + const char *s, int size, + int final, const char *errors) +{ + int done; + + /* Skip trailing lead-byte unless 'final' is set */ + if (size == 0) { + if (*v == NULL) { + Py_INCREF(unicode_empty); + *v = (PyUnicodeObject*)unicode_empty; + if (*v == NULL) + return -1; } - } else { - PyErr_SetFromWindowsErrWithFilename(0, NULL); + return 0; } - return -1; + + if (!final && is_dbcs_lead_byte(code_page, s, size - 1)) + --size; + + done = decode_code_page_strict(code_page, v, s, size); + if (done == -2) + done = decode_code_page_errors(code_page, v, s, size, errors); + return done; } -PyObject * -PyUnicode_DecodeMBCSStateful(const char *s, - Py_ssize_t size, - const char *errors, - Py_ssize_t *consumed) +static PyObject * +decode_code_page_stateful(int code_page, + const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) { PyUnicodeObject *v = NULL; int done; + if (code_page < 0) { + PyErr_SetString(PyExc_ValueError, "invalid code page number"); + return NULL; + } + if (consumed) *consumed = 0; #ifdef NEED_RETRY retry: if (size > INT_MAX) - done = decode_mbcs(&v, s, INT_MAX, 0, errors); + done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors); else #endif - done = decode_mbcs(&v, s, (int)size, !consumed, errors); + done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors); if (done < 0) { Py_XDECREF(v); @@ -7036,6 +7217,7 @@ PyUnicode_DecodeMBCSStateful(const char *s, goto retry; } #endif + #ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&v)) { Py_DECREF(v); @@ -7047,6 +7229,25 @@ PyUnicode_DecodeMBCSStateful(const char *s, } PyObject * +PyUnicode_DecodeCodePageStateful(int code_page, + const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return decode_code_page_stateful(code_page, s, size, errors, consumed); +} + +PyObject * +PyUnicode_DecodeMBCSStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); +} + +PyObject * PyUnicode_DecodeMBCS(const char *s, Py_ssize_t size, const char *errors) @@ -7054,105 +7255,342 @@ PyUnicode_DecodeMBCS(const char *s, return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); } +static DWORD +encode_code_page_flags(UINT code_page, const char *errors) +{ + if (code_page == CP_UTF8) { + if (winver.dwMajorVersion >= 6) + /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista + and later */ + return WC_ERR_INVALID_CHARS; + else + /* CP_UTF8 only supports flags=0 on Windows older than Vista */ + return 0; + } + else if (code_page == CP_UTF7) { + /* CP_UTF7 only supports flags=0 */ + return 0; + } + else { + if (errors != NULL && strcmp(errors, "replace") == 0) + return 0; + else + return WC_NO_BEST_FIT_CHARS; + } +} + /* - * Convert unicode into string object (MBCS). - * Returns 0 if succeed, -1 otherwise. + * Encode a Unicode string to a Windows code page into a byte string in strict + * mode. + * + * Returns consumed characters if succeed, returns -2 on encode error, or raise + * a WindowsError and returns -1 on other error. */ static int -encode_mbcs(PyObject **repr, - const Py_UNICODE *p, /* unicode */ - int size, /* size of unicode */ - const char* errors) +encode_code_page_strict(UINT code_page, PyObject **outbytes, + const Py_UNICODE *p, const int size, + const char* errors) { BOOL usedDefaultChar = FALSE; - BOOL *pusedDefaultChar; - int mbcssize; - Py_ssize_t n; + BOOL *pusedDefaultChar = &usedDefaultChar; + int outsize; PyObject *exc = NULL; - DWORD flags; + const DWORD flags = encode_code_page_flags(code_page, NULL); + char *out; - assert(size >= 0); + assert(size > 0); - /* check and handle 'errors' arg */ - if (errors==NULL || strcmp(errors, "strict")==0) { - flags = WC_NO_BEST_FIT_CHARS; + if (code_page != CP_UTF8 && code_page != CP_UTF7) pusedDefaultChar = &usedDefaultChar; - } else if (strcmp(errors, "replace")==0) { - flags = 0; + else pusedDefaultChar = NULL; - } else { - PyErr_Format(PyExc_ValueError, - "mbcs encoding does not support errors='%s'", - errors); - return -1; - } /* First get the size of the result */ - if (size > 0) { - mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, - NULL, pusedDefaultChar); - if (mbcssize == 0) { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - return -1; - } - /* If we used a default char, then we failed! */ - if (pusedDefaultChar && *pusedDefaultChar) - goto mbcs_encode_error; - } else { - mbcssize = 0; - } + outsize = WideCharToMultiByte(code_page, flags, + p, size, + NULL, 0, + NULL, pusedDefaultChar); + if (outsize <= 0) + goto error; + /* If we used a default char, then we failed! */ + if (pusedDefaultChar && *pusedDefaultChar) + return -2; - if (*repr == NULL) { + if (*outbytes == NULL) { /* Create string object */ - *repr = PyBytes_FromStringAndSize(NULL, mbcssize); - if (*repr == NULL) + *outbytes = PyBytes_FromStringAndSize(NULL, outsize); + if (*outbytes == NULL) return -1; - n = 0; + out = PyBytes_AS_STRING(*outbytes); } else { /* Extend string object */ - n = PyBytes_Size(*repr); - if (_PyBytes_Resize(repr, n + mbcssize) < 0) + const Py_ssize_t n = PyBytes_Size(*outbytes); + if (outsize > PY_SSIZE_T_MAX - n) { + PyErr_NoMemory(); return -1; + } + if (_PyBytes_Resize(outbytes, n + outsize) < 0) + return -1; + out = PyBytes_AS_STRING(*outbytes) + n; } /* Do the conversion */ - if (size > 0) { - char *s = PyBytes_AS_STRING(*repr) + n; - if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, - NULL, pusedDefaultChar)) { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - return -1; + outsize = WideCharToMultiByte(code_page, flags, + p, size, + out, outsize, + NULL, pusedDefaultChar); + if (outsize <= 0) + goto error; + if (pusedDefaultChar && *pusedDefaultChar) + return -2; + return 0; + +error: + if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) + return -2; + PyErr_SetFromWindowsErr(0); + return -1; +} + +/* + * Encode a Unicode string to a Windows code page into a byte string using a + * error handler. + * + * Returns consumed characters if succeed, or raise a WindowsError and returns + * -1 on other error. + */ +static int +encode_code_page_errors(UINT code_page, PyObject **outbytes, + const Py_UNICODE *in, const int insize, + const char* errors) +{ + const DWORD flags = encode_code_page_flags(code_page, errors); + const Py_UNICODE *startin = in; + const Py_UNICODE *endin = in + insize; + /* Ideally, we should get reason from FormatMessage. This is the Windows + 2000 English version of the message. */ + const char *reason = "invalid character"; + /* 4=maximum length of a UTF-8 sequence */ + char buffer[4]; + BOOL usedDefaultChar = FALSE, *pusedDefaultChar; + Py_ssize_t outsize; + char *out; + int charsize; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *encoding_obj = NULL; + char *encoding; + int err; + Py_ssize_t startpos, newpos, newoutsize; + PyObject *rep; + int ret = -1; + + assert(insize > 0); + + encoding = code_page_name(code_page, &encoding_obj); + if (encoding == NULL) + return -1; + + if (errors == NULL || strcmp(errors, "strict") == 0) { + /* The last error was ERROR_NO_UNICODE_TRANSLATION, + then we raise a UnicodeEncodeError. */ + make_encode_exception(&exc, encoding, in, insize, 0, 0, reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); } - if (pusedDefaultChar && *pusedDefaultChar) - goto mbcs_encode_error; + Py_XDECREF(encoding_obj); + return -1; } - return 0; -mbcs_encode_error: - raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); + if (code_page != CP_UTF8 && code_page != CP_UTF7) + pusedDefaultChar = &usedDefaultChar; + else + pusedDefaultChar = NULL; + + if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { + PyErr_NoMemory(); + goto error; + } + outsize = insize * Py_ARRAY_LENGTH(buffer); + + if (*outbytes == NULL) { + /* Create string object */ + *outbytes = PyBytes_FromStringAndSize(NULL, outsize); + if (*outbytes == NULL) + goto error; + out = PyBytes_AS_STRING(*outbytes); + } + else { + /* Extend string object */ + Py_ssize_t n = PyBytes_Size(*outbytes); + if (n > PY_SSIZE_T_MAX - outsize) { + PyErr_NoMemory(); + goto error; + } + if (_PyBytes_Resize(outbytes, n + outsize) < 0) + goto error; + out = PyBytes_AS_STRING(*outbytes) + n; + } + + /* Encode the string character per character */ + while (in < endin) + { + if ((in + 2) <= endin + && 0xD800 <= in[0] && in[0] <= 0xDBFF + && 0xDC00 <= in[1] && in[1] <= 0xDFFF) + charsize = 2; + else + charsize = 1; + + outsize = WideCharToMultiByte(code_page, flags, + in, charsize, + buffer, Py_ARRAY_LENGTH(buffer), + NULL, pusedDefaultChar); + if (outsize > 0) { + if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) + { + in += charsize; + memcpy(out, buffer, outsize); + out += outsize; + continue; + } + } + else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { + PyErr_SetFromWindowsErr(0); + goto error; + } + + charsize = Py_MAX(charsize - 1, 1); + startpos = in - startin; + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, encoding, reason, + startin, insize, &exc, + startpos, startpos + charsize, &newpos); + if (rep == NULL) + goto error; + in = startin + newpos; + + if (PyBytes_Check(rep)) { + outsize = PyBytes_GET_SIZE(rep); + if (outsize != 1) { + Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); + newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); + if (_PyBytes_Resize(outbytes, newoutsize) < 0) { + Py_DECREF(rep); + goto error; + } + out = PyBytes_AS_STRING(*outbytes) + offset; + } + memcpy(out, PyBytes_AS_STRING(rep), outsize); + out += outsize; + } + else { + Py_ssize_t i; + enum PyUnicode_Kind kind; + void *data; + + if (PyUnicode_READY(rep) < 0) { + Py_DECREF(rep); + goto error; + } + + outsize = PyUnicode_GET_LENGTH(rep); + if (outsize != 1) { + Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); + newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); + if (_PyBytes_Resize(outbytes, newoutsize) < 0) { + Py_DECREF(rep); + goto error; + } + out = PyBytes_AS_STRING(*outbytes) + offset; + } + kind = PyUnicode_KIND(rep); + data = PyUnicode_DATA(rep); + for (i=0; i < outsize; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch > 127) { + raise_encode_exception(&exc, + encoding, + startin, insize, + startpos, startpos + charsize, + "unable to encode error handler result to ASCII"); + Py_DECREF(rep); + goto error; + } + *out = (unsigned char)ch; + out++; + } + } + Py_DECREF(rep); + } + /* write a NUL byte */ + *out = 0; + outsize = out - PyBytes_AS_STRING(*outbytes); + assert(outsize <= PyBytes_GET_SIZE(*outbytes)); + if (_PyBytes_Resize(outbytes, outsize) < 0) + goto error; + ret = 0; + +error: + Py_XDECREF(encoding_obj); + Py_XDECREF(errorHandler); Py_XDECREF(exc); - return -1; + return ret; } -PyObject * -PyUnicode_EncodeMBCS(const Py_UNICODE *p, - Py_ssize_t size, - const char *errors) +/* + * Encode a Unicode string to a Windows code page into a byte string. + * + * Returns consumed characters if succeed, or raise a WindowsError and returns + * -1 on other error. + */ +static int +encode_code_page_chunk(UINT code_page, PyObject **outbytes, + const Py_UNICODE *p, int size, + const char* errors) +{ + int done; + + if (size == 0) { + if (*outbytes == NULL) { + *outbytes = PyBytes_FromStringAndSize(NULL, 0); + if (*outbytes == NULL) + return -1; + } + return 0; + } + + done = encode_code_page_strict(code_page, outbytes, p, size, errors); + if (done == -2) + done = encode_code_page_errors(code_page, outbytes, p, size, errors); + return done; +} + +static PyObject * +encode_code_page(int code_page, + const Py_UNICODE *p, Py_ssize_t size, + const char *errors) { - PyObject *repr = NULL; + PyObject *outbytes = NULL; int ret; + if (code_page < 0) { + PyErr_SetString(PyExc_ValueError, "invalid code page number"); + return NULL; + } + #ifdef NEED_RETRY retry: if (size > INT_MAX) - ret = encode_mbcs(&repr, p, INT_MAX, errors); + ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors); else #endif - ret = encode_mbcs(&repr, p, (int)size, errors); + ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors); if (ret < 0) { - Py_XDECREF(repr); + Py_XDECREF(outbytes); return NULL; } @@ -7164,7 +7602,28 @@ PyUnicode_EncodeMBCS(const Py_UNICODE *p, } #endif - return repr; + return outbytes; +} + +PyObject * +PyUnicode_EncodeMBCS(const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) +{ + return encode_code_page(CP_ACP, p, size, errors); +} + +PyObject * +PyUnicode_EncodeCodePage(int code_page, + PyObject *unicode, + const char *errors) +{ + const Py_UNICODE *p; + Py_ssize_t size; + p = PyUnicode_AsUnicodeAndSize(unicode, &size); + if (p == NULL) + return NULL; + return encode_code_page(code_page, p, size, errors); } PyObject * @@ -13434,7 +13893,7 @@ PyTypeObject PyUnicode_Type = { /* Initialize the Unicode implementation */ -void _PyUnicode_Init(void) +int _PyUnicode_Init(void) { int i; @@ -13467,6 +13926,15 @@ void _PyUnicode_Init(void) Py_ARRAY_LENGTH(linebreak)); PyType_Ready(&EncodingMapType); + +#ifdef HAVE_MBCS + winver.dwOSVersionInfoSize = sizeof(winver); + if (!GetVersionEx((OSVERSIONINFO*)&winver)) { + PyErr_SetFromWindowsErr(0); + return -1; + } +#endif + return 0; } /* Finalize the Unicode implementation */ diff --git a/Python/pythonrun.c b/Python/pythonrun.c index a6e7c46..0f2f050 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -67,7 +67,7 @@ static void initsigs(void); static void call_py_exitfuncs(void); static void wait_for_thread_shutdown(void); static void call_ll_exitfuncs(void); -extern void _PyUnicode_Init(void); +extern int _PyUnicode_Init(void); extern void _PyUnicode_Fini(void); extern int _PyLong_Init(void); extern void PyLong_Fini(void); @@ -261,7 +261,8 @@ Py_InitializeEx(int install_sigs) Py_FatalError("Py_Initialize: can't make modules_reloading dictionary"); /* Init Unicode implementation; relies on the codec registry */ - _PyUnicode_Init(); + if (_PyUnicode_Init() < 0) + Py_FatalError("Py_Initialize: can't initialize unicode"); bimod = _PyBuiltin_Init(); if (bimod == NULL) -- cgit v0.12