summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/codecs.rst7
-rw-r--r--Doc/whatsnew/3.3.rst5
-rw-r--r--Include/unicodeobject.h16
-rw-r--r--Lib/test/test_codecs.py198
-rw-r--r--Misc/NEWS4
-rw-r--r--Modules/_codecsmodule.c50
-rw-r--r--Objects/unicodeobject.c750
-rw-r--r--Python/pythonrun.c5
8 files changed, 888 insertions, 147 deletions
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
index 84593f2..2a7abf9 100644
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1280,12 +1280,13 @@ functions can be used directly if desired.
.. module:: encodings.mbcs
:synopsis: Windows ANSI codepage
-Encode operand according to the ANSI codepage (CP_ACP). This codec only
-supports ``'strict'`` and ``'replace'`` error handlers to encode, and
-``'strict'`` and ``'ignore'`` error handlers to decode.
+Encode operand according to the ANSI codepage (CP_ACP).
Availability: Windows only.
+.. versionchanged:: 3.3
+ Support any error handler.
+
.. versionchanged:: 3.2
Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
to encode, and ``'ignore'`` to decode.
diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst
index 945aa97..eb62968 100644
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -197,6 +197,11 @@ The :mod:`array` module supports the :c:type:`long long` type using ``q`` and
codecs
------
+The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty
+``replace`` and ``ignore`` error handlers on all Windows versions. The
+:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of
+only ``replace`` to encode and ``ignore`` to decode.
+
Multibyte CJK decoders now resynchronize faster. They only ignore the first
byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312',
'replace')`` now returns a ``\n`` after the replacement character.
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index a1725e5..99ec44c 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1466,6 +1466,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
Py_ssize_t *consumed /* bytes consumed */
);
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
+ int code_page, /* code page number */
+ const char *string, /* encoded string */
+ Py_ssize_t length, /* size of string */
+ const char *errors, /* error handling */
+ Py_ssize_t *consumed /* bytes consumed */
+ );
+
PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
PyObject *unicode /* Unicode object */
);
@@ -1473,11 +1481,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
const Py_UNICODE *data, /* Unicode char buffer */
- Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
+ Py_ssize_t length, /* number of Py_UNICODE chars to encode */
const char *errors /* error handling */
);
#endif
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
+ int code_page, /* code page number */
+ PyObject *unicode, /* Unicode object */
+ const char *errors /* error handling */
+ );
+
#endif /* HAVE_MBCS */
/* --- Decimal Encoder ---------------------------------------------------- */
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index e9ce95a..f714a44 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1744,6 +1744,203 @@ class TransformCodecTest(unittest.TestCase):
self.assertEqual(sout, b"\x80")
+class CodePageTest(unittest.TestCase):
+ CP_UTF8 = 65001
+ vista_or_later = (sys.getwindowsversion().major >= 6)
+
+ def test_invalid_code_page(self):
+ self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
+ self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
+ self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
+ self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
+
+ def test_code_page_name(self):
+ self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
+ codecs.code_page_encode, 932, '\xff')
+ self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
+ codecs.code_page_decode, 932, b'\x81\x00')
+ self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
+ codecs.code_page_decode, self.CP_UTF8, b'\xff')
+
+ def check_decode(self, cp, tests):
+ for raw, errors, expected in tests:
+ if expected is not None:
+ try:
+ decoded = codecs.code_page_decode(cp, raw, errors)
+ except UnicodeDecodeError as err:
+ self.fail('Unable to decode %a from "cp%s" with '
+ 'errors=%r: %s' % (raw, cp, errors, err))
+ self.assertEqual(decoded[0], expected,
+ '%a.decode("cp%s", %r)=%a != %a'
+ % (raw, cp, errors, decoded[0], expected))
+ # assert 0 <= decoded[1] <= len(raw)
+ self.assertGreaterEqual(decoded[1], 0)
+ self.assertLessEqual(decoded[1], len(raw))
+ else:
+ self.assertRaises(UnicodeDecodeError,
+ codecs.code_page_decode, cp, raw, errors)
+
+ def check_encode(self, cp, tests):
+ for text, errors, expected in tests:
+ if expected is not None:
+ try:
+ encoded = codecs.code_page_encode(cp, text, errors)
+ except UnicodeEncodeError as err:
+ self.fail('Unable to encode %a to "cp%s" with '
+ 'errors=%r: %s' % (text, cp, errors, err))
+ self.assertEqual(encoded[0], expected,
+ '%a.encode("cp%s", %r)=%a != %a'
+ % (text, cp, errors, encoded[0], expected))
+ self.assertEqual(encoded[1], len(text))
+ else:
+ self.assertRaises(UnicodeEncodeError,
+ codecs.code_page_encode, cp, text, errors)
+
+ def test_cp932(self):
+ self.check_encode(932, (
+ ('abc', 'strict', b'abc'),
+ ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
+ # not encodable
+ ('\xff', 'strict', None),
+ ('[\xff]', 'ignore', b'[]'),
+ ('[\xff]', 'replace', b'[y]'),
+ ('[\u20ac]', 'replace', b'[?]'),
+ ))
+ tests = [
+ (b'abc', 'strict', 'abc'),
+ (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
+ # invalid bytes
+ (b'\xff', 'strict', None),
+ (b'\xff', 'ignore', ''),
+ (b'\xff', 'replace', '\ufffd'),
+ (b'\x81\x00abc', 'strict', None),
+ (b'\x81\x00abc', 'ignore', '\x00abc'),
+ ]
+ if self.vista_or_later:
+ tests.append((b'\x81\x00abc', 'replace', '\ufffd\x00abc'))
+ else:
+ tests.append((b'\x81\x00abc', 'replace', '\x00\x00abc'))
+ self.check_decode(932, tests)
+
+ def test_cp1252(self):
+ self.check_encode(1252, (
+ ('abc', 'strict', b'abc'),
+ ('\xe9\u20ac', 'strict', b'\xe9\x80'),
+ ('\xff', 'strict', b'\xff'),
+ ('\u0141', 'strict', None),
+ ('\u0141', 'ignore', b''),
+ ('\u0141', 'replace', b'L'),
+ ))
+ self.check_decode(1252, (
+ (b'abc', 'strict', 'abc'),
+ (b'\xe9\x80', 'strict', '\xe9\u20ac'),
+ (b'\xff', 'strict', '\xff'),
+ ))
+
+ def test_cp_utf7(self):
+ cp = 65000
+ self.check_encode(cp, (
+ ('abc', 'strict', b'abc'),
+ ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
+ ('\U0010ffff', 'strict', b'+2//f/w-'),
+ ('\udc80', 'strict', b'+3IA-'),
+ ('\ufffd', 'strict', b'+//0-'),
+ ))
+ self.check_decode(cp, (
+ (b'abc', 'strict', 'abc'),
+ (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
+ (b'+2//f/w-', 'strict', '\U0010ffff'),
+ (b'+3IA-', 'strict', '\udc80'),
+ (b'+//0-', 'strict', '\ufffd'),
+ # invalid bytes
+ (b'[+/]', 'strict', '[]'),
+ (b'[\xff]', 'strict', '[\xff]'),
+ ))
+
+ def test_cp_utf8(self):
+ cp = self.CP_UTF8
+
+ tests = [
+ ('abc', 'strict', b'abc'),
+ ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
+ ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
+ ]
+ if self.vista_or_later:
+ tests.append(('\udc80', 'strict', None))
+ tests.append(('\udc80', 'ignore', b''))
+ tests.append(('\udc80', 'replace', b'?'))
+ else:
+ tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
+ self.check_encode(cp, tests)
+
+ tests = [
+ (b'abc', 'strict', 'abc'),
+ (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
+ (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
+ (b'\xef\xbf\xbd', 'strict', '\ufffd'),
+ (b'[\xc3\xa9]', 'strict', '[\xe9]'),
+ # invalid bytes
+ (b'[\xff]', 'strict', None),
+ (b'[\xff]', 'ignore', '[]'),
+ (b'[\xff]', 'replace', '[\ufffd]'),
+ ]
+ if self.vista_or_later:
+ tests.extend((
+ (b'[\xed\xb2\x80]', 'strict', None),
+ (b'[\xed\xb2\x80]', 'ignore', '[]'),
+ (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
+ ))
+ else:
+ tests.extend((
+ (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
+ ))
+ self.check_decode(cp, tests)
+
+ def test_error_handlers(self):
+ self.check_encode(932, (
+ ('\xff', 'backslashreplace', b'\\xff'),
+ ('\xff', 'xmlcharrefreplace', b'&#255;'),
+ ))
+ self.check_decode(932, (
+ (b'\xff', 'surrogateescape', '\udcff'),
+ ))
+ if self.vista_or_later:
+ self.check_encode(self.CP_UTF8, (
+ ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
+ ))
+
+ def test_multibyte_encoding(self):
+ self.check_decode(932, (
+ (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
+ (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
+ ))
+ self.check_decode(self.CP_UTF8, (
+ (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
+ (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
+ ))
+ if self.vista_or_later:
+ self.check_encode(self.CP_UTF8, (
+ ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
+ ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
+ ))
+
+ def test_incremental(self):
+ decoded = codecs.code_page_decode(932,
+ b'\xe9\x80\xe9', 'strict',
+ False)
+ self.assertEqual(decoded, ('\u9a3e', 2))
+
+ decoded = codecs.code_page_decode(932,
+ b'\xe9\x80\xe9\x80', 'strict',
+ False)
+ self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
+
+ decoded = codecs.code_page_decode(932,
+ b'abc', 'strict',
+ False)
+ self.assertEqual(decoded, ('abc', 3))
+
+
def test_main():
support.run_unittest(
UTF32Test,
@@ -1772,6 +1969,7 @@ def test_main():
SurrogateEscapeTest,
BomTest,
TransformCodecTest,
+ CodePageTest,
)
diff --git a/Misc/NEWS b/Misc/NEWS
index 3d77bbf..ef69bf2 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins
-----------------
+- Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore
+ error handlers on all Windows versions. The MBCS codec is now supporting all
+ error handlers, instead of only replace to encode and ignore to decode.
+
- Issue #13188: When called without an explicit traceback argument,
generator.throw() now gets the traceback from the passed exception's
``__traceback__`` attribute. Patch by Petri Lehtinen.
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
index 26c8788..be31fd2 100644
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -612,6 +612,31 @@ mbcs_decode(PyObject *self,
return codec_tuple(decoded, consumed);
}
+static PyObject *
+code_page_decode(PyObject *self,
+ PyObject *args)
+{
+ Py_buffer pbuf;
+ const char *errors = NULL;
+ int final = 0;
+ Py_ssize_t consumed;
+ PyObject *decoded = NULL;
+ int code_page;
+
+ if (!PyArg_ParseTuple(args, "iy*|zi:code_page_decode",
+ &code_page, &pbuf, &errors, &final))
+ return NULL;
+ consumed = pbuf.len;
+
+ decoded = PyUnicode_DecodeCodePageStateful(code_page,
+ pbuf.buf, pbuf.len, errors,
+ final ? NULL : &consumed);
+ PyBuffer_Release(&pbuf);
+ if (decoded == NULL)
+ return NULL;
+ return codec_tuple(decoded, consumed);
+}
+
#endif /* HAVE_MBCS */
/* --- Encoder ------------------------------------------------------------ */
@@ -1011,6 +1036,29 @@ mbcs_encode(PyObject *self,
return v;
}
+static PyObject *
+code_page_encode(PyObject *self,
+ PyObject *args)
+{
+ PyObject *str, *v;
+ const char *errors = NULL;
+ int code_page;
+
+ if (!PyArg_ParseTuple(args, "iO|z:code_page_encode",
+ &code_page, &str, &errors))
+ return NULL;
+
+ str = PyUnicode_FromObject(str);
+ if (str == NULL)
+ return NULL;
+ v = codec_tuple(PyUnicode_EncodeCodePage(code_page,
+ str,
+ errors),
+ PyUnicode_GET_LENGTH(str));
+ Py_DECREF(str);
+ return v;
+}
+
#endif /* HAVE_MBCS */
/* --- Error handler registry --------------------------------------------- */
@@ -1101,6 +1149,8 @@ static PyMethodDef _codecs_functions[] = {
#ifdef HAVE_MBCS
{"mbcs_encode", mbcs_encode, METH_VARARGS},
{"mbcs_decode", mbcs_decode, METH_VARARGS},
+ {"code_page_encode", code_page_encode, METH_VARARGS},
+ {"code_page_decode", code_page_decode, METH_VARARGS},
#endif
{"register_error", register_error, METH_VARARGS,
register_error__doc__},
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 5f56cf7..9d11546 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -429,6 +429,10 @@ _PyUnicode_CheckConsistency(void *op, int check_content)
}
#endif
+#ifdef HAVE_MBCS
+static OSVERSIONINFOEX winver;
+#endif
+
/* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters.
@@ -6896,130 +6900,307 @@ PyUnicode_AsASCIIString(PyObject *unicode)
#define NEED_RETRY
#endif
-/* XXX This code is limited to "true" double-byte encodings, as
- a) it assumes an incomplete character consists of a single byte, and
- b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
- encodings, see IsDBCSLeadByteEx documentation. */
+#ifndef WC_ERR_INVALID_CHARS
+# define WC_ERR_INVALID_CHARS 0x0080
+#endif
+
+static char*
+code_page_name(UINT code_page, PyObject **obj)
+{
+ *obj = NULL;
+ if (code_page == CP_ACP)
+ return "mbcs";
+ if (code_page == CP_UTF7)
+ return "CP_UTF7";
+ if (code_page == CP_UTF8)
+ return "CP_UTF8";
+
+ *obj = PyBytes_FromFormat("cp%u", code_page);
+ if (*obj == NULL)
+ return NULL;
+ return PyBytes_AS_STRING(*obj);
+}
static int
-is_dbcs_lead_byte(const char *s, int offset)
+is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
{
const char *curr = s + offset;
+ const char *prev;
- if (IsDBCSLeadByte(*curr)) {
- const char *prev = CharPrev(s, curr);
- return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
- }
+ if (!IsDBCSLeadByteEx(code_page, *curr))
+ return 0;
+
+ prev = CharPrevExA(code_page, s, curr, 0);
+ if (prev == curr)
+ return 1;
+ /* FIXME: This code is limited to "true" double-byte encodings,
+ as it assumes an incomplete character consists of a single
+ byte. */
+ if (curr - prev == 2)
+ return 1;
+ if (!IsDBCSLeadByteEx(code_page, *prev))
+ return 1;
return 0;
}
+static DWORD
+decode_code_page_flags(UINT code_page)
+{
+ if (code_page == CP_UTF7) {
+ /* The CP_UTF7 decoder only supports flags=0 */
+ return 0;
+ }
+ else
+ return MB_ERR_INVALID_CHARS;
+}
+
/*
- * Decode MBCS string into unicode object. If 'final' is set, converts
- * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
+ * Decode a byte string from a Windows code page into unicode object in strict
+ * mode.
+ *
+ * Returns consumed size if succeed, returns -2 on decode error, or raise a
+ * WindowsError and returns -1 on other error.
*/
static int
-decode_mbcs(PyUnicodeObject **v,
- const char *s, /* MBCS string */
- int size, /* sizeof MBCS string */
- int final,
- const char *errors)
+decode_code_page_strict(UINT code_page,
+ PyUnicodeObject **v,
+ const char *in,
+ int insize)
{
- Py_UNICODE *p;
- Py_ssize_t n;
- DWORD usize;
- DWORD flags;
+ const DWORD flags = decode_code_page_flags(code_page);
+ Py_UNICODE *out;
+ DWORD outsize;
- assert(size >= 0);
+ /* First get the size of the result */
+ assert(insize > 0);
+ outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
+ if (outsize <= 0)
+ goto error;
- /* check and handle 'errors' arg */
- if (errors==NULL || strcmp(errors, "strict")==0)
- flags = MB_ERR_INVALID_CHARS;
- else if (strcmp(errors, "ignore")==0)
- flags = 0;
+ if (*v == NULL) {
+ /* Create unicode object */
+ *v = _PyUnicode_New(outsize);
+ if (*v == NULL)
+ return -1;
+ out = PyUnicode_AS_UNICODE(*v);
+ }
else {
- PyErr_Format(PyExc_ValueError,
- "mbcs encoding does not support errors='%s'",
- errors);
- return -1;
+ /* Extend unicode object */
+ Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+ if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
+ return -1;
+ out = PyUnicode_AS_UNICODE(*v) + n;
}
- /* Skip trailing lead-byte unless 'final' is set */
- if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
- --size;
+ /* Do the conversion */
+ outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
+ if (outsize <= 0)
+ goto error;
+ return insize;
- /* First get the size of the result */
- if (size > 0) {
- usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
- if (usize==0)
- goto mbcs_decode_error;
- } else
- usize = 0;
+error:
+ if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+ return -2;
+ PyErr_SetFromWindowsErr(0);
+ return -1;
+}
+
+/*
+ * Decode a byte string from a code page into unicode object with an error
+ * handler.
+ *
+ * Returns consumed size if succeed, or raise a WindowsError or
+ * UnicodeDecodeError exception and returns -1 on error.
+ */
+static int
+decode_code_page_errors(UINT code_page,
+ PyUnicodeObject **v,
+ const char *in,
+ int size,
+ const char *errors)
+{
+ const char *startin = in;
+ const char *endin = in + size;
+ const DWORD flags = decode_code_page_flags(code_page);
+ /* Ideally, we should get reason from FormatMessage. This is the Windows
+ 2000 English version of the message. */
+ const char *reason = "No mapping for the Unicode character exists "
+ "in the target code page.";
+ /* each step cannot decode more than 1 character, but a character can be
+ represented as a surrogate pair */
+ wchar_t buffer[2], *startout, *out;
+ int insize, outsize;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ PyObject *encoding_obj = NULL;
+ char *encoding;
+ DWORD err;
+ int ret = -1;
+
+ assert(size > 0);
+
+ encoding = code_page_name(code_page, &encoding_obj);
+ if (encoding == NULL)
+ return -1;
+
+ if (errors == NULL || strcmp(errors, "strict") == 0) {
+ /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
+ UnicodeDecodeError. */
+ make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
+ if (exc != NULL) {
+ PyCodec_StrictErrors(exc);
+ Py_CLEAR(exc);
+ }
+ goto error;
+ }
if (*v == NULL) {
/* Create unicode object */
- *v = _PyUnicode_New(usize);
+ if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
if (*v == NULL)
- return -1;
- n = 0;
+ goto error;
+ startout = PyUnicode_AS_UNICODE(*v);
}
else {
/* Extend unicode object */
- n = PyUnicode_GET_SIZE(*v);
- if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
- return -1;
+ Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+ if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
+ goto error;
+ startout = PyUnicode_AS_UNICODE(*v) + n;
}
- /* Do the conversion */
- if (usize > 0) {
- p = PyUnicode_AS_UNICODE(*v) + n;
- if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
- goto mbcs_decode_error;
+ /* Decode the byte string character per character */
+ out = startout;
+ while (in < endin)
+ {
+ /* Decode a character */
+ insize = 1;
+ do
+ {
+ outsize = MultiByteToWideChar(code_page, flags,
+ in, insize,
+ buffer, Py_ARRAY_LENGTH(buffer));
+ if (outsize > 0)
+ break;
+ err = GetLastError();
+ if (err != ERROR_NO_UNICODE_TRANSLATION
+ && err != ERROR_INSUFFICIENT_BUFFER)
+ {
+ PyErr_SetFromWindowsErr(0);
+ goto error;
+ }
+ insize++;
+ }
+ /* 4=maximum length of a UTF-8 sequence */
+ while (insize <= 4 && (in + insize) <= endin);
+
+ if (outsize <= 0) {
+ Py_ssize_t startinpos, endinpos, outpos;
+
+ startinpos = in - startin;
+ endinpos = startinpos + 1;
+ outpos = out - PyUnicode_AS_UNICODE(*v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ encoding, reason,
+ &startin, &endin, &startinpos, &endinpos, &exc, &in,
+ v, &outpos, &out))
+ {
+ goto error;
+ }
+ }
+ else {
+ in += insize;
+ memcpy(out, buffer, outsize * sizeof(wchar_t));
+ out += outsize;
}
}
- return size;
-mbcs_decode_error:
- /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
- we raise a UnicodeDecodeError - else it is a 'generic'
- windows error
- */
- if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
- /* Ideally, we should get reason from FormatMessage - this
- is the Windows 2000 English version of the message
- */
- PyObject *exc = NULL;
- const char *reason = "No mapping for the Unicode character exists "
- "in the target multi-byte code page.";
- make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
- if (exc != NULL) {
- PyCodec_StrictErrors(exc);
- Py_DECREF(exc);
+ /* write a NUL character at the end */
+ *out = 0;
+
+ /* Extend unicode object */
+ outsize = out - startout;
+ assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
+ if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
+ goto error;
+ ret = 0;
+
+error:
+ Py_XDECREF(encoding_obj);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return ret;
+}
+
+/*
+ * Decode a byte string from a Windows code page into unicode object. If
+ * 'final' is set, converts trailing lead-byte too.
+ *
+ * Returns consumed size if succeed, or raise a WindowsError or
+ * UnicodeDecodeError exception and returns -1 on error.
+ */
+static int
+decode_code_page(UINT code_page,
+ PyUnicodeObject **v,
+ const char *s, int size,
+ int final, const char *errors)
+{
+ int done;
+
+ /* Skip trailing lead-byte unless 'final' is set */
+ if (size == 0) {
+ if (*v == NULL) {
+ Py_INCREF(unicode_empty);
+ *v = (PyUnicodeObject*)unicode_empty;
+ if (*v == NULL)
+ return -1;
}
- } else {
- PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return 0;
}
- return -1;
+
+ if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
+ --size;
+
+ done = decode_code_page_strict(code_page, v, s, size);
+ if (done == -2)
+ done = decode_code_page_errors(code_page, v, s, size, errors);
+ return done;
}
-PyObject *
-PyUnicode_DecodeMBCSStateful(const char *s,
- Py_ssize_t size,
- const char *errors,
- Py_ssize_t *consumed)
+static PyObject *
+decode_code_page_stateful(int code_page,
+ const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
{
PyUnicodeObject *v = NULL;
int done;
+ if (code_page < 0) {
+ PyErr_SetString(PyExc_ValueError, "invalid code page number");
+ return NULL;
+ }
+
if (consumed)
*consumed = 0;
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
- done = decode_mbcs(&v, s, INT_MAX, 0, errors);
+ done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
else
#endif
- done = decode_mbcs(&v, s, (int)size, !consumed, errors);
+ done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
if (done < 0) {
Py_XDECREF(v);
@@ -7036,6 +7217,7 @@ PyUnicode_DecodeMBCSStateful(const char *s,
goto retry;
}
#endif
+
#ifndef DONT_MAKE_RESULT_READY
if (_PyUnicode_READY_REPLACE(&v)) {
Py_DECREF(v);
@@ -7047,6 +7229,25 @@ PyUnicode_DecodeMBCSStateful(const char *s,
}
PyObject *
+PyUnicode_DecodeCodePageStateful(int code_page,
+ const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ return decode_code_page_stateful(code_page, s, size, errors, consumed);
+}
+
+PyObject *
+PyUnicode_DecodeMBCSStateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
+}
+
+PyObject *
PyUnicode_DecodeMBCS(const char *s,
Py_ssize_t size,
const char *errors)
@@ -7054,105 +7255,342 @@ PyUnicode_DecodeMBCS(const char *s,
return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
}
+static DWORD
+encode_code_page_flags(UINT code_page, const char *errors)
+{
+ if (code_page == CP_UTF8) {
+ if (winver.dwMajorVersion >= 6)
+ /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
+ and later */
+ return WC_ERR_INVALID_CHARS;
+ else
+ /* CP_UTF8 only supports flags=0 on Windows older than Vista */
+ return 0;
+ }
+ else if (code_page == CP_UTF7) {
+ /* CP_UTF7 only supports flags=0 */
+ return 0;
+ }
+ else {
+ if (errors != NULL && strcmp(errors, "replace") == 0)
+ return 0;
+ else
+ return WC_NO_BEST_FIT_CHARS;
+ }
+}
+
/*
- * Convert unicode into string object (MBCS).
- * Returns 0 if succeed, -1 otherwise.
+ * Encode a Unicode string to a Windows code page into a byte string in strict
+ * mode.
+ *
+ * Returns consumed characters if succeed, returns -2 on encode error, or raise
+ * a WindowsError and returns -1 on other error.
*/
static int
-encode_mbcs(PyObject **repr,
- const Py_UNICODE *p, /* unicode */
- int size, /* size of unicode */
- const char* errors)
+encode_code_page_strict(UINT code_page, PyObject **outbytes,
+ const Py_UNICODE *p, const int size,
+ const char* errors)
{
BOOL usedDefaultChar = FALSE;
- BOOL *pusedDefaultChar;
- int mbcssize;
- Py_ssize_t n;
+ BOOL *pusedDefaultChar = &usedDefaultChar;
+ int outsize;
PyObject *exc = NULL;
- DWORD flags;
+ const DWORD flags = encode_code_page_flags(code_page, NULL);
+ char *out;
- assert(size >= 0);
+ assert(size > 0);
- /* check and handle 'errors' arg */
- if (errors==NULL || strcmp(errors, "strict")==0) {
- flags = WC_NO_BEST_FIT_CHARS;
+ if (code_page != CP_UTF8 && code_page != CP_UTF7)
pusedDefaultChar = &usedDefaultChar;
- } else if (strcmp(errors, "replace")==0) {
- flags = 0;
+ else
pusedDefaultChar = NULL;
- } else {
- PyErr_Format(PyExc_ValueError,
- "mbcs encoding does not support errors='%s'",
- errors);
- return -1;
- }
/* First get the size of the result */
- if (size > 0) {
- mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
- NULL, pusedDefaultChar);
- if (mbcssize == 0) {
- PyErr_SetFromWindowsErrWithFilename(0, NULL);
- return -1;
- }
- /* If we used a default char, then we failed! */
- if (pusedDefaultChar && *pusedDefaultChar)
- goto mbcs_encode_error;
- } else {
- mbcssize = 0;
- }
+ outsize = WideCharToMultiByte(code_page, flags,
+ p, size,
+ NULL, 0,
+ NULL, pusedDefaultChar);
+ if (outsize <= 0)
+ goto error;
+ /* If we used a default char, then we failed! */
+ if (pusedDefaultChar && *pusedDefaultChar)
+ return -2;
- if (*repr == NULL) {
+ if (*outbytes == NULL) {
/* Create string object */
- *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
- if (*repr == NULL)
+ *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
+ if (*outbytes == NULL)
return -1;
- n = 0;
+ out = PyBytes_AS_STRING(*outbytes);
}
else {
/* Extend string object */
- n = PyBytes_Size(*repr);
- if (_PyBytes_Resize(repr, n + mbcssize) < 0)
+ const Py_ssize_t n = PyBytes_Size(*outbytes);
+ if (outsize > PY_SSIZE_T_MAX - n) {
+ PyErr_NoMemory();
return -1;
+ }
+ if (_PyBytes_Resize(outbytes, n + outsize) < 0)
+ return -1;
+ out = PyBytes_AS_STRING(*outbytes) + n;
}
/* Do the conversion */
- if (size > 0) {
- char *s = PyBytes_AS_STRING(*repr) + n;
- if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
- NULL, pusedDefaultChar)) {
- PyErr_SetFromWindowsErrWithFilename(0, NULL);
- return -1;
+ outsize = WideCharToMultiByte(code_page, flags,
+ p, size,
+ out, outsize,
+ NULL, pusedDefaultChar);
+ if (outsize <= 0)
+ goto error;
+ if (pusedDefaultChar && *pusedDefaultChar)
+ return -2;
+ return 0;
+
+error:
+ if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+ return -2;
+ PyErr_SetFromWindowsErr(0);
+ return -1;
+}
+
+/*
+ * Encode a Unicode string to a Windows code page into a byte string using a
+ * error handler.
+ *
+ * Returns consumed characters if succeed, or raise a WindowsError and returns
+ * -1 on other error.
+ */
+static int
+encode_code_page_errors(UINT code_page, PyObject **outbytes,
+ const Py_UNICODE *in, const int insize,
+ const char* errors)
+{
+ const DWORD flags = encode_code_page_flags(code_page, errors);
+ const Py_UNICODE *startin = in;
+ const Py_UNICODE *endin = in + insize;
+ /* Ideally, we should get reason from FormatMessage. This is the Windows
+ 2000 English version of the message. */
+ const char *reason = "invalid character";
+ /* 4=maximum length of a UTF-8 sequence */
+ char buffer[4];
+ BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
+ Py_ssize_t outsize;
+ char *out;
+ int charsize;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ PyObject *encoding_obj = NULL;
+ char *encoding;
+ int err;
+ Py_ssize_t startpos, newpos, newoutsize;
+ PyObject *rep;
+ int ret = -1;
+
+ assert(insize > 0);
+
+ encoding = code_page_name(code_page, &encoding_obj);
+ if (encoding == NULL)
+ return -1;
+
+ if (errors == NULL || strcmp(errors, "strict") == 0) {
+ /* The last error was ERROR_NO_UNICODE_TRANSLATION,
+ then we raise a UnicodeEncodeError. */
+ make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
+ if (exc != NULL) {
+ PyCodec_StrictErrors(exc);
+ Py_DECREF(exc);
}
- if (pusedDefaultChar && *pusedDefaultChar)
- goto mbcs_encode_error;
+ Py_XDECREF(encoding_obj);
+ return -1;
}
- return 0;
-mbcs_encode_error:
- raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
+ if (code_page != CP_UTF8 && code_page != CP_UTF7)
+ pusedDefaultChar = &usedDefaultChar;
+ else
+ pusedDefaultChar = NULL;
+
+ if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ outsize = insize * Py_ARRAY_LENGTH(buffer);
+
+ if (*outbytes == NULL) {
+ /* Create string object */
+ *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
+ if (*outbytes == NULL)
+ goto error;
+ out = PyBytes_AS_STRING(*outbytes);
+ }
+ else {
+ /* Extend string object */
+ Py_ssize_t n = PyBytes_Size(*outbytes);
+ if (n > PY_SSIZE_T_MAX - outsize) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ if (_PyBytes_Resize(outbytes, n + outsize) < 0)
+ goto error;
+ out = PyBytes_AS_STRING(*outbytes) + n;
+ }
+
+ /* Encode the string character per character */
+ while (in < endin)
+ {
+ if ((in + 2) <= endin
+ && 0xD800 <= in[0] && in[0] <= 0xDBFF
+ && 0xDC00 <= in[1] && in[1] <= 0xDFFF)
+ charsize = 2;
+ else
+ charsize = 1;
+
+ outsize = WideCharToMultiByte(code_page, flags,
+ in, charsize,
+ buffer, Py_ARRAY_LENGTH(buffer),
+ NULL, pusedDefaultChar);
+ if (outsize > 0) {
+ if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
+ {
+ in += charsize;
+ memcpy(out, buffer, outsize);
+ out += outsize;
+ continue;
+ }
+ }
+ else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
+ PyErr_SetFromWindowsErr(0);
+ goto error;
+ }
+
+ charsize = Py_MAX(charsize - 1, 1);
+ startpos = in - startin;
+ rep = unicode_encode_call_errorhandler(
+ errors, &errorHandler, encoding, reason,
+ startin, insize, &exc,
+ startpos, startpos + charsize, &newpos);
+ if (rep == NULL)
+ goto error;
+ in = startin + newpos;
+
+ if (PyBytes_Check(rep)) {
+ outsize = PyBytes_GET_SIZE(rep);
+ if (outsize != 1) {
+ Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
+ newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+ if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
+ Py_DECREF(rep);
+ goto error;
+ }
+ out = PyBytes_AS_STRING(*outbytes) + offset;
+ }
+ memcpy(out, PyBytes_AS_STRING(rep), outsize);
+ out += outsize;
+ }
+ else {
+ Py_ssize_t i;
+ enum PyUnicode_Kind kind;
+ void *data;
+
+ if (PyUnicode_READY(rep) < 0) {
+ Py_DECREF(rep);
+ goto error;
+ }
+
+ outsize = PyUnicode_GET_LENGTH(rep);
+ if (outsize != 1) {
+ Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
+ newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+ if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
+ Py_DECREF(rep);
+ goto error;
+ }
+ out = PyBytes_AS_STRING(*outbytes) + offset;
+ }
+ kind = PyUnicode_KIND(rep);
+ data = PyUnicode_DATA(rep);
+ for (i=0; i < outsize; i++) {
+ Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+ if (ch > 127) {
+ raise_encode_exception(&exc,
+ encoding,
+ startin, insize,
+ startpos, startpos + charsize,
+ "unable to encode error handler result to ASCII");
+ Py_DECREF(rep);
+ goto error;
+ }
+ *out = (unsigned char)ch;
+ out++;
+ }
+ }
+ Py_DECREF(rep);
+ }
+ /* write a NUL byte */
+ *out = 0;
+ outsize = out - PyBytes_AS_STRING(*outbytes);
+ assert(outsize <= PyBytes_GET_SIZE(*outbytes));
+ if (_PyBytes_Resize(outbytes, outsize) < 0)
+ goto error;
+ ret = 0;
+
+error:
+ Py_XDECREF(encoding_obj);
+ Py_XDECREF(errorHandler);
Py_XDECREF(exc);
- return -1;
+ return ret;
}
-PyObject *
-PyUnicode_EncodeMBCS(const Py_UNICODE *p,
- Py_ssize_t size,
- const char *errors)
+/*
+ * Encode a Unicode string to a Windows code page into a byte string.
+ *
+ * Returns consumed characters if succeed, or raise a WindowsError and returns
+ * -1 on other error.
+ */
+static int
+encode_code_page_chunk(UINT code_page, PyObject **outbytes,
+ const Py_UNICODE *p, int size,
+ const char* errors)
+{
+ int done;
+
+ if (size == 0) {
+ if (*outbytes == NULL) {
+ *outbytes = PyBytes_FromStringAndSize(NULL, 0);
+ if (*outbytes == NULL)
+ return -1;
+ }
+ return 0;
+ }
+
+ done = encode_code_page_strict(code_page, outbytes, p, size, errors);
+ if (done == -2)
+ done = encode_code_page_errors(code_page, outbytes, p, size, errors);
+ return done;
+}
+
+static PyObject *
+encode_code_page(int code_page,
+ const Py_UNICODE *p, Py_ssize_t size,
+ const char *errors)
{
- PyObject *repr = NULL;
+ PyObject *outbytes = NULL;
int ret;
+ if (code_page < 0) {
+ PyErr_SetString(PyExc_ValueError, "invalid code page number");
+ return NULL;
+ }
+
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
- ret = encode_mbcs(&repr, p, INT_MAX, errors);
+ ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
else
#endif
- ret = encode_mbcs(&repr, p, (int)size, errors);
+ ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);
if (ret < 0) {
- Py_XDECREF(repr);
+ Py_XDECREF(outbytes);
return NULL;
}
@@ -7164,7 +7602,28 @@ PyUnicode_EncodeMBCS(const Py_UNICODE *p,
}
#endif
- return repr;
+ return outbytes;
+}
+
+PyObject *
+PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+ Py_ssize_t size,
+ const char *errors)
+{
+ return encode_code_page(CP_ACP, p, size, errors);
+}
+
+PyObject *
+PyUnicode_EncodeCodePage(int code_page,
+ PyObject *unicode,
+ const char *errors)
+{
+ const Py_UNICODE *p;
+ Py_ssize_t size;
+ p = PyUnicode_AsUnicodeAndSize(unicode, &size);
+ if (p == NULL)
+ return NULL;
+ return encode_code_page(code_page, p, size, errors);
}
PyObject *
@@ -13434,7 +13893,7 @@ PyTypeObject PyUnicode_Type = {
/* Initialize the Unicode implementation */
-void _PyUnicode_Init(void)
+int _PyUnicode_Init(void)
{
int i;
@@ -13467,6 +13926,15 @@ void _PyUnicode_Init(void)
Py_ARRAY_LENGTH(linebreak));
PyType_Ready(&EncodingMapType);
+
+#ifdef HAVE_MBCS
+ winver.dwOSVersionInfoSize = sizeof(winver);
+ if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
+ PyErr_SetFromWindowsErr(0);
+ return -1;
+ }
+#endif
+ return 0;
}
/* Finalize the Unicode implementation */
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index a6e7c46..0f2f050 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -67,7 +67,7 @@ static void initsigs(void);
static void call_py_exitfuncs(void);
static void wait_for_thread_shutdown(void);
static void call_ll_exitfuncs(void);
-extern void _PyUnicode_Init(void);
+extern int _PyUnicode_Init(void);
extern void _PyUnicode_Fini(void);
extern int _PyLong_Init(void);
extern void PyLong_Fini(void);
@@ -261,7 +261,8 @@ Py_InitializeEx(int install_sigs)
Py_FatalError("Py_Initialize: can't make modules_reloading dictionary");
/* Init Unicode implementation; relies on the codec registry */
- _PyUnicode_Init();
+ if (_PyUnicode_Init() < 0)
+ Py_FatalError("Py_Initialize: can't initialize unicode");
bimod = _PyBuiltin_Init();
if (bimod == NULL)