summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/codecs.rst17
-rw-r--r--Lib/ctypes/__init__.py2
-rw-r--r--Lib/test/test_codecs.py7
-rw-r--r--Misc/NEWS5
-rw-r--r--Objects/unicodeobject.c163
5 files changed, 149 insertions, 45 deletions
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
index 13e86a2..853cc78 100644
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1223,6 +1223,23 @@ functions can be used directly if desired.
Convert a label to Unicode, as specified in :rfc:`3490`.
+:mod:`encodings.mbcs` --- Windows ANSI codepage
+-----------------------------------------------
+
+.. module:: encodings.mbcs
+ :synopsis: Windows ANSI codepage
+
+Encode operand according to the ANSI codepage (CP_ACP). This codec only
+supports ``'strict'`` and ``'replace'`` error handlers to encode, and
+``'strict'`` and ``'ignore'`` error handlers to decode.
+
+Availability: Windows only.
+
+.. versionchanged:: 3.2
+ Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
+ to encode, and ``'ignore'`` to decode.
+
+
:mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature
-------------------------------------------------------------
diff --git a/Lib/ctypes/__init__.py b/Lib/ctypes/__init__.py
index 8782db9..ce1d779 100644
--- a/Lib/ctypes/__init__.py
+++ b/Lib/ctypes/__init__.py
@@ -265,7 +265,7 @@ except ImportError:
pass
else:
if _os.name in ("nt", "ce"):
- set_conversion_mode("mbcs", "ignore")
+ set_conversion_mode("mbcs", "strict")
else:
set_conversion_mode("ascii", "strict")
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 911d58f..521cbce 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1358,11 +1358,6 @@ broken_incremental_coders = broken_unicode_with_streams + [
"idna",
]
-# The following encodings only support "strict" mode
-only_strict_mode = [
- "idna",
-]
-
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
def test_basics(self):
s = "abc123" # all codecs should be able to encode these
@@ -1437,7 +1432,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
self.assertEqual(result, "")
- if encoding not in only_strict_mode:
+ if encoding not in ("idna", "mbcs"):
# check incremental decoder/encoder with errors argument
try:
encoder = codecs.getincrementalencoder(encoding)("ignore")
diff --git a/Misc/NEWS b/Misc/NEWS
index 7817276..e56711c 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,11 @@ What's New in Python 3.2 Alpha 1?
Core and Builtins
-----------------
+- Issue #850997: mbcs encoding (Windows only) handles errors argument: strict
+ mode raises unicode errors. The encoder only supports "strict" and "replace"
+ error handlers, the decoder only supports "strict" and "ignore" error
+ handlers.
+
- Issue #8592: PyArg_Parse*() functions raise a TypeError for "y", "u" and "Z"
formats if the string contains a null byte/character. Write unit tests for
string formats.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 4153c25..83e0360 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1767,6 +1767,33 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
return 0;
}
+/* create or adjust a UnicodeDecodeError */
+static void
+make_decode_exception(PyObject **exceptionObject,
+ const char *encoding,
+ const char *input, Py_ssize_t length,
+ Py_ssize_t startpos, Py_ssize_t endpos,
+ const char *reason)
+{
+ if (*exceptionObject == NULL) {
+ *exceptionObject = PyUnicodeDecodeError_Create(
+ encoding, input, length, startpos, endpos, reason);
+ }
+ else {
+ if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
+ goto onError;
+ if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
+ goto onError;
+ if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
+ goto onError;
+ }
+ return;
+
+onError:
+ Py_DECREF(*exceptionObject);
+ *exceptionObject = NULL;
+}
+
/* error handling callback helper:
build arguments, call the callback and check the arguments,
if no exception occurred, copy the replacement to the output
@@ -1800,20 +1827,13 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
goto onError;
}
- if (*exceptionObject == NULL) {
- *exceptionObject = PyUnicodeDecodeError_Create(
- encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
- if (*exceptionObject == NULL)
- goto onError;
- }
- else {
- if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
- goto onError;
- if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
- goto onError;
- if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
- goto onError;
- }
+ make_decode_exception(exceptionObject,
+ encoding,
+ *input, *inend - *input,
+ *startinpos, *endinpos,
+ reason);
+ if (*exceptionObject == NULL)
+ goto onError;
restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
if (restuple == NULL)
@@ -4552,32 +4572,46 @@ static int is_dbcs_lead_byte(const char *s, int offset)
static int decode_mbcs(PyUnicodeObject **v,
const char *s, /* MBCS string */
int size, /* sizeof MBCS string */
- int final)
+ int final,
+ const char *errors)
{
Py_UNICODE *p;
- Py_ssize_t n = 0;
- int usize = 0;
+ Py_ssize_t n;
+ DWORD usize;
+ DWORD flags;
assert(size >= 0);
+ /* check and handle 'errors' arg */
+ if (errors==NULL || strcmp(errors, "strict")==0)
+ flags = MB_ERR_INVALID_CHARS;
+ else if (strcmp(errors, "ignore")==0)
+ flags = 0;
+ else {
+ PyErr_Format(PyExc_ValueError,
+ "mbcs encoding does not support errors='%s'",
+ errors);
+ return -1;
+ }
+
/* Skip trailing lead-byte unless 'final' is set */
if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
--size;
/* First get the size of the result */
if (size > 0) {
- usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
- if (usize == 0) {
- PyErr_SetFromWindowsErrWithFilename(0, NULL);
- return -1;
- }
- }
+ usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
+ if (usize==0)
+ goto mbcs_decode_error;
+ } else
+ usize = 0;
if (*v == NULL) {
/* Create unicode object */
*v = _PyUnicode_New(usize);
if (*v == NULL)
return -1;
+ n = 0;
}
else {
/* Extend unicode object */
@@ -4587,15 +4621,35 @@ static int decode_mbcs(PyUnicodeObject **v,
}
/* Do the conversion */
- if (size > 0) {
+ if (usize > 0) {
p = PyUnicode_AS_UNICODE(*v) + n;
- if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
- PyErr_SetFromWindowsErrWithFilename(0, NULL);
- return -1;
+ if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
+ goto mbcs_decode_error;
}
}
-
return size;
+
+mbcs_decode_error:
+ /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
+ we raise a UnicodeDecodeError - else it is a 'generic'
+ windows error
+ */
+ if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
+ /* Ideally, we should get reason from FormatMessage - this
+ is the Windows 2000 English version of the message
+ */
+ PyObject *exc = NULL;
+ const char *reason = "No mapping for the Unicode character exists "
+ "in the target multi-byte code page.";
+ make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
+ if (exc != NULL) {
+ PyCodec_StrictErrors(exc);
+ Py_DECREF(exc);
+ }
+ } else {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ }
+ return -1;
}
PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
@@ -4612,10 +4666,10 @@ PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
- done = decode_mbcs(&v, s, INT_MAX, 0);
+ done = decode_mbcs(&v, s, INT_MAX, 0, errors);
else
#endif
- done = decode_mbcs(&v, s, (int)size, !consumed);
+ done = decode_mbcs(&v, s, (int)size, !consumed, errors);
if (done < 0) {
Py_XDECREF(v);
@@ -4649,20 +4703,45 @@ PyObject *PyUnicode_DecodeMBCS(const char *s,
*/
static int encode_mbcs(PyObject **repr,
const Py_UNICODE *p, /* unicode */
- int size) /* size of unicode */
+ int size, /* size of unicode */
+ const char* errors)
{
- int mbcssize = 0;
- Py_ssize_t n = 0;
+ BOOL usedDefaultChar = FALSE;
+ BOOL *pusedDefaultChar;
+ int mbcssize;
+ Py_ssize_t n;
+ PyObject *exc = NULL;
+ DWORD flags;
assert(size >= 0);
+ /* check and handle 'errors' arg */
+ if (errors==NULL || strcmp(errors, "strict")==0) {
+ flags = WC_NO_BEST_FIT_CHARS;
+ pusedDefaultChar = &usedDefaultChar;
+ } else if (strcmp(errors, "replace")==0) {
+ flags = 0;
+ pusedDefaultChar = NULL;
+ } else {
+ PyErr_Format(PyExc_ValueError,
+ "mbcs encoding does not support errors='%s'",
+ errors);
+ return -1;
+ }
+
/* First get the size of the result */
if (size > 0) {
- mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+ mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
+ NULL, pusedDefaultChar);
if (mbcssize == 0) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
}
+ /* If we used a default char, then we failed! */
+ if (pusedDefaultChar && *pusedDefaultChar)
+ goto mbcs_encode_error;
+ } else {
+ mbcssize = 0;
}
if (*repr == NULL) {
@@ -4670,6 +4749,7 @@ static int encode_mbcs(PyObject **repr,
*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
if (*repr == NULL)
return -1;
+ n = 0;
}
else {
/* Extend string object */
@@ -4681,13 +4761,20 @@ static int encode_mbcs(PyObject **repr,
/* Do the conversion */
if (size > 0) {
char *s = PyBytes_AS_STRING(*repr) + n;
- if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+ if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
+ NULL, pusedDefaultChar)) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
}
+ if (pusedDefaultChar && *pusedDefaultChar)
+ goto mbcs_encode_error;
}
-
return 0;
+
+mbcs_encode_error:
+ raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
+ Py_XDECREF(exc);
+ return -1;
}
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
@@ -4700,10 +4787,10 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
- ret = encode_mbcs(&repr, p, INT_MAX);
+ ret = encode_mbcs(&repr, p, INT_MAX, errors);
else
#endif
- ret = encode_mbcs(&repr, p, (int)size);
+ ret = encode_mbcs(&repr, p, (int)size, errors);
if (ret < 0) {
Py_XDECREF(repr);