From 554f3f0081fc8f3561b031d957ffbab1034d8b83 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 16 Jun 2010 23:33:54 +0000 Subject: Issue #850997: mbcs encoding (Windows only) handles errors argument: strict mode raises unicode errors. The encoder only supports "strict" and "replace" error handlers, the decoder only supports "strict" and "ignore" error handlers. --- Doc/library/codecs.rst | 17 +++++ Lib/ctypes/__init__.py | 2 +- Lib/test/test_codecs.py | 7 +-- Misc/NEWS | 5 ++ Objects/unicodeobject.c | 163 +++++++++++++++++++++++++++++++++++++----------- 5 files changed, 149 insertions(+), 45 deletions(-) diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 13e86a2..853cc78 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1223,6 +1223,23 @@ functions can be used directly if desired. Convert a label to Unicode, as specified in :rfc:`3490`. +:mod:`encodings.mbcs` --- Windows ANSI codepage +----------------------------------------------- + +.. module:: encodings.mbcs + :synopsis: Windows ANSI codepage + +Encode operand according to the ANSI codepage (CP_ACP). This codec only +supports ``'strict'`` and ``'replace'`` error handlers to encode, and +``'strict'`` and ``'ignore'`` error handlers to decode. + +Availability: Windows only. + +.. versionchanged:: 3.2 + Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used + to encode, and ``'ignore'`` to decode. + + :mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature ------------------------------------------------------------- diff --git a/Lib/ctypes/__init__.py b/Lib/ctypes/__init__.py index 8782db9..ce1d779 100644 --- a/Lib/ctypes/__init__.py +++ b/Lib/ctypes/__init__.py @@ -265,7 +265,7 @@ except ImportError: pass else: if _os.name in ("nt", "ce"): - set_conversion_mode("mbcs", "ignore") + set_conversion_mode("mbcs", "strict") else: set_conversion_mode("ascii", "strict") diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 911d58f..521cbce 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1358,11 +1358,6 @@ broken_incremental_coders = broken_unicode_with_streams + [ "idna", ] -# The following encodings only support "strict" mode -only_strict_mode = [ - "idna", -] - class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): def test_basics(self): s = "abc123" # all codecs should be able to encode these @@ -1437,7 +1432,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding)) self.assertEqual(result, "") - if encoding not in only_strict_mode: + if encoding not in ("idna", "mbcs"): # check incremental decoder/encoder with errors argument try: encoder = codecs.getincrementalencoder(encoding)("ignore") diff --git a/Misc/NEWS b/Misc/NEWS index 7817276..e56711c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,11 @@ What's New in Python 3.2 Alpha 1? Core and Builtins ----------------- +- Issue #850997: mbcs encoding (Windows only) handles errors argument: strict + mode raises unicode errors. The encoder only supports "strict" and "replace" + error handlers, the decoder only supports "strict" and "ignore" error + handlers. + - Issue #8592: PyArg_Parse*() functions raise a TypeError for "y", "u" and "Z" formats if the string contains a null byte/character. Write unit tests for string formats. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4153c25..83e0360 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1767,6 +1767,33 @@ int PyUnicode_SetDefaultEncoding(const char *encoding) return 0; } +/* create or adjust a UnicodeDecodeError */ +static void +make_decode_exception(PyObject **exceptionObject, + const char *encoding, + const char *input, Py_ssize_t length, + Py_ssize_t startpos, Py_ssize_t endpos, + const char *reason) +{ + if (*exceptionObject == NULL) { + *exceptionObject = PyUnicodeDecodeError_Create( + encoding, input, length, startpos, endpos, reason); + } + else { + if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) + goto onError; + if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) + goto onError; + if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) + goto onError; + } + return; + +onError: + Py_DECREF(*exceptionObject); + *exceptionObject = NULL; +} + /* error handling callback helper: build arguments, call the callback and check the arguments, if no exception occurred, copy the replacement to the output @@ -1800,20 +1827,13 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler goto onError; } - if (*exceptionObject == NULL) { - *exceptionObject = PyUnicodeDecodeError_Create( - encoding, *input, *inend-*input, *startinpos, *endinpos, reason); - if (*exceptionObject == NULL) - goto onError; - } - else { - if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) - goto onError; - if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) - goto onError; - if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) - goto onError; - } + make_decode_exception(exceptionObject, + encoding, + *input, *inend - *input, + *startinpos, *endinpos, + reason); + if (*exceptionObject == NULL) + goto onError; restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); if (restuple == NULL) @@ -4552,32 +4572,46 @@ static int is_dbcs_lead_byte(const char *s, int offset) static int decode_mbcs(PyUnicodeObject **v, const char *s, /* MBCS string */ int size, /* sizeof MBCS string */ - int final) + int final, + const char *errors) { Py_UNICODE *p; - Py_ssize_t n = 0; - int usize = 0; + Py_ssize_t n; + DWORD usize; + DWORD flags; assert(size >= 0); + /* check and handle 'errors' arg */ + if (errors==NULL || strcmp(errors, "strict")==0) + flags = MB_ERR_INVALID_CHARS; + else if (strcmp(errors, "ignore")==0) + flags = 0; + else { + PyErr_Format(PyExc_ValueError, + "mbcs encoding does not support errors='%s'", + errors); + return -1; + } + /* Skip trailing lead-byte unless 'final' is set */ if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) --size; /* First get the size of the result */ if (size > 0) { - usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); - if (usize == 0) { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - return -1; - } - } + usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); + if (usize==0) + goto mbcs_decode_error; + } else + usize = 0; if (*v == NULL) { /* Create unicode object */ *v = _PyUnicode_New(usize); if (*v == NULL) return -1; + n = 0; } else { /* Extend unicode object */ @@ -4587,15 +4621,35 @@ static int decode_mbcs(PyUnicodeObject **v, } /* Do the conversion */ - if (size > 0) { + if (usize > 0) { p = PyUnicode_AS_UNICODE(*v) + n; - if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - return -1; + if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { + goto mbcs_decode_error; } } - return size; + +mbcs_decode_error: + /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then + we raise a UnicodeDecodeError - else it is a 'generic' + windows error + */ + if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { + /* Ideally, we should get reason from FormatMessage - this + is the Windows 2000 English version of the message + */ + PyObject *exc = NULL; + const char *reason = "No mapping for the Unicode character exists " + "in the target multi-byte code page."; + make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); + } + } else { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + } + return -1; } PyObject *PyUnicode_DecodeMBCSStateful(const char *s, @@ -4612,10 +4666,10 @@ PyObject *PyUnicode_DecodeMBCSStateful(const char *s, #ifdef NEED_RETRY retry: if (size > INT_MAX) - done = decode_mbcs(&v, s, INT_MAX, 0); + done = decode_mbcs(&v, s, INT_MAX, 0, errors); else #endif - done = decode_mbcs(&v, s, (int)size, !consumed); + done = decode_mbcs(&v, s, (int)size, !consumed, errors); if (done < 0) { Py_XDECREF(v); @@ -4649,20 +4703,45 @@ PyObject *PyUnicode_DecodeMBCS(const char *s, */ static int encode_mbcs(PyObject **repr, const Py_UNICODE *p, /* unicode */ - int size) /* size of unicode */ + int size, /* size of unicode */ + const char* errors) { - int mbcssize = 0; - Py_ssize_t n = 0; + BOOL usedDefaultChar = FALSE; + BOOL *pusedDefaultChar; + int mbcssize; + Py_ssize_t n; + PyObject *exc = NULL; + DWORD flags; assert(size >= 0); + /* check and handle 'errors' arg */ + if (errors==NULL || strcmp(errors, "strict")==0) { + flags = WC_NO_BEST_FIT_CHARS; + pusedDefaultChar = &usedDefaultChar; + } else if (strcmp(errors, "replace")==0) { + flags = 0; + pusedDefaultChar = NULL; + } else { + PyErr_Format(PyExc_ValueError, + "mbcs encoding does not support errors='%s'", + errors); + return -1; + } + /* First get the size of the result */ if (size > 0) { - mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); + mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, + NULL, pusedDefaultChar); if (mbcssize == 0) { PyErr_SetFromWindowsErrWithFilename(0, NULL); return -1; } + /* If we used a default char, then we failed! */ + if (pusedDefaultChar && *pusedDefaultChar) + goto mbcs_encode_error; + } else { + mbcssize = 0; } if (*repr == NULL) { @@ -4670,6 +4749,7 @@ static int encode_mbcs(PyObject **repr, *repr = PyBytes_FromStringAndSize(NULL, mbcssize); if (*repr == NULL) return -1; + n = 0; } else { /* Extend string object */ @@ -4681,13 +4761,20 @@ static int encode_mbcs(PyObject **repr, /* Do the conversion */ if (size > 0) { char *s = PyBytes_AS_STRING(*repr) + n; - if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { + if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, + NULL, pusedDefaultChar)) { PyErr_SetFromWindowsErrWithFilename(0, NULL); return -1; } + if (pusedDefaultChar && *pusedDefaultChar) + goto mbcs_encode_error; } - return 0; + +mbcs_encode_error: + raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); + Py_XDECREF(exc); + return -1; } PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, @@ -4700,10 +4787,10 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, #ifdef NEED_RETRY retry: if (size > INT_MAX) - ret = encode_mbcs(&repr, p, INT_MAX); + ret = encode_mbcs(&repr, p, INT_MAX, errors); else #endif - ret = encode_mbcs(&repr, p, (int)size); + ret = encode_mbcs(&repr, p, (int)size, errors); if (ret < 0) { Py_XDECREF(repr); -- cgit v0.12