diff options
-rw-r--r-- | Doc/whatsnew/3.6.rst | 3 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 12 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 48 |
4 files changed, 57 insertions, 9 deletions
diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst index ca83ef9..24fd822 100644 --- a/Doc/whatsnew/3.6.rst +++ b/Doc/whatsnew/3.6.rst @@ -123,6 +123,9 @@ Optimizations * The UTF-8 encoder is now up to 75 times as fast for error handlers: ``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``. +* The UTF-8 decoder is now up to 15 times as fast for error handlers: + ``ignore``, ``replace`` and ``surrogateescape``. + Build and C API Changes ======================= diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index bdc331e..7b6883f 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -788,6 +788,18 @@ class UTF8Test(ReadTest, unittest.TestCase): self.check_state_handling_decode(self.encoding, u, u.encode(self.encoding)) + def test_decode_error(self): + for data, error_handler, expected in ( + (b'[\x80\xff]', 'ignore', '[]'), + (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), + (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), + (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), + ): + with self.subTest(data=data, error_handler=error_handler, + expected=expected): + self.assertEqual(data.decode(self.encoding, error_handler), + expected) + def test_lone_surrogates(self): super().test_lone_surrogates() # not sure if this is making sense for @@ -10,6 +10,9 @@ Release date: XXXX-XX-XX Core and Builtins ----------------- +* Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error + handlers: ``ignore``, ``replace`` and ``surrogateescape``. + - Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data. - Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index bc98287..56614e6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4714,8 +4714,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s, Py_ssize_t startinpos; Py_ssize_t endinpos; const char *errmsg = ""; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) { if (consumed) @@ -4740,6 +4741,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, while (s < end) { Py_UCS4 ch; int kind = writer.kind; + if (kind == PyUnicode_1BYTE_KIND) { if (PyUnicode_IS_ASCII(writer.buffer)) ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); @@ -4778,24 +4780,52 @@ PyUnicode_DecodeUTF8Stateful(const char *s, continue; } - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "utf-8", errmsg, - &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) - goto onError; + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) { + case _Py_ERROR_IGNORE: + s += (endinpos - startinpos); + break; + + case _Py_ERROR_REPLACE: + if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) + goto onError; + s += (endinpos - startinpos); + break; + + case _Py_ERROR_SURROGATEESCAPE: + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + for (Py_ssize_t i=startinpos; i<endinpos; i++) { + ch = (Py_UCS4)(unsigned char)(starts[i]); + PyUnicode_WRITE(writer.kind, writer.data, writer.pos, + ch + 0xdc00); + writer.pos++; + } + s += (endinpos - startinpos); + break; + + default: + if (unicode_decode_call_errorhandler_writer( + errors, &error_handler_obj, + "utf-8", errmsg, + &starts, &end, &startinpos, &endinpos, &exc, &s, + &writer)) + goto onError; + } } End: if (consumed) *consumed = s - starts; - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); _PyUnicodeWriter_Dealloc(&writer); return NULL; |