summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/whatsnew/3.6.rst3
-rw-r--r--Lib/test/test_codecs.py12
-rw-r--r--Misc/NEWS3
-rw-r--r--Objects/unicodeobject.c48
4 files changed, 57 insertions, 9 deletions
diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst
index ca83ef9..24fd822 100644
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@@ -123,6 +123,9 @@ Optimizations
* The UTF-8 encoder is now up to 75 times as fast for error handlers:
``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``.
+* The UTF-8 decoder is now up to 15 times as fast for error handlers:
+ ``ignore``, ``replace`` and ``surrogateescape``.
+
Build and C API Changes
=======================
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index bdc331e..7b6883f 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -788,6 +788,18 @@ class UTF8Test(ReadTest, unittest.TestCase):
self.check_state_handling_decode(self.encoding,
u, u.encode(self.encoding))
+ def test_decode_error(self):
+ for data, error_handler, expected in (
+ (b'[\x80\xff]', 'ignore', '[]'),
+ (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+ (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+ (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+ ):
+ with self.subTest(data=data, error_handler=error_handler,
+ expected=expected):
+ self.assertEqual(data.decode(self.encoding, error_handler),
+ expected)
+
def test_lone_surrogates(self):
super().test_lone_surrogates()
# not sure if this is making sense for
diff --git a/Misc/NEWS b/Misc/NEWS
index d809377..3991d6b 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
Core and Builtins
-----------------
+* Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error
+ handlers: ``ignore``, ``replace`` and ``surrogateescape``.
+
- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
- Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index bc98287..56614e6 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4714,8 +4714,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *errmsg = "";
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
if (size == 0) {
if (consumed)
@@ -4740,6 +4741,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
while (s < end) {
Py_UCS4 ch;
int kind = writer.kind;
+
if (kind == PyUnicode_1BYTE_KIND) {
if (PyUnicode_IS_ASCII(writer.buffer))
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
@@ -4778,24 +4780,52 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
continue;
}
- if (unicode_decode_call_errorhandler_writer(
- errors, &errorHandler,
- "utf-8", errmsg,
- &starts, &end, &startinpos, &endinpos, &exc, &s,
- &writer))
- goto onError;
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler) {
+ case _Py_ERROR_IGNORE:
+ s += (endinpos - startinpos);
+ break;
+
+ case _Py_ERROR_REPLACE:
+ if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
+ goto onError;
+ s += (endinpos - startinpos);
+ break;
+
+ case _Py_ERROR_SURROGATEESCAPE:
+ if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+ goto onError;
+ for (Py_ssize_t i=startinpos; i<endinpos; i++) {
+ ch = (Py_UCS4)(unsigned char)(starts[i]);
+ PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
+ ch + 0xdc00);
+ writer.pos++;
+ }
+ s += (endinpos - startinpos);
+ break;
+
+ default:
+ if (unicode_decode_call_errorhandler_writer(
+ errors, &error_handler_obj,
+ "utf-8", errmsg,
+ &starts, &end, &startinpos, &endinpos, &exc, &s,
+ &writer))
+ goto onError;
+ }
}
End:
if (consumed)
*consumed = s - starts;
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
onError:
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
_PyUnicodeWriter_Dealloc(&writer);
return NULL;