diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2015-10-05 11:43:50 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2015-10-05 11:43:50 (GMT) |
commit | 1d65d9192dac57776693c55a9ccefbde2ca74c23 (patch) | |
tree | 260cc1acc8425fe62664da26bfdf30c4fa39b508 /Objects | |
parent | 7dbe6dd96393d713e405f80fa0eb8f9471c8276a (diff) | |
download | cpython-1d65d9192dac57776693c55a9ccefbde2ca74c23.zip cpython-1d65d9192dac57776693c55a9ccefbde2ca74c23.tar.gz cpython-1d65d9192dac57776693c55a9ccefbde2ca74c23.tar.bz2 |
Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error
handlers: ``ignore``, ``replace`` and ``surrogateescape``.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 48 |
1 files changed, 39 insertions, 9 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index bc98287..56614e6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4714,8 +4714,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s, Py_ssize_t startinpos; Py_ssize_t endinpos; const char *errmsg = ""; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) { if (consumed) @@ -4740,6 +4741,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, while (s < end) { Py_UCS4 ch; int kind = writer.kind; + if (kind == PyUnicode_1BYTE_KIND) { if (PyUnicode_IS_ASCII(writer.buffer)) ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); @@ -4778,24 +4780,52 @@ PyUnicode_DecodeUTF8Stateful(const char *s, continue; } - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "utf-8", errmsg, - &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) - goto onError; + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) { + case _Py_ERROR_IGNORE: + s += (endinpos - startinpos); + break; + + case _Py_ERROR_REPLACE: + if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) + goto onError; + s += (endinpos - startinpos); + break; + + case _Py_ERROR_SURROGATEESCAPE: + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + for (Py_ssize_t i=startinpos; i<endinpos; i++) { + ch = (Py_UCS4)(unsigned char)(starts[i]); + PyUnicode_WRITE(writer.kind, writer.data, writer.pos, + ch + 0xdc00); + writer.pos++; + } + s += (endinpos - startinpos); + break; + + default: + if (unicode_decode_call_errorhandler_writer( + errors, &error_handler_obj, + "utf-8", errmsg, + &starts, &end, &startinpos, &endinpos, &exc, &s, + &writer)) + goto onError; + } } End: if (consumed) *consumed = s - starts; - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); _PyUnicodeWriter_Dealloc(&writer); return NULL; |