From e64322e034b62ee13dada8800c1667a8cf6332f6 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 30 Oct 2012 23:12:47 +0100 Subject: Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster Patch written by Serhiy Storchaka. --- Doc/whatsnew/3.4.rst | 2 +- Misc/NEWS | 3 + Objects/unicodeobject.c | 142 +++++++++++++++++++++++------------------------- 3 files changed, 73 insertions(+), 74 deletions(-) diff --git a/Doc/whatsnew/3.4.rst b/Doc/whatsnew/3.4.rst index 944d754..d260838 100644 --- a/Doc/whatsnew/3.4.rst +++ b/Doc/whatsnew/3.4.rst @@ -157,7 +157,7 @@ Optimizations Major performance enhancements have been added: -* None yet. +* The UTF-32 decoder is now 3x to 4x faster. Build and C API Changes diff --git a/Misc/NEWS b/Misc/NEWS index c101aa5..a6ad757 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1? Core and Builtins ----------------- +- Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch + written by Serhiy Storchaka. + - Issue #16197: Update winreg docstrings and documentation to match code. Patch by Zachary Ware. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3e2e8e3..0f5bdfc 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t outpos; PyObject *unicode; const unsigned char *q, *e; - int bo = 0; /* assume native ordering by default */ + int le, bo = 0; /* assume native ordering by default */ const char *errmsg = ""; - /* Offsets from q for retrieving bytes in the right order. */ -#if PY_LITTLE_ENDIAN - int iorder[] = {0, 1, 2, 3}; -#else - int iorder[] = {3, 2, 1, 0}; -#endif PyObject *errorHandler = NULL; PyObject *exc = NULL; @@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s, byte order setting accordingly. In native mode, the leading BOM mark is skipped, in all other modes, it is copied to the output stream as-is (giving a ZWNBSP character). */ - if (bo == 0) { - if (size >= 4) { - const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | - (q[iorder[1]] << 8) | q[iorder[0]]; -#if PY_LITTLE_ENDIAN - if (bom == 0x0000FEFF) { - q += 4; - bo = -1; - } - else if (bom == 0xFFFE0000) { - q += 4; - bo = 1; - } -#else - if (bom == 0x0000FEFF) { - q += 4; - bo = 1; - } - else if (bom == 0xFFFE0000) { - q += 4; - bo = -1; - } -#endif + if (bo == 0 && size >= 4) { + Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; + if (bom == 0x0000FEFF) { + bo = -1; + q += 4; } + else if (bom == 0xFFFE0000) { + bo = 1; + q += 4; + } + if (byteorder) + *byteorder = bo; } - if (bo == -1) { - /* force LE */ - iorder[0] = 0; - iorder[1] = 1; - iorder[2] = 2; - iorder[3] = 3; - } - else if (bo == 1) { - /* force BE */ - iorder[0] = 3; - iorder[1] = 2; - iorder[2] = 1; - iorder[3] = 0; + if (q == e) { + if (consumed) + *consumed = size; + Py_INCREF(unicode_empty); + return unicode_empty; } - /* This might be one to much, because of a BOM */ - unicode = PyUnicode_New((size+3)/4, 127); +#ifdef WORDS_BIGENDIAN + le = bo < 0; +#else + le = bo <= 0; +#endif + + unicode = PyUnicode_New((e - q + 3) / 4, 127); if (!unicode) return NULL; - if (size == 0) - return unicode; + outpos = 0; + while (1) { + Py_UCS4 ch = 0; + Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode); + + if (e - q >= 4) { + enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); + void *data = PyUnicode_DATA(unicode); + const unsigned char *last = e - 4; + if (le) { + do { + ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; + if (ch > maxch) + break; + PyUnicode_WRITE(kind, data, outpos++, ch); + q += 4; + } while (q <= last); + } + else { + do { + ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; + if (ch > maxch) + break; + PyUnicode_WRITE(kind, data, outpos++, ch); + q += 4; + } while (q <= last); + } + } - while (q < e) { - Py_UCS4 ch; - /* remaining bytes at the end? (size should be divisible by 4) */ - if (e-q<4) { - if (consumed) + if (ch <= maxch) { + if (q == e || consumed) break; + /* remaining bytes at the end? (size should be divisible by 4) */ errmsg = "truncated data"; - startinpos = ((const char *)q)-starts; - endinpos = ((const char *)e)-starts; - goto utf32Error; - /* The remaining input chars are ignored if the callback - chooses to skip the input */ + startinpos = ((const char *)q) - starts; + endinpos = ((const char *)e) - starts; } - ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | - (q[iorder[1]] << 8) | q[iorder[0]]; - - if (ch >= 0x110000) - { + else { + if (ch < 0x110000) { + if (unicode_putchar(&unicode, &outpos, ch) < 0) + goto onError; + q += 4; + continue; + } errmsg = "codepoint not in range(0x110000)"; - startinpos = ((const char *)q)-starts; - endinpos = startinpos+4; - goto utf32Error; + startinpos = ((const char *)q) - starts; + endinpos = startinpos + 4; } - if (unicode_putchar(&unicode, &outpos, ch) < 0) - goto onError; - q += 4; - continue; - utf32Error: + + /* The remaining input chars are ignored if the callback + chooses to skip the input */ if (unicode_decode_call_errorhandler( errors, &errorHandler, "utf32", errmsg, @@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s, goto onError; } - if (byteorder) - *byteorder = bo; - if (consumed) *consumed = (const char *)q-starts; -- cgit v0.12