diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2015-05-12 20:12:45 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2015-05-12 20:12:45 (GMT) |
commit | 0d4df752acbaf14164f1e8b2b95ebe3fe288bb82 (patch) | |
tree | 2b794981498b3a3d061f8c1dc42c9b9dbe354ce1 /Objects/unicodeobject.c | |
parent | fdba8381ffb02fd888d84cbcd3c5944ee88e72a2 (diff) | |
download | cpython-0d4df752acbaf14164f1e8b2b95ebe3fe288bb82.zip cpython-0d4df752acbaf14164f1e8b2b95ebe3fe288bb82.tar.gz cpython-0d4df752acbaf14164f1e8b2b95ebe3fe288bb82.tar.bz2 |
Issue #15027: The UTF-32 encoder is now 3x to 7x faster.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 102 |
1 files changed, 41 insertions, 61 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3225fb3..548cfff 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5051,32 +5051,22 @@ _PyUnicode_EncodeUTF32(PyObject *str, const char *errors, int byteorder) { - int kind; - void *data; + enum PyUnicode_Kind kind; + const void *data; Py_ssize_t len; PyObject *v; - unsigned char *p; - Py_ssize_t nsize, i; - /* Offsets from p for storing byte pairs in the right order. */ + PY_UINT32_T *out; #if PY_LITTLE_ENDIAN - int iorder[] = {0, 1, 2, 3}; + int native_ordering = byteorder <= 0; #else - int iorder[] = {3, 2, 1, 0}; + int native_ordering = byteorder >= 0; #endif const char *encoding; + Py_ssize_t nsize, pos; PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *rep = NULL; -#define STORECHAR(CH) \ - do { \ - p[iorder[3]] = ((CH) >> 24) & 0xff; \ - p[iorder[2]] = ((CH) >> 16) & 0xff; \ - p[iorder[1]] = ((CH) >> 8) & 0xff; \ - p[iorder[0]] = (CH) & 0xff; \ - p += 4; \ - } while(0) - if (!PyUnicode_Check(str)) { PyErr_BadArgument(); return NULL; @@ -5087,59 +5077,53 @@ _PyUnicode_EncodeUTF32(PyObject *str, data = PyUnicode_DATA(str); len = PyUnicode_GET_LENGTH(str); - nsize = len + (byteorder == 0); - if (nsize > PY_SSIZE_T_MAX / 4) + if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) return PyErr_NoMemory(); + nsize = len + (byteorder == 0); v = PyBytes_FromStringAndSize(NULL, nsize * 4); if (v == NULL) return NULL; - p = (unsigned char *)PyBytes_AS_STRING(v); + /* output buffer is 4-bytes aligned */ + assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); + out = (PY_UINT32_T *)PyBytes_AS_STRING(v); if (byteorder == 0) - STORECHAR(0xFEFF); + *out++ = 0xFEFF; if (len == 0) - return v; + goto done; - if (byteorder == -1) { - /* force LE */ - iorder[0] = 0; - iorder[1] = 1; - iorder[2] = 2; - iorder[3] = 3; + if (byteorder == -1) encoding = "utf-32-le"; - } - else if (byteorder == 1) { - /* force BE */ - iorder[0] = 3; - iorder[1] = 2; - iorder[2] = 1; - iorder[3] = 0; + else if (byteorder == 1) encoding = "utf-32-be"; - } else encoding = "utf-32"; if (kind == PyUnicode_1BYTE_KIND) { - for (i = 0; i < len; i++) - STORECHAR(PyUnicode_READ(kind, data, i)); - return v; + ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); + goto done; } - for (i = 0; i < len;) { + pos = 0; + while (pos < len) { Py_ssize_t repsize, moreunits; - Py_UCS4 ch = PyUnicode_READ(kind, data, i); - i++; - assert(ch <= MAX_UNICODE); - if (!Py_UNICODE_IS_SURROGATE(ch)) { - STORECHAR(ch); - continue; + + if (kind == PyUnicode_2BYTE_KIND) { + pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, + &out, native_ordering); } + else { + assert(kind == PyUnicode_4BYTE_KIND); + pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos, + &out, native_ordering); + } + if (pos == len) + break; rep = unicode_encode_call_errorhandler( errors, &errorHandler, encoding, "surrogates not allowed", - str, &exc, i-1, i, &i); - + str, &exc, pos, pos + 1, &pos); if (!rep) goto error; @@ -5147,7 +5131,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, repsize = PyBytes_GET_SIZE(rep); if (repsize & 3) { raise_encode_exception(&exc, encoding, - str, i - 1, i, + str, pos - 1, pos, "surrogates not allowed"); goto error; } @@ -5160,7 +5144,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, moreunits = repsize = PyUnicode_GET_LENGTH(rep); if (!PyUnicode_IS_ASCII(rep)) { raise_encode_exception(&exc, encoding, - str, i - 1, i, + str, pos - 1, pos, "surrogates not allowed"); goto error; } @@ -5168,7 +5152,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, /* four bytes are reserved for each surrogate */ if (moreunits > 1) { - Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v); + Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v); Py_ssize_t morebytes = 4 * (moreunits - 1); if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { /* integer overflow */ @@ -5177,20 +5161,16 @@ _PyUnicode_EncodeUTF32(PyObject *str, } if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) goto error; - p = (unsigned char*) PyBytes_AS_STRING(v) + outpos; + out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos; } if (PyBytes_Check(rep)) { - Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize); - p += repsize; + Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); + out += moreunits; } else /* rep is unicode */ { - const Py_UCS1 *repdata; assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); - repdata = PyUnicode_1BYTE_DATA(rep); - while (repsize--) { - Py_UCS4 ch = *repdata++; - STORECHAR(ch); - } + ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, + &out, native_ordering); } Py_CLEAR(rep); @@ -5199,11 +5179,12 @@ _PyUnicode_EncodeUTF32(PyObject *str, /* Cut back to size actually needed. This is necessary for, for example, encoding of a string containing isolated surrogates and the 'ignore' handler is used. */ - nsize = p - (unsigned char*) PyBytes_AS_STRING(v); + nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); if (nsize != PyBytes_GET_SIZE(v)) _PyBytes_Resize(&v, nsize); Py_XDECREF(errorHandler); Py_XDECREF(exc); + done: return v; error: Py_XDECREF(rep); @@ -5211,7 +5192,6 @@ _PyUnicode_EncodeUTF32(PyObject *str, Py_XDECREF(exc); Py_XDECREF(v); return NULL; -#undef STORECHAR } PyObject * |