summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2015-05-12 20:12:45 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2015-05-12 20:12:45 (GMT)
commit0d4df752acbaf14164f1e8b2b95ebe3fe288bb82 (patch)
tree2b794981498b3a3d061f8c1dc42c9b9dbe354ce1 /Objects/unicodeobject.c
parentfdba8381ffb02fd888d84cbcd3c5944ee88e72a2 (diff)
downloadcpython-0d4df752acbaf14164f1e8b2b95ebe3fe288bb82.zip
cpython-0d4df752acbaf14164f1e8b2b95ebe3fe288bb82.tar.gz
cpython-0d4df752acbaf14164f1e8b2b95ebe3fe288bb82.tar.bz2
Issue #15027: The UTF-32 encoder is now 3x to 7x faster.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c102
1 files changed, 41 insertions, 61 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 3225fb3..548cfff 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5051,32 +5051,22 @@ _PyUnicode_EncodeUTF32(PyObject *str,
const char *errors,
int byteorder)
{
- int kind;
- void *data;
+ enum PyUnicode_Kind kind;
+ const void *data;
Py_ssize_t len;
PyObject *v;
- unsigned char *p;
- Py_ssize_t nsize, i;
- /* Offsets from p for storing byte pairs in the right order. */
+ PY_UINT32_T *out;
#if PY_LITTLE_ENDIAN
- int iorder[] = {0, 1, 2, 3};
+ int native_ordering = byteorder <= 0;
#else
- int iorder[] = {3, 2, 1, 0};
+ int native_ordering = byteorder >= 0;
#endif
const char *encoding;
+ Py_ssize_t nsize, pos;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *rep = NULL;
-#define STORECHAR(CH) \
- do { \
- p[iorder[3]] = ((CH) >> 24) & 0xff; \
- p[iorder[2]] = ((CH) >> 16) & 0xff; \
- p[iorder[1]] = ((CH) >> 8) & 0xff; \
- p[iorder[0]] = (CH) & 0xff; \
- p += 4; \
- } while(0)
-
if (!PyUnicode_Check(str)) {
PyErr_BadArgument();
return NULL;
@@ -5087,59 +5077,53 @@ _PyUnicode_EncodeUTF32(PyObject *str,
data = PyUnicode_DATA(str);
len = PyUnicode_GET_LENGTH(str);
- nsize = len + (byteorder == 0);
- if (nsize > PY_SSIZE_T_MAX / 4)
+ if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
return PyErr_NoMemory();
+ nsize = len + (byteorder == 0);
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
if (v == NULL)
return NULL;
- p = (unsigned char *)PyBytes_AS_STRING(v);
+ /* output buffer is 4-bytes aligned */
+ assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
+ out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
if (byteorder == 0)
- STORECHAR(0xFEFF);
+ *out++ = 0xFEFF;
if (len == 0)
- return v;
+ goto done;
- if (byteorder == -1) {
- /* force LE */
- iorder[0] = 0;
- iorder[1] = 1;
- iorder[2] = 2;
- iorder[3] = 3;
+ if (byteorder == -1)
encoding = "utf-32-le";
- }
- else if (byteorder == 1) {
- /* force BE */
- iorder[0] = 3;
- iorder[1] = 2;
- iorder[2] = 1;
- iorder[3] = 0;
+ else if (byteorder == 1)
encoding = "utf-32-be";
- }
else
encoding = "utf-32";
if (kind == PyUnicode_1BYTE_KIND) {
- for (i = 0; i < len; i++)
- STORECHAR(PyUnicode_READ(kind, data, i));
- return v;
+ ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
+ goto done;
}
- for (i = 0; i < len;) {
+ pos = 0;
+ while (pos < len) {
Py_ssize_t repsize, moreunits;
- Py_UCS4 ch = PyUnicode_READ(kind, data, i);
- i++;
- assert(ch <= MAX_UNICODE);
- if (!Py_UNICODE_IS_SURROGATE(ch)) {
- STORECHAR(ch);
- continue;
+
+ if (kind == PyUnicode_2BYTE_KIND) {
+ pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
+ &out, native_ordering);
}
+ else {
+ assert(kind == PyUnicode_4BYTE_KIND);
+ pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
+ &out, native_ordering);
+ }
+ if (pos == len)
+ break;
rep = unicode_encode_call_errorhandler(
errors, &errorHandler,
encoding, "surrogates not allowed",
- str, &exc, i-1, i, &i);
-
+ str, &exc, pos, pos + 1, &pos);
if (!rep)
goto error;
@@ -5147,7 +5131,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
repsize = PyBytes_GET_SIZE(rep);
if (repsize & 3) {
raise_encode_exception(&exc, encoding,
- str, i - 1, i,
+ str, pos - 1, pos,
"surrogates not allowed");
goto error;
}
@@ -5160,7 +5144,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
if (!PyUnicode_IS_ASCII(rep)) {
raise_encode_exception(&exc, encoding,
- str, i - 1, i,
+ str, pos - 1, pos,
"surrogates not allowed");
goto error;
}
@@ -5168,7 +5152,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
/* four bytes are reserved for each surrogate */
if (moreunits > 1) {
- Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
+ Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Py_ssize_t morebytes = 4 * (moreunits - 1);
if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
/* integer overflow */
@@ -5177,20 +5161,16 @@ _PyUnicode_EncodeUTF32(PyObject *str,
}
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
goto error;
- p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
+ out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
}
if (PyBytes_Check(rep)) {
- Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
- p += repsize;
+ Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
+ out += moreunits;
} else /* rep is unicode */ {
- const Py_UCS1 *repdata;
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
- repdata = PyUnicode_1BYTE_DATA(rep);
- while (repsize--) {
- Py_UCS4 ch = *repdata++;
- STORECHAR(ch);
- }
+ ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
+ &out, native_ordering);
}
Py_CLEAR(rep);
@@ -5199,11 +5179,12 @@ _PyUnicode_EncodeUTF32(PyObject *str,
/* Cut back to size actually needed. This is necessary for, for example,
encoding of a string containing isolated surrogates and the 'ignore'
handler is used. */
- nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
+ nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
if (nsize != PyBytes_GET_SIZE(v))
_PyBytes_Resize(&v, nsize);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
+ done:
return v;
error:
Py_XDECREF(rep);
@@ -5211,7 +5192,6 @@ _PyUnicode_EncodeUTF32(PyObject *str,
Py_XDECREF(exc);
Py_XDECREF(v);
return NULL;
-#undef STORECHAR
}
PyObject *