summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@wyplay.com>2011-10-05 11:50:52 (GMT)
committerVictor Stinner <vstinner@wyplay.com>2011-10-05 11:50:52 (GMT)
commit702c7343957ec4369e89e738fdd157a435f2461f (patch)
tree3c8c0a8ca100745be0280effcec83ab56c1b28fc /Objects
parent00b2c86d09dccf125fdf7108d3b749f767c277db (diff)
downloadcpython-702c7343957ec4369e89e738fdd157a435f2461f.zip
cpython-702c7343957ec4369e89e738fdd157a435f2461f.tar.gz
cpython-702c7343957ec4369e89e738fdd157a435f2461f.tar.bz2
Speedup the ASCII decoder
It is faster for long string and a little bit faster for short strings, benchmark on Linux 32 bits, Intel Core i5 @ 3.33GHz: ./python -m timeit 'x=b"a"' 'x.decode("ascii")' ./python -m timeit 'x=b"x"*80' 'x.decode("ascii")' ./python -m timeit 'x=b"abc"*4096' 'x.decode("ascii")' length | before | after -------+------------+----------- 1 | 0.234 usec | 0.229 usec 80 | 0.381 usec | 0.357 usec 12,288 | 11.2 usec | 3.01 usec
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c80
1 files changed, 53 insertions, 27 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index cd67f60..40b2a88 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1515,6 +1515,16 @@ PyUnicode_FromString(const char *u)
}
static PyObject*
+unicode_fromascii(const unsigned char* u, Py_ssize_t size)
+{
+ PyObject *res = PyUnicode_New(size, 127);
+ if (!res)
+ return NULL;
+ memcpy(PyUnicode_1BYTE_DATA(res), u, size);
+ return res;
+}
+
+static PyObject*
_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
{
PyObject *res;
@@ -6477,65 +6487,81 @@ PyUnicode_DecodeASCII(const char *s,
{
const char *starts = s;
PyUnicodeObject *v;
- Py_UNICODE *p;
+ Py_UNICODE *u;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
- unsigned char* d;
+ int has_error;
+ const unsigned char *p = (const unsigned char *)s;
+ const unsigned char *end = p + size;
+ const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
- Py_ssize_t i;
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
- if (size == 1 && *(unsigned char*)s < 128)
- return PyUnicode_FromOrdinal(*(unsigned char*)s);
-
- /* Fast path. Assume the input actually *is* ASCII, and allocate
- a single-block Unicode object with that assumption. If there is
- an error, drop the object and start over. */
- v = (PyUnicodeObject*)PyUnicode_New(size, 127);
- if (v == NULL)
- goto onError;
- d = PyUnicode_1BYTE_DATA(v);
- for (i = 0; i < size; i++) {
- unsigned char ch = ((unsigned char*)s)[i];
- if (ch < 128)
- d[i] = ch;
- else
+ if (size == 1 && (unsigned char)s[0] < 128)
+ return get_latin1_char((unsigned char)s[0]);
+
+ has_error = 0;
+ while (p < end && !has_error) {
+ /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
+ an explanation. */
+ if (!((size_t) p & LONG_PTR_MASK)) {
+ /* Help register allocation */
+ register const unsigned char *_p = p;
+ while (_p < aligned_end) {
+ unsigned long value = *(unsigned long *) _p;
+ if (value & ASCII_CHAR_MASK) {
+ has_error = 1;
+ break;
+ }
+ _p += SIZEOF_LONG;
+ }
+ if (_p == end)
+ break;
+ if (has_error)
+ break;
+ p = _p;
+ }
+ if (*p & 0x80) {
+ has_error = 1;
break;
+ }
+ else {
+ ++p;
+ }
}
- if (i == size)
- return (PyObject*)v;
- Py_DECREF(v); /* start over */
+ if (!has_error)
+ return unicode_fromascii((const unsigned char *)s, size);
v = _PyUnicode_New(size);
if (v == NULL)
goto onError;
if (size == 0)
return (PyObject *)v;
- p = PyUnicode_AS_UNICODE(v);
+ u = PyUnicode_AS_UNICODE(v);
e = s + size;
while (s < e) {
register unsigned char c = (unsigned char)*s;
if (c < 128) {
- *p++ = c;
+ *u++ = c;
++s;
}
else {
startinpos = s-starts;
endinpos = startinpos + 1;
- outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
+ outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"ascii", "ordinal not in range(128)",
&starts, &e, &startinpos, &endinpos, &exc, &s,
- &v, &outpos, &p))
+ &v, &outpos, &u))
goto onError;
}
}
- if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
- if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
+ if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
+ if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);