summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2009-01-10 15:40:25 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2009-01-10 15:40:25 (GMT)
commitab868311a5282f188a8cf831b021938420fee5c4 (patch)
tree3cc64bd3d02080c72cf32f088c490901c0054644
parentdd6351e6b1ec1b70b29e6b6f55014b6fad6b0571 (diff)
downloadcpython-ab868311a5282f188a8cf831b021938420fee5c4.zip
cpython-ab868311a5282f188a8cf831b021938420fee5c4.tar.gz
cpython-ab868311a5282f188a8cf831b021938420fee5c4.tar.bz2
Issue #4868: utf-8, utf-16 and latin1 decoding are now 2x to 4x faster. The
common cases are optimized thanks to a dedicated fast path and a moderate amount of loop unrolling. This will especially help text I/O (we already register a 30% speedup on large reads on the io-c branch).
-rw-r--r--Misc/NEWS4
-rw-r--r--Objects/unicodeobject.c231
2 files changed, 211 insertions, 24 deletions
diff --git a/Misc/NEWS b/Misc/NEWS
index 7832505..16e1edd 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 0
Core and Builtins
-----------------
+- Issue #4868: utf-8, utf-16 and latin1 decoding are now 2x to 4x faster. The
+ common cases are optimized thanks to a dedicated fast path and a moderate
+ amount of loop unrolling.
+
- Issue #4074: Change the criteria for doing a full garbage collection (i.e.
collecting the oldest generation) so that allocating lots of objects without
destroying them does not show quadratic performance. Based on a proposal by
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 8316e91..bc1612d 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2001,6 +2001,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
}
+/* Mask to check or force alignment of a pointer to C 'long' boundaries */
+#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
+
+/* Mask to quickly check whether a C 'long' contains a
+ non-ASCII, UTF8-encoded char. */
+#if (SIZEOF_LONG == 8)
+# define ASCII_CHAR_MASK 0x8080808080808080L
+#elif (SIZEOF_LONG == 4)
+# define ASCII_CHAR_MASK 0x80808080L
+#else
+# error C 'long' size should be either 4 or 8!
+#endif
+
PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t size,
const char *errors,
@@ -2011,7 +2024,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
- const char *e;
+ const char *e, *aligned_end;
PyUnicodeObject *unicode;
Py_UNICODE *p;
const char *errmsg = "";
@@ -2032,11 +2045,52 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
/* Unpack UTF-8 encoded data */
p = unicode->str;
e = s + size;
+ aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
while (s < e) {
Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
+ /* Fast path for runs of ASCII characters. Given that common UTF-8
+ input will consist of an overwhelming majority of ASCII
+ characters, we try to optimize for this case by checking
+ as many characters as a C 'long' can contain.
+ First, check if we can do an aligned read, as most CPUs have
+ a penalty for unaligned reads.
+ */
+ if (!((size_t) s & LONG_PTR_MASK)) {
+ /* Help register allocation */
+ register const char *_s = s;
+ register Py_UNICODE *_p = p;
+ while (_s < aligned_end) {
+ /* Read a whole long at a time (either 4 or 8 bytes),
+ and do a fast unrolled copy if it only contains ASCII
+ characters. */
+ unsigned long data = *(unsigned long *) _s;
+ if (data & ASCII_CHAR_MASK)
+ break;
+ _p[0] = (unsigned char) _s[0];
+ _p[1] = (unsigned char) _s[1];
+ _p[2] = (unsigned char) _s[2];
+ _p[3] = (unsigned char) _s[3];
+#if (SIZEOF_LONG == 8)
+ _p[4] = (unsigned char) _s[4];
+ _p[5] = (unsigned char) _s[5];
+ _p[6] = (unsigned char) _s[6];
+ _p[7] = (unsigned char) _s[7];
+#endif
+ _s += SIZEOF_LONG;
+ _p += SIZEOF_LONG;
+ }
+ s = _s;
+ p = _p;
+ if (s == e)
+ break;
+ ch = (unsigned char)*s;
+ }
+ }
+
+ if (ch < 0x80) {
*p++ = (Py_UNICODE)ch;
s++;
continue;
@@ -2169,6 +2223,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
&starts, &e, &startinpos, &endinpos, &exc, &s,
&unicode, &outpos, &p))
goto onError;
+ aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
}
if (consumed)
*consumed = s-starts;
@@ -2188,6 +2243,9 @@ onError:
return NULL;
}
+#undef ASCII_CHAR_MASK
+
+
/* Allocation strategy: if the string is short, convert into a stack buffer
and allocate exactly as much space needed at the end. Else allocate the
maximum possible needed (4 result bytes per Unicode character), and return
@@ -2582,6 +2640,23 @@ PyUnicode_DecodeUTF16(const char *s,
return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
}
+/* Two masks for fast checking of whether a C 'long' may contain
+ UTF16-encoded surrogate characters. This is an efficient heuristic,
+ assuming that non-surrogate characters with a code point >= 0x8000 are
+ rare in most input.
+ FAST_CHAR_MASK is used when the input is in native byte ordering,
+ SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
+ */
+#if (SIZEOF_LONG == 8)
+# define FAST_CHAR_MASK 0x8000800080008000L
+# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
+#elif (SIZEOF_LONG == 4)
+# define FAST_CHAR_MASK 0x80008000L
+# define SWAPPED_FAST_CHAR_MASK 0x00800080L
+#else
+# error C 'long' size should be either 4 or 8!
+#endif
+
PyObject *
PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t size,
@@ -2595,8 +2670,9 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t outpos;
PyUnicodeObject *unicode;
Py_UNICODE *p;
- const unsigned char *q, *e;
+ const unsigned char *q, *e, *aligned_end;
int bo = 0; /* assume native ordering by default */
+ int native_ordering = 0;
const char *errmsg = "";
/* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
@@ -2618,7 +2694,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Unpack UTF-16 encoded data */
p = unicode->str;
q = (unsigned char *)s;
- e = q + size;
+ e = q + size - 1;
if (byteorder)
bo = *byteorder;
@@ -2662,20 +2738,78 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
ihi = 0;
ilo = 1;
}
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ native_ordering = ilo < ihi;
+#else
+ native_ordering = ilo > ihi;
+#endif
+ aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
while (q < e) {
Py_UNICODE ch;
- /* remaining bytes at the end? (size should be even) */
- if (e-q<2) {
- if (consumed)
- break;
- errmsg = "truncated data";
- startinpos = ((const char *)q)-starts;
- endinpos = ((const char *)e)-starts;
- goto utf16Error;
- /* The remaining input chars are ignored if the callback
- chooses to skip the input */
- }
+ /* First check for possible aligned read of a C 'long'. Unaligned
+ reads are more expensive, better to defer to another iteration. */
+ if (!((size_t) q & LONG_PTR_MASK)) {
+ /* Fast path for runs of non-surrogate chars. */
+ register const unsigned char *_q = q;
+ Py_UNICODE *_p = p;
+ if (native_ordering) {
+ /* Native ordering is simple: as long as the input cannot
+ possibly contain a surrogate char, do an unrolled copy
+ of several 16-bit code points to the target object.
+ The non-surrogate check is done on several input bytes
+ at a time (as many as a C 'long' can contain). */
+ while (_q < aligned_end) {
+ unsigned long data = * (unsigned long *) _q;
+ if (data & FAST_CHAR_MASK)
+ break;
+ _p[0] = ((unsigned short *) _q)[0];
+ _p[1] = ((unsigned short *) _q)[1];
+#if (SIZEOF_LONG == 8)
+ _p[2] = ((unsigned short *) _q)[2];
+ _p[3] = ((unsigned short *) _q)[3];
+#endif
+ _q += SIZEOF_LONG;
+ _p += SIZEOF_LONG / 2;
+ }
+ }
+ else {
+ /* Byteswapped ordering is similar, but we must decompose
+ the copy bytewise, and take care of zero'ing out the
+ upper bytes if the target object is in 32-bit units
+ (that is, in UCS-4 builds). */
+ while (_q < aligned_end) {
+ unsigned long data = * (unsigned long *) _q;
+ if (data & SWAPPED_FAST_CHAR_MASK)
+ break;
+ /* Zero upper bytes in UCS-4 builds */
+#if (Py_UNICODE_SIZE > 2)
+ _p[0] = 0;
+ _p[1] = 0;
+#if (SIZEOF_LONG == 8)
+ _p[2] = 0;
+ _p[3] = 0;
+#endif
+#endif
+ ((unsigned char *) _p)[1] = _q[0];
+ ((unsigned char *) _p)[0] = _q[1];
+ ((unsigned char *) _p)[1 + Py_UNICODE_SIZE] = _q[2];
+ ((unsigned char *) _p)[0 + Py_UNICODE_SIZE] = _q[3];
+#if (SIZEOF_LONG == 8)
+ ((unsigned char *) _p)[1 + 2 * Py_UNICODE_SIZE] = _q[4];
+ ((unsigned char *) _p)[0 + 2 * Py_UNICODE_SIZE] = _q[5];
+ ((unsigned char *) _p)[1 + 3 * Py_UNICODE_SIZE] = _q[6];
+ ((unsigned char *) _p)[0 + 3 * Py_UNICODE_SIZE] = _q[7];
+#endif
+ _q += SIZEOF_LONG;
+ _p += SIZEOF_LONG / 2;
+ }
+ }
+ p = _p;
+ q = _q;
+ if (q >= e)
+ break;
+ }
ch = (q[ihi] << 8) | q[ilo];
q += 2;
@@ -2686,10 +2820,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
}
/* UTF-16 code pair: */
- if (q >= e) {
+ if (q > e) {
errmsg = "unexpected end of data";
- startinpos = (((const char *)q)-2)-starts;
- endinpos = ((const char *)e)-starts;
+ startinpos = (((const char *)q) - 2) - starts;
+ endinpos = ((const char *)e) + 1 - starts;
goto utf16Error;
}
if (0xD800 <= ch && ch <= 0xDBFF) {
@@ -2718,14 +2852,47 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Fall through to report the error */
utf16Error:
- outpos = p-PyUnicode_AS_UNICODE(unicode);
+ outpos = p - PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
- errors, &errorHandler,
- "utf16", errmsg,
- &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
- &unicode, &outpos, &p))
+ errors,
+ &errorHandler,
+ "utf16", errmsg,
+ &starts,
+ (const char **)&e,
+ &startinpos,
+ &endinpos,
+ &exc,
+ (const char **)&q,
+ &unicode,
+ &outpos,
+ &p))
goto onError;
}
+ /* remaining byte at the end? (size should be even) */
+ if (e == q) {
+ if (!consumed) {
+ errmsg = "truncated data";
+ startinpos = ((const char *)q) - starts;
+ endinpos = ((const char *)e) + 1 - starts;
+ outpos = p - PyUnicode_AS_UNICODE(unicode);
+ if (unicode_decode_call_errorhandler(
+ errors,
+ &errorHandler,
+ "utf16", errmsg,
+ &starts,
+ (const char **)&e,
+ &startinpos,
+ &endinpos,
+ &exc,
+ (const char **)&q,
+ &unicode,
+ &outpos,
+ &p))
+ goto onError;
+ /* The remaining input chars are ignored if the callback
+ chooses to skip the input */
+ }
+ }
if (byteorder)
*byteorder = bo;
@@ -2748,6 +2915,9 @@ onError:
return NULL;
}
+#undef FAST_CHAR_MASK
+#undef SWAPPED_FAST_CHAR_MASK
+
PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Py_ssize_t size,
@@ -3571,6 +3741,7 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
{
PyUnicodeObject *v;
Py_UNICODE *p;
+ const char *e, *unrolled_end;
/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
if (size == 1) {
@@ -3584,8 +3755,20 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
if (size == 0)
return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v);
- while (size-- > 0)
- *p++ = (unsigned char)*s++;
+ e = s + size;
+ /* Unrolling the copy makes it much faster by reducing the looping
+ overhead. This is similar to what many memcpy() implementations do. */
+ unrolled_end = e - 4;
+ while (s < unrolled_end) {
+ p[0] = (unsigned char) s[0];
+ p[1] = (unsigned char) s[1];
+ p[2] = (unsigned char) s[2];
+ p[3] = (unsigned char) s[3];
+ s += 4;
+ p += 4;
+ }
+ while (s < e)
+ *p++ = (unsigned char) *s++;
return (PyObject *)v;
onError: