summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2011-11-21 19:39:13 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2011-11-21 19:39:13 (GMT)
commit0a3229de6b80cfa9e432ef5a9c72548569503075 (patch)
treeda816b36e78da60f56980793b562c7d2d0cdd729
parent7fe601c5bfa097b6c2aba78a3f517a1c4ded703d (diff)
downloadcpython-0a3229de6b80cfa9e432ef5a9c72548569503075.zip
cpython-0a3229de6b80cfa9e432ef5a9c72548569503075.tar.gz
cpython-0a3229de6b80cfa9e432ef5a9c72548569503075.tar.bz2
Issue #13417: speed up utf-8 decoding by around 2x for the non-fully-ASCII case.
This almost catches up with pre-PEP 393 performance, when decoding needed only one pass.
-rw-r--r--Makefile.pre.in1
-rw-r--r--Objects/stringlib/codecs.h156
-rw-r--r--Objects/unicodeobject.c228
3 files changed, 278 insertions, 107 deletions
diff --git a/Makefile.pre.in b/Makefile.pre.in
index df1c0e7..dbf290e 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -645,6 +645,7 @@ BYTESTR_DEPS = \
UNICODE_DEPS = $(BYTESTR_DEPS) \
$(srcdir)/Objects/stringlib/asciilib.h \
+ $(srcdir)/Objects/stringlib/codecs.h \
$(srcdir)/Objects/stringlib/ucs1lib.h \
$(srcdir)/Objects/stringlib/ucs2lib.h \
$(srcdir)/Objects/stringlib/ucs4lib.h \
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
new file mode 100644
index 0000000..148bacb
--- /dev/null
+++ b/Objects/stringlib/codecs.h
@@ -0,0 +1,156 @@
+/* stringlib: codec implementations */
+
+#if STRINGLIB_IS_UNICODE
+
+/* Mask to check or force alignment of a pointer to C 'long' boundaries */
+#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
+
+/* Mask to quickly check whether a C 'long' contains a
+ non-ASCII, UTF8-encoded char. */
+#if (SIZEOF_LONG == 8)
+# define ASCII_CHAR_MASK 0x8080808080808080L
+#elif (SIZEOF_LONG == 4)
+# define ASCII_CHAR_MASK 0x80808080L
+#else
+# error C 'long' size should be either 4 or 8!
+#endif
+
+Py_LOCAL_INLINE(int)
+STRINGLIB(utf8_try_decode)(const char *start, const char *end,
+ STRINGLIB_CHAR *dest,
+ const char **src_pos, Py_ssize_t *dest_index)
+{
+ int ret;
+ Py_ssize_t n;
+ const char *s = start;
+ const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
+ STRINGLIB_CHAR *p = dest;
+
+ while (s < end) {
+ Py_UCS4 ch = (unsigned char)*s;
+
+ if (ch < 0x80) {
+ /* Fast path for runs of ASCII characters. Given that common UTF-8
+ input will consist of an overwhelming majority of ASCII
+ characters, we try to optimize for this case by checking
+ as many characters as a C 'long' can contain.
+ First, check if we can do an aligned read, as most CPUs have
+ a penalty for unaligned reads.
+ */
+ if (!((size_t) s & LONG_PTR_MASK)) {
+ /* Help register allocation */
+ register const char *_s = s;
+ register STRINGLIB_CHAR *_p = p;
+ while (_s < aligned_end) {
+ /* Read a whole long at a time (either 4 or 8 bytes),
+ and do a fast unrolled copy if it only contains ASCII
+ characters. */
+ unsigned long value = *(unsigned long *) _s;
+ if (value & ASCII_CHAR_MASK)
+ break;
+ _p[0] = _s[0];
+ _p[1] = _s[1];
+ _p[2] = _s[2];
+ _p[3] = _s[3];
+#if (SIZEOF_LONG == 8)
+ _p[4] = _s[4];
+ _p[5] = _s[5];
+ _p[6] = _s[6];
+ _p[7] = _s[7];
+#endif
+ _s += SIZEOF_LONG;
+ _p += SIZEOF_LONG;
+ }
+ s = _s;
+ p = _p;
+ if (s == end)
+ break;
+ ch = (unsigned char)*s;
+ }
+ }
+
+ if (ch < 0x80) {
+ s++;
+ *p++ = ch;
+ continue;
+ }
+
+ n = utf8_code_length[ch];
+
+ if (s + n > end) {
+ /* unexpected end of data: the caller will decide whether
+ it's an error or not */
+ goto _error;
+ }
+
+ switch (n) {
+ case 0:
+ /* invalid start byte */
+ goto _error;
+ case 1:
+ /* internal error */
+ goto _error;
+ case 2:
+ if ((s[1] & 0xc0) != 0x80)
+ /* invalid continuation byte */
+ goto _error;
+ ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
+ assert ((ch > 0x007F) && (ch <= 0x07FF));
+ s += 2;
+ *p++ = ch;
+ break;
+
+ case 3:
+ /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
+ will result in surrogates in range d800-dfff. Surrogates are
+ not valid UTF-8 so they are rejected.
+ See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
+ (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ ((unsigned char)s[0] == 0xE0 &&
+ (unsigned char)s[1] < 0xA0) ||
+ ((unsigned char)s[0] == 0xED &&
+ (unsigned char)s[1] > 0x9F)) {
+ /* invalid continuation byte */
+ goto _error;
+ }
+ ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+ assert ((ch > 0x07FF) && (ch <= 0xFFFF));
+ s += 3;
+ *p++ = ch;
+ break;
+
+ case 4:
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ ((unsigned char)s[0] == 0xF0 &&
+ (unsigned char)s[1] < 0x90) ||
+ ((unsigned char)s[0] == 0xF4 &&
+ (unsigned char)s[1] > 0x8F)) {
+ /* invalid continuation byte */
+ goto _error;
+ }
+ ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
+ ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+ assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
+ s += 4;
+ *p++ = ch;
+ break;
+ }
+ }
+ ret = 0;
+ goto _ok;
+_error:
+ ret = -1;
+_ok:
+ *src_pos = s;
+ *dest_index = p - dest;
+ return ret;
+}
+
+#undef LONG_PTR_MASK
+#undef ASCII_CHAR_MASK
+
+#endif /* STRINGLIB_IS_UNICODE */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 6307a98..9dedf0b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -523,6 +523,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
#include "stringlib/fastsearch.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
+#include "stringlib/undef.h"
/* --- Unicode Object ----------------------------------------------------- */
@@ -4190,6 +4191,18 @@ PyUnicode_DecodeUTF8(const char *s,
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
}
+#include "stringlib/ucs1lib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
+#include "stringlib/ucs2lib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
+#include "stringlib/ucs4lib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
@@ -4203,33 +4216,41 @@ PyUnicode_DecodeUTF8(const char *s,
# error C 'long' size should be either 4 or 8!
#endif
-/* Scans a UTF-8 string and returns the maximum character to be expected,
- the size of the decoded unicode string and if any major errors were
- encountered.
+/* Scans a UTF-8 string and returns the maximum character to be expected
+ and the size of the decoded unicode string.
- This function does check basic UTF-8 sanity, it does however NOT CHECK
- if the string contains surrogates, and if all continuation bytes are
- within the correct ranges, these checks are performed in
+ This function doesn't check for errors, these checks are performed in
PyUnicode_DecodeUTF8Stateful.
-
- If it sets has_errors to 1, it means the value of unicode_size and max_char
- will be bogus and you should not rely on useful information in them.
*/
static Py_UCS4
-utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
- Py_ssize_t *unicode_size, Py_ssize_t* consumed,
- int *has_errors)
+utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
+ Py_ssize_t *unicode_size)
{
- Py_ssize_t n;
Py_ssize_t char_count = 0;
- Py_UCS4 max_char = 127, new_max;
- Py_UCS4 upper_bound;
const unsigned char *p = (const unsigned char *)s;
const unsigned char *end = p + string_size;
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
- int err = 0;
- for (; p < end && !err; ++p, ++char_count) {
+ assert(unicode_size != NULL);
+
+ /* By having a cascade of independent loops which fallback onto each
+ other, we minimize the amount of work done in the average loop
+ iteration, and we also maximize the CPU's ability to predict
+ branches correctly (because a given condition will have always the
+ same boolean outcome except perhaps in the last iteration of the
+ corresponding loop).
+ In the general case this brings us rather close to decoding
+ performance pre-PEP 393, despite the two-pass decoding.
+
+ Note that the pure ASCII loop is not duplicated once a non-ASCII
+ character has been encountered. It is actually a pessimization (by
+ a significant factor) to use this loop on text with many non-ASCII
+ characters, and it is important to avoid bad performance on valid
+ utf-8 data (invalid utf-8 being a different can of worms).
+ */
+
+ /* ASCII */
+ for (; p < end; ++p) {
/* Only check value if it's not a ASCII char... */
if (*p < 0x80) {
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
@@ -4249,76 +4270,59 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
break;
}
}
- if (*p >= 0x80) {
- n = utf8_code_length[*p];
- new_max = max_char;
- switch (n) {
- /* invalid start byte */
- case 0:
- err = 1;
- break;
- case 2:
- /* Code points between 0x00FF and 0x07FF inclusive.
- Approximate the upper bound of the code point,
- if this flips over 255 we can be sure it will be more
- than 255 and the string will need 2 bytes per code coint,
- if it stays under or equal to 255, we can be sure 1 byte
- is enough.
- ((*p & 0b00011111) << 6) | 0b00111111 */
- upper_bound = ((*p & 0x1F) << 6) | 0x3F;
- if (max_char < upper_bound)
- new_max = upper_bound;
- /* Ensure we track at least that we left ASCII space. */
- if (new_max < 128)
- new_max = 128;
- break;
- case 3:
- /* Between 0x0FFF and 0xFFFF inclusive, so values are
- always > 255 and <= 65535 and will always need 2 bytes. */
- if (max_char < 65535)
- new_max = 65535;
- break;
- case 4:
- /* Code point will be above 0xFFFF for sure in this case. */
- new_max = 65537;
- break;
- /* Internal error, this should be caught by the first if */
- case 1:
- default:
- assert(0 && "Impossible case in utf8_max_char_and_size");
- err = 1;
- }
- /* Instead of number of overall bytes for this code point,
- n contains the number of following bytes: */
- --n;
- /* Check if the follow up chars are all valid continuation bytes */
- if (n >= 1) {
- const unsigned char *cont;
- if ((p + n) >= end) {
- if (consumed == 0)
- /* incomplete data, non-incremental decoding */
- err = 1;
- break;
- }
- for (cont = p + 1; cont <= (p + n); ++cont) {
- if ((*cont & 0xc0) != 0x80) {
- err = 1;
- break;
- }
- }
- p += n;
- }
- else
- err = 1;
- max_char = new_max;
- }
+ if (*p < 0x80)
+ ++char_count;
+ else
+ goto _ucs1loop;
+ }
+ *unicode_size = char_count;
+ return 127;
+
+_ucs1loop:
+ for (; p < end; ++p) {
+ if (*p < 0xc4)
+ char_count += ((*p & 0xc0) != 0x80);
+ else
+ goto _ucs2loop;
+ }
+ *unicode_size = char_count;
+ return 255;
+
+_ucs2loop:
+ for (; p < end; ++p) {
+ if (*p < 0xf0)
+ char_count += ((*p & 0xc0) != 0x80);
+ else
+ goto _ucs4loop;
+ }
+ *unicode_size = char_count;
+ return 65535;
+
+_ucs4loop:
+ for (; p < end; ++p) {
+ char_count += ((*p & 0xc0) != 0x80);
}
+ *unicode_size = char_count;
+ return 65537;
+}
- if (unicode_size)
- *unicode_size = char_count;
- if (has_errors)
- *has_errors = err;
- return max_char;
+/* Called when we encountered some error that wasn't detected in the original
+ scan, e.g. an encoded surrogate character. The original maxchar computation
+ may have been incorrect, so redo it. */
+static int
+refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
+{
+ PyObject *tmp;
+ Py_ssize_t k, maxchar;
+ for (k = 0, maxchar = 0; k < n; k++)
+ maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
+ tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
+ if (tmp == NULL)
+ return -1;
+ PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
+ Py_DECREF(*unicode);
+ *unicode = tmp;
+ return 0;
}
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
@@ -4361,35 +4365,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t i;
int kind;
void *data;
- int has_errors;
+ int has_errors = 0;
if (size == 0) {
if (consumed)
*consumed = 0;
return (PyObject *)PyUnicode_New(0, 0);
}
- maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
- consumed, &has_errors);
- if (has_errors)
- /* maxchar and size computation might be incorrect;
- code below widens and resizes as necessary. */
- unicode = PyUnicode_New(size, 127);
- else
- unicode = PyUnicode_New(unicode_size, maxchar);
+ maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
+ /* In case of errors, maxchar and size computation might be incorrect;
+ code below refits and resizes as necessary. */
+ unicode = PyUnicode_New(unicode_size, maxchar);
if (!unicode)
return NULL;
/* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */
- if (!has_errors && maxchar < 128 && size == unicode_size) {
+ if (maxchar < 128 && size == unicode_size) {
Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
return unicode;
}
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
+
/* Unpack UTF-8 encoded data */
i = 0;
e = s + size;
+ switch (kind) {
+ case PyUnicode_1BYTE_KIND:
+ has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
+ break;
+ case PyUnicode_2BYTE_KIND:
+ has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
+ break;
+ case PyUnicode_4BYTE_KIND:
+ has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
+ break;
+ }
+ if (!has_errors) {
+ /* Ensure the unicode size calculation was correct */
+ assert(i == unicode_size);
+ assert(s == e);
+ if (consumed)
+ *consumed = s-starts;
+ return unicode;
+ }
+ /* Fall through to the generic decoding loop for the rest of
+ the string */
+ if (refit_partial_string(&unicode, kind, data, i) < 0)
+ goto onError;
+
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
while (s < e) {
@@ -4541,19 +4566,8 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
utf8Error:
if (!has_errors) {
- PyObject *tmp;
- Py_ssize_t k;
- /* We encountered some error that wasn't detected in the original scan,
- e.g. an encoded surrogate character. The original maxchar computation may
- have been incorrect, so redo it now. */
- for (k = 0, maxchar = 0; k < i; k++)
- maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
- tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar);
- if (tmp == NULL)
+ if (refit_partial_string(&unicode, kind, data, i) < 0)
goto onError;
- PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i);
- Py_DECREF(unicode);
- unicode = tmp;
has_errors = 1;
}
if (unicode_decode_call_errorhandler(