summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@haypocalc.com>2011-12-11 19:09:03 (GMT)
committerVictor Stinner <victor.stinner@haypocalc.com>2011-12-11 19:09:03 (GMT)
commit785938eebdcf39d6ee54005e38afc474fdb5e1ea (patch)
tree14c1699fca3f508d3f27b9a5af9bbc196fa67f14 /Objects/unicodeobject.c
parent84def3774d2079ea2a812e0220507ff0e27247e7 (diff)
downloadcpython-785938eebdcf39d6ee54005e38afc474fdb5e1ea.zip
cpython-785938eebdcf39d6ee54005e38afc474fdb5e1ea.tar.gz
cpython-785938eebdcf39d6ee54005e38afc474fdb5e1ea.tar.bz2
Move the slowest UTF-8 decoder to its own subfunction
* Create decode_utf8_errors() * Reuse unicode_fromascii() * decode_utf8_errors() doesn't refit at the beginning * Remove refit_partial_string(), use unicode_adjust_maxchar() instead
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c226
1 files changed, 98 insertions, 128 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 2846bd2..cc4da2a 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1784,7 +1784,7 @@ _PyUnicode_ClearStaticStrings()
static PyObject*
unicode_fromascii(const unsigned char* s, Py_ssize_t size)
{
- PyObject *res;
+ PyObject *unicode;
#ifdef Py_DEBUG
const unsigned char *p;
const unsigned char *end = s + size;
@@ -1794,11 +1794,12 @@ unicode_fromascii(const unsigned char* s, Py_ssize_t size)
#endif
if (size == 1)
return get_latin1_char(s[0]);
- res = PyUnicode_New(size, 127);
- if (!res)
+ unicode = PyUnicode_New(size, 127);
+ if (!unicode)
return NULL;
- memcpy(PyUnicode_1BYTE_DATA(res), s, size);
- return res;
+ memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
+ assert(_PyUnicode_CheckConsistency(unicode, 1));
+ return unicode;
}
static Py_UCS4
@@ -4320,126 +4321,38 @@ _ucs4loop:
return 65537;
}
-/* Called when we encountered some error that wasn't detected in the original
- scan, e.g. an encoded surrogate character. The original maxchar computation
- may have been incorrect, so redo it. */
-static int
-refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
-{
- PyObject *tmp;
- Py_ssize_t k;
- Py_UCS4 maxchar;
- for (k = 0, maxchar = 0; k < n; k++)
- maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
- tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
- if (tmp == NULL)
- return -1;
- PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
- Py_DECREF(*unicode);
- *unicode = tmp;
- return 0;
-}
-
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
- in case of errors. Implicit parameters: unicode, kind, data, has_errors,
- onError. Potential resizing overallocates, so the result needs to shrink
- at the end.
+ in case of errors. Implicit parameters: unicode, kind, data, onError.
+ Potential resizing overallocates, so the result needs to shrink at the end.
*/
-#define WRITE_MAYBE_FAIL(index, value) \
- do { \
- if (has_errors) { \
- Py_ssize_t pos = index; \
- if (pos > PyUnicode_GET_LENGTH(unicode) && \
- unicode_resize(&unicode, pos + pos/8) < 0) \
- goto onError; \
- if (unicode_putchar(&unicode, &pos, value) < 0) \
- goto onError; \
- } \
- else \
- PyUnicode_WRITE(kind, data, index, value); \
+#define WRITE_MAYBE_FAIL(index, value) \
+ do { \
+ Py_ssize_t pos = index; \
+ if (pos > PyUnicode_GET_LENGTH(unicode) && \
+ unicode_resize(&unicode, pos + pos/8) < 0) \
+ goto onError; \
+ if (unicode_putchar(&unicode, &pos, value) < 0) \
+ goto onError; \
} while (0)
PyObject *
-PyUnicode_DecodeUTF8Stateful(const char *s,
- Py_ssize_t size,
- const char *errors,
- Py_ssize_t *consumed)
+decode_utf8_errors(const char *starts,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed,
+ const char *s,
+ PyObject *unicode,
+ Py_ssize_t i)
{
- const char *starts = s;
int n;
int k;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
- const char *e, *aligned_end;
- PyObject *unicode;
+ const char *e = starts + size;
+ const char *aligned_end;
const char *errmsg = "";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
- Py_UCS4 maxchar = 0;
- Py_ssize_t unicode_size;
- Py_ssize_t i;
- int kind;
- void *data;
- int has_errors = 0;
-
- if (size == 0) {
- if (consumed)
- *consumed = 0;
- return (PyObject *)PyUnicode_New(0, 0);
- }
- maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
- /* When the string is ASCII only, just use memcpy and return.
- unicode_size may be != size if there is an incomplete UTF-8
- sequence at the end of the ASCII block. */
- if (maxchar < 128 && size == unicode_size) {
- if (consumed)
- *consumed = size;
-
- if (size == 1)
- return get_latin1_char((unsigned char)s[0]);
-
- unicode = PyUnicode_New(unicode_size, maxchar);
- if (!unicode)
- return NULL;
- Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
- assert(_PyUnicode_CheckConsistency(unicode, 1));
- return unicode;
- }
-
- /* In case of errors, maxchar and size computation might be incorrect;
- code below refits and resizes as necessary. */
- unicode = PyUnicode_New(unicode_size, maxchar);
- if (!unicode)
- return NULL;
- kind = PyUnicode_KIND(unicode);
- data = PyUnicode_DATA(unicode);
-
- /* Unpack UTF-8 encoded data */
- i = 0;
- e = s + size;
- switch (kind) {
- case PyUnicode_1BYTE_KIND:
- has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
- break;
- case PyUnicode_2BYTE_KIND:
- has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
- break;
- case PyUnicode_4BYTE_KIND:
- has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
- break;
- }
- if (!has_errors) {
- /* Ensure the unicode size calculation was correct */
- assert(i == unicode_size);
- assert(s == e);
- if (consumed)
- *consumed = s-starts;
- return unicode;
- }
- /* Fall through to the generic decoding loop for the rest of
- the string */
- if (refit_partial_string(&unicode, kind, data, i) < 0)
- goto onError;
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
@@ -4591,11 +4504,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
continue;
utf8Error:
- if (!has_errors) {
- if (refit_partial_string(&unicode, kind, data, i) < 0)
- goto onError;
- has_errors = 1;
- }
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf8", errmsg,
@@ -4604,22 +4512,18 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
goto onError;
/* Update data because unicode_decode_call_errorhandler might have
re-created or resized the unicode object. */
- data = PyUnicode_DATA(unicode);
- kind = PyUnicode_KIND(unicode);
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
}
- /* Ensure the unicode_size calculation above was correct: */
- assert(has_errors || i == unicode_size);
-
if (consumed)
*consumed = s-starts;
/* Adjust length and ready string when it contained errors and
is of the old resizable kind. */
- if (has_errors) {
- if (PyUnicode_Resize(&unicode, i) < 0)
- goto onError;
- }
+ if (unicode_resize(&unicode, i) < 0)
+ goto onError;
+ unicode_adjust_maxchar(&unicode);
+ if (unicode == NULL)
+ goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
@@ -4629,12 +4533,78 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
onError:
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
- Py_DECREF(unicode);
+ Py_XDECREF(unicode);
return NULL;
}
-
#undef WRITE_MAYBE_FAIL
+PyObject *
+PyUnicode_DecodeUTF8Stateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ Py_UCS4 maxchar = 0;
+ Py_ssize_t unicode_size;
+ int has_errors = 0;
+ PyObject *unicode;
+ int kind;
+ void *data;
+ const char *starts = s;
+ const char *e;
+ Py_ssize_t i;
+
+ if (size == 0) {
+ if (consumed)
+ *consumed = 0;
+ return (PyObject *)PyUnicode_New(0, 0);
+ }
+
+ maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
+
+ /* When the string is ASCII only, just use memcpy and return.
+ unicode_size may be != size if there is an incomplete UTF-8
+ sequence at the end of the ASCII block. */
+ if (maxchar < 128 && size == unicode_size) {
+ if (consumed)
+ *consumed = size;
+ return unicode_fromascii(s, size);
+ }
+
+ unicode = PyUnicode_New(unicode_size, maxchar);
+ if (!unicode)
+ return NULL;
+ kind = PyUnicode_KIND(unicode);
+ data = PyUnicode_DATA(unicode);
+
+ /* Unpack UTF-8 encoded data */
+ i = 0;
+ e = starts + size;
+ switch (kind) {
+ case PyUnicode_1BYTE_KIND:
+ has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
+ break;
+ case PyUnicode_2BYTE_KIND:
+ has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
+ break;
+ case PyUnicode_4BYTE_KIND:
+ has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
+ break;
+ }
+ if (!has_errors) {
+ /* Ensure the unicode size calculation was correct */
+ assert(i == unicode_size);
+ assert(s == e);
+ if (consumed)
+ *consumed = size;
+ return unicode;
+ }
+
+ /* In case of errors, maxchar and size computation might be incorrect;
+ code below refits and resizes as necessary. */
+ return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
+}
+
#ifdef __APPLE__
/* Simplified UTF-8 decoder using surrogateescape error handler,