diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2017-11-13 19:23:48 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-11-13 19:23:48 (GMT) |
commit | 9b6c60cbce4ac45e8ccd7934babff465e9769509 (patch) | |
tree | 973d37d42dfe1ce66303ad0cf658bb20870aa88e /Objects/unicodeobject.c | |
parent | ce12629c84400c52734859e43b2386deb2b6da12 (diff) | |
download | cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.zip cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.tar.gz cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.tar.bz2 |
bpo-31979: Simplify transforming decimals to ASCII (#4336)
in int(), float() and complex() parsers.
This also speeds up parsing non-ASCII numbers by around 20%.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 136 |
1 files changed, 32 insertions, 104 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index fdc3197..f6b2b65 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj) /* --- Unicode Object ----------------------------------------------------- */ -static PyObject * -fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); - static inline Py_ssize_t findchar(const void *s, int kind, Py_ssize_t size, Py_UCS4 ch, @@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str, return _PyUnicode_TranslateCharmap(str, mapping, errors); } -static Py_UCS4 -fix_decimal_and_space_to_ascii(PyObject *self) -{ - /* No need to call PyUnicode_READY(self) because this function is only - called as a callback from fixup() which does it already. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - Py_UCS4 maxchar = 127, ch, fixed; - int modified = 0; - Py_ssize_t i; - - for (i = 0; i < len; ++i) { - ch = PyUnicode_READ(kind, data, i); - fixed = 0; - if (ch > 127) { - if (Py_UNICODE_ISSPACE(ch)) - fixed = ' '; - else { - const int decimal = Py_UNICODE_TODECIMAL(ch); - if (decimal >= 0) - fixed = '0' + decimal; - } - if (fixed != 0) { - modified = 1; - maxchar = Py_MAX(maxchar, fixed); - PyUnicode_WRITE(kind, data, i, fixed); - } - else - maxchar = Py_MAX(maxchar, ch); - } - } - - return (modified) ? maxchar : 0; -} - PyObject * _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) { @@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) } if (PyUnicode_READY(unicode) == -1) return NULL; - if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { + if (PyUnicode_IS_ASCII(unicode)) { /* If the string is already ASCII, just return the same string */ Py_INCREF(unicode); return unicode; } - return fixup(unicode, fix_decimal_and_space_to_ascii); + + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + PyObject *result = PyUnicode_New(len, 127); + if (result == NULL) { + return NULL; + } + + Py_UCS1 *out = PyUnicode_1BYTE_DATA(result); + int kind = PyUnicode_KIND(unicode); + const void *data = PyUnicode_DATA(unicode); + Py_ssize_t i; + for (i = 0; i < len; ++i) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch < 127) { + out[i] = ch; + } + else if (Py_UNICODE_ISSPACE(ch)) { + out[i] = ' '; + } + else { + int decimal = Py_UNICODE_TODECIMAL(ch); + if (decimal < 0) { + out[i] = '?'; + _PyUnicode_LENGTH(result) = i + 1; + break; + } + out[i] = '0' + decimal; + } + } + + return result; } PyObject * @@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str, return tailmatch(str, substr, start, end, direction); } -/* Apply fixfct filter to the Unicode object self and return a - reference to the modified object */ - -static PyObject * -fixup(PyObject *self, - Py_UCS4 (*fixfct)(PyObject *s)) -{ - PyObject *u; - Py_UCS4 maxchar_old, maxchar_new = 0; - PyObject *v; - - u = _PyUnicode_Copy(self); - if (u == NULL) - return NULL; - maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); - - /* fix functions return the new maximum character in a string, - if the kind of the resulting unicode object does not change, - everything is fine. Otherwise we need to change the string kind - and re-run the fix function. */ - maxchar_new = fixfct(u); - - if (maxchar_new == 0) { - /* no changes */; - if (PyUnicode_CheckExact(self)) { - Py_DECREF(u); - Py_INCREF(self); - return self; - } - else - return u; - } - - maxchar_new = align_maxchar(maxchar_new); - - if (maxchar_new == maxchar_old) - return u; - - /* In case the maximum character changed, we need to - convert the string to the new category. */ - v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); - if (v == NULL) { - Py_DECREF(u); - return NULL; - } - if (maxchar_new > maxchar_old) { - /* If the maxchar increased so that the kind changed, not all - characters are representable anymore and we need to fix the - string again. This only happens in very few cases. */ - _PyUnicode_FastCopyCharacters(v, 0, - self, 0, PyUnicode_GET_LENGTH(self)); - maxchar_old = fixfct(v); - assert(maxchar_old > 0 && maxchar_old <= maxchar_new); - } - else { - _PyUnicode_FastCopyCharacters(v, 0, - u, 0, PyUnicode_GET_LENGTH(self)); - } - Py_DECREF(u); - assert(_PyUnicode_CheckConsistency(v, 1)); - return v; -} - static PyObject * ascii_upper_or_lower(PyObject *self, int lower) { |