bpo-31979: Simplify transforming decimals to ASCII (#4336)

in int(), float() and complex() parsers. This also speeds up parsing non-ASCII numbers by around 20%.
author: Serhiy Storchaka <storchaka@gmail.com> 2017-11-13 19:23:48 (GMT)
committer: GitHub <noreply@github.com> 2017-11-13 19:23:48 (GMT)
commit: 9b6c60cbce4ac45e8ccd7934babff465e9769509 (patch)
tree: 973d37d42dfe1ce66303ad0cf658bb20870aa88e /Objects/unicodeobject.c
parent: ce12629c84400c52734859e43b2386deb2b6da12 (diff)
download: cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.zip
cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.tar.gz
cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.tar.bz2
1 files changed, 32 insertions, 104 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index fdc3197..f6b2b65 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)
 
 /* --- Unicode Object ----------------------------------------------------- */
 
-static PyObject *
-fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
-
 static inline Py_ssize_t
 findchar(const void *s, int kind,
          Py_ssize_t size, Py_UCS4 ch,
@@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,
     return _PyUnicode_TranslateCharmap(str, mapping, errors);
 }
 
-static Py_UCS4
-fix_decimal_and_space_to_ascii(PyObject *self)
-{
-    /* No need to call PyUnicode_READY(self) because this function is only
-       called as a callback from fixup() which does it already. */
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
-    const int kind = PyUnicode_KIND(self);
-    void *data = PyUnicode_DATA(self);
-    Py_UCS4 maxchar = 127, ch, fixed;
-    int modified = 0;
-    Py_ssize_t i;
-
-    for (i = 0; i < len; ++i) {
-        ch = PyUnicode_READ(kind, data, i);
-        fixed = 0;
-        if (ch > 127) {
-            if (Py_UNICODE_ISSPACE(ch))
-                fixed = ' ';
-            else {
-                const int decimal = Py_UNICODE_TODECIMAL(ch);
-                if (decimal >= 0)
-                    fixed = '0' + decimal;
-            }
-            if (fixed != 0) {
-                modified = 1;
-                maxchar = Py_MAX(maxchar, fixed);
-                PyUnicode_WRITE(kind, data, i, fixed);
-            }
-            else
-                maxchar = Py_MAX(maxchar, ch);
-        }
-    }
-
-    return (modified) ? maxchar : 0;
-}
-
 PyObject *
 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
 {
@@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
     }
     if (PyUnicode_READY(unicode) == -1)
         return NULL;
-    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
+    if (PyUnicode_IS_ASCII(unicode)) {
         /* If the string is already ASCII, just return the same string */
         Py_INCREF(unicode);
         return unicode;
     }
-    return fixup(unicode, fix_decimal_and_space_to_ascii);
+
+    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+    PyObject *result = PyUnicode_New(len, 127);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
+    int kind = PyUnicode_KIND(unicode);
+    const void *data = PyUnicode_DATA(unicode);
+    Py_ssize_t i;
+    for (i = 0; i < len; ++i) {
+        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+        if (ch < 127) {
+            out[i] = ch;
+        }
+        else if (Py_UNICODE_ISSPACE(ch)) {
+            out[i] = ' ';
+        }
+        else {
+            int decimal = Py_UNICODE_TODECIMAL(ch);
+            if (decimal < 0) {
+                out[i] = '?';
+                _PyUnicode_LENGTH(result) = i + 1;
+                break;
+            }
+            out[i] = '0' + decimal;
+        }
+    }
+
+    return result;
 }
 
 PyObject *
@@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,
     return tailmatch(str, substr, start, end, direction);
 }
 
-/* Apply fixfct filter to the Unicode object self and return a
-   reference to the modified object */
-
-static PyObject *
-fixup(PyObject *self,
-      Py_UCS4 (*fixfct)(PyObject *s))
-{
-    PyObject *u;
-    Py_UCS4 maxchar_old, maxchar_new = 0;
-    PyObject *v;
-
-    u = _PyUnicode_Copy(self);
-    if (u == NULL)
-        return NULL;
-    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
-
-    /* fix functions return the new maximum character in a string,
-       if the kind of the resulting unicode object does not change,
-       everything is fine.  Otherwise we need to change the string kind
-       and re-run the fix function. */
-    maxchar_new = fixfct(u);
-
-    if (maxchar_new == 0) {
-        /* no changes */;
-        if (PyUnicode_CheckExact(self)) {
-            Py_DECREF(u);
-            Py_INCREF(self);
-            return self;
-        }
-        else
-            return u;
-    }
-
-    maxchar_new = align_maxchar(maxchar_new);
-
-    if (maxchar_new == maxchar_old)
-        return u;
-
-    /* In case the maximum character changed, we need to
-       convert the string to the new category. */
-    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
-    if (v == NULL) {
-        Py_DECREF(u);
-        return NULL;
-    }
-    if (maxchar_new > maxchar_old) {
-        /* If the maxchar increased so that the kind changed, not all
-           characters are representable anymore and we need to fix the
-           string again. This only happens in very few cases. */
-        _PyUnicode_FastCopyCharacters(v, 0,
-                                      self, 0, PyUnicode_GET_LENGTH(self));
-        maxchar_old = fixfct(v);
-        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
-    }
-    else {
-        _PyUnicode_FastCopyCharacters(v, 0,
-                                      u, 0, PyUnicode_GET_LENGTH(self));
-    }
-    Py_DECREF(u);
-    assert(_PyUnicode_CheckConsistency(v, 1));
-    return v;
-}
-
 static PyObject *
 ascii_upper_or_lower(PyObject *self, int lower)
 {
author	Serhiy Storchaka <storchaka@gmail.com>	2017-11-13 19:23:48 (GMT)
committer	GitHub <noreply@github.com>	2017-11-13 19:23:48 (GMT)
commit	9b6c60cbce4ac45e8ccd7934babff465e9769509 (patch)
tree	973d37d42dfe1ce66303ad0cf658bb20870aa88e /Objects/unicodeobject.c
parent	ce12629c84400c52734859e43b2386deb2b6da12 (diff)
download	cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.zip cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.tar.gz cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.tar.bz2