From 9b6c60cbce4ac45e8ccd7934babff465e9769509 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 13 Nov 2017 21:23:48 +0200 Subject: bpo-31979: Simplify transforming decimals to ASCII (#4336) in int(), float() and complex() parsers. This also speeds up parsing non-ASCII numbers by around 20%. --- Include/unicodeobject.h | 16 +++--- Lib/test/test_float.py | 2 +- Lib/test/test_unicode.py | 13 +++-- Objects/complexobject.c | 7 ++- Objects/floatobject.c | 7 ++- Objects/longobject.c | 21 ++++---- Objects/unicodeobject.c | 136 +++++++++++------------------------------------ 7 files changed, 63 insertions(+), 139 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index d2a8ec2..61e713b 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1723,6 +1723,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( #endif /* MS_WINDOWS */ +#ifndef Py_LIMITED_API /* --- Decimal Encoder ---------------------------------------------------- */ /* Takes a Unicode string holding a decimal value and writes it into @@ -1747,14 +1748,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( */ -#ifndef Py_LIMITED_API PyAPI_FUNC(int) PyUnicode_EncodeDecimal( Py_UNICODE *s, /* Unicode buffer */ Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ char *output, /* Output buffer; must have size >= length */ const char *errors /* error handling */ ) /* Py_DEPRECATED(3.3) */; -#endif /* Transforms code points that have decimal digit property to the corresponding ASCII digit code points. @@ -1762,19 +1761,18 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal( Returns a new Unicode string on success, NULL on failure. */ -#ifndef Py_LIMITED_API PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( Py_UNICODE *s, /* Unicode buffer */ Py_ssize_t length /* Number of Py_UNICODE chars to transform */ ) /* Py_DEPRECATED(3.3) */; -#endif -/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject - as argument instead of a raw buffer and length. This function additionally - transforms spaces to ASCII because this is what the callers in longobject, - floatobject, and complexobject did anyways. */ +/* Coverts a Unicode object holding a decimal value to an ASCII string + for using in int, float and complex parsers. + Transforms code points that have decimal digit property to the + corresponding ASCII digit code points. Transforms spaces to ASCII. + Transforms code points starting from the first non-ASCII code point that + is neither a decimal digit nor a space to the end into '?'. */ -#ifndef Py_LIMITED_API PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( PyObject *unicode /* Unicode object */ ); diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py index a16c05c..17174dd 100644 --- a/Lib/test/test_float.py +++ b/Lib/test/test_float.py @@ -51,7 +51,7 @@ class GeneralFloatCases(unittest.TestCase): self.assertRaises(TypeError, float, {}) self.assertRaisesRegex(TypeError, "not 'dict'", float, {}) # Lone surrogate - self.assertRaises(UnicodeEncodeError, float, '\uD8F0') + self.assertRaises(ValueError, float, '\uD8F0') # check that we don't accept alternate exponent markers self.assertRaises(ValueError, float, "-1.7d29") self.assertRaises(ValueError, float, "3D-14") diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 341007b..2b77863 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -2068,11 +2068,14 @@ class UnicodeTest(string_tests.CommonTest, # Error handling (wrong arguments) self.assertRaises(TypeError, "hello".encode, 42, 42, 42) - # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII()) - self.assertRaises(UnicodeError, float, "\ud800") - self.assertRaises(UnicodeError, float, "\udf00") - self.assertRaises(UnicodeError, complex, "\ud800") - self.assertRaises(UnicodeError, complex, "\udf00") + # Error handling (lone surrogate in + # _PyUnicode_TransformDecimalAndSpaceToASCII()) + self.assertRaises(ValueError, int, "\ud800") + self.assertRaises(ValueError, int, "\udf00") + self.assertRaises(ValueError, float, "\ud800") + self.assertRaises(ValueError, float, "\udf00") + self.assertRaises(ValueError, complex, "\ud800") + self.assertRaises(ValueError, complex, "\udf00") def test_codecs(self): # Encoding diff --git a/Objects/complexobject.c b/Objects/complexobject.c index 4bcf2ce..2c886c7 100644 --- a/Objects/complexobject.c +++ b/Objects/complexobject.c @@ -914,10 +914,10 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v) if (s_buffer == NULL) { return NULL; } + assert(PyUnicode_IS_ASCII(s_buffer)); + /* Simply get a pointer to existing ASCII characters. */ s = PyUnicode_AsUTF8AndSize(s_buffer, &len); - if (s == NULL) { - goto exit; - } + assert(s != NULL); } else { PyErr_Format(PyExc_TypeError, @@ -928,7 +928,6 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v) result = _Py_string_to_number_with_underscores(s, len, "complex", v, type, complex_from_string_inner); - exit: Py_DECREF(s_buffer); return result; } diff --git a/Objects/floatobject.c b/Objects/floatobject.c index 8d7a55a..47a174c 100644 --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -176,11 +176,10 @@ PyFloat_FromString(PyObject *v) s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v); if (s_buffer == NULL) return NULL; + assert(PyUnicode_IS_ASCII(s_buffer)); + /* Simply get a pointer to existing ASCII characters. */ s = PyUnicode_AsUTF8AndSize(s_buffer, &len); - if (s == NULL) { - Py_DECREF(s_buffer); - return NULL; - } + assert(s != NULL); } else if (PyBytes_Check(v)) { s = PyBytes_AS_STRING(v); diff --git a/Objects/longobject.c b/Objects/longobject.c index 6fd4feb..7437155 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -2509,21 +2509,18 @@ PyLong_FromUnicodeObject(PyObject *u, int base) asciidig = _PyUnicode_TransformDecimalAndSpaceToASCII(u); if (asciidig == NULL) return NULL; + assert(PyUnicode_IS_ASCII(asciidig)); + /* Simply get a pointer to existing ASCII characters. */ buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen); - if (buffer == NULL) { - Py_DECREF(asciidig); - if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) - return NULL; - } - else { - result = PyLong_FromString(buffer, &end, base); - if (end == NULL || (result != NULL && end == buffer + buflen)) { - Py_DECREF(asciidig); - return result; - } + assert(buffer != NULL); + + result = PyLong_FromString(buffer, &end, base); + if (end == NULL || (result != NULL && end == buffer + buflen)) { Py_DECREF(asciidig); - Py_XDECREF(result); + return result; } + Py_DECREF(asciidig); + Py_XDECREF(result); PyErr_Format(PyExc_ValueError, "invalid literal for int() with base %d: %.200R", base, u); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index fdc3197..f6b2b65 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj) /* --- Unicode Object ----------------------------------------------------- */ -static PyObject * -fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); - static inline Py_ssize_t findchar(const void *s, int kind, Py_ssize_t size, Py_UCS4 ch, @@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str, return _PyUnicode_TranslateCharmap(str, mapping, errors); } -static Py_UCS4 -fix_decimal_and_space_to_ascii(PyObject *self) -{ - /* No need to call PyUnicode_READY(self) because this function is only - called as a callback from fixup() which does it already. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - Py_UCS4 maxchar = 127, ch, fixed; - int modified = 0; - Py_ssize_t i; - - for (i = 0; i < len; ++i) { - ch = PyUnicode_READ(kind, data, i); - fixed = 0; - if (ch > 127) { - if (Py_UNICODE_ISSPACE(ch)) - fixed = ' '; - else { - const int decimal = Py_UNICODE_TODECIMAL(ch); - if (decimal >= 0) - fixed = '0' + decimal; - } - if (fixed != 0) { - modified = 1; - maxchar = Py_MAX(maxchar, fixed); - PyUnicode_WRITE(kind, data, i, fixed); - } - else - maxchar = Py_MAX(maxchar, ch); - } - } - - return (modified) ? maxchar : 0; -} - PyObject * _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) { @@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) } if (PyUnicode_READY(unicode) == -1) return NULL; - if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { + if (PyUnicode_IS_ASCII(unicode)) { /* If the string is already ASCII, just return the same string */ Py_INCREF(unicode); return unicode; } - return fixup(unicode, fix_decimal_and_space_to_ascii); + + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + PyObject *result = PyUnicode_New(len, 127); + if (result == NULL) { + return NULL; + } + + Py_UCS1 *out = PyUnicode_1BYTE_DATA(result); + int kind = PyUnicode_KIND(unicode); + const void *data = PyUnicode_DATA(unicode); + Py_ssize_t i; + for (i = 0; i < len; ++i) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch < 127) { + out[i] = ch; + } + else if (Py_UNICODE_ISSPACE(ch)) { + out[i] = ' '; + } + else { + int decimal = Py_UNICODE_TODECIMAL(ch); + if (decimal < 0) { + out[i] = '?'; + _PyUnicode_LENGTH(result) = i + 1; + break; + } + out[i] = '0' + decimal; + } + } + + return result; } PyObject * @@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str, return tailmatch(str, substr, start, end, direction); } -/* Apply fixfct filter to the Unicode object self and return a - reference to the modified object */ - -static PyObject * -fixup(PyObject *self, - Py_UCS4 (*fixfct)(PyObject *s)) -{ - PyObject *u; - Py_UCS4 maxchar_old, maxchar_new = 0; - PyObject *v; - - u = _PyUnicode_Copy(self); - if (u == NULL) - return NULL; - maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); - - /* fix functions return the new maximum character in a string, - if the kind of the resulting unicode object does not change, - everything is fine. Otherwise we need to change the string kind - and re-run the fix function. */ - maxchar_new = fixfct(u); - - if (maxchar_new == 0) { - /* no changes */; - if (PyUnicode_CheckExact(self)) { - Py_DECREF(u); - Py_INCREF(self); - return self; - } - else - return u; - } - - maxchar_new = align_maxchar(maxchar_new); - - if (maxchar_new == maxchar_old) - return u; - - /* In case the maximum character changed, we need to - convert the string to the new category. */ - v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); - if (v == NULL) { - Py_DECREF(u); - return NULL; - } - if (maxchar_new > maxchar_old) { - /* If the maxchar increased so that the kind changed, not all - characters are representable anymore and we need to fix the - string again. This only happens in very few cases. */ - _PyUnicode_FastCopyCharacters(v, 0, - self, 0, PyUnicode_GET_LENGTH(self)); - maxchar_old = fixfct(v); - assert(maxchar_old > 0 && maxchar_old <= maxchar_new); - } - else { - _PyUnicode_FastCopyCharacters(v, 0, - u, 0, PyUnicode_GET_LENGTH(self)); - } - Py_DECREF(u); - assert(_PyUnicode_CheckConsistency(v, 1)); - return v; -} - static PyObject * ascii_upper_or_lower(PyObject *self, int lower) { -- cgit v0.12