summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2017-11-13 19:23:48 (GMT)
committerGitHub <noreply@github.com>2017-11-13 19:23:48 (GMT)
commit9b6c60cbce4ac45e8ccd7934babff465e9769509 (patch)
tree973d37d42dfe1ce66303ad0cf658bb20870aa88e
parentce12629c84400c52734859e43b2386deb2b6da12 (diff)
downloadcpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.zip
cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.tar.gz
cpython-9b6c60cbce4ac45e8ccd7934babff465e9769509.tar.bz2
bpo-31979: Simplify transforming decimals to ASCII (#4336)
in int(), float() and complex() parsers. This also speeds up parsing non-ASCII numbers by around 20%.
-rw-r--r--Include/unicodeobject.h16
-rw-r--r--Lib/test/test_float.py2
-rw-r--r--Lib/test/test_unicode.py13
-rw-r--r--Objects/complexobject.c7
-rw-r--r--Objects/floatobject.c7
-rw-r--r--Objects/longobject.c21
-rw-r--r--Objects/unicodeobject.c136
7 files changed, 63 insertions, 139 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index d2a8ec2..61e713b 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1723,6 +1723,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
#endif /* MS_WINDOWS */
+#ifndef Py_LIMITED_API
/* --- Decimal Encoder ---------------------------------------------------- */
/* Takes a Unicode string holding a decimal value and writes it into
@@ -1747,14 +1748,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
*/
-#ifndef Py_LIMITED_API
PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Py_UNICODE *s, /* Unicode buffer */
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
char *output, /* Output buffer; must have size >= length */
const char *errors /* error handling */
) /* Py_DEPRECATED(3.3) */;
-#endif
/* Transforms code points that have decimal digit property to the
corresponding ASCII digit code points.
@@ -1762,19 +1761,18 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Returns a new Unicode string on success, NULL on failure.
*/
-#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
Py_UNICODE *s, /* Unicode buffer */
Py_ssize_t length /* Number of Py_UNICODE chars to transform */
) /* Py_DEPRECATED(3.3) */;
-#endif
-/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
- as argument instead of a raw buffer and length. This function additionally
- transforms spaces to ASCII because this is what the callers in longobject,
- floatobject, and complexobject did anyways. */
+/* Coverts a Unicode object holding a decimal value to an ASCII string
+ for using in int, float and complex parsers.
+ Transforms code points that have decimal digit property to the
+ corresponding ASCII digit code points. Transforms spaces to ASCII.
+ Transforms code points starting from the first non-ASCII code point that
+ is neither a decimal digit nor a space to the end into '?'. */
-#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
PyObject *unicode /* Unicode object */
);
diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py
index a16c05c..17174dd 100644
--- a/Lib/test/test_float.py
+++ b/Lib/test/test_float.py
@@ -51,7 +51,7 @@ class GeneralFloatCases(unittest.TestCase):
self.assertRaises(TypeError, float, {})
self.assertRaisesRegex(TypeError, "not 'dict'", float, {})
# Lone surrogate
- self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
+ self.assertRaises(ValueError, float, '\uD8F0')
# check that we don't accept alternate exponent markers
self.assertRaises(ValueError, float, "-1.7d29")
self.assertRaises(ValueError, float, "3D-14")
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 341007b..2b77863 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -2068,11 +2068,14 @@ class UnicodeTest(string_tests.CommonTest,
# Error handling (wrong arguments)
self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
- # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
- self.assertRaises(UnicodeError, float, "\ud800")
- self.assertRaises(UnicodeError, float, "\udf00")
- self.assertRaises(UnicodeError, complex, "\ud800")
- self.assertRaises(UnicodeError, complex, "\udf00")
+ # Error handling (lone surrogate in
+ # _PyUnicode_TransformDecimalAndSpaceToASCII())
+ self.assertRaises(ValueError, int, "\ud800")
+ self.assertRaises(ValueError, int, "\udf00")
+ self.assertRaises(ValueError, float, "\ud800")
+ self.assertRaises(ValueError, float, "\udf00")
+ self.assertRaises(ValueError, complex, "\ud800")
+ self.assertRaises(ValueError, complex, "\udf00")
def test_codecs(self):
# Encoding
diff --git a/Objects/complexobject.c b/Objects/complexobject.c
index 4bcf2ce..2c886c7 100644
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@@ -914,10 +914,10 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
if (s_buffer == NULL) {
return NULL;
}
+ assert(PyUnicode_IS_ASCII(s_buffer));
+ /* Simply get a pointer to existing ASCII characters. */
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
- if (s == NULL) {
- goto exit;
- }
+ assert(s != NULL);
}
else {
PyErr_Format(PyExc_TypeError,
@@ -928,7 +928,6 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
result = _Py_string_to_number_with_underscores(s, len, "complex", v, type,
complex_from_string_inner);
- exit:
Py_DECREF(s_buffer);
return result;
}
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
index 8d7a55a..47a174c 100644
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -176,11 +176,10 @@ PyFloat_FromString(PyObject *v)
s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v);
if (s_buffer == NULL)
return NULL;
+ assert(PyUnicode_IS_ASCII(s_buffer));
+ /* Simply get a pointer to existing ASCII characters. */
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
- if (s == NULL) {
- Py_DECREF(s_buffer);
- return NULL;
- }
+ assert(s != NULL);
}
else if (PyBytes_Check(v)) {
s = PyBytes_AS_STRING(v);
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 6fd4feb..7437155 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -2509,21 +2509,18 @@ PyLong_FromUnicodeObject(PyObject *u, int base)
asciidig = _PyUnicode_TransformDecimalAndSpaceToASCII(u);
if (asciidig == NULL)
return NULL;
+ assert(PyUnicode_IS_ASCII(asciidig));
+ /* Simply get a pointer to existing ASCII characters. */
buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen);
- if (buffer == NULL) {
- Py_DECREF(asciidig);
- if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
- return NULL;
- }
- else {
- result = PyLong_FromString(buffer, &end, base);
- if (end == NULL || (result != NULL && end == buffer + buflen)) {
- Py_DECREF(asciidig);
- return result;
- }
+ assert(buffer != NULL);
+
+ result = PyLong_FromString(buffer, &end, base);
+ if (end == NULL || (result != NULL && end == buffer + buflen)) {
Py_DECREF(asciidig);
- Py_XDECREF(result);
+ return result;
}
+ Py_DECREF(asciidig);
+ Py_XDECREF(result);
PyErr_Format(PyExc_ValueError,
"invalid literal for int() with base %d: %.200R",
base, u);
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index fdc3197..f6b2b65 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)
/* --- Unicode Object ----------------------------------------------------- */
-static PyObject *
-fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
-
static inline Py_ssize_t
findchar(const void *s, int kind,
Py_ssize_t size, Py_UCS4 ch,
@@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,
return _PyUnicode_TranslateCharmap(str, mapping, errors);
}
-static Py_UCS4
-fix_decimal_and_space_to_ascii(PyObject *self)
-{
- /* No need to call PyUnicode_READY(self) because this function is only
- called as a callback from fixup() which does it already. */
- const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
- const int kind = PyUnicode_KIND(self);
- void *data = PyUnicode_DATA(self);
- Py_UCS4 maxchar = 127, ch, fixed;
- int modified = 0;
- Py_ssize_t i;
-
- for (i = 0; i < len; ++i) {
- ch = PyUnicode_READ(kind, data, i);
- fixed = 0;
- if (ch > 127) {
- if (Py_UNICODE_ISSPACE(ch))
- fixed = ' ';
- else {
- const int decimal = Py_UNICODE_TODECIMAL(ch);
- if (decimal >= 0)
- fixed = '0' + decimal;
- }
- if (fixed != 0) {
- modified = 1;
- maxchar = Py_MAX(maxchar, fixed);
- PyUnicode_WRITE(kind, data, i, fixed);
- }
- else
- maxchar = Py_MAX(maxchar, ch);
- }
- }
-
- return (modified) ? maxchar : 0;
-}
-
PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
{
@@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
- if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
+ if (PyUnicode_IS_ASCII(unicode)) {
/* If the string is already ASCII, just return the same string */
Py_INCREF(unicode);
return unicode;
}
- return fixup(unicode, fix_decimal_and_space_to_ascii);
+
+ Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+ PyObject *result = PyUnicode_New(len, 127);
+ if (result == NULL) {
+ return NULL;
+ }
+
+ Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
+ int kind = PyUnicode_KIND(unicode);
+ const void *data = PyUnicode_DATA(unicode);
+ Py_ssize_t i;
+ for (i = 0; i < len; ++i) {
+ Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+ if (ch < 127) {
+ out[i] = ch;
+ }
+ else if (Py_UNICODE_ISSPACE(ch)) {
+ out[i] = ' ';
+ }
+ else {
+ int decimal = Py_UNICODE_TODECIMAL(ch);
+ if (decimal < 0) {
+ out[i] = '?';
+ _PyUnicode_LENGTH(result) = i + 1;
+ break;
+ }
+ out[i] = '0' + decimal;
+ }
+ }
+
+ return result;
}
PyObject *
@@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,
return tailmatch(str, substr, start, end, direction);
}
-/* Apply fixfct filter to the Unicode object self and return a
- reference to the modified object */
-
-static PyObject *
-fixup(PyObject *self,
- Py_UCS4 (*fixfct)(PyObject *s))
-{
- PyObject *u;
- Py_UCS4 maxchar_old, maxchar_new = 0;
- PyObject *v;
-
- u = _PyUnicode_Copy(self);
- if (u == NULL)
- return NULL;
- maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
-
- /* fix functions return the new maximum character in a string,
- if the kind of the resulting unicode object does not change,
- everything is fine. Otherwise we need to change the string kind
- and re-run the fix function. */
- maxchar_new = fixfct(u);
-
- if (maxchar_new == 0) {
- /* no changes */;
- if (PyUnicode_CheckExact(self)) {
- Py_DECREF(u);
- Py_INCREF(self);
- return self;
- }
- else
- return u;
- }
-
- maxchar_new = align_maxchar(maxchar_new);
-
- if (maxchar_new == maxchar_old)
- return u;
-
- /* In case the maximum character changed, we need to
- convert the string to the new category. */
- v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
- if (v == NULL) {
- Py_DECREF(u);
- return NULL;
- }
- if (maxchar_new > maxchar_old) {
- /* If the maxchar increased so that the kind changed, not all
- characters are representable anymore and we need to fix the
- string again. This only happens in very few cases. */
- _PyUnicode_FastCopyCharacters(v, 0,
- self, 0, PyUnicode_GET_LENGTH(self));
- maxchar_old = fixfct(v);
- assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
- }
- else {
- _PyUnicode_FastCopyCharacters(v, 0,
- u, 0, PyUnicode_GET_LENGTH(self));
- }
- Py_DECREF(u);
- assert(_PyUnicode_CheckConsistency(v, 1));
- return v;
-}
-
static PyObject *
ascii_upper_or_lower(PyObject *self, int lower)
{