summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/c-api/unicode.rst7
-rw-r--r--Include/unicodeobject.h11
-rw-r--r--Lib/test/test_complex.py3
-rw-r--r--Lib/test/test_float.py18
-rw-r--r--Lib/test/test_int.py15
-rw-r--r--Lib/test/test_unicode.py9
-rw-r--r--Misc/NEWS4
-rw-r--r--Objects/complexobject.c30
-rw-r--r--Objects/floatobject.c58
-rw-r--r--Objects/longobject.c31
-rw-r--r--Objects/unicodeobject.c35
11 files changed, 169 insertions, 52 deletions
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 4530422..9edbcbb 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -328,6 +328,13 @@ APIs:
Identical to :c:func:`PyUnicode_FromFormat` except that it takes exactly two
arguments.
+.. c:function:: PyObject* PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, Py_ssize_t size)
+
+ Create a Unicode object by replacing all decimal digits in
+ :c:type:`Py_UNICODE` buffer of the given size by ASCII digits 0--9
+ according to their decimal value. Return *NULL* if an exception
+ occurs.
+
.. c:function:: Py_UNICODE* PyUnicode_AsUnicode(PyObject *unicode)
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 116bb82..abd286d 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1225,6 +1225,17 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
);
#endif
+/* Transforms code points that have decimal digit property to the
+ corresponding ASCII digit code points.
+
+ Returns a new Unicode string on success, NULL on failure.
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
+ Py_UNICODE *s, /* Unicode buffer */
+ Py_ssize_t length /* Number of Py_UNICODE chars to transform */
+ );
+
/* --- File system encoding ---------------------------------------------- */
/* ParseTuple converter: encode str objects to bytes using
diff --git a/Lib/test/test_complex.py b/Lib/test/test_complex.py
index cc21aa7..2352ef1 100644
--- a/Lib/test/test_complex.py
+++ b/Lib/test/test_complex.py
@@ -220,6 +220,7 @@ class ComplexTest(unittest.TestCase):
self.assertEqual(complex(NS(1+10j)), 1+10j)
self.assertRaises(TypeError, complex, OS(None))
self.assertRaises(TypeError, complex, NS(None))
+ self.assertRaises(TypeError, complex, {})
self.assertAlmostEqual(complex("1+10j"), 1+10j)
self.assertAlmostEqual(complex(10), 10+0j)
@@ -325,6 +326,8 @@ class ComplexTest(unittest.TestCase):
# check that complex accepts long unicode strings
self.assertEqual(type(complex("1"*500)), complex)
+ # check whitespace processing
+ self.assertEqual(complex('\N{EM SPACE}(\N{EN SPACE}1+1j ) '), 1+1j)
class EvilExc(Exception):
pass
diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py
index 0072133..9bcd63d 100644
--- a/Lib/test/test_float.py
+++ b/Lib/test/test_float.py
@@ -43,14 +43,30 @@ class GeneralFloatCases(unittest.TestCase):
self.assertRaises(ValueError, float, "+.inf")
self.assertRaises(ValueError, float, ".")
self.assertRaises(ValueError, float, "-.")
+ self.assertRaises(ValueError, float, b"-")
+ self.assertRaises(TypeError, float, {})
+ # Lone surrogate
+ self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
# check that we don't accept alternate exponent markers
self.assertRaises(ValueError, float, "-1.7d29")
self.assertRaises(ValueError, float, "3D-14")
- self.assertEqual(float(b" \u0663.\u0661\u0664 ".decode('raw-unicode-escape')), 3.14)
+ self.assertEqual(float(" \u0663.\u0661\u0664 "), 3.14)
+ self.assertEqual(float("\N{EM SPACE}3.14\N{EN SPACE}"), 3.14)
# extra long strings should not be a problem
float(b'.' + b'1'*1000)
float('.' + '1'*1000)
+ def test_error_message(self):
+ testlist = ('\xbd', '123\xbd', ' 123 456 ')
+ for s in testlist:
+ try:
+ float(s)
+ except ValueError as e:
+ self.assertIn(s.strip(), e.args[0])
+ else:
+ self.fail("Expected int(%r) to raise a ValueError", s)
+
+
@support.run_with_locale('LC_NUMERIC', 'fr_FR', 'de_DE')
def test_float_with_comma(self):
# set locale to something that doesn't use '.' for the decimal point
diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py
index 86c4dd7..437e323 100644
--- a/Lib/test/test_int.py
+++ b/Lib/test/test_int.py
@@ -20,7 +20,8 @@ L = [
(' 1\02 ', ValueError),
('', ValueError),
(' ', ValueError),
- (' \t\t ', ValueError)
+ (' \t\t ', ValueError),
+ ("\u0200", ValueError)
]
class IntTestCases(unittest.TestCase):
@@ -35,6 +36,8 @@ class IntTestCases(unittest.TestCase):
self.assertEqual(int(3.5), 3)
self.assertEqual(int(-3.5), -3)
self.assertEqual(int("-3"), -3)
+ self.assertEqual(int(" -3 "), -3)
+ self.assertEqual(int("\N{EM SPACE}-3\N{EN SPACE}"), -3)
# Different base:
self.assertEqual(int("10",16), 16)
# Test conversion from strings and various anomalies
@@ -302,6 +305,16 @@ class IntTestCases(unittest.TestCase):
self.fail("Failed to raise TypeError with %s" %
((base, trunc_result_base),))
+ def test_error_message(self):
+ testlist = ('\xbd', '123\xbd', ' 123 456 ')
+ for s in testlist:
+ try:
+ int(s)
+ except ValueError as e:
+ self.assertIn(s.strip(), e.args[0])
+ else:
+ self.fail("Expected int(%r) to raise a ValueError", s)
+
def test_main():
run_unittest(IntTestCases)
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index c5a0f80..2de9e7f 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1168,8 +1168,13 @@ class UnicodeTest(string_tests.CommonTest,
# Error handling (wrong arguments)
self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
- # Error handling (PyUnicode_EncodeDecimal())
- self.assertRaises(UnicodeError, int, "\u0200")
+ # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
+ self.assertRaises(UnicodeError, int, "\ud800")
+ self.assertRaises(UnicodeError, int, "\udf00")
+ self.assertRaises(UnicodeError, float, "\ud800")
+ self.assertRaises(UnicodeError, float, "\udf00")
+ self.assertRaises(UnicodeError, complex, "\ud800")
+ self.assertRaises(UnicodeError, complex, "\udf00")
def test_codecs(self):
# Encoding
diff --git a/Misc/NEWS b/Misc/NEWS
index 59946bd..f53a486 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -222,6 +222,10 @@ Library
C-API
-----
+- Issue #10557: Added a new API function, PyUnicode_TransformDecimalToASCII(),
+ which transforms non-ASCII decimal digits in a Unicode string to their
+ ASCII equivalents.
+
- Issue #9518: Extend the PyModuleDef_HEAD_INIT macro to explicitly
zero-initialize all fields, fixing compiler warnings seen when building
extension modules with gcc with "-Wmissing-field-initializers" (implied by
diff --git a/Objects/complexobject.c b/Objects/complexobject.c
index 5999796..ec529d5 100644
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@@ -766,20 +766,26 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
char *end;
double x=0.0, y=0.0, z;
int got_bracket=0;
- char *s_buffer = NULL;
+ PyObject *s_buffer = NULL;
Py_ssize_t len;
if (PyUnicode_Check(v)) {
- s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v) + 1);
+ Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v);
+ Py_UNICODE *bufptr;
+ s_buffer = PyUnicode_TransformDecimalToASCII(
+ PyUnicode_AS_UNICODE(v), buflen);
if (s_buffer == NULL)
- return PyErr_NoMemory();
- if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v),
- PyUnicode_GET_SIZE(v),
- s_buffer,
- NULL))
+ return NULL;
+ /* Replace non-ASCII whitespace with ' ' */
+ bufptr = PyUnicode_AS_UNICODE(s_buffer);
+ for (i = 0; i < buflen; i++) {
+ Py_UNICODE ch = bufptr[i];
+ if (ch > 127 && Py_UNICODE_ISSPACE(ch))
+ bufptr[i] = ' ';
+ }
+ s = _PyUnicode_AsStringAndSize(s_buffer, &len);
+ if (s == NULL)
goto error;
- s = s_buffer;
- len = strlen(s);
}
else if (PyObject_AsCharBuffer(v, &s, &len)) {
PyErr_SetString(PyExc_TypeError,
@@ -894,16 +900,14 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
if (s-start != len)
goto parse_error;
- if (s_buffer)
- PyMem_FREE(s_buffer);
+ Py_XDECREF(s_buffer);
return complex_subtype_from_doubles(type, x, y);
parse_error:
PyErr_SetString(PyExc_ValueError,
"complex() arg is a malformed string");
error:
- if (s_buffer)
- PyMem_FREE(s_buffer);
+ Py_XDECREF(s_buffer);
return NULL;
}
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
index 4decb0b..8409f0a 100644
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -174,22 +174,30 @@ PyFloat_FromString(PyObject *v)
{
const char *s, *last, *end;
double x;
- char buffer[256]; /* for errors */
- char *s_buffer = NULL;
+ PyObject *s_buffer = NULL;
Py_ssize_t len;
PyObject *result = NULL;
if (PyUnicode_Check(v)) {
- s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v)+1);
+ Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v);
+ Py_UNICODE *bufptr;
+ s_buffer = PyUnicode_TransformDecimalToASCII(
+ PyUnicode_AS_UNICODE(v), buflen);
if (s_buffer == NULL)
- return PyErr_NoMemory();
- if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v),
- PyUnicode_GET_SIZE(v),
- s_buffer,
- NULL))
- goto error;
- s = s_buffer;
- len = strlen(s);
+ return NULL;
+ /* Replace non-ASCII whitespace with ' ' */
+ bufptr = PyUnicode_AS_UNICODE(s_buffer);
+ for (i = 0; i < buflen; i++) {
+ Py_UNICODE ch = bufptr[i];
+ if (ch > 127 && Py_UNICODE_ISSPACE(ch))
+ bufptr[i] = ' ';
+ }
+ s = _PyUnicode_AsStringAndSize(s_buffer, &len);
+ if (s == NULL) {
+ Py_DECREF(s_buffer);
+ return NULL;
+ }
+ last = s + len;
}
else if (PyObject_AsCharBuffer(v, &s, &len)) {
PyErr_SetString(PyExc_TypeError,
@@ -197,29 +205,27 @@ PyFloat_FromString(PyObject *v)
return NULL;
}
last = s + len;
-
- while (Py_ISSPACE(*s))
+ /* strip space */
+ while (s < last && Py_ISSPACE(*s))
s++;
+ while (s < last - 1 && Py_ISSPACE(last[-1]))
+ last--;
/* We don't care about overflow or underflow. If the platform
* supports them, infinities and signed zeroes (on underflow) are
* fine. */
x = PyOS_string_to_double(s, (char **)&end, NULL);
- if (x == -1.0 && PyErr_Occurred())
- goto error;
- while (Py_ISSPACE(*end))
- end++;
- if (end == last)
- result = PyFloat_FromDouble(x);
- else {
- PyOS_snprintf(buffer, sizeof(buffer),
- "invalid literal for float(): %.200s", s);
- PyErr_SetString(PyExc_ValueError, buffer);
+ if (end != last) {
+ PyErr_Format(PyExc_ValueError,
+ "could not convert string to float: "
+ "%R", v);
result = NULL;
}
+ else if (x == -1.0 && PyErr_Occurred())
+ result = NULL;
+ else
+ result = PyFloat_FromDouble(x);
- error:
- if (s_buffer)
- PyMem_FREE(s_buffer);
+ Py_XDECREF(s_buffer);
return result;
}
diff --git a/Objects/longobject.c b/Objects/longobject.c
index e8a7284..534e52d 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -2133,17 +2133,34 @@ PyObject *
PyLong_FromUnicode(Py_UNICODE *u, Py_ssize_t length, int base)
{
PyObject *result;
- char *buffer = (char *)PyMem_MALLOC(length+1);
+ PyObject *asciidig;
+ char *buffer, *end;
+ Py_ssize_t i, buflen;
+ Py_UNICODE *ptr;
- if (buffer == NULL)
+ asciidig = PyUnicode_TransformDecimalToASCII(u, length);
+ if (asciidig == NULL)
return NULL;
-
- if (PyUnicode_EncodeDecimal(u, length, buffer, NULL)) {
- PyMem_FREE(buffer);
+ /* Replace non-ASCII whitespace with ' ' */
+ ptr = PyUnicode_AS_UNICODE(asciidig);
+ for (i = 0; i < length; i++) {
+ Py_UNICODE ch = ptr[i];
+ if (ch > 127 && Py_UNICODE_ISSPACE(ch))
+ ptr[i] = ' ';
+ }
+ buffer = _PyUnicode_AsStringAndSize(asciidig, &buflen);
+ if (buffer == NULL) {
+ Py_DECREF(asciidig);
return NULL;
}
- result = PyLong_FromString(buffer, NULL, base);
- PyMem_FREE(buffer);
+ result = PyLong_FromString(buffer, &end, base);
+ if (result != NULL && end != buffer + buflen) {
+ PyErr_SetString(PyExc_ValueError,
+ "null byte in argument for int()");
+ Py_DECREF(result);
+ result = NULL;
+ }
+ Py_DECREF(asciidig);
return result;
}
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index d3a2d1b..751da30 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6206,6 +6206,30 @@ PyObject *PyUnicode_Translate(PyObject *str,
return NULL;
}
+PyObject *
+PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
+ Py_ssize_t length)
+{
+ PyObject *result;
+ Py_UNICODE *p; /* write pointer into result */
+ Py_ssize_t i;
+ /* Copy to a new string */
+ result = (PyObject *)_PyUnicode_New(length);
+ Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
+ if (result == NULL)
+ return result;
+ p = PyUnicode_AS_UNICODE(result);
+ /* Iterate over code points */
+ for (i = 0; i < length; i++) {
+ Py_UNICODE ch =s[i];
+ if (ch > 127) {
+ int decimal = Py_UNICODE_TODECIMAL(ch);
+ if (decimal >= 0)
+ p[i] = '0' + decimal;
+ }
+ }
+ return result;
+}
/* --- Decimal Encoder ---------------------------------------------------- */
int PyUnicode_EncodeDecimal(Py_UNICODE *s,
@@ -8967,6 +8991,13 @@ unicode_freelistsize(PyUnicodeObject *self)
{
return PyLong_FromLong(numfree);
}
+
+static PyObject *
+unicode__decimal2ascii(PyObject *self)
+{
+ return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
+ PyUnicode_GET_SIZE(self));
+}
#endif
PyDoc_STRVAR(startswith__doc__,
@@ -9108,7 +9139,6 @@ unicode_getnewargs(PyUnicodeObject *v)
return Py_BuildValue("(u#)", v->str, v->length);
}
-
static PyMethodDef unicode_methods[] = {
/* Order is according to common usage: often used methods should
@@ -9170,8 +9200,9 @@ static PyMethodDef unicode_methods[] = {
#endif
#if 0
- /* This one is just used for debugging the implementation. */
+ /* These methods are just used for debugging the implementation. */
{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
+ {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
#endif
{"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},