Issue #10557: Fixed error messages from float() and other numeric

types. Added a new API function, PyUnicode_TransformDecimalToASCII(), which transforms non-ASCII decimal digits in a Unicode string to their ASCII equivalents.
author: Alexander Belopolsky <alexander.belopolsky@gmail.com> 2010-12-04 03:38:46 (GMT)
committer: Alexander Belopolsky <alexander.belopolsky@gmail.com> 2010-12-04 03:38:46 (GMT)
commit: 942af5a9a45b7b4976bea2e794eccaaf2b3b5c09 (patch)
tree: f621bdffa16dd0b04d7bf60d6a32f198fc7b3ec8
parent: 36526bf3d95763afa6d4efe402b8840b1532d637 (diff)
download: cpython-942af5a9a45b7b4976bea2e794eccaaf2b3b5c09.zip
cpython-942af5a9a45b7b4976bea2e794eccaaf2b3b5c09.tar.gz
cpython-942af5a9a45b7b4976bea2e794eccaaf2b3b5c09.tar.bz2
11 files changed, 169 insertions, 52 deletions
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 4530422..9edbcbb 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -328,6 +328,13 @@ APIs:
    Identical to :c:func:`PyUnicode_FromFormat` except that it takes exactly two
    arguments.
 
+.. c:function:: PyObject* PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, Py_ssize_t size)
+
+   Create a Unicode object by replacing all decimal digits in
+   :c:type:`Py_UNICODE` buffer of the given size by ASCII digits 0--9
+   according to their decimal value.  Return *NULL* if an exception
+   occurs.
+
 
 .. c:function:: Py_UNICODE* PyUnicode_AsUnicode(PyObject *unicode)
 
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 116bb82..abd286d 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1225,6 +1225,17 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
     );
 #endif
 
+/* Transforms code points that have decimal digit property to the
+   corresponding ASCII digit code points.
+
+   Returns a new Unicode string on success, NULL on failure.
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
+    Py_UNICODE *s,              /* Unicode buffer */
+    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
+    );
+
 /* --- File system encoding ---------------------------------------------- */
 
 /* ParseTuple converter: encode str objects to bytes using
diff --git a/Lib/test/test_complex.py b/Lib/test/test_complex.py
index cc21aa7..2352ef1 100644
--- a/Lib/test/test_complex.py
+++ b/Lib/test/test_complex.py
@@ -220,6 +220,7 @@ class ComplexTest(unittest.TestCase):
         self.assertEqual(complex(NS(1+10j)), 1+10j)
         self.assertRaises(TypeError, complex, OS(None))
         self.assertRaises(TypeError, complex, NS(None))
+        self.assertRaises(TypeError, complex, {})
 
         self.assertAlmostEqual(complex("1+10j"), 1+10j)
         self.assertAlmostEqual(complex(10), 10+0j)
@@ -325,6 +326,8 @@ class ComplexTest(unittest.TestCase):
 
         # check that complex accepts long unicode strings
         self.assertEqual(type(complex("1"*500)), complex)
+        # check whitespace processing
+        self.assertEqual(complex('\N{EM SPACE}(\N{EN SPACE}1+1j ) '), 1+1j)
 
         class EvilExc(Exception):
             pass
diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py
index 0072133..9bcd63d 100644
--- a/Lib/test/test_float.py
+++ b/Lib/test/test_float.py
@@ -43,14 +43,30 @@ class GeneralFloatCases(unittest.TestCase):
         self.assertRaises(ValueError, float, "+.inf")
         self.assertRaises(ValueError, float, ".")
         self.assertRaises(ValueError, float, "-.")
+        self.assertRaises(ValueError, float, b"-")
+        self.assertRaises(TypeError, float, {})
+        # Lone surrogate
+        self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
         # check that we don't accept alternate exponent markers
         self.assertRaises(ValueError, float, "-1.7d29")
         self.assertRaises(ValueError, float, "3D-14")
-        self.assertEqual(float(b"  \u0663.\u0661\u0664  ".decode('raw-unicode-escape')), 3.14)
+        self.assertEqual(float("  \u0663.\u0661\u0664  "), 3.14)
+        self.assertEqual(float("\N{EM SPACE}3.14\N{EN SPACE}"), 3.14)
         # extra long strings should not be a problem
         float(b'.' + b'1'*1000)
         float('.' + '1'*1000)
 
+    def test_error_message(self):
+        testlist = ('\xbd', '123\xbd', '  123 456  ')
+        for s in testlist:
+            try:
+                float(s)
+            except ValueError as e:
+                self.assertIn(s.strip(), e.args[0])
+            else:
+                self.fail("Expected int(%r) to raise a ValueError", s)
+
+
     @support.run_with_locale('LC_NUMERIC', 'fr_FR', 'de_DE')
     def test_float_with_comma(self):
         # set locale to something that doesn't use '.' for the decimal point
diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py
index 86c4dd7..437e323 100644
--- a/Lib/test/test_int.py
+++ b/Lib/test/test_int.py
@@ -20,7 +20,8 @@ L = [
         ('  1\02  ', ValueError),
         ('', ValueError),
         (' ', ValueError),
-        ('  \t\t  ', ValueError)
+        ('  \t\t  ', ValueError),
+        ("\u0200", ValueError)
 ]
 
 class IntTestCases(unittest.TestCase):
@@ -35,6 +36,8 @@ class IntTestCases(unittest.TestCase):
         self.assertEqual(int(3.5), 3)
         self.assertEqual(int(-3.5), -3)
         self.assertEqual(int("-3"), -3)
+        self.assertEqual(int(" -3 "), -3)
+        self.assertEqual(int("\N{EM SPACE}-3\N{EN SPACE}"), -3)
         # Different base:
         self.assertEqual(int("10",16), 16)
         # Test conversion from strings and various anomalies
@@ -302,6 +305,16 @@ class IntTestCases(unittest.TestCase):
                     self.fail("Failed to raise TypeError with %s" %
                               ((base, trunc_result_base),))
 
+    def test_error_message(self):
+        testlist = ('\xbd', '123\xbd', '  123 456  ')
+        for s in testlist:
+            try:
+                int(s)
+            except ValueError as e:
+                self.assertIn(s.strip(), e.args[0])
+            else:
+                self.fail("Expected int(%r) to raise a ValueError", s)
+
 def test_main():
     run_unittest(IntTestCases)
 
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index c5a0f80..2de9e7f 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1168,8 +1168,13 @@ class UnicodeTest(string_tests.CommonTest,
         # Error handling (wrong arguments)
         self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
 
-        # Error handling (PyUnicode_EncodeDecimal())
-        self.assertRaises(UnicodeError, int, "\u0200")
+        # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
+        self.assertRaises(UnicodeError, int, "\ud800")
+        self.assertRaises(UnicodeError, int, "\udf00")
+        self.assertRaises(UnicodeError, float, "\ud800")
+        self.assertRaises(UnicodeError, float, "\udf00")
+        self.assertRaises(UnicodeError, complex, "\ud800")
+        self.assertRaises(UnicodeError, complex, "\udf00")
 
     def test_codecs(self):
         # Encoding
diff --git a/Misc/NEWS b/Misc/NEWS
index 59946bd..f53a486 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -222,6 +222,10 @@ Library
 C-API
 -----
 
+- Issue #10557: Added a new API function, PyUnicode_TransformDecimalToASCII(),
+  which transforms non-ASCII decimal digits in a Unicode string to their
+  ASCII equivalents. 
+
 - Issue #9518: Extend the PyModuleDef_HEAD_INIT macro to explicitly
   zero-initialize all fields, fixing compiler warnings seen when building
   extension modules with gcc with "-Wmissing-field-initializers" (implied by
diff --git a/Objects/complexobject.c b/Objects/complexobject.c
index 5999796..ec529d5 100644
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@@ -766,20 +766,26 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
     char *end;
     double x=0.0, y=0.0, z;
     int got_bracket=0;
-    char *s_buffer = NULL;
+    PyObject *s_buffer = NULL;
     Py_ssize_t len;
 
     if (PyUnicode_Check(v)) {
-        s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v) + 1);
+        Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v);
+        Py_UNICODE *bufptr;
+        s_buffer = PyUnicode_TransformDecimalToASCII(
+            PyUnicode_AS_UNICODE(v), buflen);
         if (s_buffer == NULL)
-            return PyErr_NoMemory();
-        if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v),
-                                    PyUnicode_GET_SIZE(v),
-                                    s_buffer,
-                                    NULL))
+            return NULL;
+        /* Replace non-ASCII whitespace with ' ' */
+        bufptr = PyUnicode_AS_UNICODE(s_buffer);
+        for (i = 0; i < buflen; i++) {
+            Py_UNICODE ch = bufptr[i];
+            if (ch > 127 && Py_UNICODE_ISSPACE(ch))
+                bufptr[i] = ' ';
+        }
+        s = _PyUnicode_AsStringAndSize(s_buffer, &len);
+        if (s == NULL)
             goto error;
-        s = s_buffer;
-        len = strlen(s);
     }
     else if (PyObject_AsCharBuffer(v, &s, &len)) {
         PyErr_SetString(PyExc_TypeError,
@@ -894,16 +900,14 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
     if (s-start != len)
         goto parse_error;
 
-    if (s_buffer)
-        PyMem_FREE(s_buffer);
+    Py_XDECREF(s_buffer);
     return complex_subtype_from_doubles(type, x, y);
 
   parse_error:
     PyErr_SetString(PyExc_ValueError,
                     "complex() arg is a malformed string");
   error:
-    if (s_buffer)
-        PyMem_FREE(s_buffer);
+    Py_XDECREF(s_buffer);
     return NULL;
 }
 
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
index 4decb0b..8409f0a 100644
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -174,22 +174,30 @@ PyFloat_FromString(PyObject *v)
 {
     const char *s, *last, *end;
     double x;
-    char buffer[256]; /* for errors */
-    char *s_buffer = NULL;
+    PyObject *s_buffer = NULL;
     Py_ssize_t len;
     PyObject *result = NULL;
 
     if (PyUnicode_Check(v)) {
-        s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v)+1);
+        Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v);
+        Py_UNICODE *bufptr;
+        s_buffer = PyUnicode_TransformDecimalToASCII(
+            PyUnicode_AS_UNICODE(v), buflen);
         if (s_buffer == NULL)
-            return PyErr_NoMemory();
-        if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v),
-                                    PyUnicode_GET_SIZE(v),
-                                    s_buffer,
-                                    NULL))
-            goto error;
-        s = s_buffer;
-        len = strlen(s);
+            return NULL;
+        /* Replace non-ASCII whitespace with ' ' */
+        bufptr = PyUnicode_AS_UNICODE(s_buffer);
+        for (i = 0; i < buflen; i++) {
+            Py_UNICODE ch = bufptr[i];
+            if (ch > 127 && Py_UNICODE_ISSPACE(ch))
+                bufptr[i] = ' ';
+        }
+        s = _PyUnicode_AsStringAndSize(s_buffer, &len);
+        if (s == NULL) {
+            Py_DECREF(s_buffer);
+            return NULL;
+        }
+        last = s + len;
     }
     else if (PyObject_AsCharBuffer(v, &s, &len)) {
         PyErr_SetString(PyExc_TypeError,
@@ -197,29 +205,27 @@ PyFloat_FromString(PyObject *v)
         return NULL;
     }
     last = s + len;
-
-    while (Py_ISSPACE(*s))
+    /* strip space */
+    while (s < last && Py_ISSPACE(*s))
         s++;
+    while (s < last - 1 && Py_ISSPACE(last[-1]))
+        last--;
     /* We don't care about overflow or underflow.  If the platform
      * supports them, infinities and signed zeroes (on underflow) are
      * fine. */
     x = PyOS_string_to_double(s, (char **)&end, NULL);
-    if (x == -1.0 && PyErr_Occurred())
-        goto error;
-    while (Py_ISSPACE(*end))
-        end++;
-    if (end == last)
-        result = PyFloat_FromDouble(x);
-    else {
-        PyOS_snprintf(buffer, sizeof(buffer),
-                      "invalid literal for float(): %.200s", s);
-        PyErr_SetString(PyExc_ValueError, buffer);
+    if (end != last) {
+        PyErr_Format(PyExc_ValueError,
+                     "could not convert string to float: "
+                     "%R", v);
         result = NULL;
     }
+    else if (x == -1.0 && PyErr_Occurred())
+        result = NULL;
+    else
+        result = PyFloat_FromDouble(x);
 
-  error:
-    if (s_buffer)
-        PyMem_FREE(s_buffer);
+    Py_XDECREF(s_buffer);
     return result;
 }
 
diff --git a/Objects/longobject.c b/Objects/longobject.c
index e8a7284..534e52d 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -2133,17 +2133,34 @@ PyObject *
 PyLong_FromUnicode(Py_UNICODE *u, Py_ssize_t length, int base)
 {
     PyObject *result;
-    char *buffer = (char *)PyMem_MALLOC(length+1);
+    PyObject *asciidig;
+    char *buffer, *end;
+    Py_ssize_t i, buflen;
+    Py_UNICODE *ptr;
 
-    if (buffer == NULL)
+    asciidig = PyUnicode_TransformDecimalToASCII(u, length);
+    if (asciidig == NULL)
         return NULL;
-
-    if (PyUnicode_EncodeDecimal(u, length, buffer, NULL)) {
-        PyMem_FREE(buffer);
+    /* Replace non-ASCII whitespace with ' ' */
+    ptr = PyUnicode_AS_UNICODE(asciidig);
+    for (i = 0; i < length; i++) {
+      Py_UNICODE ch = ptr[i];
+      if (ch > 127 && Py_UNICODE_ISSPACE(ch))
+        ptr[i] = ' ';
+    }
+    buffer = _PyUnicode_AsStringAndSize(asciidig, &buflen);
+    if (buffer == NULL) {
+        Py_DECREF(asciidig);
         return NULL;
     }
-    result = PyLong_FromString(buffer, NULL, base);
-    PyMem_FREE(buffer);
+    result = PyLong_FromString(buffer, &end, base);
+    if (result != NULL && end != buffer + buflen) {
+        PyErr_SetString(PyExc_ValueError,
+                        "null byte in argument for int()");
+        Py_DECREF(result);
+        result = NULL;
+    }
+    Py_DECREF(asciidig);
     return result;
 }
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index d3a2d1b..751da30 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6206,6 +6206,30 @@ PyObject *PyUnicode_Translate(PyObject *str,
     return NULL;
 }
 
+PyObject *
+PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
+                                  Py_ssize_t length)
+{
+    PyObject *result;
+    Py_UNICODE *p; /* write pointer into result */
+    Py_ssize_t i;
+    /* Copy to a new string */
+    result = (PyObject *)_PyUnicode_New(length);
+    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
+    if (result == NULL)
+        return result;
+    p = PyUnicode_AS_UNICODE(result);
+    /* Iterate over code points */
+    for (i = 0; i < length; i++) {
+        Py_UNICODE ch =s[i];
+        if (ch > 127) {
+            int decimal = Py_UNICODE_TODECIMAL(ch);
+            if (decimal >= 0)
+                p[i] = '0' + decimal;
+        }
+    }
+    return result;
+}
 /* --- Decimal Encoder ---------------------------------------------------- */
 
 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
@@ -8967,6 +8991,13 @@ unicode_freelistsize(PyUnicodeObject *self)
 {
     return PyLong_FromLong(numfree);
 }
+
+static PyObject *
+unicode__decimal2ascii(PyObject *self)
+{
+    return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
+                                             PyUnicode_GET_SIZE(self));
+}
 #endif
 
 PyDoc_STRVAR(startswith__doc__,
@@ -9108,7 +9139,6 @@ unicode_getnewargs(PyUnicodeObject *v)
     return Py_BuildValue("(u#)", v->str, v->length);
 }
 
-
 static PyMethodDef unicode_methods[] = {
 
     /* Order is according to common usage: often used methods should
@@ -9170,8 +9200,9 @@ static PyMethodDef unicode_methods[] = {
 #endif
 
 #if 0
-    /* This one is just used for debugging the implementation. */
+    /* These methods are just used for debugging the implementation. */
     {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
+    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
 #endif
 
     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
author	Alexander Belopolsky <alexander.belopolsky@gmail.com>	2010-12-04 03:38:46 (GMT)
committer	Alexander Belopolsky <alexander.belopolsky@gmail.com>	2010-12-04 03:38:46 (GMT)
commit	942af5a9a45b7b4976bea2e794eccaaf2b3b5c09 (patch)
tree	f621bdffa16dd0b04d7bf60d6a32f198fc7b3ec8
parent	36526bf3d95763afa6d4efe402b8840b1532d637 (diff)
download	cpython-942af5a9a45b7b4976bea2e794eccaaf2b3b5c09.zip cpython-942af5a9a45b7b4976bea2e794eccaaf2b3b5c09.tar.gz cpython-942af5a9a45b7b4976bea2e794eccaaf2b3b5c09.tar.bz2