From ed4c130d3dd7ab0f0b142d36ee49b74df57d182e Mon Sep 17 00:00:00 2001
From: Benjamin Peterson <benjamin@python.org>
Date: Mon, 29 Sep 2014 18:18:57 -0400
Subject: cleanup overflowing handling in unicode_decode_call_errorhandler and
 unicode_encode_ucs1 (closes #22518)

---
 Objects/unicodeobject.c | 69 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index f52ee92..bdb14d7 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1510,9 +1510,15 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
        when there are no errors in the rest of the string) */
     repptr = PyUnicode_AS_UNICODE(repunicode);
     repsize = PyUnicode_GET_SIZE(repunicode);
-    requiredsize = *outpos + repsize + insize-newpos;
+    requiredsize = *outpos;
+    if (requiredsize > PY_SSIZE_T_MAX - repsize)
+        goto overflow;
+    requiredsize += repsize;
+    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
+        goto overflow;
+    requiredsize += insize - newpos;
     if (requiredsize > outsize) {
-        if (requiredsize<2*outsize)
+        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
             requiredsize = 2*outsize;
         if (_PyUnicode_Resize(output, requiredsize) < 0)
             goto onError;
@@ -1529,6 +1535,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
   onError:
     Py_XDECREF(restuple);
     return res;
+
+  overflow:
+    PyErr_SetString(PyExc_OverflowError,
+                    "decoded result is too long for a Python string");
+    goto onError;
 }
 
 /* --- UTF-7 Codec -------------------------------------------------------- */
@@ -3646,7 +3657,7 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
             const Py_UNICODE *collstart = p;
             const Py_UNICODE *collend = p;
             /* find all unecodable characters */
-            while ((collend < endp) && ((*collend)>=limit))
+            while ((collend < endp) && ((*collend) >= limit))
                 ++collend;
             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
             if (known_errorHandler==-1) {
@@ -3666,34 +3677,41 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
                 goto onError;
             case 2: /* replace */
-                while (collstart++<collend)
+                while (collstart++ < collend)
                     *str++ = '?'; /* fall through */
             case 3: /* ignore */
                 p = collend;
                 break;
             case 4: /* xmlcharrefreplace */
-                respos = str-PyString_AS_STRING(res);
+                respos = str - PyString_AS_STRING(res);
                 /* determine replacement size (temporarily (mis)uses p) */
-                for (p = collstart, repsize = 0; p < collend;) {
+                requiredsize = respos;
+                for (p = collstart; p < collend;) {
                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                    Py_ssize_t incr;
                     if (ch < 10)
-                        repsize += 2+1+1;
+                        incr = 2+1+1;
                     else if (ch < 100)
-                        repsize += 2+2+1;
+                        incr = 2+2+1;
                     else if (ch < 1000)
-                        repsize += 2+3+1;
+                        incr = 2+3+1;
                     else if (ch < 10000)
-                        repsize += 2+4+1;
+                        incr = 2+4+1;
                     else if (ch < 100000)
-                        repsize += 2+5+1;
+                        incr = 2+5+1;
                     else if (ch < 1000000)
-                        repsize += 2+6+1;
+                        incr = 2+6+1;
                     else
-                        repsize += 2+7+1;
+                        incr = 2+7+1;
+                    if (requiredsize > PY_SSIZE_T_MAX - incr)
+                        goto overflow;
+                    requiredsize += incr;
                 }
-                requiredsize = respos+repsize+(endp-collend);
+                if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
+                    goto overflow;
+                requiredsize += endp - collend;
                 if (requiredsize > ressize) {
-                    if (requiredsize<2*ressize)
+                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
                         requiredsize = 2*ressize;
                     if (_PyString_Resize(&res, requiredsize))
                         goto onError;
@@ -3716,11 +3734,16 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
                 /* need more space? (at least enough for what we have+the
                    replacement+the rest of the string, so we won't have to
                    check space for encodable characters) */
-                respos = str-PyString_AS_STRING(res);
+                respos = str - PyString_AS_STRING(res);
                 repsize = PyUnicode_GET_SIZE(repunicode);
-                requiredsize = respos+repsize+(endp-collend);
+                if (respos > PY_SSIZE_T_MAX - repsize)
+                    goto overflow;
+                requiredsize = respos + repsize;
+                if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
+                    goto overflow;
+                requiredsize += endp - collend;
                 if (requiredsize > ressize) {
-                    if (requiredsize<2*ressize)
+                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
                         requiredsize = 2*ressize;
                     if (_PyString_Resize(&res, requiredsize)) {
                         Py_DECREF(repunicode);
@@ -3731,7 +3754,7 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
                 }
                 /* check if there is anything unencodable in the replacement
                    and copy it to the output */
-                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
+                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
                     c = *uni2;
                     if (c >= limit) {
                         raise_encode_exception(&exc, encoding, startp, size,
@@ -3747,14 +3770,18 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
         }
     }
     /* Resize if we allocated to much */
-    respos = str-PyString_AS_STRING(res);
-    if (respos<ressize)
+    respos = str - PyString_AS_STRING(res);
+    if (respos < ressize)
         /* If this falls res will be NULL */
         _PyString_Resize(&res, respos);
     Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
     return res;
 
+  overflow:
+    PyErr_SetString(PyExc_OverflowError,
+                    "encoded result is too long for a Python string");
+
   onError:
     Py_XDECREF(res);
     Py_XDECREF(errorHandler);
-- 
cgit v0.12