Patch #1541585: fix buffer overrun when performing repr() on

a unicode string in a build with wide unicode (UCS-4) support. This code could be improved, so add an XXX comment.
author: Neal Norwitz <nnorwitz@gmail.com> 2006-08-21 22:21:19 (GMT)
committer: Neal Norwitz <nnorwitz@gmail.com> 2006-08-21 22:21:19 (GMT)
commit: 17753ecbfabc224080ef3e3e0801b7514ef3b455 (patch)
tree: 5b82040f4a72260e43278a3fe6eebc78f03a40d6 /Objects
parent: 0c6ae5bad473c57b3d5b2a3c71b26792e63df14c (diff)
download: cpython-17753ecbfabc224080ef3e3e0801b7514ef3b455.zip
cpython-17753ecbfabc224080ef3e3e0801b7514ef3b455.tar.gz
cpython-17753ecbfabc224080ef3e3e0801b7514ef3b455.tar.bz2
1 files changed, 29 insertions, 12 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index d93f780..20daf66 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2040,7 +2040,32 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
 
     static const char *hexdigit = "0123456789abcdef";
 
-    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
+    /* XXX(nnorwitz): rather than over-allocating, it would be
+       better to choose a different scheme.  Perhaps scan the
+       first N-chars of the string and allocate based on that size.
+    */
+    /* Initial allocation is based on the longest-possible unichr
+       escape.
+
+       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
+       unichr, so in this case it's the longest unichr escape. In
+       narrow (UTF-16) builds this is five chars per source unichr
+       since there are two unichrs in the surrogate pair, so in narrow
+       (UTF-16) builds it's not the longest unichr escape.
+
+       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
+       so in the narrow (UTF-16) build case it's the longest unichr
+       escape.
+    */
+
+    repr = PyString_FromStringAndSize(NULL,
+        2
+#ifdef Py_UNICODE_WIDE
+        + 10*size
+#else
+        + 6*size
+#endif
+        + 1);
     if (repr == NULL)
         return NULL;
 
@@ -2065,15 +2090,6 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
 #ifdef Py_UNICODE_WIDE
         /* Map 21-bit characters to '\U00xxxxxx' */
         else if (ch >= 0x10000) {
-	    Py_ssize_t offset = p - PyString_AS_STRING(repr);
-
-	    /* Resize the string if necessary */
-	    if (offset + 12 > PyString_GET_SIZE(repr)) {
-		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
-		    return NULL;
-		p = PyString_AS_STRING(repr) + offset;
-	    }
-
             *p++ = '\\';
             *p++ = 'U';
             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
@@ -2086,8 +2102,8 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
             *p++ = hexdigit[ch & 0x0000000F];
 	    continue;
         }
-#endif
-	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
+#else
+	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
 	else if (ch >= 0xD800 && ch < 0xDC00) {
 	    Py_UNICODE ch2;
 	    Py_UCS4 ucs;
@@ -2112,6 +2128,7 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
 	    s--;
 	    size++;
 	}
+#endif
 
         /* Map 16-bit characters to '\uxxxx' */
         if (ch >= 256) {
author	Neal Norwitz <nnorwitz@gmail.com>	2006-08-21 22:21:19 (GMT)
committer	Neal Norwitz <nnorwitz@gmail.com>	2006-08-21 22:21:19 (GMT)
commit	17753ecbfabc224080ef3e3e0801b7514ef3b455 (patch)
tree	5b82040f4a72260e43278a3fe6eebc78f03a40d6 /Objects
parent	0c6ae5bad473c57b3d5b2a3c71b26792e63df14c (diff)
download	cpython-17753ecbfabc224080ef3e3e0801b7514ef3b455.zip cpython-17753ecbfabc224080ef3e3e0801b7514ef3b455.tar.gz cpython-17753ecbfabc224080ef3e3e0801b7514ef3b455.tar.bz2