From 25a4b29c95fd5dddbfef985c7fd7b26417a4b87b Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 6 Oct 2011 12:31:55 +0200 Subject: str.replace() avoids memory when it's possible --- Objects/unicodeobject.c | 102 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 18 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a64f795..3a0f468 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1741,6 +1741,63 @@ PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) } } +/* Ensure that a string uses the most efficient storage, if it is not the + case: create a new string with of the right kind. Write NULL into *p_unicode + on error. */ +void +unicode_adjust_maxchar(PyObject **p_unicode) +{ + PyObject *unicode, *copy; + Py_UCS4 max_char; + Py_ssize_t i, len; + unsigned int kind; + + assert(p_unicode != NULL); + unicode = *p_unicode; + assert(PyUnicode_IS_READY(unicode)); + if (PyUnicode_IS_ASCII(unicode)) + return; + + len = PyUnicode_GET_LENGTH(unicode); + kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND) { + const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); + for (i = 0; i < len; i++) { + if (u[i] & 0x80) + return; + } + max_char = 127; + } + else if (kind == PyUnicode_2BYTE_KIND) { + const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); + max_char = 0; + for (i = 0; i < len; i++) { + if (u[i] > max_char) { + max_char = u[i]; + if (max_char >= 256) + return; + } + } + } + else { + assert(kind == PyUnicode_4BYTE_KIND); + const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); + max_char = 0; + for (i = 0; i < len; i++) { + if (u[i] > max_char) { + max_char = u[i]; + if (max_char >= 0x10000) + return; + } + } + } + assert(max_char > PyUnicode_MAX_CHAR_VALUE(unicode)); + copy = PyUnicode_New(len, max_char); + copy_characters(copy, 0, unicode, 0, len); + Py_DECREF(unicode); + *p_unicode = copy; +} + PyObject* PyUnicode_Copy(PyObject *unicode) { @@ -9573,14 +9630,16 @@ replace(PyObject *self, PyObject *str1, PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); } if (mayshrink) { - PyObject *tmp = u; - u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp), - PyUnicode_GET_LENGTH(tmp)); - Py_DECREF(tmp); + unicode_adjust_maxchar(&u); + if (u == NULL) + goto error; } } else { int rkind = skind; char *res; + PyObject *rstr; + Py_UCS4 maxchar; + if (kind1 < rkind) { /* widen substring */ buf1 = _PyUnicode_AsKind(str1, rkind); @@ -9607,11 +9666,13 @@ replace(PyObject *self, PyObject *str1, if (!buf1) goto error; release1 = 1; } - res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen)); - if (!res) { - PyErr_NoMemory(); + maxchar = PyUnicode_MAX_CHAR_VALUE(self); + maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2)); + rstr = PyUnicode_New(slen, maxchar); + if (!rstr) goto error; - } + res = PyUnicode_DATA(rstr); + memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen)); /* change everything in-place, starting with this one */ memcpy(res + PyUnicode_KIND_SIZE(rkind, i), @@ -9631,16 +9692,19 @@ replace(PyObject *self, PyObject *str1, i += len1; } - u = PyUnicode_FromKindAndData(rkind, res, slen); - PyMem_Free(res); - if (!u) goto error; + u = rstr; + unicode_adjust_maxchar(&u); + if (!u) + goto error; } } else { Py_ssize_t n, i, j, ires; Py_ssize_t product, new_size; int rkind = skind; + PyObject *rstr; char *res; + Py_UCS4 maxchar; if (kind1 < rkind) { buf1 = _PyUnicode_AsKind(str1, rkind); @@ -9679,9 +9743,12 @@ replace(PyObject *self, PyObject *str1, "replace string is too long"); goto error; } - res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size)); - if (!res) + maxchar = PyUnicode_MAX_CHAR_VALUE(self); + maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2)); + rstr = PyUnicode_New(new_size, maxchar); + if (!rstr) goto error; + res = PyUnicode_DATA(rstr); ires = i = 0; if (len1 > 0) { while (n-- > 0) { @@ -9731,11 +9798,10 @@ replace(PyObject *self, PyObject *str1, sbuf + PyUnicode_KIND_SIZE(rkind, i), PyUnicode_KIND_SIZE(rkind, slen-i)); } - if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(str2)) - u = unicode_fromascii((unsigned char*)res, new_size); - else - u = PyUnicode_FromKindAndData(rkind, res, new_size); - PyMem_Free(res); + u = rstr; + unicode_adjust_maxchar(&u); + if (u == NULL) + goto error; } if (srelease) PyMem_FREE(sbuf); -- cgit v0.12