summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2011-10-06 23:54:09 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2011-10-06 23:54:09 (GMT)
commit5c0ba36d5ff8d7693c5014b8696e2fab7468afd4 (patch)
tree15f3a3ba8907c01b8f95248a4d310cb9fe354e2a
parent438818b4b090e88e5ae2e19da127d17cf2bf1de2 (diff)
downloadcpython-5c0ba36d5ff8d7693c5014b8696e2fab7468afd4.zip
cpython-5c0ba36d5ff8d7693c5014b8696e2fab7468afd4.tar.gz
cpython-5c0ba36d5ff8d7693c5014b8696e2fab7468afd4.tar.bz2
Fix massive slowdown in string formatting with the % operator
-rw-r--r--Objects/unicodeobject.c241
1 files changed, 127 insertions, 114 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 3cc190e..caad326 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -12693,17 +12693,13 @@ formatlong(PyObject *val, int flags, int prec, int type)
return result;
}
-static int
-formatchar(Py_UCS4 *buf,
- size_t buflen,
- PyObject *v)
+static Py_UCS4
+formatchar(PyObject *v)
{
/* presume that the buffer is at least 3 characters long */
if (PyUnicode_Check(v)) {
if (PyUnicode_GET_LENGTH(v) == 1) {
- buf[0] = PyUnicode_READ_CHAR(v, 0);
- buf[1] = '\0';
- return 1;
+ return PyUnicode_READ_CHAR(v, 0);
}
goto onError;
}
@@ -12717,38 +12713,45 @@ formatchar(Py_UCS4 *buf,
if (x < 0 || x > 0x10ffff) {
PyErr_SetString(PyExc_OverflowError,
"%c arg not in range(0x110000)");
- return -1;
+ return (Py_UCS4) -1;
}
- buf[0] = (Py_UCS4) x;
- buf[1] = '\0';
- return 1;
+ return (Py_UCS4) x;
}
onError:
PyErr_SetString(PyExc_TypeError,
"%c requires int or char");
- return -1;
+ return (Py_UCS4) -1;
}
-/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
- FORMATBUFLEN is the length of the buffer in which chars are formatted.
-*/
-#define FORMATBUFLEN (size_t)10
-
PyObject *
PyUnicode_Format(PyObject *format, PyObject *args)
{
void *fmt;
int fmtkind;
PyObject *result;
- Py_UCS4 *res, *res0;
- Py_UCS4 max;
int kind;
- Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
+ int r;
+ Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
int args_owned = 0;
PyObject *dict = NULL;
+ PyObject *temp = NULL;
+ PyObject *second = NULL;
PyUnicodeObject *uformat;
+ _PyAccu acc;
+ static PyObject *plus, *minus, *blank, *zero, *percent;
+
+ if (!plus && !(plus = get_latin1_char('+')))
+ return NULL;
+ if (!minus && !(minus = get_latin1_char('-')))
+ return NULL;
+ if (!blank && !(blank = get_latin1_char(' ')))
+ return NULL;
+ if (!zero && !(zero = get_latin1_char('0')))
+ return NULL;
+ if (!percent && !(percent = get_latin1_char('%')))
+ return NULL;
if (format == NULL || args == NULL) {
PyErr_BadInternalCall();
@@ -12757,18 +12760,13 @@ PyUnicode_Format(PyObject *format, PyObject *args)
uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
if (uformat == NULL || PyUnicode_READY(uformat) == -1)
return NULL;
+ if (_PyAccu_Init(&acc))
+ goto onError;
fmt = PyUnicode_DATA(uformat);
fmtkind = PyUnicode_KIND(uformat);
fmtcnt = PyUnicode_GET_LENGTH(uformat);
fmtpos = 0;
- reslen = rescnt = fmtcnt + 100;
- res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
- if (res0 == NULL) {
- PyErr_NoMemory();
- goto onError;
- }
-
if (PyTuple_Check(args)) {
arglen = PyTuple_Size(args);
argidx = 0;
@@ -12783,18 +12781,21 @@ PyUnicode_Format(PyObject *format, PyObject *args)
while (--fmtcnt >= 0) {
if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
- if (--rescnt < 0) {
- rescnt = fmtcnt + 100;
- reslen += rescnt;
- res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
- if (res0 == NULL){
- PyErr_NoMemory();
- goto onError;
- }
- res = res0 + reslen - rescnt;
- --rescnt;
- }
- *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
+ PyObject *nonfmt;
+ Py_ssize_t nonfmtpos;
+ nonfmtpos = fmtpos++;
+ while (fmtcnt >= 0 &&
+ PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
+ fmtpos++;
+ fmtcnt--;
+ }
+ nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
+ if (nonfmt == NULL)
+ goto onError;
+ r = _PyAccu_Accumulate(&acc, nonfmt);
+ Py_DECREF(nonfmt);
+ if (r)
+ goto onError;
}
else {
/* Got a format specifier */
@@ -12802,15 +12803,12 @@ PyUnicode_Format(PyObject *format, PyObject *args)
Py_ssize_t width = -1;
int prec = -1;
Py_UCS4 c = '\0';
- Py_UCS4 fill;
+ Py_UCS4 fill, sign;
int isnumok;
PyObject *v = NULL;
- PyObject *temp = NULL;
- void *pbuf;
- Py_ssize_t pindex;
- Py_UNICODE sign;
- Py_ssize_t len, len1;
- Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
+ void *pbuf = NULL;
+ Py_ssize_t pindex, len;
+ PyObject *signobj = NULL, *fillobj = NULL;
fmtpos++;
if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
@@ -12955,15 +12953,12 @@ PyUnicode_Format(PyObject *format, PyObject *args)
}
sign = 0;
fill = ' ';
+ fillobj = blank;
switch (c) {
case '%':
- pbuf = formatbuf;
- kind = PyUnicode_4BYTE_KIND;
- /* presume that buffer length is at least 1 */
- PyUnicode_WRITE(kind, pbuf, 0, '%');
- len = 1;
- break;
+ _PyAccu_Accumulate(&acc, percent);
+ continue;
case 's':
case 'r':
@@ -13045,8 +13040,10 @@ PyUnicode_Format(PyObject *format, PyObject *args)
"not %.200s", (char)c, Py_TYPE(v)->tp_name);
goto onError;
}
- if (flags & F_ZERO)
+ if (flags & F_ZERO) {
fill = '0';
+ fillobj = zero;
+ }
break;
case 'e':
@@ -13066,17 +13063,25 @@ PyUnicode_Format(PyObject *format, PyObject *args)
kind = PyUnicode_KIND(temp);
len = PyUnicode_GET_LENGTH(temp);
sign = 1;
- if (flags & F_ZERO)
+ if (flags & F_ZERO) {
fill = '0';
+ fillobj = zero;
+ }
break;
case 'c':
- pbuf = formatbuf;
- kind = PyUnicode_4BYTE_KIND;
- len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
- if (len < 0)
+ {
+ Py_UCS4 ch = formatchar(v);
+ if (ch == (Py_UCS4) -1)
+ goto onError;
+ temp = _PyUnicode_FromUCS4(&ch, 1);
+ if (temp == NULL)
goto onError;
+ pbuf = PyUnicode_DATA(temp);
+ kind = PyUnicode_KIND(temp);
+ len = PyUnicode_GET_LENGTH(temp);
break;
+ }
default:
PyErr_Format(PyExc_ValueError,
@@ -13090,90 +13095,105 @@ PyUnicode_Format(PyObject *format, PyObject *args)
/* pbuf is initialized here. */
pindex = 0;
if (sign) {
- if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
- PyUnicode_READ(kind, pbuf, pindex) == '+') {
- sign = PyUnicode_READ(kind, pbuf, pindex++);
+ if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
+ signobj = minus;
len--;
+ pindex++;
+ }
+ else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
+ signobj = plus;
+ len--;
+ pindex++;
}
else if (flags & F_SIGN)
- sign = '+';
+ signobj = plus;
else if (flags & F_BLANK)
- sign = ' ';
+ signobj = blank;
else
sign = 0;
}
if (width < len)
width = len;
- if (rescnt - (sign != 0) < width) {
- reslen -= rescnt;
- rescnt = width + fmtcnt + 100;
- reslen += rescnt;
- if (reslen < 0) {
- Py_XDECREF(temp);
- PyErr_NoMemory();
- goto onError;
- }
- res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
- if (res0 == 0) {
- PyErr_NoMemory();
- Py_XDECREF(temp);
- goto onError;
- }
- res = res0 + reslen - rescnt;
- }
if (sign) {
- if (fill != ' ')
- *res++ = sign;
- rescnt--;
+ if (fill != ' ') {
+ assert(signobj != NULL);
+ if (_PyAccu_Accumulate(&acc, signobj))
+ goto onError;
+ }
if (width > len)
width--;
}
if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
- assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
+ assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
if (fill != ' ') {
- *res++ = PyUnicode_READ(kind, pbuf, pindex++);
- *res++ = PyUnicode_READ(kind, pbuf, pindex++);
+ second = get_latin1_char(
+ PyUnicode_READ(kind, pbuf, pindex + 1));
+ pindex += 2;
+ if (second == NULL ||
+ _PyAccu_Accumulate(&acc, zero) ||
+ _PyAccu_Accumulate(&acc, second))
+ goto onError;
+ Py_CLEAR(second);
}
- rescnt -= 2;
width -= 2;
if (width < 0)
width = 0;
len -= 2;
}
if (width > len && !(flags & F_LJUST)) {
+ assert(fillobj != NULL);
do {
- --rescnt;
- *res++ = fill;
+ if (_PyAccu_Accumulate(&acc, fillobj))
+ goto onError;
} while (--width > len);
}
if (fill == ' ') {
- if (sign)
- *res++ = sign;
+ if (sign) {
+ assert(signobj != NULL);
+ if (_PyAccu_Accumulate(&acc, signobj))
+ goto onError;
+ }
if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
- *res++ = PyUnicode_READ(kind, pbuf, pindex++);
- *res++ = PyUnicode_READ(kind, pbuf, pindex++);
+ second = get_latin1_char(
+ PyUnicode_READ(kind, pbuf, pindex + 1));
+ pindex += 2;
+ if (second == NULL ||
+ _PyAccu_Accumulate(&acc, zero) ||
+ _PyAccu_Accumulate(&acc, second))
+ goto onError;
+ Py_CLEAR(second);
}
}
/* Copy all characters, preserving len */
- len1 = len;
- while (len1--) {
- *res++ = PyUnicode_READ(kind, pbuf, pindex++);
- rescnt--;
+ if (temp != NULL) {
+ assert(pbuf == PyUnicode_DATA(temp));
+ v = PyUnicode_Substring(temp, pindex, pindex + len);
+ }
+ else {
+ const char *p = (const char *) pbuf;
+ assert(pbuf != NULL);
+ p = p + PyUnicode_KIND_SIZE(kind, pindex);
+ v = PyUnicode_FromKindAndData(kind, p, len);
}
+ if (v == NULL)
+ goto onError;
+ r = _PyAccu_Accumulate(&acc, v);
+ Py_DECREF(v);
+ if (r)
+ goto onError;
while (--width >= len) {
- --rescnt;
- *res++ = ' ';
+ if (_PyAccu_Accumulate(&acc, blank))
+ goto onError;
}
if (dict && (argidx < arglen) && c != '%') {
PyErr_SetString(PyExc_TypeError,
"not all arguments converted during string formatting");
- Py_XDECREF(temp);
goto onError;
}
- Py_XDECREF(temp);
+ Py_CLEAR(temp);
} /* '%' */
} /* until end */
if (argidx < arglen && !dict) {
@@ -13182,27 +13202,20 @@ PyUnicode_Format(PyObject *format, PyObject *args)
goto onError;
}
-
- for (max=0, res = res0; res < res0+reslen-rescnt; res++)
- if (*res > max)
- max = *res;
- result = PyUnicode_New(reslen - rescnt, max);
- if (!result)
- goto onError;
- kind = PyUnicode_KIND(result);
- for (res = res0; res < res0+reslen-rescnt; res++)
- PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
- PyMem_Free(res0);
+ result = _PyAccu_Finish(&acc);
if (args_owned) {
Py_DECREF(args);
}
Py_DECREF(uformat);
- assert(_PyUnicode_CheckConsistency(result, 1));
+ Py_XDECREF(temp);
+ Py_XDECREF(second);
return (PyObject *)result;
onError:
- PyMem_Free(res0);
Py_DECREF(uformat);
+ Py_XDECREF(temp);
+ Py_XDECREF(second);
+ _PyAccu_Destroy(&acc);
if (args_owned) {
Py_DECREF(args);
}