summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2012-05-03 11:10:40 (GMT)
committerVictor Stinner <victor.stinner@gmail.com>2012-05-03 11:10:40 (GMT)
commitf2c76aa6cb35ffc556058812daa2123487ac019b (patch)
tree1cda221a5c3fcef223b3af567415f2d728e326ba /Objects/unicodeobject.c
parentac20f463da361647789a758c9ae45a4e77f03ebd (diff)
downloadcpython-f2c76aa6cb35ffc556058812daa2123487ac019b.zip
cpython-f2c76aa6cb35ffc556058812daa2123487ac019b.tar.gz
cpython-f2c76aa6cb35ffc556058812daa2123487ac019b.tar.bz2
Issue #14687: str%tuple now uses an optimistic "unicode writer" instead of an
accumulator. Directly write characters into the output (don't use a temporary list): resize and widen the string on demand.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c266
1 files changed, 166 insertions, 100 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 47cbea6..2c308bc 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -10074,7 +10074,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
switch ((kind)) { \
case PyUnicode_1BYTE_KIND: { \
unsigned char * to_ = (unsigned char *)((data)) + (start); \
- memset(to_, (unsigned char)value, length); \
+ memset(to_, (unsigned char)value, (length)); \
break; \
} \
case PyUnicode_2BYTE_KIND: { \
@@ -13655,56 +13655,133 @@ formatchar(PyObject *v)
return (Py_UCS4) -1;
}
-static int
-repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
-{
- int r;
- assert(count > 0);
- assert(PyUnicode_Check(obj));
- if (count > 5) {
- PyObject *repeated = unicode_repeat(obj, count);
- if (repeated == NULL)
- return -1;
- r = _PyAccu_Accumulate(acc, repeated);
- Py_DECREF(repeated);
- return r;
+struct unicode_writer_t {
+ PyObject *buffer;
+ void *data;
+ enum PyUnicode_Kind kind;
+ Py_UCS4 maxchar;
+ Py_ssize_t length;
+ Py_ssize_t pos;
+};
+
+Py_LOCAL_INLINE(void)
+unicode_writer_update(struct unicode_writer_t *writer)
+{
+ writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
+ writer->data = PyUnicode_DATA(writer->buffer);
+ writer->kind = PyUnicode_KIND(writer->buffer);
+}
+
+Py_LOCAL_INLINE(int)
+unicode_writer_init(struct unicode_writer_t *writer,
+ Py_ssize_t length, Py_UCS4 maxchar)
+{
+ writer->pos = 0;
+ writer->length = length;
+ writer->buffer = PyUnicode_New(writer->length, maxchar);
+ if (writer->buffer == NULL)
+ return -1;
+ unicode_writer_update(writer);
+ return 0;
+}
+
+Py_LOCAL_INLINE(int)
+unicode_writer_prepare(struct unicode_writer_t *writer,
+ Py_ssize_t length, Py_UCS4 maxchar)
+{
+ Py_ssize_t newlen;
+
+ if (length > PY_SSIZE_T_MAX - writer->pos) {
+ PyErr_NoMemory();
+ return -1;
}
- else {
- do {
- if (_PyAccu_Accumulate(acc, obj))
- return -1;
- } while (--count);
+ newlen = writer->pos + length;
+
+ if (newlen > writer->length && maxchar > writer->maxchar) {
+ PyObject *newbuffer;
+
+ /* overallocate 25% to limit the number of resize */
+ if (newlen > PY_SSIZE_T_MAX - newlen / 4)
+ writer->length = newlen;
+ else
+ writer->length = newlen + newlen / 4;
+
+ /* resize + widen */
+ newbuffer = PyUnicode_New(writer->length, maxchar);
+ if (newbuffer == NULL)
+ return -1;
+ PyUnicode_CopyCharacters(newbuffer, 0,
+ writer->buffer, 0, writer->pos);
+ Py_DECREF(writer->buffer);
+ writer->buffer = newbuffer;
+ unicode_writer_update(writer);
return 0;
}
+ if (newlen > writer->length) {
+ /* overallocate 25% to limit the number of resize */
+ if (newlen > PY_SSIZE_T_MAX - newlen / 4)
+ writer->length = newlen;
+ else
+ writer->length = newlen + newlen / 4;
+ if (PyUnicode_Resize(&writer->buffer, writer->length) < 0)
+ return -1;
+ unicode_writer_update(writer);
+ }
+ if (maxchar > writer->maxchar) {
+ if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
+ return -1;
+ unicode_writer_update(writer);
+ }
+ return 0;
+}
+
+Py_LOCAL_INLINE(int)
+unicode_writer_write_str(
+ struct unicode_writer_t *writer,
+ PyObject *str, Py_ssize_t start, Py_ssize_t length)
+{
+ Py_UCS4 maxchar;
+ maxchar = _PyUnicode_FindMaxChar(str, start, start + length);
+ if (unicode_writer_prepare(writer, length, maxchar) == -1)
+ return -1;
+ assert((writer->pos + length) <= writer->length);
+ copy_characters(writer->buffer, writer->pos,
+ str, start, length);
+ writer->pos += length;
+ return 0;
+}
+
+Py_LOCAL_INLINE(int)
+unicode_writer_write_char(
+ struct unicode_writer_t *writer,
+ Py_UCS4 ch)
+{
+ if (unicode_writer_prepare(writer, 1, ch) == -1)
+ return -1;
+ assert((writer->pos + 1) <= writer->length);
+ PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
+ writer->pos += 1;
+ return 0;
+}
+
+Py_LOCAL_INLINE(void)
+unicode_writer_dealloc(struct unicode_writer_t *writer)
+{
+ Py_CLEAR(writer->buffer);
}
PyObject *
PyUnicode_Format(PyObject *format, PyObject *args)
{
- void *fmt;
- int fmtkind;
- PyObject *result;
- int kind;
- int r;
Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
int args_owned = 0;
PyObject *dict = NULL;
PyObject *temp = NULL;
PyObject *second = NULL;
PyObject *uformat;
- _PyAccu acc;
- static PyObject *plus, *minus, *blank, *zero, *percent;
-
- if (!plus && !(plus = get_latin1_char('+')))
- return NULL;
- if (!minus && !(minus = get_latin1_char('-')))
- return NULL;
- if (!blank && !(blank = get_latin1_char(' ')))
- return NULL;
- if (!zero && !(zero = get_latin1_char('0')))
- return NULL;
- if (!percent && !(percent = get_latin1_char('%')))
- return NULL;
+ void *fmt;
+ enum PyUnicode_Kind kind, fmtkind;
+ struct unicode_writer_t writer;
if (format == NULL || args == NULL) {
PyErr_BadInternalCall();
@@ -13715,13 +13792,15 @@ PyUnicode_Format(PyObject *format, PyObject *args)
return NULL;
if (PyUnicode_READY(uformat) == -1)
Py_DECREF(uformat);
- if (_PyAccu_Init(&acc))
- goto onError;
+
fmt = PyUnicode_DATA(uformat);
fmtkind = PyUnicode_KIND(uformat);
fmtcnt = PyUnicode_GET_LENGTH(uformat);
fmtpos = 0;
+ if (unicode_writer_init(&writer, fmtcnt + 100, 127) < 0)
+ goto onError;
+
if (PyTuple_Check(args)) {
arglen = PyTuple_Size(args);
argidx = 0;
@@ -13736,7 +13815,6 @@ PyUnicode_Format(PyObject *format, PyObject *args)
while (--fmtcnt >= 0) {
if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
- PyObject *nonfmt;
Py_ssize_t nonfmtpos;
nonfmtpos = fmtpos++;
while (fmtcnt >= 0 &&
@@ -13744,12 +13822,9 @@ PyUnicode_Format(PyObject *format, PyObject *args)
fmtpos++;
fmtcnt--;
}
- nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
- if (nonfmt == NULL)
- goto onError;
- r = _PyAccu_Accumulate(&acc, nonfmt);
- Py_DECREF(nonfmt);
- if (r)
+ if (fmtcnt < 0)
+ fmtpos--;
+ if (unicode_writer_write_str(&writer, uformat, nonfmtpos, fmtpos - nonfmtpos) < 0)
goto onError;
}
else {
@@ -13758,12 +13833,13 @@ PyUnicode_Format(PyObject *format, PyObject *args)
Py_ssize_t width = -1;
int prec = -1;
Py_UCS4 c = '\0';
- Py_UCS4 fill, sign;
+ Py_UCS4 fill;
+ int sign;
+ Py_UCS4 signchar;
int isnumok;
PyObject *v = NULL;
void *pbuf = NULL;
Py_ssize_t pindex, len;
- PyObject *signobj = NULL, *fillobj = NULL;
fmtpos++;
c = PyUnicode_READ(fmtkind, fmt, fmtpos);
@@ -13906,7 +13982,8 @@ PyUnicode_Format(PyObject *format, PyObject *args)
}
if (c == '%') {
- _PyAccu_Accumulate(&acc, percent);
+ if (unicode_writer_write_char(&writer, '%') < 0)
+ goto onError;
continue;
}
@@ -13916,8 +13993,8 @@ PyUnicode_Format(PyObject *format, PyObject *args)
goto onError;
sign = 0;
+ signchar = '\0';
fill = ' ';
- fillobj = blank;
switch (c) {
case 's':
@@ -13972,10 +14049,8 @@ PyUnicode_Format(PyObject *format, PyObject *args)
"not %.200s", (char)c, Py_TYPE(v)->tp_name);
goto onError;
}
- if (flags & F_ZERO) {
+ if (flags & F_ZERO)
fill = '0';
- fillobj = zero;
- }
break;
case 'e':
@@ -13985,10 +14060,8 @@ PyUnicode_Format(PyObject *format, PyObject *args)
case 'g':
case 'G':
sign = 1;
- if (flags & F_ZERO) {
+ if (flags & F_ZERO)
fill = '0';
- fillobj = zero;
- }
temp = formatfloat(v, flags, prec, c);
break;
@@ -14029,20 +14102,16 @@ PyUnicode_Format(PyObject *format, PyObject *args)
/* pbuf is initialized here. */
pindex = 0;
if (sign) {
- if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
- signobj = minus;
- len--;
- pindex++;
- }
- else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
- signobj = plus;
+ Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
+ if (ch == '-' || ch == '+') {
+ signchar = ch;
len--;
pindex++;
}
else if (flags & F_SIGN)
- signobj = plus;
+ signchar = '+';
else if (flags & F_BLANK)
- signobj = blank;
+ signchar = ' ';
else
sign = 0;
}
@@ -14050,8 +14119,7 @@ PyUnicode_Format(PyObject *format, PyObject *args)
width = len;
if (sign) {
if (fill != ' ') {
- assert(signobj != NULL);
- if (_PyAccu_Accumulate(&acc, signobj))
+ if (unicode_writer_write_char(&writer, signchar) < 0)
goto onError;
}
if (width > len)
@@ -14061,14 +14129,12 @@ PyUnicode_Format(PyObject *format, PyObject *args)
assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
if (fill != ' ') {
- second = get_latin1_char(
- PyUnicode_READ(kind, pbuf, pindex + 1));
- pindex += 2;
- if (second == NULL ||
- _PyAccu_Accumulate(&acc, zero) ||
- _PyAccu_Accumulate(&acc, second))
+ if (unicode_writer_prepare(&writer, 2, 127) < 0)
goto onError;
- Py_CLEAR(second);
+ PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
+ PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
+ writer.pos += 2;
+ pindex += 2;
}
width -= 2;
if (width < 0)
@@ -14076,45 +14142,43 @@ PyUnicode_Format(PyObject *format, PyObject *args)
len -= 2;
}
if (width > len && !(flags & F_LJUST)) {
- assert(fillobj != NULL);
- if (repeat_accumulate(&acc, fillobj, width - len))
+ Py_ssize_t sublen;
+ sublen = width - len;
+ if (unicode_writer_prepare(&writer, sublen, fill) < 0)
goto onError;
+ FILL(writer.kind, writer.data, fill, writer.pos, sublen);
+ writer.pos += sublen;
width = len;
}
if (fill == ' ') {
if (sign) {
- assert(signobj != NULL);
- if (_PyAccu_Accumulate(&acc, signobj))
+ if (unicode_writer_write_char(&writer, signchar) < 0)
goto onError;
}
if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
- second = get_latin1_char(
- PyUnicode_READ(kind, pbuf, pindex + 1));
- pindex += 2;
- if (second == NULL ||
- _PyAccu_Accumulate(&acc, zero) ||
- _PyAccu_Accumulate(&acc, second))
+
+ if (unicode_writer_prepare(&writer, 2, 127) < 0)
goto onError;
- Py_CLEAR(second);
+ PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
+ PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
+ writer.pos += 2;
+
+ pindex += 2;
}
}
+
/* Copy all characters, preserving len */
- if (pindex == 0 && len == PyUnicode_GET_LENGTH(temp)) {
- r = _PyAccu_Accumulate(&acc, temp);
- }
- else {
- v = PyUnicode_Substring(temp, pindex, pindex + len);
- if (v == NULL)
+ if (unicode_writer_write_str(&writer, temp, pindex, len) < 0)
+ goto onError;
+ if (width > len) {
+ Py_ssize_t sublen = width - len;
+ if (unicode_writer_prepare(&writer, sublen, ' ') < 0)
goto onError;
- r = _PyAccu_Accumulate(&acc, v);
- Py_DECREF(v);
+ FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
+ writer.pos += sublen;
}
- if (r)
- goto onError;
- if (width > len && repeat_accumulate(&acc, blank, width - len))
- goto onError;
if (dict && (argidx < arglen) && c != '%') {
PyErr_SetString(PyExc_TypeError,
"not all arguments converted during string formatting");
@@ -14129,20 +14193,22 @@ PyUnicode_Format(PyObject *format, PyObject *args)
goto onError;
}
- result = _PyAccu_Finish(&acc);
+ if (PyUnicode_Resize(&writer.buffer, writer.pos) < 0)
+ goto onError;
+
if (args_owned) {
Py_DECREF(args);
}
Py_DECREF(uformat);
Py_XDECREF(temp);
Py_XDECREF(second);
- return result;
+ return writer.buffer;
onError:
Py_DECREF(uformat);
Py_XDECREF(temp);
Py_XDECREF(second);
- _PyAccu_Destroy(&acc);
+ unicode_writer_dealloc(&writer);
if (args_owned) {
Py_DECREF(args);
}