diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2012-05-07 10:47:02 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2012-05-07 10:47:02 (GMT) |
commit | 202fdca133ce8f5b0c37cca1353070e0721c688d (patch) | |
tree | 7e6f1c58ca7b836f8fb8132dea7f85b08d403894 | |
parent | 9fad1604110cd7a0bb32792aa6d6c6a63018d51e (diff) | |
download | cpython-202fdca133ce8f5b0c37cca1353070e0721c688d.zip cpython-202fdca133ce8f5b0c37cca1353070e0721c688d.tar.gz cpython-202fdca133ce8f5b0c37cca1353070e0721c688d.tar.bz2 |
Close #14716: str.format() now uses the new "unicode writer" API instead of the
PyAccu API. For example, it makes str.format() from 25% to 30% faster on Linux.
-rw-r--r-- | Objects/stringlib/unicode_format.h | 60 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 258 |
2 files changed, 148 insertions, 170 deletions
diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h index 6807088..85a29f5 100644 --- a/Objects/stringlib/unicode_format.h +++ b/Objects/stringlib/unicode_format.h @@ -2,8 +2,6 @@ unicode_format.h -- implementation of str.format(). */ -#include "accu.h" - /* Defines for more efficiently reallocating the string buffer */ #define INITIAL_SIZE_INCREMENT 100 #define SIZE_MULTIPLIER 2 @@ -112,33 +110,6 @@ autonumber_state_error(AutoNumberState state, int field_name_is_empty) /************************************************************************/ -/*********** Output string management functions ****************/ -/************************************************************************/ - -/* - output_data dumps characters into our output string - buffer. - - In some cases, it has to reallocate the string. - - It returns a status: 0 for a failed reallocation, - 1 for success. -*/ -static int -output_data(_PyAccu *acc, PyObject *s, Py_ssize_t start, Py_ssize_t end) -{ - PyObject *substring; - int r; - - substring = PyUnicode_Substring(s, start, end); - if (substring == NULL) - return 0; - r = _PyAccu_Accumulate(acc, substring); - Py_DECREF(substring); - return r == 0; -} - -/************************************************************************/ /*********** Format string parsing -- integers and identifiers *********/ /************************************************************************/ @@ -523,7 +494,7 @@ error: appends to the output. */ static int -render_field(PyObject *fieldobj, SubString *format_spec, _PyAccu *acc) +render_field(PyObject *fieldobj, SubString *format_spec, unicode_writer_t *writer) { int ok = 0; PyObject *result = NULL; @@ -566,7 +537,8 @@ render_field(PyObject *fieldobj, SubString *format_spec, _PyAccu *acc) goto done; assert(PyUnicode_Check(result)); - ok = output_data(acc, result, 0, PyUnicode_GET_LENGTH(result)); + + ok = (unicode_writer_write_str(writer, result, 0, PyUnicode_GET_LENGTH(result)) == 0); done: Py_XDECREF(format_spec_object); Py_XDECREF(result); @@ -831,7 +803,7 @@ do_conversion(PyObject *obj, Py_UCS4 conversion) static int output_markup(SubString *field_name, SubString *format_spec, int format_spec_needs_expanding, Py_UCS4 conversion, - _PyAccu *acc, PyObject *args, PyObject *kwargs, + unicode_writer_t *writer, PyObject *args, PyObject *kwargs, int recursion_depth, AutoNumber *auto_number) { PyObject *tmp = NULL; @@ -872,7 +844,7 @@ output_markup(SubString *field_name, SubString *format_spec, else actual_format_spec = format_spec; - if (render_field(fieldobj, actual_format_spec, acc) == 0) + if (render_field(fieldobj, actual_format_spec, writer) == 0) goto done; result = 1; @@ -892,7 +864,7 @@ done: */ static int do_markup(SubString *input, PyObject *args, PyObject *kwargs, - _PyAccu *acc, int recursion_depth, AutoNumber *auto_number) + unicode_writer_t *writer, int recursion_depth, AutoNumber *auto_number) { MarkupIterator iter; int format_spec_needs_expanding; @@ -902,17 +874,21 @@ do_markup(SubString *input, PyObject *args, PyObject *kwargs, SubString field_name; SubString format_spec; Py_UCS4 conversion; + int err; MarkupIterator_init(&iter, input->str, input->start, input->end); while ((result = MarkupIterator_next(&iter, &literal, &field_present, &field_name, &format_spec, &conversion, &format_spec_needs_expanding)) == 2) { - if (!output_data(acc, literal.str, literal.start, literal.end)) + err = unicode_writer_write_str(writer, + literal.str, literal.start, + literal.end - literal.start); + if (err == -1) return 0; if (field_present) if (!output_markup(&field_name, &format_spec, - format_spec_needs_expanding, conversion, acc, + format_spec_needs_expanding, conversion, writer, args, kwargs, recursion_depth, auto_number)) return 0; } @@ -928,7 +904,8 @@ static PyObject * build_string(SubString *input, PyObject *args, PyObject *kwargs, int recursion_depth, AutoNumber *auto_number) { - _PyAccu acc; + unicode_writer_t writer; + Py_ssize_t initlen; /* check the recursion level */ if (recursion_depth <= 0) { @@ -937,16 +914,17 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs, return NULL; } - if (_PyAccu_Init(&acc)) + initlen = PyUnicode_GET_LENGTH(input->str) + 100; + if (unicode_writer_init(&writer, initlen, 127) == -1) return NULL; - if (!do_markup(input, args, kwargs, &acc, recursion_depth, + if (!do_markup(input, args, kwargs, &writer, recursion_depth, auto_number)) { - _PyAccu_Destroy(&acc); + unicode_writer_dealloc(&writer); return NULL; } - return _PyAccu_Finish(&acc); + return unicode_writer_finish(&writer); } /************************************************************************/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 129a5fc..0722312 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13200,6 +13200,135 @@ unicode_endswith(PyObject *self, return PyBool_FromLong(result); } +typedef struct { + PyObject *buffer; + void *data; + enum PyUnicode_Kind kind; + Py_UCS4 maxchar; + Py_ssize_t pos; +} unicode_writer_t; + +Py_LOCAL_INLINE(void) +unicode_writer_update(unicode_writer_t *writer) +{ + writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); + writer->data = PyUnicode_DATA(writer->buffer); + writer->kind = PyUnicode_KIND(writer->buffer); +} + +Py_LOCAL(int) +unicode_writer_init(unicode_writer_t *writer, + Py_ssize_t length, Py_UCS4 maxchar) +{ + writer->pos = 0; + writer->buffer = PyUnicode_New(length, maxchar); + if (writer->buffer == NULL) + return -1; + unicode_writer_update(writer); + return 0; +} + +Py_LOCAL_INLINE(int) +unicode_writer_prepare(unicode_writer_t *writer, + Py_ssize_t length, Py_UCS4 maxchar) +{ + Py_ssize_t newlen; + PyObject *newbuffer; + + if (length > PY_SSIZE_T_MAX - writer->pos) { + PyErr_NoMemory(); + return -1; + } + newlen = writer->pos + length; + + if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) { + /* overallocate 25% to limit the number of resize */ + if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) + newlen += newlen / 4; + + if (maxchar > writer->maxchar) { + /* resize + widen */ + newbuffer = PyUnicode_New(newlen, maxchar); + if (newbuffer == NULL) + return -1; + PyUnicode_CopyCharacters(newbuffer, 0, + writer->buffer, 0, writer->pos); + Py_DECREF(writer->buffer); + } + else { + newbuffer = resize_compact(writer->buffer, newlen); + if (newbuffer == NULL) + return -1; + } + writer->buffer = newbuffer; + unicode_writer_update(writer); + } + else if (maxchar > writer->maxchar) { + if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0) + return -1; + unicode_writer_update(writer); + } + return 0; +} + +Py_LOCAL_INLINE(int) +unicode_writer_write_str( + unicode_writer_t *writer, + PyObject *str, Py_ssize_t start, Py_ssize_t length) +{ + Py_UCS4 maxchar; + + assert(str != NULL); + assert(PyUnicode_Check(str)); + if (PyUnicode_READY(str) == -1) + return -1; + + assert(0 <= start); + assert(0 <= length); + assert(start + length <= PyUnicode_GET_LENGTH(str)); + if (length == 0) + return 0; + + maxchar = _PyUnicode_FindMaxChar(str, start, start + length); + if (unicode_writer_prepare(writer, length, maxchar) == -1) + return -1; + + assert((writer->pos + length) <= PyUnicode_GET_LENGTH(writer->buffer)); + copy_characters(writer->buffer, writer->pos, + str, start, length); + writer->pos += length; + return 0; +} + +Py_LOCAL_INLINE(int) +unicode_writer_write_char( + unicode_writer_t *writer, + Py_UCS4 ch) +{ + if (unicode_writer_prepare(writer, 1, ch) == -1) + return -1; + assert((writer->pos + 1) <= PyUnicode_GET_LENGTH(writer->buffer)); + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); + writer->pos += 1; + return 0; +} + +Py_LOCAL(PyObject *) +unicode_writer_finish(unicode_writer_t *writer) +{ + if (PyUnicode_Resize(&writer->buffer, writer->pos) < 0) { + Py_DECREF(writer->buffer); + return NULL; + } + return writer->buffer; +} + +Py_LOCAL(void) +unicode_writer_dealloc(unicode_writer_t *writer) +{ + Py_CLEAR(writer->buffer); +} + #include "stringlib/unicode_format.h" PyDoc_STRVAR(format__doc__, @@ -13649,135 +13778,6 @@ formatchar(PyObject *v) return (Py_UCS4) -1; } -typedef struct { - PyObject *buffer; - void *data; - enum PyUnicode_Kind kind; - Py_UCS4 maxchar; - Py_ssize_t pos; -} unicode_writer_t; - -Py_LOCAL_INLINE(void) -unicode_writer_update(unicode_writer_t *writer) -{ - writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); - writer->data = PyUnicode_DATA(writer->buffer); - writer->kind = PyUnicode_KIND(writer->buffer); -} - -Py_LOCAL(int) -unicode_writer_init(unicode_writer_t *writer, - Py_ssize_t length, Py_UCS4 maxchar) -{ - writer->pos = 0; - writer->buffer = PyUnicode_New(length, maxchar); - if (writer->buffer == NULL) - return -1; - unicode_writer_update(writer); - return 0; -} - -Py_LOCAL_INLINE(int) -unicode_writer_prepare(unicode_writer_t *writer, - Py_ssize_t length, Py_UCS4 maxchar) -{ - Py_ssize_t newlen; - PyObject *newbuffer; - - if (length > PY_SSIZE_T_MAX - writer->pos) { - PyErr_NoMemory(); - return -1; - } - newlen = writer->pos + length; - - if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) { - /* overallocate 25% to limit the number of resize */ - if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) - newlen += newlen / 4; - - if (maxchar > writer->maxchar) { - /* resize + widen */ - newbuffer = PyUnicode_New(newlen, maxchar); - if (newbuffer == NULL) - return -1; - PyUnicode_CopyCharacters(newbuffer, 0, - writer->buffer, 0, writer->pos); - Py_DECREF(writer->buffer); - } - else { - newbuffer = resize_compact(writer->buffer, newlen); - if (newbuffer == NULL) - return -1; - } - writer->buffer = newbuffer; - unicode_writer_update(writer); - } - else if (maxchar > writer->maxchar) { - if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0) - return -1; - unicode_writer_update(writer); - } - return 0; -} - -Py_LOCAL_INLINE(int) -unicode_writer_write_str( - unicode_writer_t *writer, - PyObject *str, Py_ssize_t start, Py_ssize_t length) -{ - Py_UCS4 maxchar; - - assert(str != NULL); - assert(PyUnicode_Check(str)); - if (PyUnicode_READY(str) == -1) - return -1; - - assert(0 <= start); - assert(0 <= length); - assert(start + length <= PyUnicode_GET_LENGTH(str)); - if (length == 0) - return 0; - - maxchar = _PyUnicode_FindMaxChar(str, start, start + length); - if (unicode_writer_prepare(writer, length, maxchar) == -1) - return -1; - - assert((writer->pos + length) <= PyUnicode_GET_LENGTH(writer->buffer)); - copy_characters(writer->buffer, writer->pos, - str, start, length); - writer->pos += length; - return 0; -} - -Py_LOCAL_INLINE(int) -unicode_writer_write_char( - unicode_writer_t *writer, - Py_UCS4 ch) -{ - if (unicode_writer_prepare(writer, 1, ch) == -1) - return -1; - assert((writer->pos + 1) <= PyUnicode_GET_LENGTH(writer->buffer)); - PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); - writer->pos += 1; - return 0; -} - -Py_LOCAL(PyObject *) -unicode_writer_finish(unicode_writer_t *writer) -{ - if (PyUnicode_Resize(&writer->buffer, writer->pos) < 0) { - Py_DECREF(writer->buffer); - return NULL; - } - return writer->buffer; -} - -Py_LOCAL(void) -unicode_writer_dealloc(unicode_writer_t *writer) -{ - Py_CLEAR(writer->buffer); -} - PyObject * PyUnicode_Format(PyObject *format, PyObject *args) { |