From 8f674ccd6442fd4d415f2d9d1ad8b09c1b4f9f30 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 17 Apr 2013 23:02:17 +0200 Subject: Close #17694: Add minimum length to _PyUnicodeWriter * Add also min_char attribute to _PyUnicodeWriter structure (currently unused) * _PyUnicodeWriter_Init() has no more argument (except the writer itself): min_length and overallocate must be set explicitly * In error handlers, only enable overallocation if the replacement string is longer than 1 character * CJK decoders don't use overallocation anymore * Set min_length, instead of preallocating memory using _PyUnicodeWriter_Prepare(), in many decoders * _PyUnicode_DecodeUnicodeInternal() checks for integer overflow --- Include/unicodeobject.h | 20 ++++--- Modules/cjkcodecs/multibytecodec.c | 9 +-- Objects/complexobject.c | 2 +- Objects/floatobject.c | 2 +- Objects/longobject.c | 2 +- Objects/stringlib/unicode_format.h | 6 +- Objects/unicodeobject.c | 111 +++++++++++++++++++------------------ 7 files changed, 81 insertions(+), 71 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index d613311..ed7db28 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -898,22 +898,28 @@ typedef struct { Py_UCS4 maxchar; Py_ssize_t size; Py_ssize_t pos; - /* minimum length of the buffer when overallocation is enabled, - see _PyUnicodeWriter_Init() */ + + /* minimum number of allocated characters (default: 0) */ Py_ssize_t min_length; + + /* minimum character (default: 127, ASCII) */ + Py_UCS4 min_char; + + /* If non-zero, overallocate the buffer by 25% (default: 0). */ unsigned char overallocate; + /* If readonly is 1, buffer is a shared string (cannot be modified) and size is set to 0. */ unsigned char readonly; } _PyUnicodeWriter ; /* Initialize a Unicode writer. - - If min_length is greater than zero, _PyUnicodeWriter_Prepare() - overallocates the buffer and min_length is the minimum length in characters - of the buffer. */ + * + * By default, the minimum buffer size is 0 character and overallocation is + * disabled. Set min_length, min_char and overallocate attributes to control + * the allocation of the buffer. */ PyAPI_FUNC(void) -_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length); +_PyUnicodeWriter_Init(_PyUnicodeWriter *writer); /* Prepare the buffer to write 'length' characters with the specified maximum character. diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 4c865ec..33bd779 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -633,7 +633,8 @@ MultibyteCodec_Decode(MultibyteCodecObject *self, return make_tuple(PyUnicode_New(0, 0), 0); } - _PyUnicodeWriter_Init(&buf.writer, datalen); + _PyUnicodeWriter_Init(&buf.writer); + buf.writer.min_length = datalen; buf.excobj = NULL; buf.inbuf = buf.inbuf_top = (unsigned char *)data; buf.inbuf_end = buf.inbuf_top + datalen; @@ -839,7 +840,7 @@ decoder_prepare_buffer(MultibyteDecodeBuffer *buf, const char *data, { buf->inbuf = buf->inbuf_top = (const unsigned char *)data; buf->inbuf_end = buf->inbuf_top + size; - _PyUnicodeWriter_Init(&buf->writer, size); + buf->writer.min_length += size; return 0; } @@ -1037,7 +1038,7 @@ mbidecoder_decode(MultibyteIncrementalDecoderObject *self, data = pdata.buf; size = pdata.len; - _PyUnicodeWriter_Init(&buf.writer, 1); + _PyUnicodeWriter_Init(&buf.writer); buf.excobj = NULL; origpending = self->pendingsize; @@ -1241,7 +1242,7 @@ mbstreamreader_iread(MultibyteStreamReaderObject *self, if (sizehint == 0) return PyUnicode_New(0, 0); - _PyUnicodeWriter_Init(&buf.writer, 1); + _PyUnicodeWriter_Init(&buf.writer); buf.excobj = NULL; cres = NULL; diff --git a/Objects/complexobject.c b/Objects/complexobject.c index 355b063..54838cc 100644 --- a/Objects/complexobject.c +++ b/Objects/complexobject.c @@ -705,7 +705,7 @@ complex__format__(PyObject* self, PyObject* args) if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyComplex_FormatAdvancedWriter( &writer, self, diff --git a/Objects/floatobject.c b/Objects/floatobject.c index b571ca8..c54c8e1 100644 --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -1711,7 +1711,7 @@ float__format__(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyFloat_FormatAdvancedWriter( &writer, self, diff --git a/Objects/longobject.c b/Objects/longobject.c index cdaea02..2b04804 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4379,7 +4379,7 @@ long__format__(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyLong_FormatAdvancedWriter( &writer, self, diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h index 2f58946..9429169 100644 --- a/Objects/stringlib/unicode_format.h +++ b/Objects/stringlib/unicode_format.h @@ -906,7 +906,6 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs, int recursion_depth, AutoNumber *auto_number) { _PyUnicodeWriter writer; - Py_ssize_t minlen; /* check the recursion level */ if (recursion_depth <= 0) { @@ -915,8 +914,9 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs, return NULL; } - minlen = PyUnicode_GET_LENGTH(input->str) + 100; - _PyUnicodeWriter_Init(&writer, minlen); + _PyUnicodeWriter_Init(&writer); + writer.overallocate = 1; + writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100; if (!do_markup(input, args, kwargs, &writer, recursion_depth, auto_number)) { diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 748fcc7..c4157d8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2665,7 +2665,9 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) const char *f; _PyUnicodeWriter writer; - _PyUnicodeWriter_Init(&writer, strlen(format) + 100); + _PyUnicodeWriter_Init(&writer); + writer.min_length = strlen(format) + 100; + writer.overallocate = 1; /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). Copy it to be able to pass a reference to a subfunction. */ @@ -4117,7 +4119,10 @@ unicode_decode_call_errorhandler_writer( goto onError; } - writer->overallocate = 1; + if (PyUnicode_READY(repunicode) < 0) + goto onError; + if (PyUnicode_GET_LENGTH(repunicode) > 1) + writer->overallocate = 1; if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) return @@ -4256,9 +4261,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s, } /* Start off assuming it's all ASCII. Widen later as necessary. */ - _PyUnicodeWriter_Init(&writer, 0); - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + _PyUnicodeWriter_Init(&writer); + writer.min_length = size; shiftOutStart = 0; e = s + size; @@ -4655,7 +4659,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, return get_latin1_char((unsigned char)s[0]); } - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError; @@ -4910,7 +4914,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, le = bo <= 0; #endif - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1) goto onError; @@ -5149,7 +5153,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s, /* Note: size will always be longer than the resulting Unicode character count */ - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1) goto onError; @@ -5420,11 +5424,9 @@ PyUnicode_DecodeUnicodeEscape(const char *s, and we determined it's exact size (common case) or it contains \x, \u, ... escape sequences. then we create a legacy wchar string and resize it at the end of this function. */ - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (len > 0) { - if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) - goto onError; - assert(writer.kind == PyUnicode_1BYTE_KIND); + writer.min_length = len; } else { /* Escaped strings will always be longer than the resulting @@ -5432,8 +5434,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s, length after conversion to the true value. (but if the error callback returns a long replacement string we'll have to allocate more space) */ - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + writer.min_length = size; } if (size == 0) @@ -5461,10 +5462,6 @@ PyUnicode_DecodeUnicodeEscape(const char *s, if (s > end) c = '\0'; /* Invalid after \ */ - /* The only case in which i == ascii_length is a backslash - followed by a newline. */ - assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n')); - switch (c) { /* \x escapes */ @@ -5787,9 +5784,8 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, Unicode string, so we start with size here and then reduce the length after conversion to the true value. (But decoding error handler might have to resize the string) */ - _PyUnicodeWriter_Init(&writer, 1); - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + _PyUnicodeWriter_Init(&writer); + writer.min_length = size; end = s + size; while (s < end) { @@ -5982,12 +5978,14 @@ _PyUnicode_DecodeUnicodeInternal(const char *s, if (size == 0) _Py_RETURN_UNICODE_EMPTY(); - /* XXX overflow detection missing */ - _PyUnicodeWriter_Init(&writer, 0); - if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1) + _PyUnicodeWriter_Init(&writer); + if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { + PyErr_NoMemory(); goto onError; - end = s + size; + } + writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; + end = s + size; while (s < end) { Py_UNICODE uch; Py_UCS4 ch; @@ -6429,9 +6427,9 @@ PyUnicode_DecodeASCII(const char *s, if (size == 1 && (unsigned char)s[0] < 128) return get_latin1_char((unsigned char)s[0]); - _PyUnicodeWriter_Init(&writer, 0); - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + _PyUnicodeWriter_Init(&writer); + if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) + return NULL; e = s + size; data = writer.data; @@ -7280,7 +7278,7 @@ PyUnicode_DecodeCharmap(const char *s, if (size == 0) _Py_RETURN_UNICODE_EMPTY(); - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError; @@ -7312,7 +7310,7 @@ PyUnicode_DecodeCharmap(const char *s, ch = *s; x = mapdata_ucs1[ch]; if (x > maxchar) { - if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1) + if (_PyUnicodeWriter_Prepare(&writer, 1, 0xff) == -1) goto onError; maxchar = writer.maxchar; outdata = (Py_UCS1 *)writer.data; @@ -12841,21 +12839,27 @@ unicode_endswith(PyObject *self, Py_LOCAL_INLINE(void) _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) { - writer->size = PyUnicode_GET_LENGTH(writer->buffer); + if (!writer->readonly) + writer->size = PyUnicode_GET_LENGTH(writer->buffer); + else { + /* Copy-on-write mode: set buffer size to 0 so + * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on + * next write. */ + writer->size = 0; + } writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); writer->data = PyUnicode_DATA(writer->buffer); writer->kind = PyUnicode_KIND(writer->buffer); } void -_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) +_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) { memset(writer, 0, sizeof(*writer)); #ifdef Py_DEBUG writer->kind = 5; /* invalid kind */ #endif - writer->min_length = Py_MAX(min_length, 100); - writer->overallocate = (min_length > 0); + writer->min_char = 127; } int @@ -12873,29 +12877,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, } newlen = writer->pos + length; + maxchar = MAX_MAXCHAR(maxchar, writer->min_char); + if (writer->buffer == NULL) { - if (writer->overallocate) { + assert(!writer->readonly); + if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) { /* overallocate 25% to limit the number of resize */ - if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) - newlen += newlen / 4; - if (newlen < writer->min_length) - newlen = writer->min_length; + newlen += newlen / 4; } + if (newlen < writer->min_length) + newlen = writer->min_length; + writer->buffer = PyUnicode_New(newlen, maxchar); if (writer->buffer == NULL) return -1; - _PyUnicodeWriter_Update(writer); - return 0; } - - if (newlen > writer->size) { - if (writer->overallocate) { + else if (newlen > writer->size) { + if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) { /* overallocate 25% to limit the number of resize */ - if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) - newlen += newlen / 4; - if (newlen < writer->min_length) - newlen = writer->min_length; + newlen += newlen / 4; } + if (newlen < writer->min_length) + newlen = writer->min_length; if (maxchar > writer->maxchar || writer->readonly) { /* resize + widen */ @@ -12913,7 +12916,6 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, return -1; } writer->buffer = newbuffer; - _PyUnicodeWriter_Update(writer); } else if (maxchar > writer->maxchar) { assert(!writer->readonly); @@ -12924,8 +12926,8 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, writer->buffer, 0, writer->pos); Py_DECREF(writer->buffer); writer->buffer = newbuffer; - _PyUnicodeWriter_Update(writer); } + _PyUnicodeWriter_Update(writer); return 0; } @@ -12959,11 +12961,10 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) maxchar = PyUnicode_MAX_CHAR_VALUE(str); if (maxchar > writer->maxchar || len > writer->size - writer->pos) { if (writer->buffer == NULL && !writer->overallocate) { + writer->readonly = 1; Py_INCREF(str); writer->buffer = str; _PyUnicodeWriter_Update(writer); - writer->readonly = 1; - writer->size = 0; writer->pos += len; return 0; } @@ -13080,7 +13081,7 @@ unicode__format__(PyObject* self, PyObject* args) if (PyUnicode_READY(self) == -1) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyUnicode_FormatAdvancedWriter(&writer, self, format_spec, 0, PyUnicode_GET_LENGTH(format_spec)); @@ -14164,7 +14165,9 @@ PyUnicode_Format(PyObject *format, PyObject *args) ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); ctx.fmtpos = 0; - _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100); + _PyUnicodeWriter_Init(&ctx.writer); + ctx.writer.min_length = ctx.fmtcnt + 100; + ctx.writer.overallocate = 1; if (PyTuple_Check(args)) { ctx.arglen = PyTuple_Size(args); -- cgit v0.12