diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2013-04-17 21:02:17 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2013-04-17 21:02:17 (GMT) |
commit | 8f674ccd6442fd4d415f2d9d1ad8b09c1b4f9f30 (patch) | |
tree | 8c66aa5789027ee1ed14128aa6ad6d85e1b076a7 /Objects | |
parent | e84a51c38edc35e4c1e1df6b28b1e3be95ba8d00 (diff) | |
download | cpython-8f674ccd6442fd4d415f2d9d1ad8b09c1b4f9f30.zip cpython-8f674ccd6442fd4d415f2d9d1ad8b09c1b4f9f30.tar.gz cpython-8f674ccd6442fd4d415f2d9d1ad8b09c1b4f9f30.tar.bz2 |
Close #17694: Add minimum length to _PyUnicodeWriter
* Add also min_char attribute to _PyUnicodeWriter structure (currently unused)
* _PyUnicodeWriter_Init() has no more argument (except the writer itself):
min_length and overallocate must be set explicitly
* In error handlers, only enable overallocation if the replacement string
is longer than 1 character
* CJK decoders don't use overallocation anymore
* Set min_length, instead of preallocating memory using
_PyUnicodeWriter_Prepare(), in many decoders
* _PyUnicode_DecodeUnicodeInternal() checks for integer overflow
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/complexobject.c | 2 | ||||
-rw-r--r-- | Objects/floatobject.c | 2 | ||||
-rw-r--r-- | Objects/longobject.c | 2 | ||||
-rw-r--r-- | Objects/stringlib/unicode_format.h | 6 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 111 |
5 files changed, 63 insertions, 60 deletions
diff --git a/Objects/complexobject.c b/Objects/complexobject.c index 355b063..54838cc 100644 --- a/Objects/complexobject.c +++ b/Objects/complexobject.c @@ -705,7 +705,7 @@ complex__format__(PyObject* self, PyObject* args) if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyComplex_FormatAdvancedWriter( &writer, self, diff --git a/Objects/floatobject.c b/Objects/floatobject.c index b571ca8..c54c8e1 100644 --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -1711,7 +1711,7 @@ float__format__(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyFloat_FormatAdvancedWriter( &writer, self, diff --git a/Objects/longobject.c b/Objects/longobject.c index cdaea02..2b04804 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4379,7 +4379,7 @@ long__format__(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyLong_FormatAdvancedWriter( &writer, self, diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h index 2f58946..9429169 100644 --- a/Objects/stringlib/unicode_format.h +++ b/Objects/stringlib/unicode_format.h @@ -906,7 +906,6 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs, int recursion_depth, AutoNumber *auto_number) { _PyUnicodeWriter writer; - Py_ssize_t minlen; /* check the recursion level */ if (recursion_depth <= 0) { @@ -915,8 +914,9 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs, return NULL; } - minlen = PyUnicode_GET_LENGTH(input->str) + 100; - _PyUnicodeWriter_Init(&writer, minlen); + _PyUnicodeWriter_Init(&writer); + writer.overallocate = 1; + writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100; if (!do_markup(input, args, kwargs, &writer, recursion_depth, auto_number)) { diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 748fcc7..c4157d8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2665,7 +2665,9 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) const char *f; _PyUnicodeWriter writer; - _PyUnicodeWriter_Init(&writer, strlen(format) + 100); + _PyUnicodeWriter_Init(&writer); + writer.min_length = strlen(format) + 100; + writer.overallocate = 1; /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). Copy it to be able to pass a reference to a subfunction. */ @@ -4117,7 +4119,10 @@ unicode_decode_call_errorhandler_writer( goto onError; } - writer->overallocate = 1; + if (PyUnicode_READY(repunicode) < 0) + goto onError; + if (PyUnicode_GET_LENGTH(repunicode) > 1) + writer->overallocate = 1; if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) return @@ -4256,9 +4261,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s, } /* Start off assuming it's all ASCII. Widen later as necessary. */ - _PyUnicodeWriter_Init(&writer, 0); - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + _PyUnicodeWriter_Init(&writer); + writer.min_length = size; shiftOutStart = 0; e = s + size; @@ -4655,7 +4659,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, return get_latin1_char((unsigned char)s[0]); } - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError; @@ -4910,7 +4914,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, le = bo <= 0; #endif - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1) goto onError; @@ -5149,7 +5153,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s, /* Note: size will always be longer than the resulting Unicode character count */ - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1) goto onError; @@ -5420,11 +5424,9 @@ PyUnicode_DecodeUnicodeEscape(const char *s, and we determined it's exact size (common case) or it contains \x, \u, ... escape sequences. then we create a legacy wchar string and resize it at the end of this function. */ - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (len > 0) { - if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) - goto onError; - assert(writer.kind == PyUnicode_1BYTE_KIND); + writer.min_length = len; } else { /* Escaped strings will always be longer than the resulting @@ -5432,8 +5434,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s, length after conversion to the true value. (but if the error callback returns a long replacement string we'll have to allocate more space) */ - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + writer.min_length = size; } if (size == 0) @@ -5461,10 +5462,6 @@ PyUnicode_DecodeUnicodeEscape(const char *s, if (s > end) c = '\0'; /* Invalid after \ */ - /* The only case in which i == ascii_length is a backslash - followed by a newline. */ - assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n')); - switch (c) { /* \x escapes */ @@ -5787,9 +5784,8 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, Unicode string, so we start with size here and then reduce the length after conversion to the true value. (But decoding error handler might have to resize the string) */ - _PyUnicodeWriter_Init(&writer, 1); - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + _PyUnicodeWriter_Init(&writer); + writer.min_length = size; end = s + size; while (s < end) { @@ -5982,12 +5978,14 @@ _PyUnicode_DecodeUnicodeInternal(const char *s, if (size == 0) _Py_RETURN_UNICODE_EMPTY(); - /* XXX overflow detection missing */ - _PyUnicodeWriter_Init(&writer, 0); - if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1) + _PyUnicodeWriter_Init(&writer); + if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { + PyErr_NoMemory(); goto onError; - end = s + size; + } + writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; + end = s + size; while (s < end) { Py_UNICODE uch; Py_UCS4 ch; @@ -6429,9 +6427,9 @@ PyUnicode_DecodeASCII(const char *s, if (size == 1 && (unsigned char)s[0] < 128) return get_latin1_char((unsigned char)s[0]); - _PyUnicodeWriter_Init(&writer, 0); - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + _PyUnicodeWriter_Init(&writer); + if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) + return NULL; e = s + size; data = writer.data; @@ -7280,7 +7278,7 @@ PyUnicode_DecodeCharmap(const char *s, if (size == 0) _Py_RETURN_UNICODE_EMPTY(); - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError; @@ -7312,7 +7310,7 @@ PyUnicode_DecodeCharmap(const char *s, ch = *s; x = mapdata_ucs1[ch]; if (x > maxchar) { - if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1) + if (_PyUnicodeWriter_Prepare(&writer, 1, 0xff) == -1) goto onError; maxchar = writer.maxchar; outdata = (Py_UCS1 *)writer.data; @@ -12841,21 +12839,27 @@ unicode_endswith(PyObject *self, Py_LOCAL_INLINE(void) _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) { - writer->size = PyUnicode_GET_LENGTH(writer->buffer); + if (!writer->readonly) + writer->size = PyUnicode_GET_LENGTH(writer->buffer); + else { + /* Copy-on-write mode: set buffer size to 0 so + * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on + * next write. */ + writer->size = 0; + } writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); writer->data = PyUnicode_DATA(writer->buffer); writer->kind = PyUnicode_KIND(writer->buffer); } void -_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) +_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) { memset(writer, 0, sizeof(*writer)); #ifdef Py_DEBUG writer->kind = 5; /* invalid kind */ #endif - writer->min_length = Py_MAX(min_length, 100); - writer->overallocate = (min_length > 0); + writer->min_char = 127; } int @@ -12873,29 +12877,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, } newlen = writer->pos + length; + maxchar = MAX_MAXCHAR(maxchar, writer->min_char); + if (writer->buffer == NULL) { - if (writer->overallocate) { + assert(!writer->readonly); + if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) { /* overallocate 25% to limit the number of resize */ - if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) - newlen += newlen / 4; - if (newlen < writer->min_length) - newlen = writer->min_length; + newlen += newlen / 4; } + if (newlen < writer->min_length) + newlen = writer->min_length; + writer->buffer = PyUnicode_New(newlen, maxchar); if (writer->buffer == NULL) return -1; - _PyUnicodeWriter_Update(writer); - return 0; } - - if (newlen > writer->size) { - if (writer->overallocate) { + else if (newlen > writer->size) { + if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) { /* overallocate 25% to limit the number of resize */ - if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) - newlen += newlen / 4; - if (newlen < writer->min_length) - newlen = writer->min_length; + newlen += newlen / 4; } + if (newlen < writer->min_length) + newlen = writer->min_length; if (maxchar > writer->maxchar || writer->readonly) { /* resize + widen */ @@ -12913,7 +12916,6 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, return -1; } writer->buffer = newbuffer; - _PyUnicodeWriter_Update(writer); } else if (maxchar > writer->maxchar) { assert(!writer->readonly); @@ -12924,8 +12926,8 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, writer->buffer, 0, writer->pos); Py_DECREF(writer->buffer); writer->buffer = newbuffer; - _PyUnicodeWriter_Update(writer); } + _PyUnicodeWriter_Update(writer); return 0; } @@ -12959,11 +12961,10 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) maxchar = PyUnicode_MAX_CHAR_VALUE(str); if (maxchar > writer->maxchar || len > writer->size - writer->pos) { if (writer->buffer == NULL && !writer->overallocate) { + writer->readonly = 1; Py_INCREF(str); writer->buffer = str; _PyUnicodeWriter_Update(writer); - writer->readonly = 1; - writer->size = 0; writer->pos += len; return 0; } @@ -13080,7 +13081,7 @@ unicode__format__(PyObject* self, PyObject* args) if (PyUnicode_READY(self) == -1) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyUnicode_FormatAdvancedWriter(&writer, self, format_spec, 0, PyUnicode_GET_LENGTH(format_spec)); @@ -14164,7 +14165,9 @@ PyUnicode_Format(PyObject *format, PyObject *args) ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); ctx.fmtpos = 0; - _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100); + _PyUnicodeWriter_Init(&ctx.writer); + ctx.writer.min_length = ctx.fmtcnt + 100; + ctx.writer.overallocate = 1; if (PyTuple_Check(args)) { ctx.arglen = PyTuple_Size(args); |