diff options
Diffstat (limited to 'Objects')
36 files changed, 3867 insertions, 3979 deletions
diff --git a/Objects/abstract.c b/Objects/abstract.c index a0362e7..3e1ff97 100644 --- a/Objects/abstract.c +++ b/Objects/abstract.c @@ -141,8 +141,11 @@ PyObject_GetItem(PyObject *o, PyObject *key) return null_error(); m = o->ob_type->tp_as_mapping; - if (m && m->mp_subscript) - return m->mp_subscript(o, key); + if (m && m->mp_subscript) { + PyObject *item = m->mp_subscript(o, key); + assert((item != NULL) ^ (PyErr_Occurred() != NULL)); + return item; + } if (o->ob_type->tp_as_sequence) { if (PyIndex_Check(key)) { @@ -1544,8 +1547,10 @@ PySequence_GetItem(PyObject *s, Py_ssize_t i) if (i < 0) { if (m->sq_length) { Py_ssize_t l = (*m->sq_length)(s); - if (l < 0) + if (l < 0) { + assert(PyErr_Occurred()); return NULL; + } i += l; } } diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index 277be59..b7dfd6f 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -279,31 +279,6 @@ PyByteArray_Concat(PyObject *a, PyObject *b) return (PyObject *)result; } -static PyObject * -bytearray_format(PyByteArrayObject *self, PyObject *args) -{ - PyObject *bytes_in, *bytes_out, *res; - char *bytestring; - - if (self == NULL || !PyByteArray_Check(self) || args == NULL) { - PyErr_BadInternalCall(); - return NULL; - } - bytestring = PyByteArray_AS_STRING(self); - bytes_in = PyBytes_FromString(bytestring); - if (bytes_in == NULL) - return NULL; - bytes_out = _PyBytes_Format(bytes_in, args); - Py_DECREF(bytes_in); - if (bytes_out == NULL) - return NULL; - res = PyByteArray_FromObject(bytes_out); - Py_DECREF(bytes_out); - if (res == NULL) - return NULL; - return res; -} - /* Functions stuffed into the type object */ static Py_ssize_t @@ -1122,161 +1097,27 @@ bytearray_dealloc(PyByteArrayObject *self) #include "stringlib/transmogrify.h" -/* The following Py_LOCAL_INLINE and Py_LOCAL functions -were copied from the old char* style string object. */ - -/* helper macro to fixup start/end slice values */ -#define ADJUST_INDICES(start, end, len) \ - if (end > len) \ - end = len; \ - else if (end < 0) { \ - end += len; \ - if (end < 0) \ - end = 0; \ - } \ - if (start < 0) { \ - start += len; \ - if (start < 0) \ - start = 0; \ - } - -Py_LOCAL_INLINE(Py_ssize_t) -bytearray_find_internal(PyByteArrayObject *self, PyObject *args, int dir) -{ - PyObject *subobj; - char byte; - Py_buffer subbuf; - const char *sub; - Py_ssize_t len, sub_len; - Py_ssize_t start=0, end=PY_SSIZE_T_MAX; - Py_ssize_t res; - - if (!stringlib_parse_args_finds_byte("find/rfind/index/rindex", - args, &subobj, &byte, &start, &end)) - return -2; - - if (subobj) { - if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0) - return -2; - - sub = subbuf.buf; - sub_len = subbuf.len; - } - else { - sub = &byte; - sub_len = 1; - } - len = PyByteArray_GET_SIZE(self); - - ADJUST_INDICES(start, end, len); - if (end - start < sub_len) - res = -1; - else if (sub_len == 1 -#ifndef HAVE_MEMRCHR - && dir > 0 -#endif - ) { - unsigned char needle = *sub; - int mode = (dir > 0) ? FAST_SEARCH : FAST_RSEARCH; - res = stringlib_fastsearch_memchr_1char( - PyByteArray_AS_STRING(self) + start, end - start, - needle, needle, mode); - if (res >= 0) - res += start; - } - else { - if (dir > 0) - res = stringlib_find_slice( - PyByteArray_AS_STRING(self), len, - sub, sub_len, start, end); - else - res = stringlib_rfind_slice( - PyByteArray_AS_STRING(self), len, - sub, sub_len, start, end); - } - - if (subobj) - PyBuffer_Release(&subbuf); - - return res; -} - -PyDoc_STRVAR(find__doc__, -"B.find(sub[, start[, end]]) -> int\n\ -\n\ -Return the lowest index in B where subsection sub is found,\n\ -such that sub is contained within B[start,end]. Optional\n\ -arguments start and end are interpreted as in slice notation.\n\ -\n\ -Return -1 on failure."); - static PyObject * bytearray_find(PyByteArrayObject *self, PyObject *args) { - Py_ssize_t result = bytearray_find_internal(self, args, +1); - if (result == -2) - return NULL; - return PyLong_FromSsize_t(result); + return _Py_bytes_find(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args); } -PyDoc_STRVAR(count__doc__, -"B.count(sub[, start[, end]]) -> int\n\ -\n\ -Return the number of non-overlapping occurrences of subsection sub in\n\ -bytes B[start:end]. Optional arguments start and end are interpreted\n\ -as in slice notation."); - static PyObject * bytearray_count(PyByteArrayObject *self, PyObject *args) { - PyObject *sub_obj; - const char *str = PyByteArray_AS_STRING(self), *sub; - Py_ssize_t sub_len; - char byte; - Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; - - Py_buffer vsub; - PyObject *count_obj; - - if (!stringlib_parse_args_finds_byte("count", args, &sub_obj, &byte, - &start, &end)) - return NULL; - - if (sub_obj) { - if (PyObject_GetBuffer(sub_obj, &vsub, PyBUF_SIMPLE) != 0) - return NULL; - - sub = vsub.buf; - sub_len = vsub.len; - } - else { - sub = &byte; - sub_len = 1; - } - - ADJUST_INDICES(start, end, PyByteArray_GET_SIZE(self)); - - count_obj = PyLong_FromSsize_t( - stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX) - ); - - if (sub_obj) - PyBuffer_Release(&vsub); - - return count_obj; + return _Py_bytes_count(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args); } /*[clinic input] bytearray.clear - self: self(type="PyByteArrayObject *") - Remove all items from the bytearray. [clinic start generated code]*/ static PyObject * bytearray_clear_impl(PyByteArrayObject *self) -/*[clinic end generated code: output=85c2fe6aede0956c input=e524fd330abcdc18]*/ +/*[clinic end generated code: output=85c2fe6aede0956c input=ed6edae9de447ac4]*/ { if (PyByteArray_Resize((PyObject *)self, 0) < 0) return NULL; @@ -1286,236 +1127,57 @@ bytearray_clear_impl(PyByteArrayObject *self) /*[clinic input] bytearray.copy - self: self(type="PyByteArrayObject *") - Return a copy of B. [clinic start generated code]*/ static PyObject * bytearray_copy_impl(PyByteArrayObject *self) -/*[clinic end generated code: output=68cfbcfed484c132 input=6d5d2975aa0f33f3]*/ +/*[clinic end generated code: output=68cfbcfed484c132 input=6597b0c01bccaa9e]*/ { return PyByteArray_FromStringAndSize(PyByteArray_AS_STRING((PyObject *)self), PyByteArray_GET_SIZE(self)); } -PyDoc_STRVAR(index__doc__, -"B.index(sub[, start[, end]]) -> int\n\ -\n\ -Like B.find() but raise ValueError when the subsection is not found."); - static PyObject * bytearray_index(PyByteArrayObject *self, PyObject *args) { - Py_ssize_t result = bytearray_find_internal(self, args, +1); - if (result == -2) - return NULL; - if (result == -1) { - PyErr_SetString(PyExc_ValueError, - "subsection not found"); - return NULL; - } - return PyLong_FromSsize_t(result); + return _Py_bytes_index(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args); } - -PyDoc_STRVAR(rfind__doc__, -"B.rfind(sub[, start[, end]]) -> int\n\ -\n\ -Return the highest index in B where subsection sub is found,\n\ -such that sub is contained within B[start,end]. Optional\n\ -arguments start and end are interpreted as in slice notation.\n\ -\n\ -Return -1 on failure."); - static PyObject * bytearray_rfind(PyByteArrayObject *self, PyObject *args) { - Py_ssize_t result = bytearray_find_internal(self, args, -1); - if (result == -2) - return NULL; - return PyLong_FromSsize_t(result); + return _Py_bytes_rfind(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args); } - -PyDoc_STRVAR(rindex__doc__, -"B.rindex(sub[, start[, end]]) -> int\n\ -\n\ -Like B.rfind() but raise ValueError when the subsection is not found."); - static PyObject * bytearray_rindex(PyByteArrayObject *self, PyObject *args) { - Py_ssize_t result = bytearray_find_internal(self, args, -1); - if (result == -2) - return NULL; - if (result == -1) { - PyErr_SetString(PyExc_ValueError, - "subsection not found"); - return NULL; - } - return PyLong_FromSsize_t(result); + return _Py_bytes_rindex(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args); } - static int bytearray_contains(PyObject *self, PyObject *arg) { - Py_ssize_t ival = PyNumber_AsSsize_t(arg, PyExc_ValueError); - if (ival == -1 && PyErr_Occurred()) { - Py_buffer varg; - Py_ssize_t pos; - PyErr_Clear(); - if (PyObject_GetBuffer(arg, &varg, PyBUF_SIMPLE) != 0) - return -1; - pos = stringlib_find(PyByteArray_AS_STRING(self), Py_SIZE(self), - varg.buf, varg.len, 0); - PyBuffer_Release(&varg); - return pos >= 0; - } - if (ival < 0 || ival >= 256) { - PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)"); - return -1; - } - - return memchr(PyByteArray_AS_STRING(self), (int) ival, Py_SIZE(self)) != NULL; + return _Py_bytes_contains(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), arg); } - -/* Matches the end (direction >= 0) or start (direction < 0) of self - * against substr, using the start and end arguments. Returns - * -1 on error, 0 if not found and 1 if found. - */ -Py_LOCAL(int) -_bytearray_tailmatch(PyByteArrayObject *self, PyObject *substr, Py_ssize_t start, - Py_ssize_t end, int direction) -{ - Py_ssize_t len = PyByteArray_GET_SIZE(self); - const char* str; - Py_buffer vsubstr; - int rv = 0; - - str = PyByteArray_AS_STRING(self); - - if (PyObject_GetBuffer(substr, &vsubstr, PyBUF_SIMPLE) != 0) - return -1; - - ADJUST_INDICES(start, end, len); - - if (direction < 0) { - /* startswith */ - if (start+vsubstr.len > len) { - goto done; - } - } else { - /* endswith */ - if (end-start < vsubstr.len || start > len) { - goto done; - } - - if (end-vsubstr.len > start) - start = end - vsubstr.len; - } - if (end-start >= vsubstr.len) - rv = ! memcmp(str+start, vsubstr.buf, vsubstr.len); - -done: - PyBuffer_Release(&vsubstr); - return rv; -} - - -PyDoc_STRVAR(startswith__doc__, -"B.startswith(prefix[, start[, end]]) -> bool\n\ -\n\ -Return True if B starts with the specified prefix, False otherwise.\n\ -With optional start, test B beginning at that position.\n\ -With optional end, stop comparing B at that position.\n\ -prefix can also be a tuple of bytes to try."); - static PyObject * bytearray_startswith(PyByteArrayObject *self, PyObject *args) { - Py_ssize_t start = 0; - Py_ssize_t end = PY_SSIZE_T_MAX; - PyObject *subobj; - int result; - - if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) - return NULL; - if (PyTuple_Check(subobj)) { - Py_ssize_t i; - for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - result = _bytearray_tailmatch(self, - PyTuple_GET_ITEM(subobj, i), - start, end, -1); - if (result == -1) - return NULL; - else if (result) { - Py_RETURN_TRUE; - } - } - Py_RETURN_FALSE; - } - result = _bytearray_tailmatch(self, subobj, start, end, -1); - if (result == -1) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - PyErr_Format(PyExc_TypeError, "startswith first arg must be bytes " - "or a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name); - return NULL; - } - else - return PyBool_FromLong(result); + return _Py_bytes_startswith(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args); } -PyDoc_STRVAR(endswith__doc__, -"B.endswith(suffix[, start[, end]]) -> bool\n\ -\n\ -Return True if B ends with the specified suffix, False otherwise.\n\ -With optional start, test B beginning at that position.\n\ -With optional end, stop comparing B at that position.\n\ -suffix can also be a tuple of bytes to try."); - static PyObject * bytearray_endswith(PyByteArrayObject *self, PyObject *args) { - Py_ssize_t start = 0; - Py_ssize_t end = PY_SSIZE_T_MAX; - PyObject *subobj; - int result; - - if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) - return NULL; - if (PyTuple_Check(subobj)) { - Py_ssize_t i; - for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - result = _bytearray_tailmatch(self, - PyTuple_GET_ITEM(subobj, i), - start, end, +1); - if (result == -1) - return NULL; - else if (result) { - Py_RETURN_TRUE; - } - } - Py_RETURN_FALSE; - } - result = _bytearray_tailmatch(self, subobj, start, end, +1); - if (result == -1) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - PyErr_Format(PyExc_TypeError, "endswith first arg must be bytes or " - "a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name); - return NULL; - } - else - return PyBool_FromLong(result); + return _Py_bytes_endswith(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args); } /*[clinic input] bytearray.translate - self: self(type="PyByteArrayObject *") table: object Translation table, which must be a bytes object of length 256. [ @@ -1532,7 +1194,7 @@ The remaining characters are mapped through the given translation table. static PyObject * bytearray_translate_impl(PyByteArrayObject *self, PyObject *table, int group_right_1, PyObject *deletechars) -/*[clinic end generated code: output=2bebc86a9a1ff083 input=b749ad85f4860824]*/ +/*[clinic end generated code: output=2bebc86a9a1ff083 input=846a01671bccc1c5]*/ { char *input, *output; const char *table_chars; @@ -1575,7 +1237,7 @@ bytearray_translate_impl(PyByteArrayObject *self, PyObject *table, result = PyByteArray_FromStringAndSize((char *)NULL, inlen); if (result == NULL) goto done; - output_start = output = PyByteArray_AsString(result); + output_start = output = PyByteArray_AS_STRING(result); input = PyByteArray_AS_STRING(input_obj); if (vdel.len == 0 && table_chars != NULL) { @@ -1645,493 +1307,6 @@ bytearray_maketrans_impl(Py_buffer *frm, Py_buffer *to) } -/* find and count characters and substrings */ - -#define findchar(target, target_len, c) \ - ((char *)memchr((const void *)(target), c, target_len)) - - -/* Bytes ops must return a string, create a copy */ -Py_LOCAL(PyByteArrayObject *) -return_self(PyByteArrayObject *self) -{ - /* always return a new bytearray */ - return (PyByteArrayObject *)PyByteArray_FromStringAndSize( - PyByteArray_AS_STRING(self), - PyByteArray_GET_SIZE(self)); -} - -Py_LOCAL_INLINE(Py_ssize_t) -countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount) -{ - Py_ssize_t count=0; - const char *start=target; - const char *end=target+target_len; - - while ( (start=findchar(start, end-start, c)) != NULL ) { - count++; - if (count >= maxcount) - break; - start += 1; - } - return count; -} - - -/* Algorithms for different cases of string replacement */ - -/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */ -Py_LOCAL(PyByteArrayObject *) -replace_interleave(PyByteArrayObject *self, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - char *self_s, *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count, i; - PyByteArrayObject *result; - - self_len = PyByteArray_GET_SIZE(self); - - /* 1 at the end plus 1 after every character; - count = min(maxcount, self_len + 1) */ - if (maxcount <= self_len) - count = maxcount; - else - /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */ - count = self_len + 1; - - /* Check for overflow */ - /* result_len = count * to_len + self_len; */ - assert(count > 0); - if (to_len > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, - "replace string is too long"); - return NULL; - } - result_len = count * to_len + self_len; - - if (! (result = (PyByteArrayObject *) - PyByteArray_FromStringAndSize(NULL, result_len)) ) - return NULL; - - self_s = PyByteArray_AS_STRING(self); - result_s = PyByteArray_AS_STRING(result); - - /* TODO: special case single character, which doesn't need memcpy */ - - /* Lay the first one down (guaranteed this will occur) */ - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - count -= 1; - - for (i=0; i<count; i++) { - *result_s++ = *self_s++; - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - } - - /* Copy the rest of the original string */ - Py_MEMCPY(result_s, self_s, self_len-i); - - return result; -} - -/* Special case for deleting a single character */ -/* len(self)>=1, len(from)==1, to="", maxcount>=1 */ -Py_LOCAL(PyByteArrayObject *) -replace_delete_single_character(PyByteArrayObject *self, - char from_c, Py_ssize_t maxcount) -{ - char *self_s, *result_s; - char *start, *next, *end; - Py_ssize_t self_len, result_len; - Py_ssize_t count; - PyByteArrayObject *result; - - self_len = PyByteArray_GET_SIZE(self); - self_s = PyByteArray_AS_STRING(self); - - count = countchar(self_s, self_len, from_c, maxcount); - if (count == 0) { - return return_self(self); - } - - result_len = self_len - count; /* from_len == 1 */ - assert(result_len>=0); - - if ( (result = (PyByteArrayObject *) - PyByteArray_FromStringAndSize(NULL, result_len)) == NULL) - return NULL; - result_s = PyByteArray_AS_STRING(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - next = findchar(start, end-start, from_c); - if (next == NULL) - break; - Py_MEMCPY(result_s, start, next-start); - result_s += (next-start); - start = next+1; - } - Py_MEMCPY(result_s, start, end-start); - - return result; -} - -/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */ - -Py_LOCAL(PyByteArrayObject *) -replace_delete_substring(PyByteArrayObject *self, - const char *from_s, Py_ssize_t from_len, - Py_ssize_t maxcount) -{ - char *self_s, *result_s; - char *start, *next, *end; - Py_ssize_t self_len, result_len; - Py_ssize_t count, offset; - PyByteArrayObject *result; - - self_len = PyByteArray_GET_SIZE(self); - self_s = PyByteArray_AS_STRING(self); - - count = stringlib_count(self_s, self_len, - from_s, from_len, - maxcount); - - if (count == 0) { - /* no matches */ - return return_self(self); - } - - result_len = self_len - (count * from_len); - assert (result_len>=0); - - if ( (result = (PyByteArrayObject *) - PyByteArray_FromStringAndSize(NULL, result_len)) == NULL ) - return NULL; - - result_s = PyByteArray_AS_STRING(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - offset = stringlib_find(start, end-start, - from_s, from_len, - 0); - if (offset == -1) - break; - next = start + offset; - - Py_MEMCPY(result_s, start, next-start); - - result_s += (next-start); - start = next+from_len; - } - Py_MEMCPY(result_s, start, end-start); - return result; -} - -/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */ -Py_LOCAL(PyByteArrayObject *) -replace_single_character_in_place(PyByteArrayObject *self, - char from_c, char to_c, - Py_ssize_t maxcount) -{ - char *self_s, *result_s, *start, *end, *next; - Py_ssize_t self_len; - PyByteArrayObject *result; - - /* The result string will be the same size */ - self_s = PyByteArray_AS_STRING(self); - self_len = PyByteArray_GET_SIZE(self); - - next = findchar(self_s, self_len, from_c); - - if (next == NULL) { - /* No matches; return the original bytes */ - return return_self(self); - } - - /* Need to make a new bytes */ - result = (PyByteArrayObject *) PyByteArray_FromStringAndSize(NULL, self_len); - if (result == NULL) - return NULL; - result_s = PyByteArray_AS_STRING(result); - Py_MEMCPY(result_s, self_s, self_len); - - /* change everything in-place, starting with this one */ - start = result_s + (next-self_s); - *start = to_c; - start++; - end = result_s + self_len; - - while (--maxcount > 0) { - next = findchar(start, end-start, from_c); - if (next == NULL) - break; - *next = to_c; - start = next+1; - } - - return result; -} - -/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */ -Py_LOCAL(PyByteArrayObject *) -replace_substring_in_place(PyByteArrayObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - char *result_s, *start, *end; - char *self_s; - Py_ssize_t self_len, offset; - PyByteArrayObject *result; - - /* The result bytes will be the same size */ - - self_s = PyByteArray_AS_STRING(self); - self_len = PyByteArray_GET_SIZE(self); - - offset = stringlib_find(self_s, self_len, - from_s, from_len, - 0); - if (offset == -1) { - /* No matches; return the original bytes */ - return return_self(self); - } - - /* Need to make a new bytes */ - result = (PyByteArrayObject *) PyByteArray_FromStringAndSize(NULL, self_len); - if (result == NULL) - return NULL; - result_s = PyByteArray_AS_STRING(result); - Py_MEMCPY(result_s, self_s, self_len); - - /* change everything in-place, starting with this one */ - start = result_s + offset; - Py_MEMCPY(start, to_s, from_len); - start += from_len; - end = result_s + self_len; - - while ( --maxcount > 0) { - offset = stringlib_find(start, end-start, - from_s, from_len, - 0); - if (offset==-1) - break; - Py_MEMCPY(start+offset, to_s, from_len); - start += offset+from_len; - } - - return result; -} - -/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */ -Py_LOCAL(PyByteArrayObject *) -replace_single_character(PyByteArrayObject *self, - char from_c, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - char *self_s, *result_s; - char *start, *next, *end; - Py_ssize_t self_len, result_len; - Py_ssize_t count; - PyByteArrayObject *result; - - self_s = PyByteArray_AS_STRING(self); - self_len = PyByteArray_GET_SIZE(self); - - count = countchar(self_s, self_len, from_c, maxcount); - if (count == 0) { - /* no matches, return unchanged */ - return return_self(self); - } - - /* use the difference between current and new, hence the "-1" */ - /* result_len = self_len + count * (to_len-1) */ - assert(count > 0); - if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); - return NULL; - } - result_len = self_len + count * (to_len - 1); - - if ( (result = (PyByteArrayObject *) - PyByteArray_FromStringAndSize(NULL, result_len)) == NULL) - return NULL; - result_s = PyByteArray_AS_STRING(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - next = findchar(start, end-start, from_c); - if (next == NULL) - break; - - if (next == start) { - /* replace with the 'to' */ - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - start += 1; - } else { - /* copy the unchanged old then the 'to' */ - Py_MEMCPY(result_s, start, next-start); - result_s += (next-start); - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - start = next+1; - } - } - /* Copy the remainder of the remaining bytes */ - Py_MEMCPY(result_s, start, end-start); - - return result; -} - -/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */ -Py_LOCAL(PyByteArrayObject *) -replace_substring(PyByteArrayObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - char *self_s, *result_s; - char *start, *next, *end; - Py_ssize_t self_len, result_len; - Py_ssize_t count, offset; - PyByteArrayObject *result; - - self_s = PyByteArray_AS_STRING(self); - self_len = PyByteArray_GET_SIZE(self); - - count = stringlib_count(self_s, self_len, - from_s, from_len, - maxcount); - - if (count == 0) { - /* no matches, return unchanged */ - return return_self(self); - } - - /* Check for overflow */ - /* result_len = self_len + count * (to_len-from_len) */ - assert(count > 0); - if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); - return NULL; - } - result_len = self_len + count * (to_len - from_len); - - if ( (result = (PyByteArrayObject *) - PyByteArray_FromStringAndSize(NULL, result_len)) == NULL) - return NULL; - result_s = PyByteArray_AS_STRING(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - offset = stringlib_find(start, end-start, - from_s, from_len, - 0); - if (offset == -1) - break; - next = start+offset; - if (next == start) { - /* replace with the 'to' */ - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - start += from_len; - } else { - /* copy the unchanged old then the 'to' */ - Py_MEMCPY(result_s, start, next-start); - result_s += (next-start); - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - start = next+from_len; - } - } - /* Copy the remainder of the remaining bytes */ - Py_MEMCPY(result_s, start, end-start); - - return result; -} - - -Py_LOCAL(PyByteArrayObject *) -replace(PyByteArrayObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - if (maxcount < 0) { - maxcount = PY_SSIZE_T_MAX; - } else if (maxcount == 0 || PyByteArray_GET_SIZE(self) == 0) { - /* nothing to do; return the original bytes */ - return return_self(self); - } - - if (maxcount == 0 || - (from_len == 0 && to_len == 0)) { - /* nothing to do; return the original bytes */ - return return_self(self); - } - - /* Handle zero-length special cases */ - - if (from_len == 0) { - /* insert the 'to' bytes everywhere. */ - /* >>> "Python".replace("", ".") */ - /* '.P.y.t.h.o.n.' */ - return replace_interleave(self, to_s, to_len, maxcount); - } - - /* Except for "".replace("", "A") == "A" there is no way beyond this */ - /* point for an empty self bytes to generate a non-empty bytes */ - /* Special case so the remaining code always gets a non-empty bytes */ - if (PyByteArray_GET_SIZE(self) == 0) { - return return_self(self); - } - - if (to_len == 0) { - /* delete all occurrences of 'from' bytes */ - if (from_len == 1) { - return replace_delete_single_character( - self, from_s[0], maxcount); - } else { - return replace_delete_substring(self, from_s, from_len, maxcount); - } - } - - /* Handle special case where both bytes have the same length */ - - if (from_len == to_len) { - if (from_len == 1) { - return replace_single_character_in_place( - self, - from_s[0], - to_s[0], - maxcount); - } else { - return replace_substring_in_place( - self, from_s, from_len, to_s, to_len, maxcount); - } - } - - /* Otherwise use the more generic algorithms */ - if (from_len == 1) { - return replace_single_character(self, from_s[0], - to_s, to_len, maxcount); - } else { - /* len('from')>=2, len('to')>=1 */ - return replace_substring(self, from_s, from_len, to_s, to_len, maxcount); - } -} - - /*[clinic input] bytearray.replace @@ -2153,9 +1328,9 @@ bytearray_replace_impl(PyByteArrayObject *self, Py_buffer *old, Py_buffer *new, Py_ssize_t count) /*[clinic end generated code: output=d39884c4dc59412a input=aa379d988637c7fb]*/ { - return (PyObject *)replace((PyByteArrayObject *) self, - old->buf, old->len, - new->buf, new->len, count); + return stringlib_replace((PyObject *)self, + (const char *)old->buf, old->len, + (const char *)new->buf, new->len, count); } /*[clinic input] @@ -2203,7 +1378,6 @@ bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, /*[clinic input] bytearray.partition - self: self(type="PyByteArrayObject *") sep: object / @@ -2219,7 +1393,7 @@ bytearray object and two empty bytearray objects. static PyObject * bytearray_partition(PyByteArrayObject *self, PyObject *sep) -/*[clinic end generated code: output=45d2525ddd35f957 input=7d7fe37b1696d506]*/ +/*[clinic end generated code: output=45d2525ddd35f957 input=86f89223892b70b5]*/ { PyObject *bytesep, *result; @@ -2241,7 +1415,6 @@ bytearray_partition(PyByteArrayObject *self, PyObject *sep) /*[clinic input] bytearray.rpartition - self: self(type="PyByteArrayObject *") sep: object / @@ -2257,7 +1430,7 @@ objects and the original bytearray object. static PyObject * bytearray_rpartition(PyByteArrayObject *self, PyObject *sep) -/*[clinic end generated code: output=440de3c9426115e8 input=9b8cd540c1b75853]*/ +/*[clinic end generated code: output=440de3c9426115e8 input=5f4094f2de87c8f3]*/ { PyObject *bytesep, *result; @@ -2315,14 +1488,12 @@ bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, /*[clinic input] bytearray.reverse - self: self(type="PyByteArrayObject *") - Reverse the order of the values in B in place. [clinic start generated code]*/ static PyObject * bytearray_reverse_impl(PyByteArrayObject *self) -/*[clinic end generated code: output=9f7616f29ab309d3 input=7933a499b8597bd1]*/ +/*[clinic end generated code: output=9f7616f29ab309d3 input=543356319fc78557]*/ { char swap, *head, *tail; Py_ssize_t i, j, n = Py_SIZE(self); @@ -2351,7 +1522,6 @@ class bytesvalue_converter(CConverter): /*[clinic input] bytearray.insert - self: self(type="PyByteArrayObject *") index: Py_ssize_t The index where the value is to be inserted. item: bytesvalue @@ -2363,7 +1533,7 @@ Insert a single item into the bytearray before the given index. static PyObject * bytearray_insert_impl(PyByteArrayObject *self, Py_ssize_t index, int item) -/*[clinic end generated code: output=76c775a70e7b07b7 input=833766836ba30e1e]*/ +/*[clinic end generated code: output=76c775a70e7b07b7 input=b2b5d07e9de6c070]*/ { Py_ssize_t n = Py_SIZE(self); char *buf; @@ -2393,7 +1563,6 @@ bytearray_insert_impl(PyByteArrayObject *self, Py_ssize_t index, int item) /*[clinic input] bytearray.append - self: self(type="PyByteArrayObject *") item: bytesvalue The item to be appended. / @@ -2403,7 +1572,7 @@ Append a single item to the end of the bytearray. static PyObject * bytearray_append_impl(PyByteArrayObject *self, int item) -/*[clinic end generated code: output=a154e19ed1886cb6 input=ae56ea87380407cc]*/ +/*[clinic end generated code: output=a154e19ed1886cb6 input=20d6bec3d1340593]*/ { Py_ssize_t n = Py_SIZE(self); @@ -2423,7 +1592,6 @@ bytearray_append_impl(PyByteArrayObject *self, int item) /*[clinic input] bytearray.extend - self: self(type="PyByteArrayObject *") iterable_of_ints: object The iterable of items to append. / @@ -2433,7 +1601,7 @@ Append all the items from the iterator or sequence to the end of the bytearray. static PyObject * bytearray_extend(PyByteArrayObject *self, PyObject *iterable_of_ints) -/*[clinic end generated code: output=98155dbe249170b1 input=ce83a5d75b70d850]*/ +/*[clinic end generated code: output=98155dbe249170b1 input=c617b3a93249ba28]*/ { PyObject *it, *item, *bytearray_obj; Py_ssize_t buf_size = 0, len = 0; @@ -2508,7 +1676,6 @@ bytearray_extend(PyByteArrayObject *self, PyObject *iterable_of_ints) /*[clinic input] bytearray.pop - self: self(type="PyByteArrayObject *") index: Py_ssize_t = -1 The index from where to remove the item. -1 (the default value) means remove the last item. @@ -2521,7 +1688,7 @@ If no index argument is given, will pop the last item. static PyObject * bytearray_pop_impl(PyByteArrayObject *self, Py_ssize_t index) -/*[clinic end generated code: output=e0ccd401f8021da8 input=0797e6c0ca9d5a85]*/ +/*[clinic end generated code: output=e0ccd401f8021da8 input=3591df2d06c0d237]*/ { int value; Py_ssize_t n = Py_SIZE(self); @@ -2553,7 +1720,6 @@ bytearray_pop_impl(PyByteArrayObject *self, Py_ssize_t index) /*[clinic input] bytearray.remove - self: self(type="PyByteArrayObject *") value: bytesvalue The value to remove. / @@ -2563,20 +1729,20 @@ Remove the first occurrence of a value in the bytearray. static PyObject * bytearray_remove_impl(PyByteArrayObject *self, int value) -/*[clinic end generated code: output=d659e37866709c13 input=47560b11fd856c24]*/ +/*[clinic end generated code: output=d659e37866709c13 input=121831240cd51ddf]*/ { - Py_ssize_t n = Py_SIZE(self); + Py_ssize_t where, n = Py_SIZE(self); char *buf = PyByteArray_AS_STRING(self); - char *where = memchr(buf, value, n); - if (!where) { + where = stringlib_find_char(buf, n, value); + if (where < 0) { PyErr_SetString(PyExc_ValueError, "value not found in bytearray"); return NULL; } if (!_canresize(self)) return NULL; - memmove(where, where + 1, buf + n - where); + memmove(buf + where, buf + where + 1, n - where); if (PyByteArray_Resize((PyObject *)self, n - 1) < 0) return NULL; @@ -2586,8 +1752,8 @@ bytearray_remove_impl(PyByteArrayObject *self, int value) /* XXX These two helpers could be optimized if argsize == 1 */ static Py_ssize_t -lstrip_helper(char *myptr, Py_ssize_t mysize, - void *argptr, Py_ssize_t argsize) +lstrip_helper(const char *myptr, Py_ssize_t mysize, + const void *argptr, Py_ssize_t argsize) { Py_ssize_t i = 0; while (i < mysize && memchr(argptr, (unsigned char) myptr[i], argsize)) @@ -2596,8 +1762,8 @@ lstrip_helper(char *myptr, Py_ssize_t mysize, } static Py_ssize_t -rstrip_helper(char *myptr, Py_ssize_t mysize, - void *argptr, Py_ssize_t argsize) +rstrip_helper(const char *myptr, Py_ssize_t mysize, + const void *argptr, Py_ssize_t argsize) { Py_ssize_t i = mysize - 1; while (i >= 0 && memchr(argptr, (unsigned char) myptr[i], argsize)) @@ -2798,22 +1964,6 @@ bytearray_splitlines_impl(PyByteArrayObject *self, int keepends) ); } -static int -hex_digit_to_int(Py_UCS4 c) -{ - if (c >= 128) - return -1; - if (Py_ISDIGIT(c)) - return c - '0'; - else { - if (Py_ISUPPER(c)) - c = Py_TOLOWER(c); - if (c >= 'a' && c <= 'f') - return c - 'a' + 10; - } - return -1; -} - /*[clinic input] @classmethod bytearray.fromhex @@ -2832,48 +1982,7 @@ static PyObject * bytearray_fromhex_impl(PyObject*cls, PyObject *string) /*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/ { - PyObject *newbytes; - char *buf; - Py_ssize_t hexlen, byteslen, i, j; - int top, bot; - void *data; - unsigned int kind; - - assert(PyUnicode_Check(string)); - if (PyUnicode_READY(string)) - return NULL; - kind = PyUnicode_KIND(string); - data = PyUnicode_DATA(string); - hexlen = PyUnicode_GET_LENGTH(string); - - byteslen = hexlen/2; /* This overestimates if there are spaces */ - newbytes = PyByteArray_FromStringAndSize(NULL, byteslen); - if (!newbytes) - return NULL; - buf = PyByteArray_AS_STRING(newbytes); - for (i = j = 0; i < hexlen; i += 2) { - /* skip over spaces in the input */ - while (PyUnicode_READ(kind, data, i) == ' ') - i++; - if (i >= hexlen) - break; - top = hex_digit_to_int(PyUnicode_READ(kind, data, i)); - bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1)); - if (top == -1 || bot == -1) { - PyErr_Format(PyExc_ValueError, - "non-hexadecimal number found in " - "fromhex() arg at position %zd", i); - goto error; - } - buf[j++] = (top << 4) + bot; - } - if (PyByteArray_Resize(newbytes, j) < 0) - goto error; - return newbytes; - - error: - Py_DECREF(newbytes); - return NULL; + return _PyBytes_FromHex(string, 1); } PyDoc_STRVAR(hex__doc__, @@ -2928,14 +2037,12 @@ _common_reduce(PyByteArrayObject *self, int proto) /*[clinic input] bytearray.__reduce__ as bytearray_reduce - self: self(type="PyByteArrayObject *") - Return state information for pickling. [clinic start generated code]*/ static PyObject * bytearray_reduce_impl(PyByteArrayObject *self) -/*[clinic end generated code: output=52bf304086464cab input=fbb07de4d102a03a]*/ +/*[clinic end generated code: output=52bf304086464cab input=44b5737ada62dd3f]*/ { return _common_reduce(self, 2); } @@ -2943,7 +2050,6 @@ bytearray_reduce_impl(PyByteArrayObject *self) /*[clinic input] bytearray.__reduce_ex__ as bytearray_reduce_ex - self: self(type="PyByteArrayObject *") proto: int = 0 / @@ -2952,7 +2058,7 @@ Return state information for pickling. static PyObject * bytearray_reduce_ex_impl(PyByteArrayObject *self, int proto) -/*[clinic end generated code: output=52eac33377197520 input=0e091a42ca6dbd91]*/ +/*[clinic end generated code: output=52eac33377197520 input=f129bc1a1aa151ee]*/ { return _common_reduce(self, proto); } @@ -2960,14 +2066,12 @@ bytearray_reduce_ex_impl(PyByteArrayObject *self, int proto) /*[clinic input] bytearray.__sizeof__ as bytearray_sizeof - self: self(type="PyByteArrayObject *") - Returns the size of the bytearray object in memory, in bytes. [clinic start generated code]*/ static PyObject * bytearray_sizeof_impl(PyByteArrayObject *self) -/*[clinic end generated code: output=738abdd17951c427 input=6b23d305362b462b]*/ +/*[clinic end generated code: output=738abdd17951c427 input=e27320fd98a4bc5a]*/ { Py_ssize_t res; @@ -3008,19 +2112,22 @@ bytearray_methods[] = { BYTEARRAY_APPEND_METHODDEF {"capitalize", (PyCFunction)stringlib_capitalize, METH_NOARGS, _Py_capitalize__doc__}, - {"center", (PyCFunction)stringlib_center, METH_VARARGS, center__doc__}, + {"center", (PyCFunction)stringlib_center, METH_VARARGS, _Py_center__doc__}, BYTEARRAY_CLEAR_METHODDEF BYTEARRAY_COPY_METHODDEF - {"count", (PyCFunction)bytearray_count, METH_VARARGS, count__doc__}, + {"count", (PyCFunction)bytearray_count, METH_VARARGS, + _Py_count__doc__}, BYTEARRAY_DECODE_METHODDEF - {"endswith", (PyCFunction)bytearray_endswith, METH_VARARGS, endswith__doc__}, + {"endswith", (PyCFunction)bytearray_endswith, METH_VARARGS, + _Py_endswith__doc__}, {"expandtabs", (PyCFunction)stringlib_expandtabs, METH_VARARGS | METH_KEYWORDS, - expandtabs__doc__}, + _Py_expandtabs__doc__}, BYTEARRAY_EXTEND_METHODDEF - {"find", (PyCFunction)bytearray_find, METH_VARARGS, find__doc__}, + {"find", (PyCFunction)bytearray_find, METH_VARARGS, + _Py_find__doc__}, BYTEARRAY_FROMHEX_METHODDEF {"hex", (PyCFunction)bytearray_hex, METH_NOARGS, hex__doc__}, - {"index", (PyCFunction)bytearray_index, METH_VARARGS, index__doc__}, + {"index", (PyCFunction)bytearray_index, METH_VARARGS, _Py_index__doc__}, BYTEARRAY_INSERT_METHODDEF {"isalnum", (PyCFunction)stringlib_isalnum, METH_NOARGS, _Py_isalnum__doc__}, @@ -3037,7 +2144,7 @@ bytearray_methods[] = { {"isupper", (PyCFunction)stringlib_isupper, METH_NOARGS, _Py_isupper__doc__}, BYTEARRAY_JOIN_METHODDEF - {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, ljust__doc__}, + {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, _Py_ljust__doc__}, {"lower", (PyCFunction)stringlib_lower, METH_NOARGS, _Py_lower__doc__}, BYTEARRAY_LSTRIP_METHODDEF BYTEARRAY_MAKETRANS_METHODDEF @@ -3046,23 +2153,23 @@ bytearray_methods[] = { BYTEARRAY_REMOVE_METHODDEF BYTEARRAY_REPLACE_METHODDEF BYTEARRAY_REVERSE_METHODDEF - {"rfind", (PyCFunction)bytearray_rfind, METH_VARARGS, rfind__doc__}, - {"rindex", (PyCFunction)bytearray_rindex, METH_VARARGS, rindex__doc__}, - {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, rjust__doc__}, + {"rfind", (PyCFunction)bytearray_rfind, METH_VARARGS, _Py_rfind__doc__}, + {"rindex", (PyCFunction)bytearray_rindex, METH_VARARGS, _Py_rindex__doc__}, + {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, _Py_rjust__doc__}, BYTEARRAY_RPARTITION_METHODDEF BYTEARRAY_RSPLIT_METHODDEF BYTEARRAY_RSTRIP_METHODDEF BYTEARRAY_SPLIT_METHODDEF BYTEARRAY_SPLITLINES_METHODDEF {"startswith", (PyCFunction)bytearray_startswith, METH_VARARGS , - startswith__doc__}, + _Py_startswith__doc__}, BYTEARRAY_STRIP_METHODDEF {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS, _Py_swapcase__doc__}, {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__}, BYTEARRAY_TRANSLATE_METHODDEF {"upper", (PyCFunction)stringlib_upper, METH_NOARGS, _Py_upper__doc__}, - {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, zfill__doc__}, + {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, _Py_zfill__doc__}, {NULL} }; @@ -3071,7 +2178,7 @@ bytearray_mod(PyObject *v, PyObject *w) { if (!PyByteArray_Check(v)) Py_RETURN_NOTIMPLEMENTED; - return bytearray_format((PyByteArrayObject *)v, w); + return _PyBytes_FormatEx(PyByteArray_AS_STRING(v), PyByteArray_GET_SIZE(v), w, 1); } static PyNumberMethods bytearray_as_number = { diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index a299915..fe666c6 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -277,7 +277,7 @@ Return a titlecased version of B, i.e. ASCII words start with uppercase\n\ characters, all remaining cased characters have lowercase."); void -_Py_bytes_title(char *result, char *s, Py_ssize_t len) +_Py_bytes_title(char *result, const char *s, Py_ssize_t len) { Py_ssize_t i; int previous_is_cased = 0; @@ -306,7 +306,7 @@ Return a copy of B with only its first character capitalized (ASCII)\n\ and the rest lower-cased."); void -_Py_bytes_capitalize(char *result, char *s, Py_ssize_t len) +_Py_bytes_capitalize(char *result, const char *s, Py_ssize_t len) { Py_ssize_t i; @@ -336,7 +336,7 @@ Return a copy of B with uppercase ASCII characters converted\n\ to lowercase ASCII and vice versa."); void -_Py_bytes_swapcase(char *result, char *s, Py_ssize_t len) +_Py_bytes_swapcase(char *result, const char *s, Py_ssize_t len) { Py_ssize_t i; @@ -387,3 +387,427 @@ _Py_bytes_maketrans(Py_buffer *frm, Py_buffer *to) return res; } + +#define FASTSEARCH fastsearch +#define STRINGLIB(F) stringlib_##F +#define STRINGLIB_CHAR char +#define STRINGLIB_SIZEOF_CHAR 1 + +#include "stringlib/fastsearch.h" +#include "stringlib/count.h" +#include "stringlib/find.h" + +/* +Wraps stringlib_parse_args_finds() and additionally checks whether the +first argument is an integer in range(0, 256). + +If this is the case, writes the integer value to the byte parameter +and sets subobj to NULL. Otherwise, sets the first argument to subobj +and doesn't touch byte. The other parameters are similar to those of +stringlib_parse_args_finds(). +*/ + +Py_LOCAL_INLINE(int) +parse_args_finds_byte(const char *function_name, PyObject *args, + PyObject **subobj, char *byte, + Py_ssize_t *start, Py_ssize_t *end) +{ + PyObject *tmp_subobj; + Py_ssize_t ival; + PyObject *err; + + if(!stringlib_parse_args_finds(function_name, args, &tmp_subobj, + start, end)) + return 0; + + if (!PyNumber_Check(tmp_subobj)) { + *subobj = tmp_subobj; + return 1; + } + + ival = PyNumber_AsSsize_t(tmp_subobj, PyExc_OverflowError); + if (ival == -1) { + err = PyErr_Occurred(); + if (err && !PyErr_GivenExceptionMatches(err, PyExc_OverflowError)) { + PyErr_Clear(); + *subobj = tmp_subobj; + return 1; + } + } + + if (ival < 0 || ival > 255) { + PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)"); + return 0; + } + + *subobj = NULL; + *byte = (char)ival; + return 1; +} + +/* helper macro to fixup start/end slice values */ +#define ADJUST_INDICES(start, end, len) \ + if (end > len) \ + end = len; \ + else if (end < 0) { \ + end += len; \ + if (end < 0) \ + end = 0; \ + } \ + if (start < 0) { \ + start += len; \ + if (start < 0) \ + start = 0; \ + } + +Py_LOCAL_INLINE(Py_ssize_t) +find_internal(const char *str, Py_ssize_t len, + const char *function_name, PyObject *args, int dir) +{ + PyObject *subobj; + char byte; + Py_buffer subbuf; + const char *sub; + Py_ssize_t sub_len; + Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; + Py_ssize_t res; + + if (!parse_args_finds_byte(function_name, args, + &subobj, &byte, &start, &end)) + return -2; + + if (subobj) { + if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0) + return -2; + + sub = subbuf.buf; + sub_len = subbuf.len; + } + else { + sub = &byte; + sub_len = 1; + } + + ADJUST_INDICES(start, end, len); + if (end - start < sub_len) + res = -1; + else if (sub_len == 1) { + if (dir > 0) + res = stringlib_find_char( + str + start, end - start, + *sub); + else + res = stringlib_rfind_char( + str + start, end - start, + *sub); + if (res >= 0) + res += start; + } + else { + if (dir > 0) + res = stringlib_find_slice( + str, len, + sub, sub_len, start, end); + else + res = stringlib_rfind_slice( + str, len, + sub, sub_len, start, end); + } + + if (subobj) + PyBuffer_Release(&subbuf); + + return res; +} + +PyDoc_STRVAR_shared(_Py_find__doc__, +"B.find(sub[, start[, end]]) -> int\n\ +\n\ +Return the lowest index in B where subsection sub is found,\n\ +such that sub is contained within B[start,end]. Optional\n\ +arguments start and end are interpreted as in slice notation.\n\ +\n\ +Return -1 on failure."); + +PyObject * +_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *args) +{ + Py_ssize_t result = find_internal(str, len, "find", args, +1); + if (result == -2) + return NULL; + return PyLong_FromSsize_t(result); +} + +PyDoc_STRVAR_shared(_Py_index__doc__, +"B.index(sub[, start[, end]]) -> int\n\ +\n\ +Like B.find() but raise ValueError when the subsection is not found."); + +PyObject * +_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *args) +{ + Py_ssize_t result = find_internal(str, len, "index", args, +1); + if (result == -2) + return NULL; + if (result == -1) { + PyErr_SetString(PyExc_ValueError, + "subsection not found"); + return NULL; + } + return PyLong_FromSsize_t(result); +} + +PyDoc_STRVAR_shared(_Py_rfind__doc__, +"B.rfind(sub[, start[, end]]) -> int\n\ +\n\ +Return the highest index in B where subsection sub is found,\n\ +such that sub is contained within B[start,end]. Optional\n\ +arguments start and end are interpreted as in slice notation.\n\ +\n\ +Return -1 on failure."); + +PyObject * +_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *args) +{ + Py_ssize_t result = find_internal(str, len, "rfind", args, -1); + if (result == -2) + return NULL; + return PyLong_FromSsize_t(result); +} + +PyDoc_STRVAR_shared(_Py_rindex__doc__, +"B.rindex(sub[, start[, end]]) -> int\n\ +\n\ +Like B.rfind() but raise ValueError when the subsection is not found."); + +PyObject * +_Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *args) +{ + Py_ssize_t result = find_internal(str, len, "rindex", args, -1); + if (result == -2) + return NULL; + if (result == -1) { + PyErr_SetString(PyExc_ValueError, + "subsection not found"); + return NULL; + } + return PyLong_FromSsize_t(result); +} + +PyDoc_STRVAR_shared(_Py_count__doc__, +"B.count(sub[, start[, end]]) -> int\n\ +\n\ +Return the number of non-overlapping occurrences of subsection sub in\n\ +bytes B[start:end]. Optional arguments start and end are interpreted\n\ +as in slice notation."); + +PyObject * +_Py_bytes_count(const char *str, Py_ssize_t len, PyObject *args) +{ + PyObject *sub_obj; + const char *sub; + Py_ssize_t sub_len; + char byte; + Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; + + Py_buffer vsub; + PyObject *count_obj; + + if (!parse_args_finds_byte("count", args, + &sub_obj, &byte, &start, &end)) + return NULL; + + if (sub_obj) { + if (PyObject_GetBuffer(sub_obj, &vsub, PyBUF_SIMPLE) != 0) + return NULL; + + sub = vsub.buf; + sub_len = vsub.len; + } + else { + sub = &byte; + sub_len = 1; + } + + ADJUST_INDICES(start, end, len); + + count_obj = PyLong_FromSsize_t( + stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX) + ); + + if (sub_obj) + PyBuffer_Release(&vsub); + + return count_obj; +} + +int +_Py_bytes_contains(const char *str, Py_ssize_t len, PyObject *arg) +{ + Py_ssize_t ival = PyNumber_AsSsize_t(arg, PyExc_ValueError); + if (ival == -1 && PyErr_Occurred()) { + Py_buffer varg; + Py_ssize_t pos; + PyErr_Clear(); + if (PyObject_GetBuffer(arg, &varg, PyBUF_SIMPLE) != 0) + return -1; + pos = stringlib_find(str, len, + varg.buf, varg.len, 0); + PyBuffer_Release(&varg); + return pos >= 0; + } + if (ival < 0 || ival >= 256) { + PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)"); + return -1; + } + + return memchr(str, (int) ival, len) != NULL; +} + + +/* Matches the end (direction >= 0) or start (direction < 0) of the buffer + * against substr, using the start and end arguments. Returns + * -1 on error, 0 if not found and 1 if found. + */ +Py_LOCAL(int) +tailmatch(const char *str, Py_ssize_t len, PyObject *substr, + Py_ssize_t start, Py_ssize_t end, int direction) +{ + Py_buffer sub_view = {NULL, NULL}; + const char *sub; + Py_ssize_t slen; + + if (PyBytes_Check(substr)) { + sub = PyBytes_AS_STRING(substr); + slen = PyBytes_GET_SIZE(substr); + } + else { + if (PyObject_GetBuffer(substr, &sub_view, PyBUF_SIMPLE) != 0) + return -1; + sub = sub_view.buf; + slen = sub_view.len; + } + + ADJUST_INDICES(start, end, len); + + if (direction < 0) { + /* startswith */ + if (start + slen > len) + goto notfound; + } else { + /* endswith */ + if (end - start < slen || start > len) + goto notfound; + + if (end - slen > start) + start = end - slen; + } + if (end - start < slen) + goto notfound; + if (memcmp(str + start, sub, slen) != 0) + goto notfound; + + PyBuffer_Release(&sub_view); + return 1; + +notfound: + PyBuffer_Release(&sub_view); + return 0; +} + +Py_LOCAL(PyObject *) +_Py_bytes_tailmatch(const char *str, Py_ssize_t len, + const char *function_name, PyObject *args, + int direction) +{ + Py_ssize_t start = 0; + Py_ssize_t end = PY_SSIZE_T_MAX; + PyObject *subobj; + int result; + + if (!stringlib_parse_args_finds(function_name, args, &subobj, &start, &end)) + return NULL; + if (PyTuple_Check(subobj)) { + Py_ssize_t i; + for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + result = tailmatch(str, len, PyTuple_GET_ITEM(subobj, i), + start, end, direction); + if (result == -1) + return NULL; + else if (result) { + Py_RETURN_TRUE; + } + } + Py_RETURN_FALSE; + } + result = tailmatch(str, len, subobj, start, end, direction); + if (result == -1) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) + PyErr_Format(PyExc_TypeError, + "%s first arg must be bytes or a tuple of bytes, " + "not %s", + function_name, Py_TYPE(subobj)->tp_name); + return NULL; + } + else + return PyBool_FromLong(result); +} + +PyDoc_STRVAR_shared(_Py_startswith__doc__, +"B.startswith(prefix[, start[, end]]) -> bool\n\ +\n\ +Return True if B starts with the specified prefix, False otherwise.\n\ +With optional start, test B beginning at that position.\n\ +With optional end, stop comparing B at that position.\n\ +prefix can also be a tuple of bytes to try."); + +PyObject * +_Py_bytes_startswith(const char *str, Py_ssize_t len, PyObject *args) +{ + return _Py_bytes_tailmatch(str, len, "startswith", args, -1); +} + +PyDoc_STRVAR_shared(_Py_endswith__doc__, +"B.endswith(suffix[, start[, end]]) -> bool\n\ +\n\ +Return True if B ends with the specified suffix, False otherwise.\n\ +With optional start, test B beginning at that position.\n\ +With optional end, stop comparing B at that position.\n\ +suffix can also be a tuple of bytes to try."); + +PyObject * +_Py_bytes_endswith(const char *str, Py_ssize_t len, PyObject *args) +{ + return _Py_bytes_tailmatch(str, len, "endswith", args, +1); +} + +PyDoc_STRVAR_shared(_Py_expandtabs__doc__, +"B.expandtabs(tabsize=8) -> copy of B\n\ +\n\ +Return a copy of B where all tab characters are expanded using spaces.\n\ +If tabsize is not given, a tab size of 8 characters is assumed."); + +PyDoc_STRVAR_shared(_Py_ljust__doc__, +"B.ljust(width[, fillchar]) -> copy of B\n" +"\n" +"Return B left justified in a string of length width. Padding is\n" +"done using the specified fill character (default is a space)."); + +PyDoc_STRVAR_shared(_Py_rjust__doc__, +"B.rjust(width[, fillchar]) -> copy of B\n" +"\n" +"Return B right justified in a string of length width. Padding is\n" +"done using the specified fill character (default is a space)"); + +PyDoc_STRVAR_shared(_Py_center__doc__, +"B.center(width[, fillchar]) -> copy of B\n" +"\n" +"Return B centered in a string of length width. Padding is\n" +"done using the specified fill character (default is a space)."); + +PyDoc_STRVAR_shared(_Py_zfill__doc__, +"B.zfill(width) -> copy of B\n" +"\n" +"Pad a numeric string B with zeros on the left, to fill a field\n" +"of the specified width. B is never truncated."); + diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 495c3eb..aeddf53 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -9,9 +9,9 @@ #include <stddef.h> /*[clinic input] -class bytes "PyBytesObject*" "&PyBytes_Type" +class bytes "PyBytesObject *" "&PyBytes_Type" [clinic start generated code]*/ -/*[clinic end generated code: output=da39a3ee5e6b4b0d input=1a1d9102afc1b00c]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=7a238f965d64892b]*/ #include "clinic/bytesobject.c.h" @@ -30,6 +30,10 @@ static PyBytesObject *nullstring; */ #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1) +/* Forward declaration */ +Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer, + char *str); + /* For PyBytes_FromString(), the parameter `str' points to a null-terminated string containing exactly `size' bytes. @@ -174,190 +178,184 @@ PyBytes_FromString(const char *str) PyObject * PyBytes_FromFormatV(const char *format, va_list vargs) { - va_list count; - Py_ssize_t n = 0; - const char* f; char *s; - PyObject* string; + const char *f; + const char *p; + Py_ssize_t prec; + int longflag; + int size_tflag; + /* Longest 64-bit formatted numbers: + - "18446744073709551615\0" (21 bytes) + - "-9223372036854775808\0" (21 bytes) + Decimal takes the most space (it isn't enough for octal.) + + Longest 64-bit pointer representation: + "0xffffffffffffffff\0" (19 bytes). */ + char buffer[21]; + _PyBytesWriter writer; + + _PyBytesWriter_Init(&writer); + + s = _PyBytesWriter_Alloc(&writer, strlen(format)); + if (s == NULL) + return NULL; + writer.overallocate = 1; + +#define WRITE_BYTES(str) \ + do { \ + s = _PyBytesWriter_WriteBytes(&writer, s, (str), strlen(str)); \ + if (s == NULL) \ + goto error; \ + } while (0) - Py_VA_COPY(count, vargs); - /* step 1: figure out how large a buffer we need */ for (f = format; *f; f++) { - if (*f == '%') { - const char* p = f; - while (*++f && *f != '%' && !Py_ISALPHA(*f)) - ; - - /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since - * they don't affect the amount of space we reserve. - */ - if ((*f == 'l' || *f == 'z') && - (f[1] == 'd' || f[1] == 'u')) - ++f; - - switch (*f) { - case 'c': - { - int c = va_arg(count, int); - if (c < 0 || c > 255) { - PyErr_SetString(PyExc_OverflowError, - "PyBytes_FromFormatV(): %c format " - "expects an integer in range [0; 255]"); - return NULL; - } - n++; - break; + if (*f != '%') { + *s++ = *f; + continue; + } + + p = f++; + + /* ignore the width (ex: 10 in "%10s") */ + while (Py_ISDIGIT(*f)) + f++; + + /* parse the precision (ex: 10 in "%.10s") */ + prec = 0; + if (*f == '.') { + f++; + for (; Py_ISDIGIT(*f); f++) { + prec = (prec * 10) + (*f - '0'); } - case '%': - n++; - break; - case 'd': case 'u': case 'i': case 'x': - (void) va_arg(count, int); - /* 20 bytes is enough to hold a 64-bit - integer. Decimal takes the most space. - This isn't enough for octal. */ - n += 20; - break; - case 's': - s = va_arg(count, char*); - n += strlen(s); - break; - case 'p': - (void) va_arg(count, int); - /* maximum 64-bit pointer representation: - * 0xffffffffffffffff - * so 19 characters is enough. - * XXX I count 18 -- what's the extra for? - */ - n += 19; - break; - default: - /* if we stumble upon an unknown - formatting code, copy the rest of - the format string to the output - string. (we cannot just skip the - code, since there's no way to know - what's in the argument list) */ - n += strlen(p); - goto expand; + } + + while (*f && *f != '%' && !Py_ISALPHA(*f)) + f++; + + /* handle the long flag ('l'), but only for %ld and %lu. + others can be added when necessary. */ + longflag = 0; + if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { + longflag = 1; + ++f; + } + + /* handle the size_t flag ('z'). */ + size_tflag = 0; + if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { + size_tflag = 1; + ++f; + } + + /* substract bytes preallocated for the format string + (ex: 2 for "%s") */ + writer.min_size -= (f - p + 1); + + switch (*f) { + case 'c': + { + int c = va_arg(vargs, int); + if (c < 0 || c > 255) { + PyErr_SetString(PyExc_OverflowError, + "PyBytes_FromFormatV(): %c format " + "expects an integer in range [0; 255]"); + goto error; } - } else - n++; - } - expand: - /* step 2: fill the buffer */ - /* Since we've analyzed how much space we need for the worst case, - use sprintf directly instead of the slower PyOS_snprintf. */ - string = PyBytes_FromStringAndSize(NULL, n); - if (!string) - return NULL; + writer.min_size++; + *s++ = (unsigned char)c; + break; + } - s = PyBytes_AsString(string); + case 'd': + if (longflag) + sprintf(buffer, "%ld", va_arg(vargs, long)); + else if (size_tflag) + sprintf(buffer, "%" PY_FORMAT_SIZE_T "d", + va_arg(vargs, Py_ssize_t)); + else + sprintf(buffer, "%d", va_arg(vargs, int)); + assert(strlen(buffer) < sizeof(buffer)); + WRITE_BYTES(buffer); + break; - for (f = format; *f; f++) { - if (*f == '%') { - const char* p = f++; + case 'u': + if (longflag) + sprintf(buffer, "%lu", + va_arg(vargs, unsigned long)); + else if (size_tflag) + sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", + va_arg(vargs, size_t)); + else + sprintf(buffer, "%u", + va_arg(vargs, unsigned int)); + assert(strlen(buffer) < sizeof(buffer)); + WRITE_BYTES(buffer); + break; + + case 'i': + sprintf(buffer, "%i", va_arg(vargs, int)); + assert(strlen(buffer) < sizeof(buffer)); + WRITE_BYTES(buffer); + break; + + case 'x': + sprintf(buffer, "%x", va_arg(vargs, int)); + assert(strlen(buffer) < sizeof(buffer)); + WRITE_BYTES(buffer); + break; + + case 's': + { Py_ssize_t i; - int longflag = 0; - int size_tflag = 0; - /* parse the width.precision part (we're only - interested in the precision value, if any) */ - n = 0; - while (Py_ISDIGIT(*f)) - n = (n*10) + *f++ - '0'; - if (*f == '.') { - f++; - n = 0; - while (Py_ISDIGIT(*f)) - n = (n*10) + *f++ - '0'; - } - while (*f && *f != '%' && !Py_ISALPHA(*f)) - f++; - /* handle the long flag, but only for %ld and %lu. - others can be added when necessary. */ - if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { - longflag = 1; - ++f; - } - /* handle the size_t flag. */ - if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { - size_tflag = 1; - ++f; - } - switch (*f) { - case 'c': - { - int c = va_arg(vargs, int); - /* c has been checked for overflow in the first step */ - *s++ = (unsigned char)c; - break; + p = va_arg(vargs, const char*); + i = strlen(p); + if (prec > 0 && i > prec) + i = prec; + s = _PyBytesWriter_WriteBytes(&writer, s, p, i); + if (s == NULL) + goto error; + break; + } + + case 'p': + sprintf(buffer, "%p", va_arg(vargs, void*)); + assert(strlen(buffer) < sizeof(buffer)); + /* %p is ill-defined: ensure leading 0x. */ + if (buffer[1] == 'X') + buffer[1] = 'x'; + else if (buffer[1] != 'x') { + memmove(buffer+2, buffer, strlen(buffer)+1); + buffer[0] = '0'; + buffer[1] = 'x'; } - case 'd': - if (longflag) - sprintf(s, "%ld", va_arg(vargs, long)); - else if (size_tflag) - sprintf(s, "%" PY_FORMAT_SIZE_T "d", - va_arg(vargs, Py_ssize_t)); - else - sprintf(s, "%d", va_arg(vargs, int)); - s += strlen(s); - break; - case 'u': - if (longflag) - sprintf(s, "%lu", - va_arg(vargs, unsigned long)); - else if (size_tflag) - sprintf(s, "%" PY_FORMAT_SIZE_T "u", - va_arg(vargs, size_t)); - else - sprintf(s, "%u", - va_arg(vargs, unsigned int)); - s += strlen(s); - break; - case 'i': - sprintf(s, "%i", va_arg(vargs, int)); - s += strlen(s); - break; - case 'x': - sprintf(s, "%x", va_arg(vargs, int)); - s += strlen(s); - break; - case 's': - p = va_arg(vargs, char*); - i = strlen(p); - if (n > 0 && i > n) - i = n; - Py_MEMCPY(s, p, i); - s += i; - break; - case 'p': - sprintf(s, "%p", va_arg(vargs, void*)); - /* %p is ill-defined: ensure leading 0x. */ - if (s[1] == 'X') - s[1] = 'x'; - else if (s[1] != 'x') { - memmove(s+2, s, strlen(s)+1); - s[0] = '0'; - s[1] = 'x'; - } - s += strlen(s); - break; - case '%': - *s++ = '%'; - break; - default: - strcpy(s, p); - s += strlen(s); - goto end; + WRITE_BYTES(buffer); + break; + + case '%': + writer.min_size++; + *s++ = '%'; + break; + + default: + if (*f == 0) { + /* fix min_size if we reached the end of the format string */ + writer.min_size++; } - } else - *s++ = *f; + + /* invalid format string: copy unformatted string and exit */ + WRITE_BYTES(p); + return _PyBytesWriter_Finish(&writer, s); + } } - end: - _PyBytes_Resize(&string, s - PyBytes_AS_STRING(string)); - return string; +#undef WRITE_BYTES + + return _PyBytesWriter_Finish(&writer, s); + + error: + _PyBytesWriter_Dealloc(&writer); + return NULL; } PyObject * @@ -409,12 +407,14 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) /* Returns a new reference to a PyBytes object, or NULL on failure. */ -static PyObject * -formatfloat(PyObject *v, int flags, int prec, int type) +static char* +formatfloat(PyObject *v, int flags, int prec, int type, + PyObject **p_result, _PyBytesWriter *writer, char *str) { char *p; PyObject *result; double x; + size_t len; x = PyFloat_AsDouble(v); if (x == -1.0 && PyErr_Occurred()) { @@ -431,9 +431,22 @@ formatfloat(PyObject *v, int flags, int prec, int type) if (p == NULL) return NULL; - result = PyBytes_FromStringAndSize(p, strlen(p)); + + len = strlen(p); + if (writer != NULL) { + str = _PyBytesWriter_Prepare(writer, str, len); + if (str == NULL) + return NULL; + Py_MEMCPY(str, p, len); + PyMem_Free(p); + str += len; + return str; + } + + result = PyBytes_FromStringAndSize(p, len); PyMem_Free(p); - return result; + *p_result = result; + return str; } static PyObject * @@ -473,11 +486,11 @@ formatlong(PyObject *v, int flags, int prec, int type) static int byte_converter(PyObject *arg, char *p) { - if (PyBytes_Check(arg) && PyBytes_Size(arg) == 1) { + if (PyBytes_Check(arg) && PyBytes_GET_SIZE(arg) == 1) { *p = PyBytes_AS_STRING(arg)[0]; return 1; } - else if (PyByteArray_Check(arg) && PyByteArray_Size(arg) == 1) { + else if (PyByteArray_Check(arg) && PyByteArray_GET_SIZE(arg) == 1) { *p = PyByteArray_AS_STRING(arg)[0]; return 1; } @@ -557,36 +570,36 @@ format_obj(PyObject *v, const char **pbuf, Py_ssize_t *plen) return NULL; } -/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) - - FORMATBUFLEN is the length of the buffer in which the ints & - chars are formatted. XXX This is a magic number. Each formatting - routine does bounds checking to ensure no overflow, but a better - solution may be to malloc a buffer of appropriate size for each - format. For now, the current solution is sufficient. -*/ -#define FORMATBUFLEN (size_t)120 +/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) */ PyObject * -_PyBytes_Format(PyObject *format, PyObject *args) +_PyBytes_FormatEx(const char *format, Py_ssize_t format_len, + PyObject *args, int use_bytearray) { - char *fmt, *res; + const char *fmt; + char *res; Py_ssize_t arglen, argidx; - Py_ssize_t reslen, rescnt, fmtcnt; + Py_ssize_t fmtcnt; int args_owned = 0; - PyObject *result; PyObject *dict = NULL; - if (format == NULL || !PyBytes_Check(format) || args == NULL) { + _PyBytesWriter writer; + + if (args == NULL) { PyErr_BadInternalCall(); return NULL; } - fmt = PyBytes_AS_STRING(format); - fmtcnt = PyBytes_GET_SIZE(format); - reslen = rescnt = fmtcnt + 100; - result = PyBytes_FromStringAndSize((char *)NULL, reslen); - if (result == NULL) + fmt = format; + fmtcnt = format_len; + + _PyBytesWriter_Init(&writer); + writer.use_bytearray = use_bytearray; + + res = _PyBytesWriter_Alloc(&writer, fmtcnt); + if (res == NULL) return NULL; - res = PyBytes_AsString(result); + if (!use_bytearray) + writer.overallocate = 1; + if (PyTuple_Check(args)) { arglen = PyTuple_GET_SIZE(args); argidx = 0; @@ -600,18 +613,23 @@ _PyBytes_Format(PyObject *format, PyObject *args) !PyByteArray_Check(args)) { dict = args; } + while (--fmtcnt >= 0) { if (*fmt != '%') { - if (--rescnt < 0) { - rescnt = fmtcnt + 100; - reslen += rescnt; - if (_PyBytes_Resize(&result, reslen)) - return NULL; - res = PyBytes_AS_STRING(result) - + reslen - rescnt; - --rescnt; - } - *res++ = *fmt++; + Py_ssize_t len; + char *pos; + + pos = strchr(fmt + 1, '%'); + if (pos != NULL) + len = pos - fmt; + else + len = format_len - (fmt - format); + assert(len != 0); + + Py_MEMCPY(res, fmt, len); + res += len; + fmt += len; + fmtcnt -= (len - 1); } else { /* Got a format specifier */ @@ -626,10 +644,14 @@ _PyBytes_Format(PyObject *format, PyObject *args) int sign; Py_ssize_t len = 0; char onechar; /* For byte_converter() */ + Py_ssize_t alloc; +#ifdef Py_DEBUG + char *before; +#endif fmt++; if (*fmt == '(') { - char *keystart; + const char *keystart; Py_ssize_t keylen; PyObject *key; int pcount = 1; @@ -673,6 +695,8 @@ _PyBytes_Format(PyObject *format, PyObject *args) arglen = -1; argidx = -2; } + + /* Parse flags. Example: "%+i" => flags=F_SIGN. */ while (--fmtcnt >= 0) { switch (c = *fmt++) { case '-': flags |= F_LJUST; continue; @@ -683,6 +707,8 @@ _PyBytes_Format(PyObject *format, PyObject *args) } break; } + + /* Parse width. Example: "%10s" => width=10 */ if (c == '*') { v = getnextarg(args, arglen, &argidx); if (v == NULL) @@ -717,6 +743,8 @@ _PyBytes_Format(PyObject *format, PyObject *args) width = width*10 + (c - '0'); } } + + /* Parse precision. Example: "%.3f" => prec=3 */ if (c == '.') { prec = 0; if (--fmtcnt >= 0) @@ -771,13 +799,19 @@ _PyBytes_Format(PyObject *format, PyObject *args) if (v == NULL) goto error; } + + if (fmtcnt < 0) { + /* last writer: disable writer overallocation */ + writer.overallocate = 0; + } + sign = 0; fill = ' '; switch (c) { case '%': - pbuf = "%"; - len = 1; - break; + *res++ = '%'; + continue; + case 'r': // %r is only for 2/3 code; 3 only code should use %a case 'a': @@ -790,6 +824,7 @@ _PyBytes_Format(PyObject *format, PyObject *args) if (prec >= 0 && len > prec) len = prec; break; + case 's': // %s is only for 2/3 code; 3 only code should use %b case 'b': @@ -799,12 +834,49 @@ _PyBytes_Format(PyObject *format, PyObject *args) if (prec >= 0 && len > prec) len = prec; break; + case 'i': case 'd': case 'u': case 'o': case 'x': case 'X': + if (PyLong_CheckExact(v) + && width == -1 && prec == -1 + && !(flags & (F_SIGN | F_BLANK)) + && c != 'X') + { + /* Fast path */ + int alternate = flags & F_ALT; + int base; + + switch(c) + { + default: + assert(0 && "'type' not in [diuoxX]"); + case 'd': + case 'i': + case 'u': + base = 10; + break; + case 'o': + base = 8; + break; + case 'x': + case 'X': + base = 16; + break; + } + + /* Fast path */ + writer.min_size -= 2; /* size preallocated for "%d" */ + res = _PyLong_FormatBytesWriter(&writer, res, + v, base, alternate); + if (res == NULL) + goto error; + continue; + } + temp = formatlong(v, flags, prec, c); if (!temp) goto error; @@ -815,14 +887,25 @@ _PyBytes_Format(PyObject *format, PyObject *args) if (flags & F_ZERO) fill = '0'; break; + case 'e': case 'E': case 'f': case 'F': case 'g': case 'G': - temp = formatfloat(v, flags, prec, c); - if (temp == NULL) + if (width == -1 && prec == -1 + && !(flags & (F_SIGN | F_BLANK))) + { + /* Fast path */ + writer.min_size -= 2; /* size preallocated for "%f" */ + res = formatfloat(v, flags, prec, c, NULL, &writer, res); + if (res == NULL) + goto error; + continue; + } + + if (!formatfloat(v, flags, prec, c, &temp, NULL, res)) goto error; pbuf = PyBytes_AS_STRING(temp); len = PyBytes_GET_SIZE(temp); @@ -830,21 +913,28 @@ _PyBytes_Format(PyObject *format, PyObject *args) if (flags & F_ZERO) fill = '0'; break; + case 'c': pbuf = &onechar; len = byte_converter(v, &onechar); if (!len) goto error; + if (width == -1) { + /* Fast path */ + *res++ = onechar; + continue; + } break; + default: PyErr_Format(PyExc_ValueError, "unsupported format character '%c' (0x%x) " "at index %zd", c, c, - (Py_ssize_t)(fmt - 1 - - PyBytes_AsString(format))); + (Py_ssize_t)(fmt - 1 - format)); goto error; } + if (sign) { if (*pbuf == '-' || *pbuf == '+') { sign = *pbuf++; @@ -859,29 +949,31 @@ _PyBytes_Format(PyObject *format, PyObject *args) } if (width < len) width = len; - if (rescnt - (sign != 0) < width) { - reslen -= rescnt; - rescnt = width + fmtcnt + 100; - reslen += rescnt; - if (reslen < 0) { - Py_DECREF(result); - Py_XDECREF(temp); - return PyErr_NoMemory(); - } - if (_PyBytes_Resize(&result, reslen)) { - Py_XDECREF(temp); - return NULL; - } - res = PyBytes_AS_STRING(result) - + reslen - rescnt; + + alloc = width; + if (sign != 0 && len == width) + alloc++; + /* 2: size preallocated for %s */ + if (alloc > 2) { + res = _PyBytesWriter_Prepare(&writer, res, alloc - 2); + if (res == NULL) + goto error; } +#ifdef Py_DEBUG + before = res; +#endif + + /* Write the sign if needed */ if (sign) { if (fill != ' ') *res++ = sign; - rescnt--; if (width > len) width--; } + + /* Write the numeric prefix for "x", "X" and "o" formats + if the alternate form is used. + For example, write "0x" for the "%#x" format. */ if ((flags & F_ALT) && (c == 'x' || c == 'X')) { assert(pbuf[0] == '0'); assert(pbuf[1] == c); @@ -889,18 +981,21 @@ _PyBytes_Format(PyObject *format, PyObject *args) *res++ = *pbuf++; *res++ = *pbuf++; } - rescnt -= 2; width -= 2; if (width < 0) width = 0; len -= 2; } + + /* Pad left with the fill character if needed */ if (width > len && !(flags & F_LJUST)) { - do { - --rescnt; - *res++ = fill; - } while (--width > len); + memset(res, fill, width - len); + res += (width - len); + width = len; } + + /* If padding with spaces: write sign if needed and/or numeric + prefix if the alternate form is used */ if (fill == ' ') { if (sign) *res++ = sign; @@ -912,13 +1007,17 @@ _PyBytes_Format(PyObject *format, PyObject *args) *res++ = *pbuf++; } } + + /* Copy bytes */ Py_MEMCPY(res, pbuf, len); res += len; - rescnt -= len; - while (--width >= len) { - --rescnt; - *res++ = ' '; + + /* Pad right with the fill character if needed */ + if (width > len) { + memset(res, ' ', width - len); + res += (width - len); } + if (dict && (argidx < arglen) && c != '%') { PyErr_SetString(PyExc_TypeError, "not all arguments converted during bytes formatting"); @@ -926,22 +1025,31 @@ _PyBytes_Format(PyObject *format, PyObject *args) goto error; } Py_XDECREF(temp); + +#ifdef Py_DEBUG + /* check that we computed the exact size for this write */ + assert((res - before) == alloc); +#endif } /* '%' */ + + /* If overallocation was disabled, ensure that it was the last + write. Otherwise, we missed an optimization */ + assert(writer.overallocate || fmtcnt < 0 || use_bytearray); } /* until end */ + if (argidx < arglen && !dict) { PyErr_SetString(PyExc_TypeError, "not all arguments converted during bytes formatting"); goto error; } + if (args_owned) { Py_DECREF(args); } - if (_PyBytes_Resize(&result, reslen - rescnt)) - return NULL; - return result; + return _PyBytesWriter_Finish(&writer, res); error: - Py_DECREF(result); + _PyBytesWriter_Dealloc(&writer); if (args_owned) { Py_DECREF(args); } @@ -961,6 +1069,42 @@ bytes_dealloc(PyObject *op) the string is UTF-8 encoded and should be re-encoded in the specified encoding. */ +static char * +_PyBytes_DecodeEscapeRecode(const char **s, const char *end, + const char *errors, const char *recode_encoding, + _PyBytesWriter *writer, char *p) +{ + PyObject *u, *w; + const char* t; + + t = *s; + /* Decode non-ASCII bytes as UTF-8. */ + while (t < end && (*t & 0x80)) + t++; + u = PyUnicode_DecodeUTF8(*s, t - *s, errors); + if (u == NULL) + return NULL; + + /* Recode them in target encoding. */ + w = PyUnicode_AsEncodedString(u, recode_encoding, errors); + Py_DECREF(u); + if (w == NULL) + return NULL; + assert(PyBytes_Check(w)); + + /* Append bytes to output buffer. */ + writer->min_size--; /* substract 1 preallocated byte */ + p = _PyBytesWriter_WriteBytes(writer, p, + PyBytes_AS_STRING(w), + PyBytes_GET_SIZE(w)); + Py_DECREF(w); + if (p == NULL) + return NULL; + + *s = t; + return p; +} + PyObject *PyBytes_DecodeEscape(const char *s, Py_ssize_t len, const char *errors, @@ -968,54 +1112,42 @@ PyObject *PyBytes_DecodeEscape(const char *s, const char *recode_encoding) { int c; - char *p, *buf; + char *p; const char *end; - PyObject *v; - Py_ssize_t newlen = recode_encoding ? 4*len:len; - v = PyBytes_FromStringAndSize((char *)NULL, newlen); - if (v == NULL) + _PyBytesWriter writer; + + _PyBytesWriter_Init(&writer); + + p = _PyBytesWriter_Alloc(&writer, len); + if (p == NULL) return NULL; - p = buf = PyBytes_AsString(v); + writer.overallocate = 1; + end = s + len; while (s < end) { if (*s != '\\') { non_esc: - if (recode_encoding && (*s & 0x80)) { - PyObject *u, *w; - char *r; - const char* t; - Py_ssize_t rn; - t = s; - /* Decode non-ASCII bytes as UTF-8. */ - while (t < end && (*t & 0x80)) t++; - u = PyUnicode_DecodeUTF8(s, t - s, errors); - if(!u) goto failed; - - /* Recode them in target encoding. */ - w = PyUnicode_AsEncodedString( - u, recode_encoding, errors); - Py_DECREF(u); - if (!w) goto failed; - - /* Append bytes to output buffer. */ - assert(PyBytes_Check(w)); - r = PyBytes_AS_STRING(w); - rn = PyBytes_GET_SIZE(w); - Py_MEMCPY(p, r, rn); - p += rn; - Py_DECREF(w); - s = t; - } else { + if (!(recode_encoding && (*s & 0x80))) { *p++ = *s++; } + else { + /* non-ASCII character and need to recode */ + p = _PyBytes_DecodeEscapeRecode(&s, end, + errors, recode_encoding, + &writer, p); + if (p == NULL) + goto failed; + } continue; } + s++; - if (s==end) { + if (s == end) { PyErr_SetString(PyExc_ValueError, "Trailing \\ in string"); goto failed; } + switch (*s++) { /* XXX This assumes ASCII! */ case '\n': break; @@ -1040,28 +1172,18 @@ PyObject *PyBytes_DecodeEscape(const char *s, *p++ = c; break; case 'x': - if (s+1 < end && Py_ISXDIGIT(s[0]) && Py_ISXDIGIT(s[1])) { - unsigned int x = 0; - c = Py_CHARMASK(*s); - s++; - if (Py_ISDIGIT(c)) - x = c - '0'; - else if (Py_ISLOWER(c)) - x = 10 + c - 'a'; - else - x = 10 + c - 'A'; - x = x << 4; - c = Py_CHARMASK(*s); - s++; - if (Py_ISDIGIT(c)) - x += c - '0'; - else if (Py_ISLOWER(c)) - x += 10 + c - 'a'; - else - x += 10 + c - 'A'; - *p++ = x; - break; + if (s+1 < end) { + int digit1, digit2; + digit1 = _PyLong_DigitValue[Py_CHARMASK(s[0])]; + digit2 = _PyLong_DigitValue[Py_CHARMASK(s[1])]; + if (digit1 < 16 && digit2 < 16) { + *p++ = (unsigned char)((digit1 << 4) + digit2); + s += 2; + break; + } } + /* invalid hexadecimal digits */ + if (!errors || strcmp(errors, "strict") == 0) { PyErr_Format(PyExc_ValueError, "invalid \\x escape at position %d", @@ -1083,6 +1205,7 @@ PyObject *PyBytes_DecodeEscape(const char *s, if (s < end && Py_ISXDIGIT(s[0])) s++; /* and a hexdigit */ break; + default: *p++ = '\\'; s--; @@ -1090,11 +1213,11 @@ PyObject *PyBytes_DecodeEscape(const char *s, UTF-8 bytes may follow. */ } } - if (p-buf < newlen) - _PyBytes_Resize(&v, p - buf); - return v; + + return _PyBytesWriter_Finish(&writer, p); + failed: - Py_DECREF(v); + _PyBytesWriter_Dealloc(&writer); return NULL; } @@ -1365,24 +1488,7 @@ bytes_repeat(PyBytesObject *a, Py_ssize_t n) static int bytes_contains(PyObject *self, PyObject *arg) { - Py_ssize_t ival = PyNumber_AsSsize_t(arg, PyExc_ValueError); - if (ival == -1 && PyErr_Occurred()) { - Py_buffer varg; - Py_ssize_t pos; - PyErr_Clear(); - if (PyObject_GetBuffer(arg, &varg, PyBUF_SIMPLE) != 0) - return -1; - pos = stringlib_find(PyBytes_AS_STRING(self), Py_SIZE(self), - varg.buf, varg.len, 0); - PyBuffer_Release(&varg); - return pos >= 0; - } - if (ival < 0 || ival >= 256) { - PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)"); - return -1; - } - - return memchr(PyBytes_AS_STRING(self), (int) ival, Py_SIZE(self)) != NULL; + return _Py_bytes_contains(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), arg); } static PyObject * @@ -1629,8 +1735,8 @@ Return a list of the sections in the bytes, using sep as the delimiter. [clinic start generated code]*/ static PyObject * -bytes_split_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=8bde44dacb36ef2e input=8b809b39074abbfa]*/ +bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit) +/*[clinic end generated code: output=52126b5844c1d8ef input=8b809b39074abbfa]*/ { Py_ssize_t len = PyBytes_GET_SIZE(self), n; const char *s = PyBytes_AS_STRING(self), *sub; @@ -1654,7 +1760,6 @@ bytes_split_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit) /*[clinic input] bytes.partition - self: self(type="PyBytesObject *") sep: Py_buffer / @@ -1670,7 +1775,7 @@ object and two empty bytes objects. static PyObject * bytes_partition_impl(PyBytesObject *self, Py_buffer *sep) -/*[clinic end generated code: output=f532b392a17ff695 input=bc855dc63ca949de]*/ +/*[clinic end generated code: output=f532b392a17ff695 input=61cca95519406099]*/ { return stringlib_partition( (PyObject*) self, @@ -1682,7 +1787,6 @@ bytes_partition_impl(PyBytesObject *self, Py_buffer *sep) /*[clinic input] bytes.rpartition - self: self(type="PyBytesObject *") sep: Py_buffer / @@ -1698,7 +1802,7 @@ objects and the original bytes object. static PyObject * bytes_rpartition_impl(PyBytesObject *self, Py_buffer *sep) -/*[clinic end generated code: output=191b114cbb028e50 input=6588fff262a9170e]*/ +/*[clinic end generated code: output=191b114cbb028e50 input=67f689e63a62d478]*/ { return stringlib_rpartition( (PyObject*) self, @@ -1716,8 +1820,8 @@ Splitting is done starting at the end of the bytes and working to the front. [clinic start generated code]*/ static PyObject * -bytes_rsplit_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=0b6570b977911d88 input=0f86c9f28f7d7b7b]*/ +bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit) +/*[clinic end generated code: output=ba698d9ea01e1c8f input=0f86c9f28f7d7b7b]*/ { Py_ssize_t len = PyBytes_GET_SIZE(self), n; const char *s = PyBytes_AS_STRING(self), *sub; @@ -1755,8 +1859,8 @@ Example: b'.'.join([b'ab', b'pq', b'rs']) -> b'ab.pq.rs'. [clinic start generated code]*/ static PyObject * -bytes_join(PyBytesObject*self, PyObject *iterable_of_bytes) -/*[clinic end generated code: output=634aff14764ff997 input=7fe377b95bd549d2]*/ +bytes_join(PyBytesObject *self, PyObject *iterable_of_bytes) +/*[clinic end generated code: output=a046f379f626f6f8 input=7fe377b95bd549d2]*/ { return stringlib_bytes_join((PyObject*)self, iterable_of_bytes); } @@ -1769,158 +1873,30 @@ _PyBytes_Join(PyObject *sep, PyObject *x) return bytes_join((PyBytesObject*)sep, x); } -/* helper macro to fixup start/end slice values */ -#define ADJUST_INDICES(start, end, len) \ - if (end > len) \ - end = len; \ - else if (end < 0) { \ - end += len; \ - if (end < 0) \ - end = 0; \ - } \ - if (start < 0) { \ - start += len; \ - if (start < 0) \ - start = 0; \ - } - -Py_LOCAL_INLINE(Py_ssize_t) -bytes_find_internal(PyBytesObject *self, PyObject *args, int dir) -{ - PyObject *subobj; - char byte; - Py_buffer subbuf; - const char *sub; - Py_ssize_t len, sub_len; - Py_ssize_t start=0, end=PY_SSIZE_T_MAX; - Py_ssize_t res; - - if (!stringlib_parse_args_finds_byte("find/rfind/index/rindex", - args, &subobj, &byte, &start, &end)) - return -2; - - if (subobj) { - if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0) - return -2; - - sub = subbuf.buf; - sub_len = subbuf.len; - } - else { - sub = &byte; - sub_len = 1; - } - len = PyBytes_GET_SIZE(self); - - ADJUST_INDICES(start, end, len); - if (end - start < sub_len) - res = -1; - else if (sub_len == 1 -#ifndef HAVE_MEMRCHR - && dir > 0 -#endif - ) { - unsigned char needle = *sub; - int mode = (dir > 0) ? FAST_SEARCH : FAST_RSEARCH; - res = stringlib_fastsearch_memchr_1char( - PyBytes_AS_STRING(self) + start, end - start, - needle, needle, mode); - if (res >= 0) - res += start; - } - else { - if (dir > 0) - res = stringlib_find_slice( - PyBytes_AS_STRING(self), len, - sub, sub_len, start, end); - else - res = stringlib_rfind_slice( - PyBytes_AS_STRING(self), len, - sub, sub_len, start, end); - } - - if (subobj) - PyBuffer_Release(&subbuf); - - return res; -} - - -PyDoc_STRVAR(find__doc__, -"B.find(sub[, start[, end]]) -> int\n\ -\n\ -Return the lowest index in B where substring sub is found,\n\ -such that sub is contained within B[start:end]. Optional\n\ -arguments start and end are interpreted as in slice notation.\n\ -\n\ -Return -1 on failure."); - static PyObject * bytes_find(PyBytesObject *self, PyObject *args) { - Py_ssize_t result = bytes_find_internal(self, args, +1); - if (result == -2) - return NULL; - return PyLong_FromSsize_t(result); + return _Py_bytes_find(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args); } - -PyDoc_STRVAR(index__doc__, -"B.index(sub[, start[, end]]) -> int\n\ -\n\ -Like B.find() but raise ValueError when the substring is not found."); - static PyObject * bytes_index(PyBytesObject *self, PyObject *args) { - Py_ssize_t result = bytes_find_internal(self, args, +1); - if (result == -2) - return NULL; - if (result == -1) { - PyErr_SetString(PyExc_ValueError, - "substring not found"); - return NULL; - } - return PyLong_FromSsize_t(result); + return _Py_bytes_index(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args); } -PyDoc_STRVAR(rfind__doc__, -"B.rfind(sub[, start[, end]]) -> int\n\ -\n\ -Return the highest index in B where substring sub is found,\n\ -such that sub is contained within B[start:end]. Optional\n\ -arguments start and end are interpreted as in slice notation.\n\ -\n\ -Return -1 on failure."); - static PyObject * bytes_rfind(PyBytesObject *self, PyObject *args) { - Py_ssize_t result = bytes_find_internal(self, args, -1); - if (result == -2) - return NULL; - return PyLong_FromSsize_t(result); + return _Py_bytes_rfind(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args); } -PyDoc_STRVAR(rindex__doc__, -"B.rindex(sub[, start[, end]]) -> int\n\ -\n\ -Like B.rfind() but raise ValueError when the substring is not found."); - static PyObject * bytes_rindex(PyBytesObject *self, PyObject *args) { - Py_ssize_t result = bytes_find_internal(self, args, -1); - if (result == -2) - return NULL; - if (result == -1) { - PyErr_SetString(PyExc_ValueError, - "substring not found"); - return NULL; - } - return PyLong_FromSsize_t(result); + return _Py_bytes_rindex(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args); } @@ -2007,7 +1983,6 @@ do_argstrip(PyBytesObject *self, int striptype, PyObject *bytes) /*[clinic input] bytes.strip - self: self(type="PyBytesObject *") bytes: object = None / @@ -2018,7 +1993,7 @@ If the argument is omitted or None, strip leading and trailing ASCII whitespace. static PyObject * bytes_strip_impl(PyBytesObject *self, PyObject *bytes) -/*[clinic end generated code: output=c7c228d3bd104a1b input=37daa5fad1395d95]*/ +/*[clinic end generated code: output=c7c228d3bd104a1b input=8a354640e4e0b3ef]*/ { return do_argstrip(self, BOTHSTRIP, bytes); } @@ -2026,7 +2001,6 @@ bytes_strip_impl(PyBytesObject *self, PyObject *bytes) /*[clinic input] bytes.lstrip - self: self(type="PyBytesObject *") bytes: object = None / @@ -2037,7 +2011,7 @@ If the argument is omitted or None, strip leading ASCII whitespace. static PyObject * bytes_lstrip_impl(PyBytesObject *self, PyObject *bytes) -/*[clinic end generated code: output=28602e586f524e82 input=88811b09dfbc2988]*/ +/*[clinic end generated code: output=28602e586f524e82 input=9baff4398c3f6857]*/ { return do_argstrip(self, LEFTSTRIP, bytes); } @@ -2045,7 +2019,6 @@ bytes_lstrip_impl(PyBytesObject *self, PyObject *bytes) /*[clinic input] bytes.rstrip - self: self(type="PyBytesObject *") bytes: object = None / @@ -2056,64 +2029,22 @@ If the argument is omitted or None, strip trailing ASCII whitespace. static PyObject * bytes_rstrip_impl(PyBytesObject *self, PyObject *bytes) -/*[clinic end generated code: output=547e3815c95447da input=8f93c9cd361f0140]*/ +/*[clinic end generated code: output=547e3815c95447da input=b78af445c727e32b]*/ { return do_argstrip(self, RIGHTSTRIP, bytes); } -PyDoc_STRVAR(count__doc__, -"B.count(sub[, start[, end]]) -> int\n\ -\n\ -Return the number of non-overlapping occurrences of substring sub in\n\ -string B[start:end]. Optional arguments start and end are interpreted\n\ -as in slice notation."); - static PyObject * bytes_count(PyBytesObject *self, PyObject *args) { - PyObject *sub_obj; - const char *str = PyBytes_AS_STRING(self), *sub; - Py_ssize_t sub_len; - char byte; - Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; - - Py_buffer vsub; - PyObject *count_obj; - - if (!stringlib_parse_args_finds_byte("count", args, &sub_obj, &byte, - &start, &end)) - return NULL; - - if (sub_obj) { - if (PyObject_GetBuffer(sub_obj, &vsub, PyBUF_SIMPLE) != 0) - return NULL; - - sub = vsub.buf; - sub_len = vsub.len; - } - else { - sub = &byte; - sub_len = 1; - } - - ADJUST_INDICES(start, end, PyBytes_GET_SIZE(self)); - - count_obj = PyLong_FromSsize_t( - stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX) - ); - - if (sub_obj) - PyBuffer_Release(&vsub); - - return count_obj; + return _Py_bytes_count(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args); } /*[clinic input] bytes.translate - self: self(type="PyBytesObject *") table: object Translation table, which must be a bytes object of length 256. [ @@ -2130,7 +2061,7 @@ The remaining characters are mapped through the given translation table. static PyObject * bytes_translate_impl(PyBytesObject *self, PyObject *table, int group_right_1, PyObject *deletechars) -/*[clinic end generated code: output=233df850eb50bf8d input=d8fa5519d7cc4be7]*/ +/*[clinic end generated code: output=233df850eb50bf8d input=ca20edf39d780d49]*/ { char *input, *output; Py_buffer table_view = {NULL, NULL}; @@ -2191,7 +2122,7 @@ bytes_translate_impl(PyBytesObject *self, PyObject *table, int group_right_1, PyBuffer_Release(&table_view); return NULL; } - output_start = output = PyBytes_AsString(result); + output_start = output = PyBytes_AS_STRING(result); input = PyBytes_AS_STRING(input_obj); if (dellen == 0 && table_chars != NULL) { @@ -2267,498 +2198,6 @@ bytes_maketrans_impl(Py_buffer *frm, Py_buffer *to) return _Py_bytes_maketrans(frm, to); } -/* find and count characters and substrings */ - -#define findchar(target, target_len, c) \ - ((char *)memchr((const void *)(target), c, target_len)) - -/* String ops must return a string. */ -/* If the object is subclass of string, create a copy */ -Py_LOCAL(PyBytesObject *) -return_self(PyBytesObject *self) -{ - if (PyBytes_CheckExact(self)) { - Py_INCREF(self); - return self; - } - return (PyBytesObject *)PyBytes_FromStringAndSize( - PyBytes_AS_STRING(self), - PyBytes_GET_SIZE(self)); -} - -Py_LOCAL_INLINE(Py_ssize_t) -countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount) -{ - Py_ssize_t count=0; - const char *start=target; - const char *end=target+target_len; - - while ( (start=findchar(start, end-start, c)) != NULL ) { - count++; - if (count >= maxcount) - break; - start += 1; - } - return count; -} - - -/* Algorithms for different cases of string replacement */ - -/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */ -Py_LOCAL(PyBytesObject *) -replace_interleave(PyBytesObject *self, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - char *self_s, *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count, i; - PyBytesObject *result; - - self_len = PyBytes_GET_SIZE(self); - - /* 1 at the end plus 1 after every character; - count = min(maxcount, self_len + 1) */ - if (maxcount <= self_len) - count = maxcount; - else - /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */ - count = self_len + 1; - - /* Check for overflow */ - /* result_len = count * to_len + self_len; */ - assert(count > 0); - if (to_len > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, - "replacement bytes are too long"); - return NULL; - } - result_len = count * to_len + self_len; - - if (! (result = (PyBytesObject *) - PyBytes_FromStringAndSize(NULL, result_len)) ) - return NULL; - - self_s = PyBytes_AS_STRING(self); - result_s = PyBytes_AS_STRING(result); - - /* TODO: special case single character, which doesn't need memcpy */ - - /* Lay the first one down (guaranteed this will occur) */ - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - count -= 1; - - for (i=0; i<count; i++) { - *result_s++ = *self_s++; - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - } - - /* Copy the rest of the original string */ - Py_MEMCPY(result_s, self_s, self_len-i); - - return result; -} - -/* Special case for deleting a single character */ -/* len(self)>=1, len(from)==1, to="", maxcount>=1 */ -Py_LOCAL(PyBytesObject *) -replace_delete_single_character(PyBytesObject *self, - char from_c, Py_ssize_t maxcount) -{ - char *self_s, *result_s; - char *start, *next, *end; - Py_ssize_t self_len, result_len; - Py_ssize_t count; - PyBytesObject *result; - - self_len = PyBytes_GET_SIZE(self); - self_s = PyBytes_AS_STRING(self); - - count = countchar(self_s, self_len, from_c, maxcount); - if (count == 0) { - return return_self(self); - } - - result_len = self_len - count; /* from_len == 1 */ - assert(result_len>=0); - - if ( (result = (PyBytesObject *) - PyBytes_FromStringAndSize(NULL, result_len)) == NULL) - return NULL; - result_s = PyBytes_AS_STRING(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - next = findchar(start, end-start, from_c); - if (next == NULL) - break; - Py_MEMCPY(result_s, start, next-start); - result_s += (next-start); - start = next+1; - } - Py_MEMCPY(result_s, start, end-start); - - return result; -} - -/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */ - -Py_LOCAL(PyBytesObject *) -replace_delete_substring(PyBytesObject *self, - const char *from_s, Py_ssize_t from_len, - Py_ssize_t maxcount) { - char *self_s, *result_s; - char *start, *next, *end; - Py_ssize_t self_len, result_len; - Py_ssize_t count, offset; - PyBytesObject *result; - - self_len = PyBytes_GET_SIZE(self); - self_s = PyBytes_AS_STRING(self); - - count = stringlib_count(self_s, self_len, - from_s, from_len, - maxcount); - - if (count == 0) { - /* no matches */ - return return_self(self); - } - - result_len = self_len - (count * from_len); - assert (result_len>=0); - - if ( (result = (PyBytesObject *) - PyBytes_FromStringAndSize(NULL, result_len)) == NULL ) - return NULL; - - result_s = PyBytes_AS_STRING(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - offset = stringlib_find(start, end-start, - from_s, from_len, - 0); - if (offset == -1) - break; - next = start + offset; - - Py_MEMCPY(result_s, start, next-start); - - result_s += (next-start); - start = next+from_len; - } - Py_MEMCPY(result_s, start, end-start); - return result; -} - -/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */ -Py_LOCAL(PyBytesObject *) -replace_single_character_in_place(PyBytesObject *self, - char from_c, char to_c, - Py_ssize_t maxcount) -{ - char *self_s, *result_s, *start, *end, *next; - Py_ssize_t self_len; - PyBytesObject *result; - - /* The result string will be the same size */ - self_s = PyBytes_AS_STRING(self); - self_len = PyBytes_GET_SIZE(self); - - next = findchar(self_s, self_len, from_c); - - if (next == NULL) { - /* No matches; return the original string */ - return return_self(self); - } - - /* Need to make a new string */ - result = (PyBytesObject *) PyBytes_FromStringAndSize(NULL, self_len); - if (result == NULL) - return NULL; - result_s = PyBytes_AS_STRING(result); - Py_MEMCPY(result_s, self_s, self_len); - - /* change everything in-place, starting with this one */ - start = result_s + (next-self_s); - *start = to_c; - start++; - end = result_s + self_len; - - while (--maxcount > 0) { - next = findchar(start, end-start, from_c); - if (next == NULL) - break; - *next = to_c; - start = next+1; - } - - return result; -} - -/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */ -Py_LOCAL(PyBytesObject *) -replace_substring_in_place(PyBytesObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - char *result_s, *start, *end; - char *self_s; - Py_ssize_t self_len, offset; - PyBytesObject *result; - - /* The result string will be the same size */ - - self_s = PyBytes_AS_STRING(self); - self_len = PyBytes_GET_SIZE(self); - - offset = stringlib_find(self_s, self_len, - from_s, from_len, - 0); - if (offset == -1) { - /* No matches; return the original string */ - return return_self(self); - } - - /* Need to make a new string */ - result = (PyBytesObject *) PyBytes_FromStringAndSize(NULL, self_len); - if (result == NULL) - return NULL; - result_s = PyBytes_AS_STRING(result); - Py_MEMCPY(result_s, self_s, self_len); - - /* change everything in-place, starting with this one */ - start = result_s + offset; - Py_MEMCPY(start, to_s, from_len); - start += from_len; - end = result_s + self_len; - - while ( --maxcount > 0) { - offset = stringlib_find(start, end-start, - from_s, from_len, - 0); - if (offset==-1) - break; - Py_MEMCPY(start+offset, to_s, from_len); - start += offset+from_len; - } - - return result; -} - -/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */ -Py_LOCAL(PyBytesObject *) -replace_single_character(PyBytesObject *self, - char from_c, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - char *self_s, *result_s; - char *start, *next, *end; - Py_ssize_t self_len, result_len; - Py_ssize_t count; - PyBytesObject *result; - - self_s = PyBytes_AS_STRING(self); - self_len = PyBytes_GET_SIZE(self); - - count = countchar(self_s, self_len, from_c, maxcount); - if (count == 0) { - /* no matches, return unchanged */ - return return_self(self); - } - - /* use the difference between current and new, hence the "-1" */ - /* result_len = self_len + count * (to_len-1) */ - assert(count > 0); - if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, - "replacement bytes are too long"); - return NULL; - } - result_len = self_len + count * (to_len - 1); - - if ( (result = (PyBytesObject *) - PyBytes_FromStringAndSize(NULL, result_len)) == NULL) - return NULL; - result_s = PyBytes_AS_STRING(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - next = findchar(start, end-start, from_c); - if (next == NULL) - break; - - if (next == start) { - /* replace with the 'to' */ - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - start += 1; - } else { - /* copy the unchanged old then the 'to' */ - Py_MEMCPY(result_s, start, next-start); - result_s += (next-start); - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - start = next+1; - } - } - /* Copy the remainder of the remaining string */ - Py_MEMCPY(result_s, start, end-start); - - return result; -} - -/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */ -Py_LOCAL(PyBytesObject *) -replace_substring(PyBytesObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) { - char *self_s, *result_s; - char *start, *next, *end; - Py_ssize_t self_len, result_len; - Py_ssize_t count, offset; - PyBytesObject *result; - - self_s = PyBytes_AS_STRING(self); - self_len = PyBytes_GET_SIZE(self); - - count = stringlib_count(self_s, self_len, - from_s, from_len, - maxcount); - - if (count == 0) { - /* no matches, return unchanged */ - return return_self(self); - } - - /* Check for overflow */ - /* result_len = self_len + count * (to_len-from_len) */ - assert(count > 0); - if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, - "replacement bytes are too long"); - return NULL; - } - result_len = self_len + count * (to_len-from_len); - - if ( (result = (PyBytesObject *) - PyBytes_FromStringAndSize(NULL, result_len)) == NULL) - return NULL; - result_s = PyBytes_AS_STRING(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - offset = stringlib_find(start, end-start, - from_s, from_len, - 0); - if (offset == -1) - break; - next = start+offset; - if (next == start) { - /* replace with the 'to' */ - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - start += from_len; - } else { - /* copy the unchanged old then the 'to' */ - Py_MEMCPY(result_s, start, next-start); - result_s += (next-start); - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - start = next+from_len; - } - } - /* Copy the remainder of the remaining string */ - Py_MEMCPY(result_s, start, end-start); - - return result; -} - - -Py_LOCAL(PyBytesObject *) -replace(PyBytesObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - if (maxcount < 0) { - maxcount = PY_SSIZE_T_MAX; - } else if (maxcount == 0 || PyBytes_GET_SIZE(self) == 0) { - /* nothing to do; return the original string */ - return return_self(self); - } - - if (maxcount == 0 || - (from_len == 0 && to_len == 0)) { - /* nothing to do; return the original string */ - return return_self(self); - } - - /* Handle zero-length special cases */ - - if (from_len == 0) { - /* insert the 'to' string everywhere. */ - /* >>> "Python".replace("", ".") */ - /* '.P.y.t.h.o.n.' */ - return replace_interleave(self, to_s, to_len, maxcount); - } - - /* Except for "".replace("", "A") == "A" there is no way beyond this */ - /* point for an empty self string to generate a non-empty string */ - /* Special case so the remaining code always gets a non-empty string */ - if (PyBytes_GET_SIZE(self) == 0) { - return return_self(self); - } - - if (to_len == 0) { - /* delete all occurrences of 'from' string */ - if (from_len == 1) { - return replace_delete_single_character( - self, from_s[0], maxcount); - } else { - return replace_delete_substring(self, from_s, - from_len, maxcount); - } - } - - /* Handle special case where both strings have the same length */ - - if (from_len == to_len) { - if (from_len == 1) { - return replace_single_character_in_place( - self, - from_s[0], - to_s[0], - maxcount); - } else { - return replace_substring_in_place( - self, from_s, from_len, to_s, to_len, - maxcount); - } - } - - /* Otherwise use the more generic algorithms */ - if (from_len == 1) { - return replace_single_character(self, from_s[0], - to_s, to_len, maxcount); - } else { - /* len('from')>=2, len('to')>=1 */ - return replace_substring(self, from_s, from_len, to_s, to_len, - maxcount); - } -} - /*[clinic input] bytes.replace @@ -2777,156 +2216,28 @@ replaced. [clinic start generated code]*/ static PyObject * -bytes_replace_impl(PyBytesObject*self, Py_buffer *old, Py_buffer *new, +bytes_replace_impl(PyBytesObject *self, Py_buffer *old, Py_buffer *new, Py_ssize_t count) -/*[clinic end generated code: output=403dc9d7a83c5a1d input=b2fbbf0bf04de8e5]*/ +/*[clinic end generated code: output=994fa588b6b9c104 input=b2fbbf0bf04de8e5]*/ { - return (PyObject *)replace((PyBytesObject *) self, - (const char *)old->buf, old->len, - (const char *)new->buf, new->len, count); + return stringlib_replace((PyObject *)self, + (const char *)old->buf, old->len, + (const char *)new->buf, new->len, count); } /** End DALKE **/ -/* Matches the end (direction >= 0) or start (direction < 0) of self - * against substr, using the start and end arguments. Returns - * -1 on error, 0 if not found and 1 if found. - */ -Py_LOCAL(int) -_bytes_tailmatch(PyBytesObject *self, PyObject *substr, Py_ssize_t start, - Py_ssize_t end, int direction) -{ - Py_ssize_t len = PyBytes_GET_SIZE(self); - Py_ssize_t slen; - Py_buffer sub_view = {NULL, NULL}; - const char* sub; - const char* str; - - if (PyBytes_Check(substr)) { - sub = PyBytes_AS_STRING(substr); - slen = PyBytes_GET_SIZE(substr); - } - else { - if (PyObject_GetBuffer(substr, &sub_view, PyBUF_SIMPLE) != 0) - return -1; - sub = sub_view.buf; - slen = sub_view.len; - } - str = PyBytes_AS_STRING(self); - - ADJUST_INDICES(start, end, len); - - if (direction < 0) { - /* startswith */ - if (start+slen > len) - goto notfound; - } else { - /* endswith */ - if (end-start < slen || start > len) - goto notfound; - - if (end-slen > start) - start = end - slen; - } - if (end-start < slen) - goto notfound; - if (memcmp(str+start, sub, slen) != 0) - goto notfound; - - PyBuffer_Release(&sub_view); - return 1; - -notfound: - PyBuffer_Release(&sub_view); - return 0; -} - - -PyDoc_STRVAR(startswith__doc__, -"B.startswith(prefix[, start[, end]]) -> bool\n\ -\n\ -Return True if B starts with the specified prefix, False otherwise.\n\ -With optional start, test B beginning at that position.\n\ -With optional end, stop comparing B at that position.\n\ -prefix can also be a tuple of bytes to try."); static PyObject * bytes_startswith(PyBytesObject *self, PyObject *args) { - Py_ssize_t start = 0; - Py_ssize_t end = PY_SSIZE_T_MAX; - PyObject *subobj; - int result; - - if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) - return NULL; - if (PyTuple_Check(subobj)) { - Py_ssize_t i; - for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - result = _bytes_tailmatch(self, - PyTuple_GET_ITEM(subobj, i), - start, end, -1); - if (result == -1) - return NULL; - else if (result) { - Py_RETURN_TRUE; - } - } - Py_RETURN_FALSE; - } - result = _bytes_tailmatch(self, subobj, start, end, -1); - if (result == -1) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - PyErr_Format(PyExc_TypeError, "startswith first arg must be bytes " - "or a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name); - return NULL; - } - else - return PyBool_FromLong(result); + return _Py_bytes_startswith(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args); } - -PyDoc_STRVAR(endswith__doc__, -"B.endswith(suffix[, start[, end]]) -> bool\n\ -\n\ -Return True if B ends with the specified suffix, False otherwise.\n\ -With optional start, test B beginning at that position.\n\ -With optional end, stop comparing B at that position.\n\ -suffix can also be a tuple of bytes to try."); - static PyObject * bytes_endswith(PyBytesObject *self, PyObject *args) { - Py_ssize_t start = 0; - Py_ssize_t end = PY_SSIZE_T_MAX; - PyObject *subobj; - int result; - - if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) - return NULL; - if (PyTuple_Check(subobj)) { - Py_ssize_t i; - for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - result = _bytes_tailmatch(self, - PyTuple_GET_ITEM(subobj, i), - start, end, +1); - if (result == -1) - return NULL; - else if (result) { - Py_RETURN_TRUE; - } - } - Py_RETURN_FALSE; - } - result = _bytes_tailmatch(self, subobj, start, end, +1); - if (result == -1) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - PyErr_Format(PyExc_TypeError, "endswith first arg must be bytes or " - "a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name); - return NULL; - } - else - return PyBool_FromLong(result); + return _Py_bytes_endswith(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args); } @@ -2946,9 +2257,9 @@ Decode the bytes using the codec registered for encoding. [clinic start generated code]*/ static PyObject * -bytes_decode_impl(PyBytesObject*self, const char *encoding, +bytes_decode_impl(PyBytesObject *self, const char *encoding, const char *errors) -/*[clinic end generated code: output=2d2016ff8e0bb176 input=958174769d2a40ca]*/ +/*[clinic end generated code: output=5649a53dde27b314 input=958174769d2a40ca]*/ { return PyUnicode_FromEncodedObject((PyObject*)self, encoding, errors); } @@ -2966,8 +2277,8 @@ true. [clinic start generated code]*/ static PyObject * -bytes_splitlines_impl(PyBytesObject*self, int keepends) -/*[clinic end generated code: output=995c3598f7833cad input=7f4aac67144f9944]*/ +bytes_splitlines_impl(PyBytesObject *self, int keepends) +/*[clinic end generated code: output=3484149a5d880ffb input=7f4aac67144f9944]*/ { return stringlib_splitlines( (PyObject*) self, PyBytes_AS_STRING(self), @@ -2975,22 +2286,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends) ); } -static int -hex_digit_to_int(Py_UCS4 c) -{ - if (c >= 128) - return -1; - if (Py_ISDIGIT(c)) - return c - '0'; - else { - if (Py_ISUPPER(c)) - c = Py_TOLOWER(c); - if (c >= 'a' && c <= 'f') - return c - 'a' + 10; - } - return -1; -} - /*[clinic input] @classmethod bytes.fromhex @@ -3008,47 +2303,83 @@ static PyObject * bytes_fromhex_impl(PyTypeObject *type, PyObject *string) /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/ { - PyObject *newstring; + return _PyBytes_FromHex(string, 0); +} + +PyObject* +_PyBytes_FromHex(PyObject *string, int use_bytearray) +{ char *buf; - Py_ssize_t hexlen, byteslen, i, j; - int top, bot; - void *data; - unsigned int kind; + Py_ssize_t hexlen, invalid_char; + unsigned int top, bot; + Py_UCS1 *str, *end; + _PyBytesWriter writer; + + _PyBytesWriter_Init(&writer); + writer.use_bytearray = use_bytearray; assert(PyUnicode_Check(string)); if (PyUnicode_READY(string)) return NULL; - kind = PyUnicode_KIND(string); - data = PyUnicode_DATA(string); hexlen = PyUnicode_GET_LENGTH(string); - byteslen = hexlen/2; /* This overestimates if there are spaces */ - newstring = PyBytes_FromStringAndSize(NULL, byteslen); - if (!newstring) + if (!PyUnicode_IS_ASCII(string)) { + void *data = PyUnicode_DATA(string); + unsigned int kind = PyUnicode_KIND(string); + Py_ssize_t i; + + /* search for the first non-ASCII character */ + for (i = 0; i < hexlen; i++) { + if (PyUnicode_READ(kind, data, i) >= 128) + break; + } + invalid_char = i; + goto error; + } + + assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND); + str = PyUnicode_1BYTE_DATA(string); + + /* This overestimates if there are spaces */ + buf = _PyBytesWriter_Alloc(&writer, hexlen / 2); + if (buf == NULL) return NULL; - buf = PyBytes_AS_STRING(newstring); - for (i = j = 0; i < hexlen; i += 2) { + + end = str + hexlen; + while (str < end) { /* skip over spaces in the input */ - while (PyUnicode_READ(kind, data, i) == ' ') - i++; - if (i >= hexlen) - break; - top = hex_digit_to_int(PyUnicode_READ(kind, data, i)); - bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1)); - if (top == -1 || bot == -1) { - PyErr_Format(PyExc_ValueError, - "non-hexadecimal number found in " - "fromhex() arg at position %zd", i); + if (*str == ' ') { + do { + str++; + } while (*str == ' '); + if (str >= end) + break; + } + + top = _PyLong_DigitValue[*str]; + if (top >= 16) { + invalid_char = str - PyUnicode_1BYTE_DATA(string); + goto error; + } + str++; + + bot = _PyLong_DigitValue[*str]; + if (bot >= 16) { + invalid_char = str - PyUnicode_1BYTE_DATA(string); goto error; } - buf[j++] = (top << 4) + bot; + str++; + + *buf++ = (unsigned char)((top << 4) + bot); } - if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0) - goto error; - return newstring; + + return _PyBytesWriter_Finish(&writer, buf); error: - Py_XDECREF(newstring); + PyErr_Format(PyExc_ValueError, + "non-hexadecimal number found in " + "fromhex() arg at position %zd", invalid_char); + _PyBytesWriter_Dealloc(&writer); return NULL; } @@ -3078,17 +2409,20 @@ bytes_methods[] = { {"__getnewargs__", (PyCFunction)bytes_getnewargs, METH_NOARGS}, {"capitalize", (PyCFunction)stringlib_capitalize, METH_NOARGS, _Py_capitalize__doc__}, - {"center", (PyCFunction)stringlib_center, METH_VARARGS, center__doc__}, - {"count", (PyCFunction)bytes_count, METH_VARARGS, count__doc__}, + {"center", (PyCFunction)stringlib_center, METH_VARARGS, + _Py_center__doc__}, + {"count", (PyCFunction)bytes_count, METH_VARARGS, + _Py_count__doc__}, BYTES_DECODE_METHODDEF {"endswith", (PyCFunction)bytes_endswith, METH_VARARGS, - endswith__doc__}, + _Py_endswith__doc__}, {"expandtabs", (PyCFunction)stringlib_expandtabs, METH_VARARGS | METH_KEYWORDS, - expandtabs__doc__}, - {"find", (PyCFunction)bytes_find, METH_VARARGS, find__doc__}, + _Py_expandtabs__doc__}, + {"find", (PyCFunction)bytes_find, METH_VARARGS, + _Py_find__doc__}, BYTES_FROMHEX_METHODDEF {"hex", (PyCFunction)bytes_hex, METH_NOARGS, hex__doc__}, - {"index", (PyCFunction)bytes_index, METH_VARARGS, index__doc__}, + {"index", (PyCFunction)bytes_index, METH_VARARGS, _Py_index__doc__}, {"isalnum", (PyCFunction)stringlib_isalnum, METH_NOARGS, _Py_isalnum__doc__}, {"isalpha", (PyCFunction)stringlib_isalpha, METH_NOARGS, @@ -3104,38 +2438,40 @@ bytes_methods[] = { {"isupper", (PyCFunction)stringlib_isupper, METH_NOARGS, _Py_isupper__doc__}, BYTES_JOIN_METHODDEF - {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, ljust__doc__}, + {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, _Py_ljust__doc__}, {"lower", (PyCFunction)stringlib_lower, METH_NOARGS, _Py_lower__doc__}, BYTES_LSTRIP_METHODDEF BYTES_MAKETRANS_METHODDEF BYTES_PARTITION_METHODDEF BYTES_REPLACE_METHODDEF - {"rfind", (PyCFunction)bytes_rfind, METH_VARARGS, rfind__doc__}, - {"rindex", (PyCFunction)bytes_rindex, METH_VARARGS, rindex__doc__}, - {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, rjust__doc__}, + {"rfind", (PyCFunction)bytes_rfind, METH_VARARGS, _Py_rfind__doc__}, + {"rindex", (PyCFunction)bytes_rindex, METH_VARARGS, _Py_rindex__doc__}, + {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, _Py_rjust__doc__}, BYTES_RPARTITION_METHODDEF BYTES_RSPLIT_METHODDEF BYTES_RSTRIP_METHODDEF BYTES_SPLIT_METHODDEF BYTES_SPLITLINES_METHODDEF {"startswith", (PyCFunction)bytes_startswith, METH_VARARGS, - startswith__doc__}, + _Py_startswith__doc__}, BYTES_STRIP_METHODDEF {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS, _Py_swapcase__doc__}, {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__}, BYTES_TRANSLATE_METHODDEF {"upper", (PyCFunction)stringlib_upper, METH_NOARGS, _Py_upper__doc__}, - {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, zfill__doc__}, + {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, _Py_zfill__doc__}, {NULL, NULL} /* sentinel */ }; static PyObject * -bytes_mod(PyObject *v, PyObject *w) +bytes_mod(PyObject *self, PyObject *arg) { - if (!PyBytes_Check(v)) + if (!PyBytes_Check(self)) { Py_RETURN_NOTIMPLEMENTED; - return _PyBytes_Format(v, w); + } + return _PyBytes_FormatEx(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), + arg, 0); } static PyNumberMethods bytes_as_number = { @@ -3244,108 +2580,93 @@ bytes_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return PyBytes_FromObject(x); } -PyObject * -PyBytes_FromObject(PyObject *x) +static PyObject* +_PyBytes_FromBuffer(PyObject *x) { - PyObject *new, *it; - Py_ssize_t i, size; + PyObject *new; + Py_buffer view; - if (x == NULL) { - PyErr_BadInternalCall(); + if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0) return NULL; - } - if (PyBytes_CheckExact(x)) { - Py_INCREF(x); - return x; - } + new = PyBytes_FromStringAndSize(NULL, view.len); + if (!new) + goto fail; + if (PyBuffer_ToContiguous(((PyBytesObject *)new)->ob_sval, + &view, view.len, 'C') < 0) + goto fail; + PyBuffer_Release(&view); + return new; - /* Use the modern buffer interface */ - if (PyObject_CheckBuffer(x)) { - Py_buffer view; - if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0) - return NULL; - new = PyBytes_FromStringAndSize(NULL, view.len); - if (!new) - goto fail; - if (PyBuffer_ToContiguous(((PyBytesObject *)new)->ob_sval, - &view, view.len, 'C') < 0) - goto fail; - PyBuffer_Release(&view); - return new; - fail: - Py_XDECREF(new); - PyBuffer_Release(&view); - return NULL; - } - if (PyUnicode_Check(x)) { - PyErr_SetString(PyExc_TypeError, - "cannot convert unicode object to bytes"); - return NULL; - } +fail: + Py_XDECREF(new); + PyBuffer_Release(&view); + return NULL; +} - if (PyList_CheckExact(x)) { - new = PyBytes_FromStringAndSize(NULL, Py_SIZE(x)); - if (new == NULL) - return NULL; - for (i = 0; i < Py_SIZE(x); i++) { - Py_ssize_t value = PyNumber_AsSsize_t( - PyList_GET_ITEM(x, i), PyExc_ValueError); - if (value == -1 && PyErr_Occurred()) { - Py_DECREF(new); - return NULL; - } - if (value < 0 || value >= 256) { - PyErr_SetString(PyExc_ValueError, - "bytes must be in range(0, 256)"); - Py_DECREF(new); - return NULL; - } - ((PyBytesObject *)new)->ob_sval[i] = (char) value; - } - return new; - } - if (PyTuple_CheckExact(x)) { - new = PyBytes_FromStringAndSize(NULL, Py_SIZE(x)); - if (new == NULL) - return NULL; - for (i = 0; i < Py_SIZE(x); i++) { - Py_ssize_t value = PyNumber_AsSsize_t( - PyTuple_GET_ITEM(x, i), PyExc_ValueError); - if (value == -1 && PyErr_Occurred()) { - Py_DECREF(new); - return NULL; - } - if (value < 0 || value >= 256) { - PyErr_SetString(PyExc_ValueError, - "bytes must be in range(0, 256)"); - Py_DECREF(new); - return NULL; - } - ((PyBytesObject *)new)->ob_sval[i] = (char) value; - } - return new; - } +#define _PyBytes_FROM_LIST_BODY(x, GET_ITEM) \ + do { \ + PyObject *bytes; \ + Py_ssize_t i; \ + Py_ssize_t value; \ + char *str; \ + PyObject *item; \ + \ + bytes = PyBytes_FromStringAndSize(NULL, Py_SIZE(x)); \ + if (bytes == NULL) \ + return NULL; \ + str = ((PyBytesObject *)bytes)->ob_sval; \ + \ + for (i = 0; i < Py_SIZE(x); i++) { \ + item = GET_ITEM((x), i); \ + value = PyNumber_AsSsize_t(item, PyExc_ValueError); \ + if (value == -1 && PyErr_Occurred()) \ + goto error; \ + \ + if (value < 0 || value >= 256) { \ + PyErr_SetString(PyExc_ValueError, \ + "bytes must be in range(0, 256)"); \ + goto error; \ + } \ + *str++ = (char) value; \ + } \ + return bytes; \ + \ + error: \ + Py_DECREF(bytes); \ + return NULL; \ + } while (0) + +static PyObject* +_PyBytes_FromList(PyObject *x) +{ + _PyBytes_FROM_LIST_BODY(x, PyList_GET_ITEM); +} + +static PyObject* +_PyBytes_FromTuple(PyObject *x) +{ + _PyBytes_FROM_LIST_BODY(x, PyTuple_GET_ITEM); +} + +static PyObject * +_PyBytes_FromIterator(PyObject *it, PyObject *x) +{ + char *str; + Py_ssize_t i, size; + _PyBytesWriter writer; /* For iterator version, create a string object and resize as needed */ size = PyObject_LengthHint(x, 64); if (size == -1 && PyErr_Occurred()) return NULL; - /* Allocate an extra byte to prevent PyBytes_FromStringAndSize() from - returning a shared empty bytes string. This required because we - want to call _PyBytes_Resize() the returned object, which we can - only do on bytes objects with refcount == 1. */ - if (size == 0) - size = 1; - new = PyBytes_FromStringAndSize(NULL, size); - if (new == NULL) - return NULL; - assert(Py_REFCNT(new) == 1); - /* Get the iterator */ - it = PyObject_GetIter(x); - if (it == NULL) - goto error; + _PyBytesWriter_Init(&writer); + str = _PyBytesWriter_Alloc(&writer, size); + if (str == NULL) + return NULL; + writer.overallocate = 1; + size = writer.allocated; /* Run the iterator to exhaustion */ for (i = 0; ; i++) { @@ -3375,21 +2696,58 @@ PyBytes_FromObject(PyObject *x) /* Append the byte */ if (i >= size) { - size = 2 * size + 1; - if (_PyBytes_Resize(&new, size) < 0) - goto error; + str = _PyBytesWriter_Resize(&writer, str, size+1); + if (str == NULL) + return NULL; + size = writer.allocated; } - ((PyBytesObject *)new)->ob_sval[i] = (char) value; + *str++ = (char) value; } - _PyBytes_Resize(&new, i); - /* Clean up and return success */ - Py_DECREF(it); - return new; + return _PyBytesWriter_Finish(&writer, str); error: - Py_XDECREF(it); - Py_XDECREF(new); + _PyBytesWriter_Dealloc(&writer); + return NULL; +} + +PyObject * +PyBytes_FromObject(PyObject *x) +{ + PyObject *it, *result; + + if (x == NULL) { + PyErr_BadInternalCall(); + return NULL; + } + + if (PyBytes_CheckExact(x)) { + Py_INCREF(x); + return x; + } + + /* Use the modern buffer interface */ + if (PyObject_CheckBuffer(x)) + return _PyBytes_FromBuffer(x); + + if (PyList_CheckExact(x)) + return _PyBytes_FromList(x); + + if (PyTuple_CheckExact(x)) + return _PyBytes_FromTuple(x); + + if (!PyUnicode_Check(x)) { + it = PyObject_GetIter(x); + if (it != NULL) { + result = _PyBytes_FromIterator(it, x); + Py_DECREF(it); + return result; + } + } + + PyErr_Format(PyExc_TypeError, + "cannot convert '%.200s' object to bytes", + x->ob_type->tp_name); return NULL; } @@ -3740,3 +3098,282 @@ bytes_iter(PyObject *seq) _PyObject_GC_TRACK(it); return (PyObject *)it; } + + +/* _PyBytesWriter API */ + +#ifdef MS_WINDOWS + /* On Windows, overallocate by 50% is the best factor */ +# define OVERALLOCATE_FACTOR 2 +#else + /* On Linux, overallocate by 25% is the best factor */ +# define OVERALLOCATE_FACTOR 4 +#endif + +void +_PyBytesWriter_Init(_PyBytesWriter *writer) +{ + /* Set all attributes before small_buffer to 0 */ + memset(writer, 0, offsetof(_PyBytesWriter, small_buffer)); +#ifdef Py_DEBUG + memset(writer->small_buffer, 0xCB, sizeof(writer->small_buffer)); +#endif +} + +void +_PyBytesWriter_Dealloc(_PyBytesWriter *writer) +{ + Py_CLEAR(writer->buffer); +} + +Py_LOCAL_INLINE(char*) +_PyBytesWriter_AsString(_PyBytesWriter *writer) +{ + if (writer->use_small_buffer) { + assert(writer->buffer == NULL); + return writer->small_buffer; + } + else if (writer->use_bytearray) { + assert(writer->buffer != NULL); + return PyByteArray_AS_STRING(writer->buffer); + } + else { + assert(writer->buffer != NULL); + return PyBytes_AS_STRING(writer->buffer); + } +} + +Py_LOCAL_INLINE(Py_ssize_t) +_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str) +{ + char *start = _PyBytesWriter_AsString(writer); + assert(str != NULL); + assert(str >= start); + assert(str - start <= writer->allocated); + return str - start; +} + +Py_LOCAL_INLINE(void) +_PyBytesWriter_CheckConsistency(_PyBytesWriter *writer, char *str) +{ +#ifdef Py_DEBUG + char *start, *end; + + if (writer->use_small_buffer) { + assert(writer->buffer == NULL); + } + else { + assert(writer->buffer != NULL); + if (writer->use_bytearray) + assert(PyByteArray_CheckExact(writer->buffer)); + else + assert(PyBytes_CheckExact(writer->buffer)); + assert(Py_REFCNT(writer->buffer) == 1); + } + + if (writer->use_bytearray) { + /* bytearray has its own overallocation algorithm, + writer overallocation must be disabled */ + assert(!writer->overallocate); + } + + assert(0 <= writer->allocated); + assert(0 <= writer->min_size && writer->min_size <= writer->allocated); + /* the last byte must always be null */ + start = _PyBytesWriter_AsString(writer); + assert(start[writer->allocated] == 0); + + end = start + writer->allocated; + assert(str != NULL); + assert(start <= str && str <= end); +#endif +} + +void* +_PyBytesWriter_Resize(_PyBytesWriter *writer, void *str, Py_ssize_t size) +{ + Py_ssize_t allocated, pos; + + _PyBytesWriter_CheckConsistency(writer, str); + assert(writer->allocated < size); + + allocated = size; + if (writer->overallocate + && allocated <= (PY_SSIZE_T_MAX - allocated / OVERALLOCATE_FACTOR)) { + /* overallocate to limit the number of realloc() */ + allocated += allocated / OVERALLOCATE_FACTOR; + } + + pos = _PyBytesWriter_GetSize(writer, str); + if (!writer->use_small_buffer) { + if (writer->use_bytearray) { + if (PyByteArray_Resize(writer->buffer, allocated)) + goto error; + /* writer->allocated can be smaller than writer->buffer->ob_alloc, + but we cannot use ob_alloc because bytes may need to be moved + to use the whole buffer. bytearray uses an internal optimization + to avoid moving or copying bytes when bytes are removed at the + beginning (ex: del bytearray[:1]). */ + } + else { + if (_PyBytes_Resize(&writer->buffer, allocated)) + goto error; + } + } + else { + /* convert from stack buffer to bytes object buffer */ + assert(writer->buffer == NULL); + + if (writer->use_bytearray) + writer->buffer = PyByteArray_FromStringAndSize(NULL, allocated); + else + writer->buffer = PyBytes_FromStringAndSize(NULL, allocated); + if (writer->buffer == NULL) + goto error; + + if (pos != 0) { + char *dest; + if (writer->use_bytearray) + dest = PyByteArray_AS_STRING(writer->buffer); + else + dest = PyBytes_AS_STRING(writer->buffer); + Py_MEMCPY(dest, + writer->small_buffer, + pos); + } + + writer->use_small_buffer = 0; +#ifdef Py_DEBUG + memset(writer->small_buffer, 0xDB, sizeof(writer->small_buffer)); +#endif + } + writer->allocated = allocated; + + str = _PyBytesWriter_AsString(writer) + pos; + _PyBytesWriter_CheckConsistency(writer, str); + return str; + +error: + _PyBytesWriter_Dealloc(writer); + return NULL; +} + +void* +_PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size) +{ + Py_ssize_t new_min_size; + + _PyBytesWriter_CheckConsistency(writer, str); + assert(size >= 0); + + if (size == 0) { + /* nothing to do */ + return str; + } + + if (writer->min_size > PY_SSIZE_T_MAX - size) { + PyErr_NoMemory(); + _PyBytesWriter_Dealloc(writer); + return NULL; + } + new_min_size = writer->min_size + size; + + if (new_min_size > writer->allocated) + str = _PyBytesWriter_Resize(writer, str, new_min_size); + + writer->min_size = new_min_size; + return str; +} + +/* Allocate the buffer to write size bytes. + Return the pointer to the beginning of buffer data. + Raise an exception and return NULL on error. */ +void* +_PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size) +{ + /* ensure that _PyBytesWriter_Alloc() is only called once */ + assert(writer->min_size == 0 && writer->buffer == NULL); + assert(size >= 0); + + writer->use_small_buffer = 1; +#ifdef Py_DEBUG + writer->allocated = sizeof(writer->small_buffer) - 1; + /* In debug mode, don't use the full small buffer because it is less + efficient than bytes and bytearray objects to detect buffer underflow + and buffer overflow. Use 10 bytes of the small buffer to test also + code using the smaller buffer in debug mode. + + Don't modify the _PyBytesWriter structure (use a shorter small buffer) + in debug mode to also be able to detect stack overflow when running + tests in debug mode. The _PyBytesWriter is large (more than 512 bytes), + if Py_EnterRecursiveCall() is not used in deep C callback, we may hit a + stack overflow. */ + writer->allocated = Py_MIN(writer->allocated, 10); + /* _PyBytesWriter_CheckConsistency() requires the last byte to be 0, + to detect buffer overflow */ + writer->small_buffer[writer->allocated] = 0; +#else + writer->allocated = sizeof(writer->small_buffer); +#endif + return _PyBytesWriter_Prepare(writer, writer->small_buffer, size); +} + +PyObject * +_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str) +{ + Py_ssize_t size; + PyObject *result; + + _PyBytesWriter_CheckConsistency(writer, str); + + size = _PyBytesWriter_GetSize(writer, str); + if (size == 0 && !writer->use_bytearray) { + Py_CLEAR(writer->buffer); + /* Get the empty byte string singleton */ + result = PyBytes_FromStringAndSize(NULL, 0); + } + else if (writer->use_small_buffer) { + if (writer->use_bytearray) { + result = PyByteArray_FromStringAndSize(writer->small_buffer, size); + } + else { + result = PyBytes_FromStringAndSize(writer->small_buffer, size); + } + } + else { + result = writer->buffer; + writer->buffer = NULL; + + if (size != writer->allocated) { + if (writer->use_bytearray) { + if (PyByteArray_Resize(result, size)) { + Py_DECREF(result); + return NULL; + } + } + else { + if (_PyBytes_Resize(&result, size)) { + assert(result == NULL); + return NULL; + } + } + } + } + return result; +} + +void* +_PyBytesWriter_WriteBytes(_PyBytesWriter *writer, void *ptr, + const void *bytes, Py_ssize_t size) +{ + char *str = (char *)ptr; + + str = _PyBytesWriter_Prepare(writer, str, size); + if (str == NULL) + return NULL; + + Py_MEMCPY(str, bytes, size); + str += size; + + return str; +} diff --git a/Objects/clinic/bytesobject.c.h b/Objects/clinic/bytesobject.c.h index 5a1a5e9..95ce817 100644 --- a/Objects/clinic/bytesobject.c.h +++ b/Objects/clinic/bytesobject.c.h @@ -20,10 +20,10 @@ PyDoc_STRVAR(bytes_split__doc__, {"split", (PyCFunction)bytes_split, METH_VARARGS|METH_KEYWORDS, bytes_split__doc__}, static PyObject * -bytes_split_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit); +bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit); static PyObject * -bytes_split(PyBytesObject*self, PyObject *args, PyObject *kwargs) +bytes_split(PyBytesObject *self, PyObject *args, PyObject *kwargs) { PyObject *return_value = NULL; static char *_keywords[] = {"sep", "maxsplit", NULL}; @@ -133,10 +133,10 @@ PyDoc_STRVAR(bytes_rsplit__doc__, {"rsplit", (PyCFunction)bytes_rsplit, METH_VARARGS|METH_KEYWORDS, bytes_rsplit__doc__}, static PyObject * -bytes_rsplit_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit); +bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit); static PyObject * -bytes_rsplit(PyBytesObject*self, PyObject *args, PyObject *kwargs) +bytes_rsplit(PyBytesObject *self, PyObject *args, PyObject *kwargs) { PyObject *return_value = NULL; static char *_keywords[] = {"sep", "maxsplit", NULL}; @@ -359,11 +359,11 @@ PyDoc_STRVAR(bytes_replace__doc__, {"replace", (PyCFunction)bytes_replace, METH_VARARGS, bytes_replace__doc__}, static PyObject * -bytes_replace_impl(PyBytesObject*self, Py_buffer *old, Py_buffer *new, +bytes_replace_impl(PyBytesObject *self, Py_buffer *old, Py_buffer *new, Py_ssize_t count); static PyObject * -bytes_replace(PyBytesObject*self, PyObject *args) +bytes_replace(PyBytesObject *self, PyObject *args) { PyObject *return_value = NULL; Py_buffer old = {NULL, NULL}; @@ -405,11 +405,11 @@ PyDoc_STRVAR(bytes_decode__doc__, {"decode", (PyCFunction)bytes_decode, METH_VARARGS|METH_KEYWORDS, bytes_decode__doc__}, static PyObject * -bytes_decode_impl(PyBytesObject*self, const char *encoding, +bytes_decode_impl(PyBytesObject *self, const char *encoding, const char *errors); static PyObject * -bytes_decode(PyBytesObject*self, PyObject *args, PyObject *kwargs) +bytes_decode(PyBytesObject *self, PyObject *args, PyObject *kwargs) { PyObject *return_value = NULL; static char *_keywords[] = {"encoding", "errors", NULL}; @@ -438,10 +438,10 @@ PyDoc_STRVAR(bytes_splitlines__doc__, {"splitlines", (PyCFunction)bytes_splitlines, METH_VARARGS|METH_KEYWORDS, bytes_splitlines__doc__}, static PyObject * -bytes_splitlines_impl(PyBytesObject*self, int keepends); +bytes_splitlines_impl(PyBytesObject *self, int keepends); static PyObject * -bytes_splitlines(PyBytesObject*self, PyObject *args, PyObject *kwargs) +bytes_splitlines(PyBytesObject *self, PyObject *args, PyObject *kwargs) { PyObject *return_value = NULL; static char *_keywords[] = {"keepends", NULL}; @@ -484,4 +484,4 @@ bytes_fromhex(PyTypeObject *type, PyObject *arg) exit: return return_value; } -/*[clinic end generated code: output=bd0ce8f25d7e18f4 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=d0e9f5a1c0682910 input=a9049054013a1b77]*/ diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 964ae62..f089f75 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -694,7 +694,8 @@ PyCode_Addr2Line(PyCodeObject *co, int addrq) addr += *p++; if (addr > addrq) break; - line += *p++; + line += (signed char)*p; + p++; } return line; } @@ -729,17 +730,19 @@ _PyCode_CheckLineNumber(PyCodeObject* co, int lasti, PyAddrPair *bounds) if (addr + *p > lasti) break; addr += *p++; - if (*p) + if ((signed char)*p) bounds->ap_lower = addr; - line += *p++; + line += (signed char)*p; + p++; --size; } if (size > 0) { while (--size >= 0) { addr += *p++; - if (*p++) + if ((signed char)*p) break; + p++; } bounds->ap_upper = addr; } diff --git a/Objects/descrobject.c b/Objects/descrobject.c index da68e3b..4bc73b9 100644 --- a/Objects/descrobject.c +++ b/Objects/descrobject.c @@ -22,7 +22,7 @@ descr_name(PyDescrObject *descr) } static PyObject * -descr_repr(PyDescrObject *descr, char *format) +descr_repr(PyDescrObject *descr, const char *format) { PyObject *name = NULL; if (descr->d_name != NULL && PyUnicode_Check(descr->d_name)) diff --git a/Objects/dictobject.c b/Objects/dictobject.c index d774586..31c45ef 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -324,7 +324,7 @@ static PyDictKeysObject *new_keys_object(Py_ssize_t size) assert(size >= PyDict_MINSIZE_SPLIT); assert(IS_POWER_OF_2(size)); - dk = PyMem_MALLOC(sizeof(PyDictKeysObject) + + dk = PyObject_MALLOC(sizeof(PyDictKeysObject) + sizeof(PyDictKeyEntry) * (size-1)); if (dk == NULL) { PyErr_NoMemory(); @@ -353,7 +353,7 @@ free_keys_object(PyDictKeysObject *keys) Py_XDECREF(entries[i].me_key); Py_XDECREF(entries[i].me_value); } - PyMem_FREE(keys); + PyObject_FREE(keys); } #define new_values(size) PyMem_NEW(PyObject *, size) @@ -964,7 +964,7 @@ dictresize(PyDictObject *mp, Py_ssize_t minused) } } assert(oldkeys->dk_refcnt == 1); - DK_DEBUG_DECREF PyMem_FREE(oldkeys); + DK_DEBUG_DECREF PyObject_FREE(oldkeys); } return 0; } @@ -1163,39 +1163,42 @@ _PyDict_GetItemIdWithError(PyObject *dp, struct _Py_Identifier *key) return PyDict_GetItemWithError(dp, kv); } -/* Fast version of global value lookup. +/* Fast version of global value lookup (LOAD_GLOBAL). * Lookup in globals, then builtins. + * + * Raise an exception and return NULL if an error occurred (ex: computing the + * key hash failed, key comparison failed, ...). Return NULL if the key doesn't + * exist. Return the value if the key exists. */ PyObject * _PyDict_LoadGlobal(PyDictObject *globals, PyDictObject *builtins, PyObject *key) { - PyObject *x; - if (PyUnicode_CheckExact(key)) { - PyObject **value_addr; - Py_hash_t hash = ((PyASCIIObject *)key)->hash; - if (hash != -1) { - PyDictKeyEntry *e; - e = globals->ma_keys->dk_lookup(globals, key, hash, &value_addr); - if (e == NULL) { - return NULL; - } - x = *value_addr; - if (x != NULL) - return x; - e = builtins->ma_keys->dk_lookup(builtins, key, hash, &value_addr); - if (e == NULL) { - return NULL; - } - x = *value_addr; - return x; - } + Py_hash_t hash; + PyDictKeyEntry *entry; + PyObject **value_addr; + PyObject *value; + + if (!PyUnicode_CheckExact(key) || + (hash = ((PyASCIIObject *) key)->hash) == -1) + { + hash = PyObject_Hash(key); + if (hash == -1) + return NULL; } - x = PyDict_GetItemWithError((PyObject *)globals, key); - if (x != NULL) - return x; - if (PyErr_Occurred()) + + /* namespace 1: globals */ + entry = globals->ma_keys->dk_lookup(globals, key, hash, &value_addr); + if (entry == NULL) return NULL; - return PyDict_GetItemWithError((PyObject *)builtins, key); + value = *value_addr; + if (value != NULL) + return value; + + /* namespace 2: builtins */ + entry = builtins->ma_keys->dk_lookup(builtins, key, hash, &value_addr); + if (entry == NULL) + return NULL; + return *value_addr; } /* CAUTION: PyDict_SetItem() must guarantee that it won't resize the @@ -1920,7 +1923,7 @@ dict_fromkeys_impl(PyTypeObject *type, PyObject *iterable, PyObject *value) } static int -dict_update_common(PyObject *self, PyObject *args, PyObject *kwds, char *methname) +dict_update_common(PyObject *self, PyObject *args, PyObject *kwds, const char *methname) { PyObject *arg = NULL; int result = 0; diff --git a/Objects/exceptions.c b/Objects/exceptions.c index d03aada..2c7688c 100644 --- a/Objects/exceptions.c +++ b/Objects/exceptions.c @@ -59,15 +59,11 @@ BaseException_new(PyTypeObject *type, PyObject *args, PyObject *kwds) static int BaseException_init(PyBaseExceptionObject *self, PyObject *args, PyObject *kwds) { - PyObject *tmp; - if (!_PyArg_NoKeywords(Py_TYPE(self)->tp_name, kwds)) return -1; - tmp = self->args; - self->args = args; - Py_INCREF(self->args); - Py_XDECREF(tmp); + Py_INCREF(args); + Py_XSETREF(self->args, args); return 0; } @@ -328,11 +324,10 @@ PyException_GetCause(PyObject *self) { /* Steals a reference to cause */ void -PyException_SetCause(PyObject *self, PyObject *cause) { - PyObject *old_cause = ((PyBaseExceptionObject *)self)->cause; - ((PyBaseExceptionObject *)self)->cause = cause; +PyException_SetCause(PyObject *self, PyObject *cause) +{ ((PyBaseExceptionObject *)self)->suppress_context = 1; - Py_XDECREF(old_cause); + Py_XSETREF(((PyBaseExceptionObject *)self)->cause, cause); } PyObject * @@ -344,10 +339,9 @@ PyException_GetContext(PyObject *self) { /* Steals a reference to context */ void -PyException_SetContext(PyObject *self, PyObject *context) { - PyObject *old_context = ((PyBaseExceptionObject *)self)->context; - ((PyBaseExceptionObject *)self)->context = context; - Py_XDECREF(old_context); +PyException_SetContext(PyObject *self, PyObject *context) +{ + Py_XSETREF(((PyBaseExceptionObject *)self)->context, context); } diff --git a/Objects/floatobject.c b/Objects/floatobject.c index d92bec3..5b2742a 100644 --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -1195,7 +1195,7 @@ Return a hexadecimal representation of a floating-point number.\n\ static PyObject * float_fromhex(PyObject *cls, PyObject *arg) { - PyObject *result_as_float, *result; + PyObject *result; double x; long exp, top_exp, lsb, key_digit; char *s, *coeff_start, *s_store, *coeff_end, *exp_start, *s_end; @@ -1410,11 +1410,10 @@ float_fromhex(PyObject *cls, PyObject *arg) s++; if (s != s_end) goto parse_error; - result_as_float = Py_BuildValue("(d)", negate ? -x : x); - if (result_as_float == NULL) - return NULL; - result = PyObject_CallObject(cls, result_as_float); - Py_DECREF(result_as_float); + result = PyFloat_FromDouble(negate ? -x : x); + if (cls != (PyObject *)&PyFloat_Type && result != NULL) { + Py_SETREF(result, PyObject_CallFunctionObjArgs(cls, result, NULL)); + } return result; overflow_error: @@ -1451,29 +1450,23 @@ float_as_integer_ratio(PyObject *v, PyObject *unused) int exponent; int i; - PyObject *prev; PyObject *py_exponent = NULL; PyObject *numerator = NULL; PyObject *denominator = NULL; PyObject *result_pair = NULL; PyNumberMethods *long_methods = PyLong_Type.tp_as_number; -#define INPLACE_UPDATE(obj, call) \ - prev = obj; \ - obj = call; \ - Py_DECREF(prev); \ - CONVERT_TO_DOUBLE(v, self); if (Py_IS_INFINITY(self)) { - PyErr_SetString(PyExc_OverflowError, - "Cannot pass infinity to float.as_integer_ratio."); - return NULL; + PyErr_SetString(PyExc_OverflowError, + "cannot convert Infinity to integer ratio"); + return NULL; } if (Py_IS_NAN(self)) { - PyErr_SetString(PyExc_ValueError, - "Cannot pass NaN to float.as_integer_ratio."); - return NULL; + PyErr_SetString(PyExc_ValueError, + "cannot convert NaN to integer ratio"); + return NULL; } PyFPE_START_PROTECT("as_integer_ratio", goto error); @@ -1489,29 +1482,31 @@ float_as_integer_ratio(PyObject *v, PyObject *unused) to be truncated by PyLong_FromDouble(). */ numerator = PyLong_FromDouble(float_part); - if (numerator == NULL) goto error; + if (numerator == NULL) + goto error; + denominator = PyLong_FromLong(1); + if (denominator == NULL) + goto error; + py_exponent = PyLong_FromLong(Py_ABS(exponent)); + if (py_exponent == NULL) + goto error; /* fold in 2**exponent */ - denominator = PyLong_FromLong(1); - py_exponent = PyLong_FromLong(labs((long)exponent)); - if (py_exponent == NULL) goto error; - INPLACE_UPDATE(py_exponent, - long_methods->nb_lshift(denominator, py_exponent)); - if (py_exponent == NULL) goto error; if (exponent > 0) { - INPLACE_UPDATE(numerator, - long_methods->nb_multiply(numerator, py_exponent)); - if (numerator == NULL) goto error; + Py_SETREF(numerator, + long_methods->nb_lshift(numerator, py_exponent)); + if (numerator == NULL) + goto error; } else { - Py_DECREF(denominator); - denominator = py_exponent; - py_exponent = NULL; + Py_SETREF(denominator, + long_methods->nb_lshift(denominator, py_exponent)); + if (denominator == NULL) + goto error; } result_pair = PyTuple_Pack(2, numerator, denominator); -#undef INPLACE_UPDATE error: Py_XDECREF(py_exponent); Py_XDECREF(denominator); diff --git a/Objects/frameobject.c b/Objects/frameobject.c index bdf06db..a4a862a 100644 --- a/Objects/frameobject.c +++ b/Objects/frameobject.c @@ -137,7 +137,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno) new_lasti = -1; for (offset = 0; offset < lnotab_len; offset += 2) { addr += lnotab[offset]; - line += lnotab[offset+1]; + line += (signed char)lnotab[offset+1]; if (line >= new_lineno) { new_lasti = addr; new_lineno = line; @@ -349,15 +349,11 @@ frame_gettrace(PyFrameObject *f, void *closure) static int frame_settrace(PyFrameObject *f, PyObject* v, void *closure) { - PyObject* old_value; - /* We rely on f_lineno being accurate when f_trace is set. */ f->f_lineno = PyFrame_GetLineNumber(f); - old_value = f->f_trace; Py_XINCREF(v); - f->f_trace = v; - Py_XDECREF(old_value); + Py_XSETREF(f->f_trace, v); return 0; } diff --git a/Objects/funcobject.c b/Objects/funcobject.c index e6c327d..261c16d 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -249,7 +249,6 @@ func_get_code(PyFunctionObject *op) static int func_set_code(PyFunctionObject *op, PyObject *value) { - PyObject *tmp; Py_ssize_t nfree, nclosure; /* Not legal to del f.func_code or to set it to anything @@ -270,10 +269,8 @@ func_set_code(PyFunctionObject *op, PyObject *value) nclosure, nfree); return -1; } - tmp = op->func_code; Py_INCREF(value); - op->func_code = value; - Py_DECREF(tmp); + Py_XSETREF(op->func_code, value); return 0; } @@ -287,8 +284,6 @@ func_get_name(PyFunctionObject *op) static int func_set_name(PyFunctionObject *op, PyObject *value) { - PyObject *tmp; - /* Not legal to del f.func_name or to set it to anything * other than a string object. */ if (value == NULL || !PyUnicode_Check(value)) { @@ -296,10 +291,8 @@ func_set_name(PyFunctionObject *op, PyObject *value) "__name__ must be set to a string object"); return -1; } - tmp = op->func_name; Py_INCREF(value); - op->func_name = value; - Py_DECREF(tmp); + Py_XSETREF(op->func_name, value); return 0; } @@ -313,8 +306,6 @@ func_get_qualname(PyFunctionObject *op) static int func_set_qualname(PyFunctionObject *op, PyObject *value) { - PyObject *tmp; - /* Not legal to del f.__qualname__ or to set it to anything * other than a string object. */ if (value == NULL || !PyUnicode_Check(value)) { @@ -322,10 +313,8 @@ func_set_qualname(PyFunctionObject *op, PyObject *value) "__qualname__ must be set to a string object"); return -1; } - tmp = op->func_qualname; Py_INCREF(value); - op->func_qualname = value; - Py_DECREF(tmp); + Py_XSETREF(op->func_qualname, value); return 0; } @@ -343,8 +332,6 @@ func_get_defaults(PyFunctionObject *op) static int func_set_defaults(PyFunctionObject *op, PyObject *value) { - PyObject *tmp; - /* Legal to del f.func_defaults. * Can only set func_defaults to NULL or a tuple. */ if (value == Py_None) @@ -354,10 +341,8 @@ func_set_defaults(PyFunctionObject *op, PyObject *value) "__defaults__ must be set to a tuple object"); return -1; } - tmp = op->func_defaults; Py_XINCREF(value); - op->func_defaults = value; - Py_XDECREF(tmp); + Py_XSETREF(op->func_defaults, value); return 0; } @@ -375,8 +360,6 @@ func_get_kwdefaults(PyFunctionObject *op) static int func_set_kwdefaults(PyFunctionObject *op, PyObject *value) { - PyObject *tmp; - if (value == Py_None) value = NULL; /* Legal to del f.func_kwdefaults. @@ -386,10 +369,8 @@ func_set_kwdefaults(PyFunctionObject *op, PyObject *value) "__kwdefaults__ must be set to a dict object"); return -1; } - tmp = op->func_kwdefaults; Py_XINCREF(value); - op->func_kwdefaults = value; - Py_XDECREF(tmp); + Py_XSETREF(op->func_kwdefaults, value); return 0; } @@ -408,8 +389,6 @@ func_get_annotations(PyFunctionObject *op) static int func_set_annotations(PyFunctionObject *op, PyObject *value) { - PyObject *tmp; - if (value == Py_None) value = NULL; /* Legal to del f.func_annotations. @@ -420,10 +399,8 @@ func_set_annotations(PyFunctionObject *op, PyObject *value) "__annotations__ must be set to a dict object"); return -1; } - tmp = op->func_annotations; Py_XINCREF(value); - op->func_annotations = value; - Py_XDECREF(tmp); + Py_XSETREF(op->func_annotations, value); return 0; } diff --git a/Objects/genobject.c b/Objects/genobject.c index f74d044..c94a6ed 100644 --- a/Objects/genobject.c +++ b/Objects/genobject.c @@ -187,7 +187,7 @@ gen_send_ex(PyGenObject *gen, PyObject *arg, int exc, int closing) /* Pop the exception before issuing a warning. */ PyErr_Fetch(&exc, &val, &tb); - if (PyErr_WarnFormat(PyExc_PendingDeprecationWarning, 1, + if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "generator '%.50S' raised StopIteration", gen->gi_qualname)) { /* Warning was converted to an error. */ @@ -519,8 +519,6 @@ gen_get_name(PyGenObject *op) static int gen_set_name(PyGenObject *op, PyObject *value) { - PyObject *tmp; - /* Not legal to del gen.gi_name or to set it to anything * other than a string object. */ if (value == NULL || !PyUnicode_Check(value)) { @@ -528,10 +526,8 @@ gen_set_name(PyGenObject *op, PyObject *value) "__name__ must be set to a string object"); return -1; } - tmp = op->gi_name; Py_INCREF(value); - op->gi_name = value; - Py_DECREF(tmp); + Py_XSETREF(op->gi_name, value); return 0; } @@ -545,8 +541,6 @@ gen_get_qualname(PyGenObject *op) static int gen_set_qualname(PyGenObject *op, PyObject *value) { - PyObject *tmp; - /* Not legal to del gen.__qualname__ or to set it to anything * other than a string object. */ if (value == NULL || !PyUnicode_Check(value)) { @@ -554,10 +548,8 @@ gen_set_qualname(PyGenObject *op, PyObject *value) "__qualname__ must be set to a string object"); return -1; } - tmp = op->gi_qualname; Py_INCREF(value); - op->gi_qualname = value; - Py_DECREF(tmp); + Py_XSETREF(op->gi_qualname, value); return 0; } diff --git a/Objects/listobject.c b/Objects/listobject.c index d688179..6e2d026 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -216,7 +216,6 @@ int PyList_SetItem(PyObject *op, Py_ssize_t i, PyObject *newitem) { - PyObject *olditem; PyObject **p; if (!PyList_Check(op)) { Py_XDECREF(newitem); @@ -230,9 +229,7 @@ PyList_SetItem(PyObject *op, Py_ssize_t i, return -1; } p = ((PyListObject *)op) -> ob_item + i; - olditem = *p; - *p = newitem; - Py_XDECREF(olditem); + Py_XSETREF(*p, newitem); return 0; } @@ -251,7 +248,7 @@ ins1(PyListObject *self, Py_ssize_t where, PyObject *v) return -1; } - if (list_resize(self, n+1) == -1) + if (list_resize(self, n+1) < 0) return -1; if (where < 0) { @@ -291,7 +288,7 @@ app1(PyListObject *self, PyObject *v) return -1; } - if (list_resize(self, n+1) == -1) + if (list_resize(self, n+1) < 0) return -1; Py_INCREF(v); @@ -711,7 +708,7 @@ list_inplace_repeat(PyListObject *self, Py_ssize_t n) return PyErr_NoMemory(); } - if (list_resize(self, size*n) == -1) + if (list_resize(self, size*n) < 0) return NULL; p = size; @@ -730,7 +727,6 @@ list_inplace_repeat(PyListObject *self, Py_ssize_t n) static int list_ass_item(PyListObject *a, Py_ssize_t i, PyObject *v) { - PyObject *old_value; if (i < 0 || i >= Py_SIZE(a)) { PyErr_SetString(PyExc_IndexError, "list assignment index out of range"); @@ -739,9 +735,7 @@ list_ass_item(PyListObject *a, Py_ssize_t i, PyObject *v) if (v == NULL) return list_ass_slice(a, i, i+1, v); Py_INCREF(v); - old_value = a->ob_item[i]; - a->ob_item[i] = v; - Py_DECREF(old_value); + Py_SETREF(a->ob_item[i], v); return 0; } @@ -804,7 +798,7 @@ listextend(PyListObject *self, PyObject *b) Py_RETURN_NONE; } m = Py_SIZE(self); - if (list_resize(self, m + n) == -1) { + if (list_resize(self, m + n) < 0) { Py_DECREF(b); return NULL; } @@ -832,7 +826,7 @@ listextend(PyListObject *self, PyObject *b) /* Guess a result list size. */ n = PyObject_LengthHint(b, 8); - if (n == -1) { + if (n < 0) { Py_DECREF(it); return NULL; } @@ -840,7 +834,7 @@ listextend(PyListObject *self, PyObject *b) mn = m + n; if (mn >= m) { /* Make room. */ - if (list_resize(self, mn) == -1) + if (list_resize(self, mn) < 0) goto error; /* Make the list sane again. */ Py_SIZE(self) = m; diff --git a/Objects/listsort.txt b/Objects/listsort.txt index 832e4f2..fef982f 100644 --- a/Objects/listsort.txt +++ b/Objects/listsort.txt @@ -486,7 +486,7 @@ sub-run, yet finding such very efficiently when they exist. I first learned about the galloping strategy in a related context; see: "Adaptive Set Intersections, Unions, and Differences" (2000) - Erik D. Demaine, Alejandro López-Ortiz, J. Ian Munro + Erik D. Demaine, Alejandro López-Ortiz, J. Ian Munro and its followup(s). An earlier paper called the same strategy "exponential search": diff --git a/Objects/lnotab_notes.txt b/Objects/lnotab_notes.txt index d247edd..5153757 100644 --- a/Objects/lnotab_notes.txt +++ b/Objects/lnotab_notes.txt @@ -12,42 +12,47 @@ pairs. The details are important and delicate, best illustrated by example: 0 1 6 2 50 7 - 350 307 - 361 308 + 350 207 + 361 208 Instead of storing these numbers literally, we compress the list by storing only -the increments from one row to the next. Conceptually, the stored list might +the difference from one row to the next. Conceptually, the stored list might look like: - 0, 1, 6, 1, 44, 5, 300, 300, 11, 1 + 0, 1, 6, 1, 44, 5, 300, 200, 11, 1 -The above doesn't really work, but it's a start. Note that an unsigned byte -can't hold negative values, or values larger than 255, and the above example -contains two such values. So we make two tweaks: +The above doesn't really work, but it's a start. An unsigned byte (byte code +offset) can't hold negative values, or values larger than 255, a signed byte +(line number) can't hold values larger than 127 or less than -128, and the +above example contains two such values. So we make two tweaks: - (a) there's a deep assumption that byte code offsets and their corresponding - line #s both increase monotonically, and - (b) if at least one column jumps by more than 255 from one row to the next, - more than one pair is written to the table. In case #b, there's no way to know - from looking at the table later how many were written. That's the delicate - part. A user of co_lnotab desiring to find the source line number - corresponding to a bytecode address A should do something like this + (a) there's a deep assumption that byte code offsets increase monotonically, + and + (b) if byte code offset jumps by more than 255 from one row to the next, or if + source code line number jumps by more than 127 or less than -128 from one row + to the next, more than one pair is written to the table. In case #b, + there's no way to know from looking at the table later how many were written. + That's the delicate part. A user of co_lnotab desiring to find the source + line number corresponding to a bytecode address A should do something like + this: lineno = addr = 0 for addr_incr, line_incr in co_lnotab: addr += addr_incr if addr > A: return lineno + if line_incr >= 0x80: + line_incr -= 0x100 lineno += line_incr (In C, this is implemented by PyCode_Addr2Line().) In order for this to work, when the addr field increments by more than 255, the line # increment in each pair generated must be 0 until the remaining addr increment is < 256. So, in the example above, assemble_lnotab in compile.c should not (as was actually done -until 2.2) expand 300, 300 to +until 2.2) expand 300, 200 to 255, 255, 45, 45, but to - 255, 0, 45, 255, 0, 45. + 255, 0, 45, 128, 0, 72. The above is sufficient to reconstruct line numbers for tracebacks, but not for line tracing. Tracing is handled by PyCode_CheckLineNumber() in codeobject.c @@ -90,16 +95,16 @@ which compiles to this: 6 POP_JUMP_IF_FALSE 17 3 9 LOAD_CONST 1 (1) - 12 PRINT_ITEM + 12 PRINT_ITEM - 4 13 BREAK_LOOP + 4 13 BREAK_LOOP 14 JUMP_ABSOLUTE 3 - >> 17 POP_BLOCK + >> 17 POP_BLOCK 6 18 LOAD_CONST 2 (2) - 21 PRINT_ITEM + 21 PRINT_ITEM >> 22 LOAD_CONST 0 (None) - 25 RETURN_VALUE + 25 RETURN_VALUE If 'a' is false, execution will jump to the POP_BLOCK instruction at offset 17 and the co_lnotab will claim that execution has moved to line 4, which is wrong. diff --git a/Objects/longobject.c b/Objects/longobject.c index f68d15e..14d2974 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -1582,10 +1582,12 @@ divrem1(PyLongObject *a, digit n, digit *prem) static int long_to_decimal_string_internal(PyObject *aa, PyObject **p_output, - _PyUnicodeWriter *writer) + _PyUnicodeWriter *writer, + _PyBytesWriter *bytes_writer, + char **bytes_str) { PyLongObject *scratch, *a; - PyObject *str; + PyObject *str = NULL; Py_ssize_t size, strlen, size_a, i, j; digit *pout, *pin, rem, tenpow; int negative; @@ -1662,7 +1664,13 @@ long_to_decimal_string_internal(PyObject *aa, return -1; } kind = writer->kind; - str = NULL; + } + else if (bytes_writer) { + *bytes_str = _PyBytesWriter_Prepare(bytes_writer, *bytes_str, strlen); + if (*bytes_str == NULL) { + Py_DECREF(scratch); + return -1; + } } else { str = PyUnicode_New(strlen, '9'); @@ -1673,13 +1681,8 @@ long_to_decimal_string_internal(PyObject *aa, kind = PyUnicode_KIND(str); } -#define WRITE_DIGITS(TYPE) \ +#define WRITE_DIGITS(p) \ do { \ - if (writer) \ - p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + strlen; \ - else \ - p = (TYPE*)PyUnicode_DATA(str) + strlen; \ - \ /* pout[0] through pout[size-2] contribute exactly \ _PyLong_DECIMAL_SHIFT digits each */ \ for (i=0; i < size - 1; i++) { \ @@ -1699,6 +1702,16 @@ long_to_decimal_string_internal(PyObject *aa, /* and sign */ \ if (negative) \ *--p = '-'; \ + } while (0) + +#define WRITE_UNICODE_DIGITS(TYPE) \ + do { \ + if (writer) \ + p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + strlen; \ + else \ + p = (TYPE*)PyUnicode_DATA(str) + strlen; \ + \ + WRITE_DIGITS(p); \ \ /* check we've counted correctly */ \ if (writer) \ @@ -1708,25 +1721,34 @@ long_to_decimal_string_internal(PyObject *aa, } while (0) /* fill the string right-to-left */ - if (kind == PyUnicode_1BYTE_KIND) { + if (bytes_writer) { + char *p = *bytes_str + strlen; + WRITE_DIGITS(p); + assert(p == *bytes_str); + } + else if (kind == PyUnicode_1BYTE_KIND) { Py_UCS1 *p; - WRITE_DIGITS(Py_UCS1); + WRITE_UNICODE_DIGITS(Py_UCS1); } else if (kind == PyUnicode_2BYTE_KIND) { Py_UCS2 *p; - WRITE_DIGITS(Py_UCS2); + WRITE_UNICODE_DIGITS(Py_UCS2); } else { Py_UCS4 *p; assert (kind == PyUnicode_4BYTE_KIND); - WRITE_DIGITS(Py_UCS4); + WRITE_UNICODE_DIGITS(Py_UCS4); } #undef WRITE_DIGITS +#undef WRITE_UNICODE_DIGITS Py_DECREF(scratch); if (writer) { writer->pos += strlen; } + else if (bytes_writer) { + (*bytes_str) += strlen; + } else { assert(_PyUnicode_CheckConsistency(str, 1)); *p_output = (PyObject *)str; @@ -1738,7 +1760,7 @@ static PyObject * long_to_decimal_string(PyObject *aa) { PyObject *v; - if (long_to_decimal_string_internal(aa, &v, NULL) == -1) + if (long_to_decimal_string_internal(aa, &v, NULL, NULL, NULL) == -1) return NULL; return v; } @@ -1750,10 +1772,11 @@ long_to_decimal_string(PyObject *aa) static int long_format_binary(PyObject *aa, int base, int alternate, - PyObject **p_output, _PyUnicodeWriter *writer) + PyObject **p_output, _PyUnicodeWriter *writer, + _PyBytesWriter *bytes_writer, char **bytes_str) { PyLongObject *a = (PyLongObject *)aa; - PyObject *v; + PyObject *v = NULL; Py_ssize_t sz; Py_ssize_t size_a; enum PyUnicode_Kind kind; @@ -1810,7 +1833,11 @@ long_format_binary(PyObject *aa, int base, int alternate, if (_PyUnicodeWriter_Prepare(writer, sz, 'x') == -1) return -1; kind = writer->kind; - v = NULL; + } + else if (bytes_writer) { + *bytes_str = _PyBytesWriter_Prepare(bytes_writer, *bytes_str, sz); + if (*bytes_str == NULL) + return -1; } else { v = PyUnicode_New(sz, 'x'); @@ -1819,13 +1846,8 @@ long_format_binary(PyObject *aa, int base, int alternate, kind = PyUnicode_KIND(v); } -#define WRITE_DIGITS(TYPE) \ +#define WRITE_DIGITS(p) \ do { \ - if (writer) \ - p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + sz; \ - else \ - p = (TYPE*)PyUnicode_DATA(v) + sz; \ - \ if (size_a == 0) { \ *--p = '0'; \ } \ @@ -1860,30 +1882,50 @@ long_format_binary(PyObject *aa, int base, int alternate, } \ if (negative) \ *--p = '-'; \ + } while (0) + +#define WRITE_UNICODE_DIGITS(TYPE) \ + do { \ + if (writer) \ + p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + sz; \ + else \ + p = (TYPE*)PyUnicode_DATA(v) + sz; \ + \ + WRITE_DIGITS(p); \ + \ if (writer) \ assert(p == ((TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos)); \ else \ assert(p == (TYPE*)PyUnicode_DATA(v)); \ } while (0) - if (kind == PyUnicode_1BYTE_KIND) { + if (bytes_writer) { + char *p = *bytes_str + sz; + WRITE_DIGITS(p); + assert(p == *bytes_str); + } + else if (kind == PyUnicode_1BYTE_KIND) { Py_UCS1 *p; - WRITE_DIGITS(Py_UCS1); + WRITE_UNICODE_DIGITS(Py_UCS1); } else if (kind == PyUnicode_2BYTE_KIND) { Py_UCS2 *p; - WRITE_DIGITS(Py_UCS2); + WRITE_UNICODE_DIGITS(Py_UCS2); } else { Py_UCS4 *p; assert (kind == PyUnicode_4BYTE_KIND); - WRITE_DIGITS(Py_UCS4); + WRITE_UNICODE_DIGITS(Py_UCS4); } #undef WRITE_DIGITS +#undef WRITE_UNICODE_DIGITS if (writer) { writer->pos += sz; } + else if (bytes_writer) { + (*bytes_str) += sz; + } else { assert(_PyUnicode_CheckConsistency(v, 1)); *p_output = v; @@ -1897,9 +1939,9 @@ _PyLong_Format(PyObject *obj, int base) PyObject *str; int err; if (base == 10) - err = long_to_decimal_string_internal(obj, &str, NULL); + err = long_to_decimal_string_internal(obj, &str, NULL, NULL, NULL); else - err = long_format_binary(obj, base, 1, &str, NULL); + err = long_format_binary(obj, base, 1, &str, NULL, NULL, NULL); if (err == -1) return NULL; return str; @@ -1911,9 +1953,31 @@ _PyLong_FormatWriter(_PyUnicodeWriter *writer, int base, int alternate) { if (base == 10) - return long_to_decimal_string_internal(obj, NULL, writer); + return long_to_decimal_string_internal(obj, NULL, writer, + NULL, NULL); + else + return long_format_binary(obj, base, alternate, NULL, writer, + NULL, NULL); +} + +char* +_PyLong_FormatBytesWriter(_PyBytesWriter *writer, char *str, + PyObject *obj, + int base, int alternate) +{ + char *str2; + int res; + str2 = str; + if (base == 10) + res = long_to_decimal_string_internal(obj, NULL, NULL, + writer, &str2); else - return long_format_binary(obj, base, alternate, NULL, writer); + res = long_format_binary(obj, base, alternate, NULL, NULL, + writer, &str2); + if (res < 0) + return NULL; + assert(str2 != NULL); + return str2; } /* Table of digit values for 8-bit string -> integer conversion. @@ -2705,6 +2769,13 @@ PyLong_AsDouble(PyObject *v) PyErr_SetString(PyExc_TypeError, "an integer is required"); return -1.0; } + if (Py_ABS(Py_SIZE(v)) <= 1) { + /* Fast path; single digit long (31 bits) will cast safely + to double. This improves performance of FP/long operations + by 20%. + */ + return (double)MEDIUM_VALUE((PyLongObject *)v); + } x = _PyLong_Frexp((PyLongObject *)v, &exponent); if ((x == -1.0 && PyErr_Occurred()) || exponent > DBL_MAX_EXP) { PyErr_SetString(PyExc_OverflowError, @@ -3431,6 +3502,52 @@ long_mul(PyLongObject *a, PyLongObject *b) return (PyObject *)z; } +/* Fast modulo division for single-digit longs. */ +static PyObject * +fast_mod(PyLongObject *a, PyLongObject *b) +{ + sdigit left = a->ob_digit[0]; + sdigit right = b->ob_digit[0]; + sdigit mod; + + assert(Py_ABS(Py_SIZE(a)) == 1); + assert(Py_ABS(Py_SIZE(b)) == 1); + + if (Py_SIZE(a) == Py_SIZE(b)) { + /* 'a' and 'b' have the same sign. */ + mod = left % right; + } + else { + /* Either 'a' or 'b' is negative. */ + mod = right - 1 - (left - 1) % right; + } + + return PyLong_FromLong(mod * (sdigit)Py_SIZE(b)); +} + +/* Fast floor division for single-digit longs. */ +static PyObject * +fast_floor_div(PyLongObject *a, PyLongObject *b) +{ + sdigit left = a->ob_digit[0]; + sdigit right = b->ob_digit[0]; + sdigit div; + + assert(Py_ABS(Py_SIZE(a)) == 1); + assert(Py_ABS(Py_SIZE(b)) == 1); + + if (Py_SIZE(a) == Py_SIZE(b)) { + /* 'a' and 'b' have the same sign. */ + div = left / right; + } + else { + /* Either 'a' or 'b' is negative. */ + div = -1 - (left - 1) / right; + } + + return PyLong_FromLong(div); +} + /* The / and % operators are now defined in terms of divmod(). The expression a mod b has the value a - b*floor(a/b). The long_divrem function gives the remainder after division of @@ -3458,6 +3575,30 @@ l_divmod(PyLongObject *v, PyLongObject *w, { PyLongObject *div, *mod; + if (Py_ABS(Py_SIZE(v)) == 1 && Py_ABS(Py_SIZE(w)) == 1) { + /* Fast path for single-digit longs */ + div = NULL; + if (pdiv != NULL) { + div = (PyLongObject *)fast_floor_div(v, w); + if (div == NULL) { + return -1; + } + } + if (pmod != NULL) { + mod = (PyLongObject *)fast_mod(v, w); + if (mod == NULL) { + Py_XDECREF(div); + return -1; + } + *pmod = mod; + } + if (pdiv != NULL) { + /* We only want to set `*pdiv` when `*pmod` is + set successfully. */ + *pdiv = div; + } + return 0; + } if (long_divrem(v, w, &div, &mod) < 0) return -1; if ((Py_SIZE(mod) < 0 && Py_SIZE(w) > 0) || @@ -3502,6 +3643,11 @@ long_div(PyObject *a, PyObject *b) PyLongObject *div; CHECK_BINOP(a, b); + + if (Py_ABS(Py_SIZE(a)) == 1 && Py_ABS(Py_SIZE(b)) == 1) { + return fast_floor_div((PyLongObject*)a, (PyLongObject*)b); + } + if (l_divmod((PyLongObject*)a, (PyLongObject*)b, &div, NULL) < 0) div = NULL; return (PyObject *)div; @@ -3777,6 +3923,10 @@ long_mod(PyObject *a, PyObject *b) CHECK_BINOP(a, b); + if (Py_ABS(Py_SIZE(a)) == 1 && Py_ABS(Py_SIZE(b)) == 1) { + return fast_mod((PyLongObject*)a, (PyLongObject*)b); + } + if (l_divmod((PyLongObject*)a, (PyLongObject*)b, NULL, &mod) < 0) mod = NULL; return (PyObject *)mod; diff --git a/Objects/memoryobject.c b/Objects/memoryobject.c index 10162cb..e355a83 100644 --- a/Objects/memoryobject.c +++ b/Objects/memoryobject.c @@ -1133,7 +1133,7 @@ get_native_fmtchar(char *result, const char *fmt) return -1; } -Py_LOCAL_INLINE(char *) +Py_LOCAL_INLINE(const char *) get_native_fmtstr(const char *fmt) { int at = 0; @@ -1221,7 +1221,7 @@ cast_to_1D(PyMemoryViewObject *mv, PyObject *format) goto out; } - view->format = get_native_fmtstr(PyBytes_AS_STRING(asciifmt)); + view->format = (char *)get_native_fmtstr(PyBytes_AS_STRING(asciifmt)); if (view->format == NULL) { /* NOT_REACHED: get_native_fmtchar() already validates the format. */ PyErr_SetString(PyExc_RuntimeError, diff --git a/Objects/object.c b/Objects/object.c index 6fc4df1..cc1b2ff 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -644,7 +644,7 @@ PyObject_Bytes(PyObject *v) /* Map rich comparison operators to their swapped version, e.g. LT <--> GT */ int _Py_SwappedOp[] = {Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE}; -static char *opstrings[] = {"<", "<=", "==", "!=", ">", ">="}; +static const char * const opstrings[] = {"<", "<=", "==", "!=", ">", ">="}; /* Perform a rich comparison, raising TypeError when the requested comparison operator is not supported. */ @@ -686,11 +686,10 @@ do_richcompare(PyObject *v, PyObject *w, int op) res = (v != w) ? Py_True : Py_False; break; default: - /* XXX Special-case None so it doesn't show as NoneType() */ PyErr_Format(PyExc_TypeError, - "unorderable types: %.100s() %s %.100s()", - v->ob_type->tp_name, + "'%s' not supported between instances of '%.100s' and '%.100s'", opstrings[op], + v->ob_type->tp_name, w->ob_type->tp_name); return NULL; } @@ -1041,8 +1040,7 @@ _PyObject_GenericGetAttrWithDict(PyObject *obj, PyObject *name, PyObject *dict) name->ob_type->tp_name); return NULL; } - else - Py_INCREF(name); + Py_INCREF(name); if (tp->tp_dict == NULL) { if (PyType_Ready(tp) < 0) @@ -1050,10 +1048,10 @@ _PyObject_GenericGetAttrWithDict(PyObject *obj, PyObject *name, PyObject *dict) } descr = _PyType_Lookup(tp, name); - Py_XINCREF(descr); f = NULL; if (descr != NULL) { + Py_INCREF(descr); f = descr->ob_type->tp_descr_get; if (f != NULL && PyDescr_IsData(descr)) { res = f(descr, obj, (PyObject *)obj->ob_type); @@ -1073,8 +1071,9 @@ _PyObject_GenericGetAttrWithDict(PyObject *obj, PyObject *name, PyObject *dict) if (tsize < 0) tsize = -tsize; size = _PyObject_VAR_SIZE(tp, tsize); + assert(size <= PY_SSIZE_T_MAX); - dictoffset += (long)size; + dictoffset += (Py_ssize_t)size; assert(dictoffset > 0); assert(dictoffset % SIZEOF_VOID_P == 0); } @@ -1142,12 +1141,11 @@ _PyObject_GenericSetAttrWithDict(PyObject *obj, PyObject *name, Py_INCREF(name); descr = _PyType_Lookup(tp, name); - Py_XINCREF(descr); - f = NULL; if (descr != NULL) { + Py_INCREF(descr); f = descr->ob_type->tp_descr_set; - if (f != NULL && PyDescr_IsData(descr)) { + if (f != NULL) { res = f(descr, obj, value); goto done; } @@ -1155,40 +1153,32 @@ _PyObject_GenericSetAttrWithDict(PyObject *obj, PyObject *name, if (dict == NULL) { dictptr = _PyObject_GetDictPtr(obj); - if (dictptr != NULL) { - res = _PyObjectDict_SetItem(Py_TYPE(obj), dictptr, name, value); - if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError)) - PyErr_SetObject(PyExc_AttributeError, name); + if (dictptr == NULL) { + if (descr == NULL) { + PyErr_Format(PyExc_AttributeError, + "'%.100s' object has no attribute '%U'", + tp->tp_name, name); + } + else { + PyErr_Format(PyExc_AttributeError, + "'%.50s' object attribute '%U' is read-only", + tp->tp_name, name); + } goto done; } + res = _PyObjectDict_SetItem(tp, dictptr, name, value); } - if (dict != NULL) { + else { Py_INCREF(dict); if (value == NULL) res = PyDict_DelItem(dict, name); else res = PyDict_SetItem(dict, name, value); Py_DECREF(dict); - if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError)) - PyErr_SetObject(PyExc_AttributeError, name); - goto done; - } - - if (f != NULL) { - res = f(descr, obj, value); - goto done; } + if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError)) + PyErr_SetObject(PyExc_AttributeError, name); - if (descr == NULL) { - PyErr_Format(PyExc_AttributeError, - "'%.100s' object has no attribute '%U'", - tp->tp_name, name); - goto done; - } - - PyErr_Format(PyExc_AttributeError, - "'%.50s' object attribute '%U' is read-only", - tp->tp_name, name); done: Py_XDECREF(descr); Py_DECREF(name); @@ -1204,7 +1194,7 @@ PyObject_GenericSetAttr(PyObject *obj, PyObject *name, PyObject *value) int PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context) { - PyObject *dict, **dictptr = _PyObject_GetDictPtr(obj); + PyObject **dictptr = _PyObject_GetDictPtr(obj); if (dictptr == NULL) { PyErr_SetString(PyExc_AttributeError, "This object has no __dict__"); @@ -1220,10 +1210,8 @@ PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context) "not a '%.200s'", Py_TYPE(value)->tp_name); return -1; } - dict = *dictptr; - Py_XINCREF(value); - *dictptr = value; - Py_XDECREF(dict); + Py_INCREF(value); + Py_XSETREF(*dictptr, value); return 0; } diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 7cc889f..3f95133 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -1,17 +1,38 @@ #include "Python.h" + +/* Defined in tracemalloc.c */ +extern void _PyMem_DumpTraceback(int fd, const void *ptr); + + /* Python's malloc wrappers (see pymem.h) */ -#ifdef PYMALLOC_DEBUG /* WITH_PYMALLOC && PYMALLOC_DEBUG */ +/* + * Basic types + * I don't care if these are defined in <sys/types.h> or elsewhere. Axiom. + */ +#undef uchar +#define uchar unsigned char /* assuming == 8 bits */ + +#undef uint +#define uint unsigned int /* assuming >= 16 bits */ + +#undef uptr +#define uptr Py_uintptr_t + /* Forward declaration */ +static void* _PyMem_DebugRawMalloc(void *ctx, size_t size); +static void* _PyMem_DebugRawCalloc(void *ctx, size_t nelem, size_t elsize); +static void* _PyMem_DebugRawRealloc(void *ctx, void *ptr, size_t size); +static void _PyMem_DebugRawFree(void *ctx, void *p); + static void* _PyMem_DebugMalloc(void *ctx, size_t size); static void* _PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize); -static void _PyMem_DebugFree(void *ctx, void *p); static void* _PyMem_DebugRealloc(void *ctx, void *ptr, size_t size); +static void _PyMem_DebugFree(void *ctx, void *p); static void _PyObject_DebugDumpAddress(const void *p); static void _PyMem_DebugCheckAddress(char api_id, const void *p); -#endif #if defined(__has_feature) /* Clang */ #if __has_feature(address_sanitizer) /* is ASAN enabled? */ @@ -145,9 +166,8 @@ _PyObject_ArenaFree(void *ctx, void *ptr, size_t size) #else # define PYOBJ_FUNCS PYRAW_FUNCS #endif -#define PYMEM_FUNCS PYRAW_FUNCS +#define PYMEM_FUNCS PYOBJ_FUNCS -#ifdef PYMALLOC_DEBUG typedef struct { /* We tag each block with an API ID in order to tag API violations */ char api_id; @@ -163,19 +183,21 @@ static struct { {'o', {NULL, PYOBJ_FUNCS}} }; -#define PYDBG_FUNCS _PyMem_DebugMalloc, _PyMem_DebugCalloc, _PyMem_DebugRealloc, _PyMem_DebugFree -#endif +#define PYRAWDBG_FUNCS \ + _PyMem_DebugRawMalloc, _PyMem_DebugRawCalloc, _PyMem_DebugRawRealloc, _PyMem_DebugRawFree +#define PYDBG_FUNCS \ + _PyMem_DebugMalloc, _PyMem_DebugCalloc, _PyMem_DebugRealloc, _PyMem_DebugFree static PyMemAllocatorEx _PyMem_Raw = { -#ifdef PYMALLOC_DEBUG - &_PyMem_Debug.raw, PYDBG_FUNCS +#ifdef Py_DEBUG + &_PyMem_Debug.raw, PYRAWDBG_FUNCS #else NULL, PYRAW_FUNCS #endif }; static PyMemAllocatorEx _PyMem = { -#ifdef PYMALLOC_DEBUG +#ifdef Py_DEBUG &_PyMem_Debug.mem, PYDBG_FUNCS #else NULL, PYMEM_FUNCS @@ -183,16 +205,76 @@ static PyMemAllocatorEx _PyMem = { }; static PyMemAllocatorEx _PyObject = { -#ifdef PYMALLOC_DEBUG +#ifdef Py_DEBUG &_PyMem_Debug.obj, PYDBG_FUNCS #else NULL, PYOBJ_FUNCS #endif }; +int +_PyMem_SetupAllocators(const char *opt) +{ + if (opt == NULL || *opt == '\0') { + /* PYTHONMALLOC is empty or is not set or ignored (-E/-I command line + options): use default allocators */ +#ifdef Py_DEBUG +# ifdef WITH_PYMALLOC + opt = "pymalloc_debug"; +# else + opt = "malloc_debug"; +# endif +#else + /* !Py_DEBUG */ +# ifdef WITH_PYMALLOC + opt = "pymalloc"; +# else + opt = "malloc"; +# endif +#endif + } + + if (strcmp(opt, "debug") == 0) { + PyMem_SetupDebugHooks(); + } + else if (strcmp(opt, "malloc") == 0 || strcmp(opt, "malloc_debug") == 0) + { + PyMemAllocatorEx alloc = {NULL, PYRAW_FUNCS}; + + PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &alloc); + PyMem_SetAllocator(PYMEM_DOMAIN_MEM, &alloc); + PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &alloc); + + if (strcmp(opt, "malloc_debug") == 0) + PyMem_SetupDebugHooks(); + } +#ifdef WITH_PYMALLOC + else if (strcmp(opt, "pymalloc") == 0 + || strcmp(opt, "pymalloc_debug") == 0) + { + PyMemAllocatorEx raw_alloc = {NULL, PYRAW_FUNCS}; + PyMemAllocatorEx mem_alloc = {NULL, PYMEM_FUNCS}; + PyMemAllocatorEx obj_alloc = {NULL, PYOBJ_FUNCS}; + + PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &raw_alloc); + PyMem_SetAllocator(PYMEM_DOMAIN_MEM, &mem_alloc); + PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &obj_alloc); + + if (strcmp(opt, "pymalloc_debug") == 0) + PyMem_SetupDebugHooks(); + } +#endif + else { + /* unknown allocator */ + return -1; + } + return 0; +} + #undef PYRAW_FUNCS #undef PYMEM_FUNCS #undef PYOBJ_FUNCS +#undef PYRAWDBG_FUNCS #undef PYDBG_FUNCS static PyObjectArenaAllocator _PyObject_Arena = {NULL, @@ -205,23 +287,46 @@ static PyObjectArenaAllocator _PyObject_Arena = {NULL, #endif }; +#ifdef WITH_PYMALLOC +static int +_PyMem_DebugEnabled(void) +{ + return (_PyObject.malloc == _PyMem_DebugMalloc); +} + +int +_PyMem_PymallocEnabled(void) +{ + if (_PyMem_DebugEnabled()) { + return (_PyMem_Debug.obj.alloc.malloc == _PyObject_Malloc); + } + else { + return (_PyObject.malloc == _PyObject_Malloc); + } +} +#endif + void PyMem_SetupDebugHooks(void) { -#ifdef PYMALLOC_DEBUG PyMemAllocatorEx alloc; - alloc.malloc = _PyMem_DebugMalloc; - alloc.calloc = _PyMem_DebugCalloc; - alloc.realloc = _PyMem_DebugRealloc; - alloc.free = _PyMem_DebugFree; + alloc.malloc = _PyMem_DebugRawMalloc; + alloc.calloc = _PyMem_DebugRawCalloc; + alloc.realloc = _PyMem_DebugRawRealloc; + alloc.free = _PyMem_DebugRawFree; - if (_PyMem_Raw.malloc != _PyMem_DebugMalloc) { + if (_PyMem_Raw.malloc != _PyMem_DebugRawMalloc) { alloc.ctx = &_PyMem_Debug.raw; PyMem_GetAllocator(PYMEM_DOMAIN_RAW, &_PyMem_Debug.raw.alloc); PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &alloc); } + alloc.malloc = _PyMem_DebugMalloc; + alloc.calloc = _PyMem_DebugCalloc; + alloc.realloc = _PyMem_DebugRealloc; + alloc.free = _PyMem_DebugFree; + if (_PyMem.malloc != _PyMem_DebugMalloc) { alloc.ctx = &_PyMem_Debug.mem; PyMem_GetAllocator(PYMEM_DOMAIN_MEM, &_PyMem_Debug.mem.alloc); @@ -233,7 +338,6 @@ PyMem_SetupDebugHooks(void) PyMem_GetAllocator(PYMEM_DOMAIN_OBJ, &_PyMem_Debug.obj.alloc); PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &alloc); } -#endif } void @@ -264,7 +368,6 @@ PyMem_SetAllocator(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator) case PYMEM_DOMAIN_OBJ: _PyObject = *allocator; break; /* ignore unknown domain */ } - } void @@ -642,22 +745,6 @@ static int running_on_valgrind = -1; #define SIMPLELOCK_LOCK(lock) /* acquire released lock */ #define SIMPLELOCK_UNLOCK(lock) /* release acquired lock */ -/* - * Basic types - * I don't care if these are defined in <sys/types.h> or elsewhere. Axiom. - */ -#undef uchar -#define uchar unsigned char /* assuming == 8 bits */ - -#undef uint -#define uint unsigned int /* assuming >= 16 bits */ - -#undef ulong -#define ulong unsigned long /* assuming >= 32 bits */ - -#undef uptr -#define uptr Py_uintptr_t - /* When you say memory, my mind reasons in terms of (pointers to) blocks */ typedef uchar block; @@ -949,11 +1036,15 @@ new_arena(void) struct arena_object* arenaobj; uint excess; /* number of bytes above pool alignment */ void *address; + static int debug_stats = -1; -#ifdef PYMALLOC_DEBUG - if (Py_GETENV("PYTHONMALLOCSTATS")) + if (debug_stats == -1) { + char *opt = Py_GETENV("PYTHONMALLOCSTATS"); + debug_stats = (opt != NULL && *opt != '\0'); + } + if (debug_stats) _PyObject_DebugMallocStats(stderr); -#endif + if (unused_arena_objects == NULL) { uint i; uint numarenas; @@ -1709,7 +1800,7 @@ _Py_GetAllocatedBlocks(void) #endif /* WITH_PYMALLOC */ -#ifdef PYMALLOC_DEBUG + /*==========================================================================*/ /* A x-platform debugging allocator. This doesn't manage memory directly, * it wraps a real allocator, adding extra debugging info to the memory blocks. @@ -1767,31 +1858,6 @@ write_size_t(void *p, size_t n) } } -#ifdef Py_DEBUG -/* Is target in the list? The list is traversed via the nextpool pointers. - * The list may be NULL-terminated, or circular. Return 1 if target is in - * list, else 0. - */ -static int -pool_is_in_list(const poolp target, poolp list) -{ - poolp origlist = list; - assert(target != NULL); - if (list == NULL) - return 0; - do { - if (target == list) - return 1; - list = list->nextpool; - } while (list != NULL && list != origlist); - return 0; -} - -#else -#define pool_is_in_list(X, Y) 1 - -#endif /* Py_DEBUG */ - /* Let S = sizeof(size_t). The debug malloc asks for 4*S extra bytes and fills them with useful stuff, here calling the underlying malloc's result p: @@ -1819,7 +1885,7 @@ p[2*S+n+S: 2*S+n+2*S] */ static void * -_PyMem_DebugAlloc(int use_calloc, void *ctx, size_t nbytes) +_PyMem_DebugRawAlloc(int use_calloc, void *ctx, size_t nbytes) { debug_alloc_api_t *api = (debug_alloc_api_t *)ctx; uchar *p; /* base address of malloc'ed block */ @@ -1856,18 +1922,18 @@ _PyMem_DebugAlloc(int use_calloc, void *ctx, size_t nbytes) } static void * -_PyMem_DebugMalloc(void *ctx, size_t nbytes) +_PyMem_DebugRawMalloc(void *ctx, size_t nbytes) { - return _PyMem_DebugAlloc(0, ctx, nbytes); + return _PyMem_DebugRawAlloc(0, ctx, nbytes); } static void * -_PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize) +_PyMem_DebugRawCalloc(void *ctx, size_t nelem, size_t elsize) { size_t nbytes; assert(elsize == 0 || nelem <= PY_SSIZE_T_MAX / elsize); nbytes = nelem * elsize; - return _PyMem_DebugAlloc(1, ctx, nbytes); + return _PyMem_DebugRawAlloc(1, ctx, nbytes); } /* The debug free first checks the 2*SST bytes on each end for sanity (in @@ -1876,7 +1942,7 @@ _PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize) Then calls the underlying free. */ static void -_PyMem_DebugFree(void *ctx, void *p) +_PyMem_DebugRawFree(void *ctx, void *p) { debug_alloc_api_t *api = (debug_alloc_api_t *)ctx; uchar *q = (uchar *)p - 2*SST; /* address returned from malloc */ @@ -1893,7 +1959,7 @@ _PyMem_DebugFree(void *ctx, void *p) } static void * -_PyMem_DebugRealloc(void *ctx, void *p, size_t nbytes) +_PyMem_DebugRawRealloc(void *ctx, void *p, size_t nbytes) { debug_alloc_api_t *api = (debug_alloc_api_t *)ctx; uchar *q = (uchar *)p, *oldq; @@ -1903,7 +1969,7 @@ _PyMem_DebugRealloc(void *ctx, void *p, size_t nbytes) int i; if (p == NULL) - return _PyMem_DebugAlloc(0, ctx, nbytes); + return _PyMem_DebugRawAlloc(0, ctx, nbytes); _PyMem_DebugCheckAddress(api->api_id, p); bumpserialno(); @@ -1946,6 +2012,44 @@ _PyMem_DebugRealloc(void *ctx, void *p, size_t nbytes) return q; } +static void +_PyMem_DebugCheckGIL(void) +{ +#ifdef WITH_THREAD + if (!PyGILState_Check()) + Py_FatalError("Python memory allocator called " + "without holding the GIL"); +#endif +} + +static void * +_PyMem_DebugMalloc(void *ctx, size_t nbytes) +{ + _PyMem_DebugCheckGIL(); + return _PyMem_DebugRawMalloc(ctx, nbytes); +} + +static void * +_PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize) +{ + _PyMem_DebugCheckGIL(); + return _PyMem_DebugRawCalloc(ctx, nelem, elsize); +} + +static void +_PyMem_DebugFree(void *ctx, void *ptr) +{ + _PyMem_DebugCheckGIL(); + _PyMem_DebugRawFree(ctx, ptr); +} + +static void * +_PyMem_DebugRealloc(void *ctx, void *ptr, size_t nbytes) +{ + _PyMem_DebugCheckGIL(); + return _PyMem_DebugRawRealloc(ctx, ptr, nbytes); +} + /* Check the forbidden bytes on both ends of the memory allocated for p. * If anything is wrong, print info to stderr via _PyObject_DebugDumpAddress, * and call Py_FatalError to kill the program. @@ -2104,9 +2208,12 @@ _PyObject_DebugDumpAddress(const void *p) } fputc('\n', stderr); } + fputc('\n', stderr); + + fflush(stderr); + _PyMem_DumpTraceback(fileno(stderr), p); } -#endif /* PYMALLOC_DEBUG */ static size_t printone(FILE *out, const char* msg, size_t value) @@ -2158,8 +2265,30 @@ _PyDebugAllocatorStats(FILE *out, (void)printone(out, buf2, num_blocks * sizeof_block); } + #ifdef WITH_PYMALLOC +#ifdef Py_DEBUG +/* Is target in the list? The list is traversed via the nextpool pointers. + * The list may be NULL-terminated, or circular. Return 1 if target is in + * list, else 0. + */ +static int +pool_is_in_list(const poolp target, poolp list) +{ + poolp origlist = list; + assert(target != NULL); + if (list == NULL) + return 0; + do { + if (target == list) + return 1; + list = list->nextpool; + } while (list != NULL && list != origlist); + return 0; +} +#endif + /* Print summary info to "out" about the state of pymalloc's structures. * In Py_DEBUG mode, also perform some expensive internal consistency * checks. @@ -2233,7 +2362,9 @@ _PyObject_DebugMallocStats(FILE *out) if (p->ref.count == 0) { /* currently unused */ +#ifdef Py_DEBUG assert(pool_is_in_list(p, arenas[i].freepools)); +#endif continue; } ++numpools[sz]; @@ -2273,9 +2404,8 @@ _PyObject_DebugMallocStats(FILE *out) quantization += p * ((POOL_SIZE - POOL_OVERHEAD) % size); } fputc('\n', out); -#ifdef PYMALLOC_DEBUG - (void)printone(out, "# times object malloc called", serialno); -#endif + if (_PyMem_DebugEnabled()) + (void)printone(out, "# times object malloc called", serialno); (void)printone(out, "# arenas allocated total", ntimes_arena_allocated); (void)printone(out, "# arenas reclaimed", ntimes_arena_allocated - narenas); (void)printone(out, "# arenas highwater mark", narenas_highwater); @@ -2303,6 +2433,7 @@ _PyObject_DebugMallocStats(FILE *out) #endif /* #ifdef WITH_PYMALLOC */ + #ifdef Py_USING_MEMORY_DEBUGGER /* Make this function last so gcc won't inline it since the definition is * after the reference. diff --git a/Objects/odictobject.c b/Objects/odictobject.c index 1abdd02..dccbb3e 100644 --- a/Objects/odictobject.c +++ b/Objects/odictobject.c @@ -1424,14 +1424,13 @@ static PyMethodDef odict_methods[] = { * OrderedDict members */ -/* tp_members */ +/* tp_getset */ -static PyMemberDef odict_members[] = { - {"__dict__", T_OBJECT, offsetof(PyODictObject, od_inst_dict), READONLY}, - {0} +static PyGetSetDef odict_getset[] = { + {"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict}, + {NULL} }; - /* ---------------------------------------------- * OrderedDict type slot methods */ @@ -1653,20 +1652,12 @@ odict_init(PyObject *self, PyObject *args, PyObject *kwds) static PyObject * odict_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - PyObject *dict; PyODictObject *od; - dict = PyDict_New(); - if (dict == NULL) - return NULL; - od = (PyODictObject *)PyDict_Type.tp_new(type, args, kwds); - if (od == NULL) { - Py_DECREF(dict); + if (od == NULL) return NULL; - } - od->od_inst_dict = dict; /* type constructor fills the memory with zeros (see PyType_GenericAlloc()), there is no need to set them to zero again */ if (_odict_resize(od) < 0) { @@ -1708,8 +1699,8 @@ PyTypeObject PyODict_Type = { (getiterfunc)odict_iter, /* tp_iter */ 0, /* tp_iternext */ odict_methods, /* tp_methods */ - odict_members, /* tp_members */ - 0, /* tp_getset */ + 0, /* tp_members */ + odict_getset, /* tp_getset */ &PyDict_Type, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ diff --git a/Objects/setobject.c b/Objects/setobject.c index 4ef692d..6dd403f 100644 --- a/Objects/setobject.c +++ b/Objects/setobject.c @@ -26,7 +26,6 @@ #include "Python.h" #include "structmember.h" -#include "stringlib/eq.h" /* Object used as dummy key to fill deleted entries */ static PyObject _dummy_struct; @@ -48,19 +47,20 @@ static PyObject _dummy_struct; static setentry * set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash) { - setentry *table = so->table; - setentry *freeslot = NULL; + setentry *table; setentry *entry; - size_t perturb = hash; + size_t perturb; size_t mask = so->mask; size_t i = (size_t)hash & mask; /* Unsigned for defined overflow behavior */ size_t j; int cmp; - entry = &table[i]; + entry = &so->table[i]; if (entry->key == NULL) return entry; + perturb = hash; + while (1) { if (entry->hash == hash) { PyObject *startkey = entry->key; @@ -70,8 +70,9 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash) return entry; if (PyUnicode_CheckExact(startkey) && PyUnicode_CheckExact(key) - && unicode_eq(startkey, key)) + && _PyUnicode_EQ(startkey, key)) return entry; + table = so->table; Py_INCREF(startkey); cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); Py_DECREF(startkey); @@ -83,14 +84,12 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash) return entry; mask = so->mask; /* help avoid a register spill */ } - if (entry->hash == -1 && freeslot == NULL) - freeslot = entry; if (i + LINEAR_PROBES <= mask) { for (j = 0 ; j < LINEAR_PROBES ; j++) { entry++; - if (entry->key == NULL) - goto found_null; + if (entry->hash == 0 && entry->key == NULL) + return entry; if (entry->hash == hash) { PyObject *startkey = entry->key; assert(startkey != dummy); @@ -98,8 +97,9 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash) return entry; if (PyUnicode_CheckExact(startkey) && PyUnicode_CheckExact(key) - && unicode_eq(startkey, key)) + && _PyUnicode_EQ(startkey, key)) return entry; + table = so->table; Py_INCREF(startkey); cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); Py_DECREF(startkey); @@ -111,7 +111,104 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash) return entry; mask = so->mask; } - if (entry->hash == -1 && freeslot == NULL) + } + } + + perturb >>= PERTURB_SHIFT; + i = (i * 5 + 1 + perturb) & mask; + + entry = &so->table[i]; + if (entry->key == NULL) + return entry; + } +} + +static int set_table_resize(PySetObject *, Py_ssize_t); + +static int +set_add_entry(PySetObject *so, PyObject *key, Py_hash_t hash) +{ + setentry *table; + setentry *freeslot; + setentry *entry; + size_t perturb; + size_t mask; + size_t i; /* Unsigned for defined overflow behavior */ + size_t j; + int cmp; + + /* Pre-increment is necessary to prevent arbitrary code in the rich + comparison from deallocating the key just before the insertion. */ + Py_INCREF(key); + + restart: + + mask = so->mask; + i = (size_t)hash & mask; + + entry = &so->table[i]; + if (entry->key == NULL) + goto found_unused; + + freeslot = NULL; + perturb = hash; + + while (1) { + if (entry->hash == hash) { + PyObject *startkey = entry->key; + /* startkey cannot be a dummy because the dummy hash field is -1 */ + assert(startkey != dummy); + if (startkey == key) + goto found_active; + if (PyUnicode_CheckExact(startkey) + && PyUnicode_CheckExact(key) + && _PyUnicode_EQ(startkey, key)) + goto found_active; + table = so->table; + Py_INCREF(startkey); + cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); + Py_DECREF(startkey); + if (cmp > 0) /* likely */ + goto found_active; + if (cmp < 0) + goto comparison_error; + /* Continuing the search from the current entry only makes + sense if the table and entry are unchanged; otherwise, + we have to restart from the beginning */ + if (table != so->table || entry->key != startkey) + goto restart; + mask = so->mask; /* help avoid a register spill */ + } + else if (entry->hash == -1 && freeslot == NULL) + freeslot = entry; + + if (i + LINEAR_PROBES <= mask) { + for (j = 0 ; j < LINEAR_PROBES ; j++) { + entry++; + if (entry->hash == 0 && entry->key == NULL) + goto found_unused_or_dummy; + if (entry->hash == hash) { + PyObject *startkey = entry->key; + assert(startkey != dummy); + if (startkey == key) + goto found_active; + if (PyUnicode_CheckExact(startkey) + && PyUnicode_CheckExact(key) + && _PyUnicode_EQ(startkey, key)) + goto found_active; + table = so->table; + Py_INCREF(startkey); + cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); + Py_DECREF(startkey); + if (cmp > 0) + goto found_active; + if (cmp < 0) + goto comparison_error; + if (table != so->table || entry->key != startkey) + goto restart; + mask = so->mask; + } + else if (entry->hash == -1 && freeslot == NULL) freeslot = entry; } } @@ -119,29 +216,51 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash) perturb >>= PERTURB_SHIFT; i = (i * 5 + 1 + perturb) & mask; - entry = &table[i]; + entry = &so->table[i]; if (entry->key == NULL) - goto found_null; + goto found_unused_or_dummy; } - found_null: - return freeslot == NULL ? entry : freeslot; + + found_unused_or_dummy: + if (freeslot == NULL) + goto found_unused; + so->used++; + freeslot->key = key; + freeslot->hash = hash; + return 0; + + found_unused: + so->fill++; + so->used++; + entry->key = key; + entry->hash = hash; + if ((size_t)so->fill*3 < mask*2) + return 0; + return set_table_resize(so, so->used); + + found_active: + Py_DECREF(key); + return 0; + + comparison_error: + Py_DECREF(key); + return -1; } /* Internal routine used by set_table_resize() to insert an item which is known to be absent from the set. This routine also assumes that the set contains no deleted entries. Besides the performance benefit, -using set_insert_clean() in set_table_resize() is dangerous (SF bug #1456209). -Note that no refcounts are changed by this routine; if needed, the caller -is responsible for incref'ing `key`. +there is also safety benefit since using set_add_entry() risks making +a callback in the middle of a set_table_resize(), see issue 1456209. +The caller is responsible for updating the key's reference count and +the setobject's fill and used fields. */ static void -set_insert_clean(PySetObject *so, PyObject *key, Py_hash_t hash) +set_insert_clean(setentry *table, size_t mask, PyObject *key, Py_hash_t hash) { - setentry *table = so->table; setentry *entry; size_t perturb = hash; - size_t mask = (size_t)so->mask; size_t i = (size_t)hash & mask; size_t j; @@ -162,45 +281,11 @@ set_insert_clean(PySetObject *so, PyObject *key, Py_hash_t hash) found_null: entry->key = key; entry->hash = hash; - so->fill++; - so->used++; } /* ======== End logic for probing the hash table ========================== */ /* ======================================================================== */ - -/* -Internal routine to insert a new key into the table. -Used by the public insert routine. -Eats a reference to key. -*/ -static int -set_insert_key(PySetObject *so, PyObject *key, Py_hash_t hash) -{ - setentry *entry; - - entry = set_lookkey(so, key, hash); - if (entry == NULL) - return -1; - if (entry->key == NULL) { - /* UNUSED */ - entry->key = key; - entry->hash = hash; - so->fill++; - so->used++; - } else if (entry->key == dummy) { - /* DUMMY */ - entry->key = key; - entry->hash = hash; - so->used++; - } else { - /* ACTIVE */ - Py_DECREF(key); - } - return 0; -} - /* Restructure the table by allocating a new table and reinserting all keys again. When entries have been deleted, the new table may @@ -213,10 +298,13 @@ set_table_resize(PySetObject *so, Py_ssize_t minused) setentry *oldtable, *newtable, *entry; Py_ssize_t oldfill = so->fill; Py_ssize_t oldused = so->used; + Py_ssize_t oldmask = so->mask; + size_t newmask; int is_oldtable_malloced; setentry small_copy[PySet_MINSIZE]; assert(minused >= 0); + minused = (minused > 50000) ? minused * 2 : minused * 4; /* Find the smallest table size > minused. */ /* XXX speed-up with intrinsics */ @@ -264,25 +352,24 @@ set_table_resize(PySetObject *so, Py_ssize_t minused) /* Make the set empty, using the new table. */ assert(newtable != oldtable); memset(newtable, 0, sizeof(setentry) * newsize); - so->fill = 0; - so->used = 0; + so->fill = oldused; + so->used = oldused; so->mask = newsize - 1; so->table = newtable; /* Copy the data over; this is refcount-neutral for active entries; dummy entries aren't copied over, of course */ + newmask = (size_t)so->mask; if (oldfill == oldused) { - for (entry = oldtable; oldused > 0; entry++) { + for (entry = oldtable; entry <= oldtable + oldmask; entry++) { if (entry->key != NULL) { - oldused--; - set_insert_clean(so, entry->key, entry->hash); + set_insert_clean(newtable, newmask, entry->key, entry->hash); } } } else { - for (entry = oldtable; oldused > 0; entry++) { + for (entry = oldtable; entry <= oldtable + oldmask; entry++) { if (entry->key != NULL && entry->key != dummy) { - oldused--; - set_insert_clean(so, entry->key, entry->hash); + set_insert_clean(newtable, newmask, entry->key, entry->hash); } } } @@ -292,31 +379,42 @@ set_table_resize(PySetObject *so, Py_ssize_t minused) return 0; } -/* CAUTION: set_add_key/entry() must guarantee it won't resize the table */ +static int +set_contains_entry(PySetObject *so, PyObject *key, Py_hash_t hash) +{ + setentry *entry; + + entry = set_lookkey(so, key, hash); + if (entry != NULL) + return entry->key != NULL; + return -1; +} + +#define DISCARD_NOTFOUND 0 +#define DISCARD_FOUND 1 static int -set_add_entry(PySetObject *so, setentry *entry) +set_discard_entry(PySetObject *so, PyObject *key, Py_hash_t hash) { - Py_ssize_t n_used; - PyObject *key = entry->key; - Py_hash_t hash = entry->hash; + setentry *entry; + PyObject *old_key; - assert(so->fill <= so->mask); /* at least one empty slot */ - n_used = so->used; - Py_INCREF(key); - if (set_insert_key(so, key, hash)) { - Py_DECREF(key); + entry = set_lookkey(so, key, hash); + if (entry == NULL) return -1; - } - if (!(so->used > n_used && so->fill*3 >= (so->mask+1)*2)) - return 0; - return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4); + if (entry->key == NULL) + return DISCARD_NOTFOUND; + old_key = entry->key; + entry->key = dummy; + entry->hash = -1; + so->used--; + Py_DECREF(old_key); + return DISCARD_FOUND; } static int set_add_key(PySetObject *so, PyObject *key) { - setentry entry; Py_hash_t hash; if (!PyUnicode_CheckExact(key) || @@ -325,50 +423,35 @@ set_add_key(PySetObject *so, PyObject *key) if (hash == -1) return -1; } - entry.key = key; - entry.hash = hash; - return set_add_entry(so, &entry); + return set_add_entry(so, key, hash); } -#define DISCARD_NOTFOUND 0 -#define DISCARD_FOUND 1 - static int -set_discard_entry(PySetObject *so, setentry *oldentry) +set_contains_key(PySetObject *so, PyObject *key) { - setentry *entry; - PyObject *old_key; + Py_hash_t hash; - entry = set_lookkey(so, oldentry->key, oldentry->hash); - if (entry == NULL) - return -1; - if (entry->key == NULL || entry->key == dummy) - return DISCARD_NOTFOUND; - old_key = entry->key; - entry->key = dummy; - entry->hash = -1; - so->used--; - Py_DECREF(old_key); - return DISCARD_FOUND; + if (!PyUnicode_CheckExact(key) || + (hash = ((PyASCIIObject *) key)->hash) == -1) { + hash = PyObject_Hash(key); + if (hash == -1) + return -1; + } + return set_contains_entry(so, key, hash); } static int set_discard_key(PySetObject *so, PyObject *key) { - setentry entry; Py_hash_t hash; - assert (PyAnySet_Check(so)); - if (!PyUnicode_CheckExact(key) || (hash = ((PyASCIIObject *) key)->hash) == -1) { hash = PyObject_Hash(key); if (hash == -1) return -1; } - entry.key = key; - entry.hash = hash; - return set_discard_entry(so, &entry); + return set_discard_entry(so, key, hash); } static void @@ -449,20 +532,22 @@ set_next(PySetObject *so, Py_ssize_t *pos_ptr, setentry **entry_ptr) { Py_ssize_t i; Py_ssize_t mask; - setentry *table; + setentry *entry; assert (PyAnySet_Check(so)); i = *pos_ptr; assert(i >= 0); - table = so->table; mask = so->mask; - while (i <= mask && (table[i].key == NULL || table[i].key == dummy)) + entry = &so->table[i]; + while (i <= mask && (entry->key == NULL || entry->key == dummy)) { i++; + entry++; + } *pos_ptr = i+1; if (i > mask) return 0; - assert(table[i].key != NULL); - *entry_ptr = &table[i]; + assert(entry != NULL); + *entry_ptr = entry; return 1; } @@ -560,8 +645,8 @@ set_merge(PySetObject *so, PyObject *otherset) * incrementally resizing as we insert new keys. Expect * that there will be no (or few) overlapping keys. */ - if ((so->fill + other->used)*3 >= (so->mask+1)*2) { - if (set_table_resize(so, (so->used + other->used)*2) != 0) + if ((so->fill + other->used)*3 >= so->mask*2) { + if (set_table_resize(so, so->used + other->used) != 0) return -1; } so_entry = so->table; @@ -586,11 +671,15 @@ set_merge(PySetObject *so, PyObject *otherset) /* If our table is empty, we can use set_insert_clean() */ if (so->fill == 0) { - for (i = 0; i <= other->mask; i++, other_entry++) { + setentry *newtable = so->table; + size_t newmask = (size_t)so->mask; + so->fill = other->used; + so->used = other->used; + for (i = other->mask + 1; i > 0 ; i--, other_entry++) { key = other_entry->key; if (key != NULL && key != dummy) { Py_INCREF(key); - set_insert_clean(so, key, other_entry->hash); + set_insert_clean(newtable, newmask, key, other_entry->hash); } } return 0; @@ -601,46 +690,13 @@ set_merge(PySetObject *so, PyObject *otherset) other_entry = &other->table[i]; key = other_entry->key; if (key != NULL && key != dummy) { - Py_INCREF(key); - if (set_insert_key(so, key, other_entry->hash)) { - Py_DECREF(key); + if (set_add_entry(so, key, other_entry->hash)) return -1; - } } } return 0; } -static int -set_contains_entry(PySetObject *so, setentry *entry) -{ - PyObject *key; - setentry *lu_entry; - - lu_entry = set_lookkey(so, entry->key, entry->hash); - if (lu_entry == NULL) - return -1; - key = lu_entry->key; - return key != NULL && key != dummy; -} - -static int -set_contains_key(PySetObject *so, PyObject *key) -{ - setentry entry; - Py_hash_t hash; - - if (!PyUnicode_CheckExact(key) || - (hash = ((PyASCIIObject *) key)->hash) == -1) { - hash = PyObject_Hash(key); - if (hash == -1) - return -1; - } - entry.key = key; - entry.hash = hash; - return set_contains_entry(so, &entry); -} - static PyObject * set_pop(PySetObject *so) { @@ -682,43 +738,64 @@ set_traverse(PySetObject *so, visitproc visit, void *arg) return 0; } -static Py_hash_t -frozenset_hash(PyObject *self) +/* Work to increase the bit dispersion for closely spaced hash values. + This is important because some use cases have many combinations of a + small number of elements with nearby hashes so that many distinct + combinations collapse to only a handful of distinct hash values. */ + +static Py_uhash_t +_shuffle_bits(Py_uhash_t h) { - /* Most of the constants in this hash algorithm are randomly choosen - large primes with "interesting bit patterns" and that passed - tests for good collision statistics on a variety of problematic - datasets such as: + return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL; +} - ps = [] - for r in range(21): - ps += itertools.combinations(range(20), r) - num_distinct_hashes = len({hash(frozenset(s)) for s in ps}) +/* Most of the constants in this hash algorithm are randomly chosen + large primes with "interesting bit patterns" and that passed tests + for good collision statistics on a variety of problematic datasets + including powersets and graph structures (such as David Eppstein's + graph recipes in Lib/test/test_set.py) */ - */ +static Py_hash_t +frozenset_hash(PyObject *self) +{ PySetObject *so = (PySetObject *)self; - Py_uhash_t h, hash = 1927868237UL; + Py_uhash_t hash = 0; setentry *entry; - Py_ssize_t pos = 0; if (so->hash != -1) return so->hash; - hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1; - while (set_next(so, &pos, &entry)) { - /* Work to increase the bit dispersion for closely spaced hash - values. This is important because some use cases have many - combinations of a small number of elements with nearby - hashes so that many distinct combinations collapse to only - a handful of distinct hash values. */ - h = entry->hash; - hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL; - } - /* Make the final result spread-out in a different pattern - than the algorithm for tuples or other python objects. */ + /* Xor-in shuffled bits from every entry's hash field because xor is + commutative and a frozenset hash should be independent of order. + + For speed, include null entries and dummy entries and then + subtract out their effect afterwards so that the final hash + depends only on active entries. This allows the code to be + vectorized by the compiler and it saves the unpredictable + branches that would arise when trying to exclude null and dummy + entries on every iteration. */ + + for (entry = so->table; entry <= &so->table[so->mask]; entry++) + hash ^= _shuffle_bits(entry->hash); + + /* Remove the effect of an odd number of NULL entries */ + if ((so->mask + 1 - so->fill) & 1) + hash ^= _shuffle_bits(0); + + /* Remove the effect of an odd number of dummy entries */ + if ((so->fill - so->used) & 1) + hash ^= _shuffle_bits(-1); + + /* Factor in the number of active entries */ + hash ^= ((Py_uhash_t)PySet_GET_SIZE(self) + 1) * 1927868237UL; + + /* Disperse patterns arising in nested frozensets */ hash = hash * 69069U + 907133923UL; + + /* -1 is reserved as an error code */ if (hash == (Py_uhash_t)-1) hash = 590923713UL; + so->hash = hash; return hash; } @@ -865,7 +942,7 @@ PyTypeObject PySetIter_Type = { PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, /* tp_flags */ 0, /* tp_doc */ (traverseproc)setiter_traverse, /* tp_traverse */ 0, /* tp_clear */ @@ -910,18 +987,14 @@ set_update_internal(PySetObject *so, PyObject *other) * incrementally resizing as we insert new keys. Expect * that there will be no (or few) overlapping keys. */ - if (dictsize == -1) + if (dictsize < 0) return -1; - if ((so->fill + dictsize)*3 >= (so->mask+1)*2) { - if (set_table_resize(so, (so->used + dictsize)*2) != 0) + if ((so->fill + dictsize)*3 >= so->mask*2) { + if (set_table_resize(so, so->used + dictsize) != 0) return -1; } while (_PyDict_Next(other, &pos, &key, &value, &hash)) { - setentry an_entry; - - an_entry.hash = hash; - an_entry.key = key; - if (set_add_entry(so, &an_entry)) + if (set_add_entry(so, key, hash)) return -1; } return 0; @@ -970,9 +1043,8 @@ PyDoc_STRVAR(update_doc, static PyObject * make_new_set(PyTypeObject *type, PyObject *iterable) { - PySetObject *so = NULL; + PySetObject *so; - /* create PySetObject structure */ so = (PySetObject *)type->tp_alloc(type, 0); if (so == NULL) return NULL; @@ -1015,7 +1087,8 @@ frozenset_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { PyObject *iterable = NULL, *result; - if (type == &PyFrozenSet_Type && !_PyArg_NoKeywords("frozenset()", kwds)) + if (kwds != NULL && type == &PyFrozenSet_Type + && !_PyArg_NoKeywords("frozenset()", kwds)) return NULL; if (!PyArg_UnpackTuple(args, type->tp_name, 0, 1, &iterable)) @@ -1042,24 +1115,9 @@ frozenset_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return emptyfrozenset; } -int -PySet_ClearFreeList(void) -{ - return 0; -} - -void -PySet_Fini(void) -{ - Py_CLEAR(emptyfrozenset); -} - static PyObject * set_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - if (type == &PySet_Type && !_PyArg_NoKeywords("set()", kwds)) - return NULL; - return make_new_set(type, NULL); } @@ -1201,6 +1259,8 @@ set_intersection(PySetObject *so, PyObject *other) { PySetObject *result; PyObject *key, *it, *tmp; + Py_hash_t hash; + int rv; if ((PyObject *)so == other) return set_copy(so); @@ -1220,13 +1280,15 @@ set_intersection(PySetObject *so, PyObject *other) } while (set_next((PySetObject *)other, &pos, &entry)) { - int rv = set_contains_entry(so, entry); - if (rv == -1) { + key = entry->key; + hash = entry->hash; + rv = set_contains_entry(so, key, hash); + if (rv < 0) { Py_DECREF(result); return NULL; } if (rv) { - if (set_add_entry(result, entry)) { + if (set_add_entry(result, key, hash)) { Py_DECREF(result); return NULL; } @@ -1242,32 +1304,15 @@ set_intersection(PySetObject *so, PyObject *other) } while ((key = PyIter_Next(it)) != NULL) { - int rv; - setentry entry; - Py_hash_t hash = PyObject_Hash(key); - - if (hash == -1) { - Py_DECREF(it); - Py_DECREF(result); - Py_DECREF(key); - return NULL; - } - entry.hash = hash; - entry.key = key; - rv = set_contains_entry(so, &entry); - if (rv == -1) { - Py_DECREF(it); - Py_DECREF(result); - Py_DECREF(key); - return NULL; - } + hash = PyObject_Hash(key); + if (hash == -1) + goto error; + rv = set_contains_entry(so, key, hash); + if (rv < 0) + goto error; if (rv) { - if (set_add_entry(result, &entry)) { - Py_DECREF(it); - Py_DECREF(result); - Py_DECREF(key); - return NULL; - } + if (set_add_entry(result, key, hash)) + goto error; } Py_DECREF(key); } @@ -1277,6 +1322,11 @@ set_intersection(PySetObject *so, PyObject *other) return NULL; } return (PyObject *)result; + error: + Py_DECREF(it); + Py_DECREF(result); + Py_DECREF(key); + return NULL; } static PyObject * @@ -1363,6 +1413,7 @@ static PyObject * set_isdisjoint(PySetObject *so, PyObject *other) { PyObject *key, *it, *tmp; + int rv; if ((PyObject *)so == other) { if (PySet_GET_SIZE(so) == 0) @@ -1381,8 +1432,8 @@ set_isdisjoint(PySetObject *so, PyObject *other) other = tmp; } while (set_next((PySetObject *)other, &pos, &entry)) { - int rv = set_contains_entry(so, entry); - if (rv == -1) + rv = set_contains_entry(so, entry->key, entry->hash); + if (rv < 0) return NULL; if (rv) Py_RETURN_FALSE; @@ -1395,8 +1446,6 @@ set_isdisjoint(PySetObject *so, PyObject *other) return NULL; while ((key = PyIter_Next(it)) != NULL) { - int rv; - setentry entry; Py_hash_t hash = PyObject_Hash(key); if (hash == -1) { @@ -1404,11 +1453,9 @@ set_isdisjoint(PySetObject *so, PyObject *other) Py_DECREF(it); return NULL; } - entry.hash = hash; - entry.key = key; - rv = set_contains_entry(so, &entry); + rv = set_contains_entry(so, key, hash); Py_DECREF(key); - if (rv == -1) { + if (rv < 0) { Py_DECREF(it); return NULL; } @@ -1437,7 +1484,7 @@ set_difference_update_internal(PySetObject *so, PyObject *other) Py_ssize_t pos = 0; while (set_next((PySetObject *)other, &pos, &entry)) - if (set_discard_entry(so, entry) == -1) + if (set_discard_entry(so, entry->key, entry->hash) < 0) return -1; } else { PyObject *key, *it; @@ -1446,7 +1493,7 @@ set_difference_update_internal(PySetObject *so, PyObject *other) return -1; while ((key = PyIter_Next(it)) != NULL) { - if (set_discard_key(so, key) == -1) { + if (set_discard_key(so, key) < 0) { Py_DECREF(it); Py_DECREF(key); return -1; @@ -1457,10 +1504,10 @@ set_difference_update_internal(PySetObject *so, PyObject *other) if (PyErr_Occurred()) return -1; } - /* If more than 1/5 are dummies, then resize them away. */ - if ((so->fill - so->used) * 5 < so->mask) + /* If more than 1/4th are dummies, then resize them away. */ + if ((size_t)(so->fill - so->used) <= (size_t)so->mask / 4) return 0; - return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4); + return set_table_resize(so, so->used); } static PyObject * @@ -1487,7 +1534,7 @@ set_copy_and_difference(PySetObject *so, PyObject *other) result = set_copy(so); if (result == NULL) return NULL; - if (set_difference_update_internal((PySetObject *) result, other) != -1) + if (set_difference_update_internal((PySetObject *) result, other) == 0) return result; Py_DECREF(result); return NULL; @@ -1497,8 +1544,11 @@ static PyObject * set_difference(PySetObject *so, PyObject *other) { PyObject *result; + PyObject *key; + Py_hash_t hash; setentry *entry; Py_ssize_t pos = 0; + int rv; if (!PyAnySet_Check(other) && !PyDict_CheckExact(other)) { return set_copy_and_difference(so, other); @@ -1516,17 +1566,15 @@ set_difference(PySetObject *so, PyObject *other) if (PyDict_CheckExact(other)) { while (set_next(so, &pos, &entry)) { - setentry entrycopy; - int rv; - entrycopy.hash = entry->hash; - entrycopy.key = entry->key; - rv = _PyDict_Contains(other, entry->key, entry->hash); + key = entry->key; + hash = entry->hash; + rv = _PyDict_Contains(other, key, hash); if (rv < 0) { Py_DECREF(result); return NULL; } if (!rv) { - if (set_add_entry((PySetObject *)result, &entrycopy)) { + if (set_add_entry((PySetObject *)result, key, hash)) { Py_DECREF(result); return NULL; } @@ -1537,13 +1585,15 @@ set_difference(PySetObject *so, PyObject *other) /* Iterate over so, checking for common elements in other. */ while (set_next(so, &pos, &entry)) { - int rv = set_contains_entry((PySetObject *)other, entry); - if (rv == -1) { + key = entry->key; + hash = entry->hash; + rv = set_contains_entry((PySetObject *)other, key, hash); + if (rv < 0) { Py_DECREF(result); return NULL; } if (!rv) { - if (set_add_entry((PySetObject *)result, entry)) { + if (set_add_entry((PySetObject *)result, key, hash)) { Py_DECREF(result); return NULL; } @@ -1605,29 +1655,24 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other) PySetObject *otherset; PyObject *key; Py_ssize_t pos = 0; + Py_hash_t hash; setentry *entry; + int rv; if ((PyObject *)so == other) return set_clear(so); if (PyDict_CheckExact(other)) { PyObject *value; - int rv; - Py_hash_t hash; while (_PyDict_Next(other, &pos, &key, &value, &hash)) { - setentry an_entry; - Py_INCREF(key); - an_entry.hash = hash; - an_entry.key = key; - - rv = set_discard_entry(so, &an_entry); - if (rv == -1) { + rv = set_discard_entry(so, key, hash); + if (rv < 0) { Py_DECREF(key); return NULL; } if (rv == DISCARD_NOTFOUND) { - if (set_add_entry(so, &an_entry)) { + if (set_add_entry(so, key, hash)) { Py_DECREF(key); return NULL; } @@ -1647,13 +1692,15 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other) } while (set_next(otherset, &pos, &entry)) { - int rv = set_discard_entry(so, entry); - if (rv == -1) { + key = entry->key; + hash = entry->hash; + rv = set_discard_entry(so, key, hash); + if (rv < 0) { Py_DECREF(otherset); return NULL; } if (rv == DISCARD_NOTFOUND) { - if (set_add_entry(so, entry)) { + if (set_add_entry(so, key, hash)) { Py_DECREF(otherset); return NULL; } @@ -1715,6 +1762,7 @@ set_issubset(PySetObject *so, PyObject *other) { setentry *entry; Py_ssize_t pos = 0; + int rv; if (!PyAnySet_Check(other)) { PyObject *tmp, *result; @@ -1729,8 +1777,8 @@ set_issubset(PySetObject *so, PyObject *other) Py_RETURN_FALSE; while (set_next(so, &pos, &entry)) { - int rv = set_contains_entry((PySetObject *)other, entry); - if (rv == -1) + rv = set_contains_entry((PySetObject *)other, entry->key, entry->hash); + if (rv < 0) return NULL; if (!rv) Py_RETURN_FALSE; @@ -1821,7 +1869,7 @@ set_contains(PySetObject *so, PyObject *key) int rv; rv = set_contains_key(so, key); - if (rv == -1) { + if (rv < 0) { if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError)) return -1; PyErr_Clear(); @@ -1840,7 +1888,7 @@ set_direct_contains(PySetObject *so, PyObject *key) long result; result = set_contains(so, key); - if (result == -1) + if (result < 0) return NULL; return PyBool_FromLong(result); } @@ -1854,7 +1902,7 @@ set_remove(PySetObject *so, PyObject *key) int rv; rv = set_discard_key(so, key); - if (rv == -1) { + if (rv < 0) { if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError)) return NULL; PyErr_Clear(); @@ -1863,7 +1911,7 @@ set_remove(PySetObject *so, PyObject *key) return NULL; rv = set_discard_key(so, tmpkey); Py_DECREF(tmpkey); - if (rv == -1) + if (rv < 0) return NULL; } @@ -1886,7 +1934,7 @@ set_discard(PySetObject *so, PyObject *key) int rv; rv = set_discard_key(so, key); - if (rv == -1) { + if (rv < 0) { if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError)) return NULL; PyErr_Clear(); @@ -1895,7 +1943,7 @@ set_discard(PySetObject *so, PyObject *key) return NULL; rv = set_discard_key(so, tmpkey); Py_DECREF(tmpkey); - if (rv == -1) + if (rv < 0) return NULL; } Py_RETURN_NONE; @@ -1949,13 +1997,12 @@ set_init(PySetObject *self, PyObject *args, PyObject *kwds) { PyObject *iterable = NULL; - if (!PyAnySet_Check(self)) - return -1; - if (PySet_Check(self) && !_PyArg_NoKeywords("set()", kwds)) + if (kwds != NULL && !_PyArg_NoKeywords("set()", kwds)) return -1; if (!PyArg_UnpackTuple(args, Py_TYPE(self)->tp_name, 0, 1, &iterable)) return -1; - set_clear_internal(self); + if (self->fill) + set_clear_internal(self); self->hash = -1; if (iterable == NULL) return 0; @@ -2122,7 +2169,7 @@ static PyMethodDef frozenset_methods[] = { copy_doc}, {"difference", (PyCFunction)set_difference_multi, METH_VARARGS, difference_doc}, - {"intersection",(PyCFunction)set_intersection_multi, METH_VARARGS, + {"intersection", (PyCFunction)set_intersection_multi, METH_VARARGS, intersection_doc}, {"isdisjoint", (PyCFunction)set_isdisjoint, METH_O, isdisjoint_doc}, @@ -2193,7 +2240,7 @@ PyTypeObject PyFrozenSet_Type = { (traverseproc)set_traverse, /* tp_traverse */ (inquiry)set_clear_internal, /* tp_clear */ (richcmpfunc)set_richcompare, /* tp_richcompare */ - offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */ + offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */ (getiterfunc)set_iter, /* tp_iter */ 0, /* tp_iternext */ frozenset_methods, /* tp_methods */ @@ -2277,6 +2324,18 @@ PySet_Add(PyObject *anyset, PyObject *key) } int +PySet_ClearFreeList(void) +{ + return 0; +} + +void +PySet_Fini(void) +{ + Py_CLEAR(emptyfrozenset); +} + +int _PySet_NextEntry(PyObject *set, Py_ssize_t *pos, PyObject **key, Py_hash_t *hash) { setentry *entry; diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 0fc6b58..2846d7e 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -1,6 +1,8 @@ /* stringlib: codec implementations */ -#if STRINGLIB_IS_UNICODE +#if !STRINGLIB_IS_UNICODE +# error "codecs.h is specific to Unicode" +#endif /* Mask to quickly check whether a C 'long' contains a non-ASCII, UTF8-encoded char. */ @@ -263,50 +265,34 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ Py_ssize_t i; /* index into s of next input byte */ - PyObject *result; /* result string object */ char *p; /* next free byte in output buffer */ - Py_ssize_t nallocated; /* number of result bytes allocated */ - Py_ssize_t nneeded; /* number of result bytes needed */ #if STRINGLIB_SIZEOF_CHAR > 1 - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; PyObject *rep = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; #endif #if STRINGLIB_SIZEOF_CHAR == 1 const Py_ssize_t max_char_size = 2; - char stackbuf[MAX_SHORT_UNICHARS * 2]; #elif STRINGLIB_SIZEOF_CHAR == 2 const Py_ssize_t max_char_size = 3; - char stackbuf[MAX_SHORT_UNICHARS * 3]; #else /* STRINGLIB_SIZEOF_CHAR == 4 */ const Py_ssize_t max_char_size = 4; - char stackbuf[MAX_SHORT_UNICHARS * 4]; #endif + _PyBytesWriter writer; assert(size >= 0); + _PyBytesWriter_Init(&writer); - if (size <= MAX_SHORT_UNICHARS) { - /* Write into the stack buffer; nallocated can't overflow. - * At the end, we'll allocate exactly as much heap space as it - * turns out we need. - */ - nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); - result = NULL; /* will allocate after we're done */ - p = stackbuf; - } - else { - if (size > PY_SSIZE_T_MAX / max_char_size) { - /* integer overflow */ - return PyErr_NoMemory(); - } - /* Overallocate on the heap, and give the excess back at the end. */ - nallocated = size * max_char_size; - result = PyBytes_FromStringAndSize(NULL, nallocated); - if (result == NULL) - return NULL; - p = PyBytes_AS_STRING(result); + if (size > PY_SSIZE_T_MAX / max_char_size) { + /* integer overflow */ + return PyErr_NoMemory(); } + p = _PyBytesWriter_Alloc(&writer, size * max_char_size); + if (p == NULL) + return NULL; + for (i = 0; i < size;) { Py_UCS4 ch = data[i++]; @@ -326,72 +312,118 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, } #if STRINGLIB_SIZEOF_CHAR > 1 else if (Py_UNICODE_IS_SURROGATE(ch)) { - Py_ssize_t newpos; - Py_ssize_t repsize, k, startpos; + Py_ssize_t startpos, endpos, newpos; + Py_ssize_t k; + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + startpos = i-1; - rep = unicode_encode_call_errorhandler( - errors, &errorHandler, "utf-8", "surrogates not allowed", - unicode, &exc, startpos, startpos+1, &newpos); - if (!rep) - goto error; - - if (PyBytes_Check(rep)) - repsize = PyBytes_GET_SIZE(rep); - else - repsize = PyUnicode_GET_LENGTH(rep); + endpos = startpos+1; + + while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) + endpos++; + + /* Only overallocate the buffer if it's not the last write */ + writer.overallocate = (endpos < size); + + switch (error_handler) + { + case _Py_ERROR_REPLACE: + memset(p, '?', endpos - startpos); + p += (endpos - startpos); + /* fall through the ignore handler */ + case _Py_ERROR_IGNORE: + i += (endpos - startpos - 1); + break; - if (repsize > max_char_size) { - Py_ssize_t offset; + case _Py_ERROR_SURROGATEPASS: + for (k=startpos; k<endpos; k++) { + ch = data[k]; + *p++ = (char)(0xe0 | (ch >> 12)); + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } + i += (endpos - startpos - 1); + break; - if (result == NULL) - offset = p - stackbuf; - else - offset = p - PyBytes_AS_STRING(result); + case _Py_ERROR_BACKSLASHREPLACE: + /* substract preallocated bytes */ + writer.min_size -= max_char_size * (endpos - startpos); + p = backslashreplace(&writer, p, + unicode, startpos, endpos); + if (p == NULL) + goto error; + i += (endpos - startpos - 1); + break; - if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) { - /* integer overflow */ - PyErr_NoMemory(); + case _Py_ERROR_XMLCHARREFREPLACE: + /* substract preallocated bytes */ + writer.min_size -= max_char_size * (endpos - startpos); + p = xmlcharrefreplace(&writer, p, + unicode, startpos, endpos); + if (p == NULL) goto error; + i += (endpos - startpos - 1); + break; + + case _Py_ERROR_SURROGATEESCAPE: + for (k=startpos; k<endpos; k++) { + ch = data[k]; + if (!(0xDC80 <= ch && ch <= 0xDCFF)) + break; + *p++ = (char)(ch & 0xff); } - nallocated += repsize - max_char_size; - if (result != NULL) { - if (_PyBytes_Resize(&result, nallocated) < 0) - goto error; - } else { - result = PyBytes_FromStringAndSize(NULL, nallocated); - if (result == NULL) - goto error; - Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); + if (k >= endpos) { + i += (endpos - startpos - 1); + break; } - p = PyBytes_AS_STRING(result) + offset; - } + startpos = k; + assert(startpos < endpos); + /* fall through the default handler */ + default: + rep = unicode_encode_call_errorhandler( + errors, &error_handler_obj, "utf-8", "surrogates not allowed", + unicode, &exc, startpos, endpos, &newpos); + if (!rep) + goto error; - if (PyBytes_Check(rep)) { - char *prep = PyBytes_AS_STRING(rep); - for(k = repsize; k > 0; k--) - *p++ = *prep++; - } else /* rep is unicode */ { - enum PyUnicode_Kind repkind; - void *repdata; + /* substract preallocated bytes */ + writer.min_size -= max_char_size; - if (PyUnicode_READY(rep) < 0) - goto error; - repkind = PyUnicode_KIND(rep); - repdata = PyUnicode_DATA(rep); + if (PyBytes_Check(rep)) { + p = _PyBytesWriter_WriteBytes(&writer, p, + PyBytes_AS_STRING(rep), + PyBytes_GET_SIZE(rep)); + } + else { + /* rep is unicode */ + if (PyUnicode_READY(rep) < 0) + goto error; - for(k=0; k<repsize; k++) { - Py_UCS4 c = PyUnicode_READ(repkind, repdata, k); - if (0x80 <= c) { + if (!PyUnicode_IS_ASCII(rep)) { raise_encode_exception(&exc, "utf-8", unicode, i-1, i, "surrogates not allowed"); goto error; } - *p++ = (char)c; + + assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); + p = _PyBytesWriter_WriteBytes(&writer, p, + PyUnicode_DATA(rep), + PyUnicode_GET_LENGTH(rep)); } + + if (p == NULL) + goto error; + Py_CLEAR(rep); + + i = newpos; } - Py_CLEAR(rep); + + /* If overallocation was disabled, ensure that it was the last + write. Otherwise, we missed an optimization */ + assert(writer.overallocate || i == size); } else #if STRINGLIB_SIZEOF_CHAR > 2 @@ -416,31 +448,18 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ } - if (result == NULL) { - /* This was stack allocated. */ - nneeded = p - stackbuf; - assert(nneeded <= nallocated); - result = PyBytes_FromStringAndSize(stackbuf, nneeded); - } - else { - /* Cut back to size actually needed. */ - nneeded = p - PyBytes_AS_STRING(result); - assert(nneeded <= nallocated); - _PyBytes_Resize(&result, nneeded); - } - #if STRINGLIB_SIZEOF_CHAR > 1 - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); #endif - return result; + return _PyBytesWriter_Finish(&writer, p); #if STRINGLIB_SIZEOF_CHAR > 1 error: Py_XDECREF(rep); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); - Py_XDECREF(result); + _PyBytesWriter_Dealloc(&writer); return NULL; #endif @@ -806,5 +825,3 @@ STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, #undef SWAB4 #endif - -#endif /* STRINGLIB_IS_UNICODE */ diff --git a/Objects/stringlib/ctype.h b/Objects/stringlib/ctype.h index 739cf3d..f054625 100644 --- a/Objects/stringlib/ctype.h +++ b/Objects/stringlib/ctype.h @@ -1,5 +1,6 @@ -/* NOTE: this API is -ONLY- for use with single byte character strings. */ -/* Do not use it with Unicode. */ +#if STRINGLIB_IS_UNICODE +# error "ctype.h only compatible with byte-wise strings" +#endif #include "bytes_methods.h" diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index cda68e7..98165ad 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -32,52 +32,98 @@ #define STRINGLIB_BLOOM(mask, ch) \ ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) - Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(fastsearch_memchr_1char)(const STRINGLIB_CHAR* s, Py_ssize_t n, - STRINGLIB_CHAR ch, unsigned char needle, - int mode) +STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) { - if (mode == FAST_SEARCH) { - const STRINGLIB_CHAR *ptr = s; - const STRINGLIB_CHAR *e = s + n; - while (ptr < e) { - void *candidate = memchr((const void *) ptr, needle, (e - ptr) * sizeof(STRINGLIB_CHAR)); - if (candidate == NULL) - return -1; - ptr = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); - if (sizeof(STRINGLIB_CHAR) == 1 || *ptr == ch) - return (ptr - s); - /* False positive */ - ptr++; - } + const STRINGLIB_CHAR *p, *e; + + p = s; + e = s + n; + if (n > 10) { +#if STRINGLIB_SIZEOF_CHAR == 1 + p = memchr(s, ch, n); + if (p != NULL) + return (p - s); return -1; +#else + /* use memchr if we can choose a needle without two many likely + false positives */ + unsigned char needle = ch & 0xff; + /* If looking for a multiple of 256, we'd have too + many false positives looking for the '\0' byte in UCS2 + and UCS4 representations. */ + if (needle != 0) { + while (p < e) { + void *candidate = memchr(p, needle, + (e - p) * sizeof(STRINGLIB_CHAR)); + if (candidate == NULL) + return -1; + p = (const STRINGLIB_CHAR *) + _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); + if (*p == ch) + return (p - s); + /* False positive */ + p++; + } + return -1; + } +#endif } + while (p < e) { + if (*p == ch) + return (p - s); + p++; + } + return -1; +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) +{ + const STRINGLIB_CHAR *p; #ifdef HAVE_MEMRCHR /* memrchr() is a GNU extension, available since glibc 2.1.91. it doesn't seem as optimized as memchr(), but is still quite - faster than our hand-written loop in FASTSEARCH below */ - else if (mode == FAST_RSEARCH) { - while (n > 0) { - const STRINGLIB_CHAR *found; - void *candidate = memrchr((const void *) s, needle, n * sizeof(STRINGLIB_CHAR)); - if (candidate == NULL) - return -1; - found = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); - n = found - s; - if (sizeof(STRINGLIB_CHAR) == 1 || *found == ch) - return n; - /* False positive */ - } + faster than our hand-written loop below */ + + if (n > 10) { +#if STRINGLIB_SIZEOF_CHAR == 1 + p = memrchr(s, ch, n); + if (p != NULL) + return (p - s); return -1; - } +#else + /* use memrchr if we can choose a needle without two many likely + false positives */ + unsigned char needle = ch & 0xff; + /* If looking for a multiple of 256, we'd have too + many false positives looking for the '\0' byte in UCS2 + and UCS4 representations. */ + if (needle != 0) { + while (n > 0) { + void *candidate = memrchr(s, needle, + n * sizeof(STRINGLIB_CHAR)); + if (candidate == NULL) + return -1; + p = (const STRINGLIB_CHAR *) + _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); + n = p - s; + if (*p == ch) + return n; + /* False positive */ + } + return -1; + } #endif - else { - assert(0); /* Should never get here */ - return 0; } - -#undef DO_MEMCHR +#endif /* HAVE_MEMRCHR */ + p = s + n; + while (p > s) { + p--; + if (*p == ch) + return (p - s); + } + return -1; } Py_LOCAL_INLINE(Py_ssize_t) @@ -99,25 +145,11 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, if (m <= 0) return -1; /* use special case for 1-character strings */ - if (n > 10 && (mode == FAST_SEARCH -#ifdef HAVE_MEMRCHR - || mode == FAST_RSEARCH -#endif - )) { - /* use memchr if we can choose a needle without two many likely - false positives */ - unsigned char needle; - needle = p[0] & 0xff; -#if STRINGLIB_SIZEOF_CHAR > 1 - /* If looking for a multiple of 256, we'd have too - many false positives looking for the '\0' byte in UCS2 - and UCS4 representations. */ - if (needle != 0) -#endif - return STRINGLIB(fastsearch_memchr_1char) - (s, n, p[0], needle, mode); - } - if (mode == FAST_COUNT) { + if (mode == FAST_SEARCH) + return STRINGLIB(find_char)(s, n, p[0]); + else if (mode == FAST_RSEARCH) + return STRINGLIB(rfind_char)(s, n, p[0]); + else { /* FAST_COUNT */ for (i = 0; i < n; i++) if (s[i] == p[0]) { count++; @@ -125,14 +157,6 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return maxcount; } return count; - } else if (mode == FAST_SEARCH) { - for (i = 0; i < n; i++) - if (s[i] == p[0]) - return i; - } else { /* FAST_RSEARCH */ - for (i = n - 1; i > -1; i--) - if (s[i] == p[0]) - return i; } return -1; } diff --git a/Objects/stringlib/find.h b/Objects/stringlib/find.h index 14815f6..509b929 100644 --- a/Objects/stringlib/find.h +++ b/Objects/stringlib/find.h @@ -117,85 +117,3 @@ STRINGLIB(parse_args_finds)(const char * function_name, PyObject *args, } #undef FORMAT_BUFFER_SIZE - -#if STRINGLIB_IS_UNICODE - -/* -Wraps stringlib_parse_args_finds() and additionally ensures that the -first argument is a unicode object. - -Note that we receive a pointer to the pointer of the substring object, -so when we create that object in this function we don't DECREF it, -because it continues living in the caller functions (those functions, -after finishing using the substring, must DECREF it). -*/ - -Py_LOCAL_INLINE(int) -STRINGLIB(parse_args_finds_unicode)(const char * function_name, PyObject *args, - PyObject **substring, - Py_ssize_t *start, Py_ssize_t *end) -{ - PyObject *tmp_substring; - - if(STRINGLIB(parse_args_finds)(function_name, args, &tmp_substring, - start, end)) { - tmp_substring = PyUnicode_FromObject(tmp_substring); - if (!tmp_substring) - return 0; - *substring = tmp_substring; - return 1; - } - return 0; -} - -#else /* !STRINGLIB_IS_UNICODE */ - -/* -Wraps stringlib_parse_args_finds() and additionally checks whether the -first argument is an integer in range(0, 256). - -If this is the case, writes the integer value to the byte parameter -and sets subobj to NULL. Otherwise, sets the first argument to subobj -and doesn't touch byte. The other parameters are similar to those of -stringlib_parse_args_finds(). -*/ - -Py_LOCAL_INLINE(int) -STRINGLIB(parse_args_finds_byte)(const char *function_name, PyObject *args, - PyObject **subobj, char *byte, - Py_ssize_t *start, Py_ssize_t *end) -{ - PyObject *tmp_subobj; - Py_ssize_t ival; - PyObject *err; - - if(!STRINGLIB(parse_args_finds)(function_name, args, &tmp_subobj, - start, end)) - return 0; - - if (!PyNumber_Check(tmp_subobj)) { - *subobj = tmp_subobj; - return 1; - } - - ival = PyNumber_AsSsize_t(tmp_subobj, PyExc_OverflowError); - if (ival == -1) { - err = PyErr_Occurred(); - if (err && !PyErr_GivenExceptionMatches(err, PyExc_OverflowError)) { - PyErr_Clear(); - *subobj = tmp_subobj; - return 1; - } - } - - if (ival < 0 || ival > 255) { - PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)"); - return 0; - } - - *subobj = NULL; - *byte = (char)ival; - return 1; -} - -#endif /* STRINGLIB_IS_UNICODE */ diff --git a/Objects/stringlib/find_max_char.h b/Objects/stringlib/find_max_char.h index eb3fe88..8ccbc30 100644 --- a/Objects/stringlib/find_max_char.h +++ b/Objects/stringlib/find_max_char.h @@ -1,6 +1,8 @@ /* Finding the optimal width of unicode characters in a buffer */ -#if STRINGLIB_IS_UNICODE +#if !STRINGLIB_IS_UNICODE +# error "find_max_char.h is specific to Unicode" +#endif /* Mask to quickly check whether a C 'long' contains a non-ASCII, UTF8-encoded char. */ @@ -129,5 +131,4 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) #undef MAX_CHAR_UCS4 #endif /* STRINGLIB_SIZEOF_CHAR == 1 */ -#endif /* STRINGLIB_IS_UNICODE */ diff --git a/Objects/stringlib/join.h b/Objects/stringlib/join.h index cbf81be..90f966d 100644 --- a/Objects/stringlib/join.h +++ b/Objects/stringlib/join.h @@ -1,6 +1,6 @@ /* stringlib: bytes joining implementation */ -#if STRINGLIB_SIZEOF_CHAR != 1 +#if STRINGLIB_IS_UNICODE #error join.h only compatible with byte-wise strings #endif diff --git a/Objects/stringlib/localeutil.h b/Objects/stringlib/localeutil.h index 6e2f073..df501ed 100644 --- a/Objects/stringlib/localeutil.h +++ b/Objects/stringlib/localeutil.h @@ -2,8 +2,8 @@ #include <locale.h> -#ifndef STRINGLIB_IS_UNICODE -# error "localeutil is specific to Unicode" +#if !STRINGLIB_IS_UNICODE +# error "localeutil.h is specific to Unicode" #endif typedef struct { diff --git a/Objects/stringlib/transmogrify.h b/Objects/stringlib/transmogrify.h index b559b53..625507d 100644 --- a/Objects/stringlib/transmogrify.h +++ b/Objects/stringlib/transmogrify.h @@ -1,14 +1,21 @@ -/* NOTE: this API is -ONLY- for use with single byte character strings. */ -/* Do not use it with Unicode. */ +#if STRINGLIB_IS_UNICODE +# error "transmogrify.h only compatible with byte-wise strings" +#endif /* the more complicated methods. parts of these should be pulled out into the shared code in bytes_methods.c to cut down on duplicate code bloat. */ -PyDoc_STRVAR(expandtabs__doc__, -"B.expandtabs(tabsize=8) -> copy of B\n\ -\n\ -Return a copy of B where all tab characters are expanded using spaces.\n\ -If tabsize is not given, a tab size of 8 characters is assumed."); +Py_LOCAL_INLINE(PyObject *) +return_self(PyObject *self) +{ +#if !STRINGLIB_MUTABLE + if (STRINGLIB_CHECK_EXACT(self)) { + Py_INCREF(self); + return self; + } +#endif + return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +} static PyObject* stringlib_expandtabs(PyObject *self, PyObject *args, PyObject *kwds) @@ -93,39 +100,25 @@ pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill) if (right < 0) right = 0; - if (left == 0 && right == 0 && STRINGLIB_CHECK_EXACT(self)) { -#if STRINGLIB_MUTABLE - /* We're defined as returning a copy; If the object is mutable - * that means we must make an identical copy. */ - return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -#else - Py_INCREF(self); - return (PyObject *)self; -#endif /* STRINGLIB_MUTABLE */ + if (left == 0 && right == 0) { + return return_self(self); } - u = STRINGLIB_NEW(NULL, - left + STRINGLIB_LEN(self) + right); + u = STRINGLIB_NEW(NULL, left + STRINGLIB_LEN(self) + right); if (u) { if (left) memset(STRINGLIB_STR(u), fill, left); Py_MEMCPY(STRINGLIB_STR(u) + left, - STRINGLIB_STR(self), - STRINGLIB_LEN(self)); + STRINGLIB_STR(self), + STRINGLIB_LEN(self)); if (right) memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self), - fill, right); + fill, right); } return u; } -PyDoc_STRVAR(ljust__doc__, -"B.ljust(width[, fillchar]) -> copy of B\n" -"\n" -"Return B left justified in a string of length width. Padding is\n" -"done using the specified fill character (default is a space)."); - static PyObject * stringlib_ljust(PyObject *self, PyObject *args) { @@ -135,27 +128,14 @@ stringlib_ljust(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "n|c:ljust", &width, &fillchar)) return NULL; - if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) { -#if STRINGLIB_MUTABLE - /* We're defined as returning a copy; If the object is mutable - * that means we must make an identical copy. */ - return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -#else - Py_INCREF(self); - return (PyObject*) self; -#endif + if (STRINGLIB_LEN(self) >= width) { + return return_self(self); } return pad(self, 0, width - STRINGLIB_LEN(self), fillchar); } -PyDoc_STRVAR(rjust__doc__, -"B.rjust(width[, fillchar]) -> copy of B\n" -"\n" -"Return B right justified in a string of length width. Padding is\n" -"done using the specified fill character (default is a space)"); - static PyObject * stringlib_rjust(PyObject *self, PyObject *args) { @@ -165,27 +145,14 @@ stringlib_rjust(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "n|c:rjust", &width, &fillchar)) return NULL; - if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) { -#if STRINGLIB_MUTABLE - /* We're defined as returning a copy; If the object is mutable - * that means we must make an identical copy. */ - return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -#else - Py_INCREF(self); - return (PyObject*) self; -#endif + if (STRINGLIB_LEN(self) >= width) { + return return_self(self); } return pad(self, width - STRINGLIB_LEN(self), 0, fillchar); } -PyDoc_STRVAR(center__doc__, -"B.center(width[, fillchar]) -> copy of B\n" -"\n" -"Return B centered in a string of length width. Padding is\n" -"done using the specified fill character (default is a space)."); - static PyObject * stringlib_center(PyObject *self, PyObject *args) { @@ -196,15 +163,8 @@ stringlib_center(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "n|c:center", &width, &fillchar)) return NULL; - if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) { -#if STRINGLIB_MUTABLE - /* We're defined as returning a copy; If the object is mutable - * that means we must make an identical copy. */ - return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -#else - Py_INCREF(self); - return (PyObject*) self; -#endif + if (STRINGLIB_LEN(self) >= width) { + return return_self(self); } marg = width - STRINGLIB_LEN(self); @@ -213,12 +173,6 @@ stringlib_center(PyObject *self, PyObject *args) return pad(self, left, marg - left, fillchar); } -PyDoc_STRVAR(zfill__doc__, -"B.zfill(width) -> copy of B\n" -"\n" -"Pad a numeric string B with zeros on the left, to fill a field\n" -"of the specified width. B is never truncated."); - static PyObject * stringlib_zfill(PyObject *self, PyObject *args) { @@ -231,21 +185,7 @@ stringlib_zfill(PyObject *self, PyObject *args) return NULL; if (STRINGLIB_LEN(self) >= width) { - if (STRINGLIB_CHECK_EXACT(self)) { -#if STRINGLIB_MUTABLE - /* We're defined as returning a copy; If the object is mutable - * that means we must make an identical copy. */ - return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -#else - Py_INCREF(self); - return (PyObject*) self; -#endif - } - else - return STRINGLIB_NEW( - STRINGLIB_STR(self), - STRINGLIB_LEN(self) - ); + return return_self(self); } fill = width - STRINGLIB_LEN(self); @@ -262,5 +202,500 @@ stringlib_zfill(PyObject *self, PyObject *args) p[fill] = '0'; } - return (PyObject*) s; + return s; +} + + +/* find and count characters and substrings */ + +#define findchar(target, target_len, c) \ + ((char *)memchr((const void *)(target), c, target_len)) + + +Py_LOCAL_INLINE(Py_ssize_t) +countchar(const char *target, Py_ssize_t target_len, char c, + Py_ssize_t maxcount) +{ + Py_ssize_t count = 0; + const char *start = target; + const char *end = target + target_len; + + while ((start = findchar(start, end - start, c)) != NULL) { + count++; + if (count >= maxcount) + break; + start += 1; + } + return count; +} + + +/* Algorithms for different cases of string replacement */ + +/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */ +Py_LOCAL(PyObject *) +stringlib_replace_interleave(PyObject *self, + const char *to_s, Py_ssize_t to_len, + Py_ssize_t maxcount) +{ + const char *self_s; + char *result_s; + Py_ssize_t self_len, result_len; + Py_ssize_t count, i; + PyObject *result; + + self_len = STRINGLIB_LEN(self); + + /* 1 at the end plus 1 after every character; + count = min(maxcount, self_len + 1) */ + if (maxcount <= self_len) { + count = maxcount; + } + else { + /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */ + count = self_len + 1; + } + + /* Check for overflow */ + /* result_len = count * to_len + self_len; */ + assert(count > 0); + if (to_len > (PY_SSIZE_T_MAX - self_len) / count) { + PyErr_SetString(PyExc_OverflowError, + "replace bytes are too long"); + return NULL; + } + result_len = count * to_len + self_len; + result = STRINGLIB_NEW(NULL, result_len); + if (result == NULL) { + return NULL; + } + + self_s = STRINGLIB_STR(self); + result_s = STRINGLIB_STR(result); + + if (to_len > 1) { + /* Lay the first one down (guaranteed this will occur) */ + Py_MEMCPY(result_s, to_s, to_len); + result_s += to_len; + count -= 1; + + for (i = 0; i < count; i++) { + *result_s++ = *self_s++; + Py_MEMCPY(result_s, to_s, to_len); + result_s += to_len; + } + } + else { + result_s[0] = to_s[0]; + result_s += to_len; + count -= 1; + for (i = 0; i < count; i++) { + *result_s++ = *self_s++; + result_s[0] = to_s[0]; + result_s += to_len; + } + } + + /* Copy the rest of the original string */ + Py_MEMCPY(result_s, self_s, self_len - i); + + return result; } + +/* Special case for deleting a single character */ +/* len(self)>=1, len(from)==1, to="", maxcount>=1 */ +Py_LOCAL(PyObject *) +stringlib_replace_delete_single_character(PyObject *self, + char from_c, Py_ssize_t maxcount) +{ + const char *self_s, *start, *next, *end; + char *result_s; + Py_ssize_t self_len, result_len; + Py_ssize_t count; + PyObject *result; + + self_len = STRINGLIB_LEN(self); + self_s = STRINGLIB_STR(self); + + count = countchar(self_s, self_len, from_c, maxcount); + if (count == 0) { + return return_self(self); + } + + result_len = self_len - count; /* from_len == 1 */ + assert(result_len>=0); + + result = STRINGLIB_NEW(NULL, result_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + next = findchar(start, end - start, from_c); + if (next == NULL) + break; + Py_MEMCPY(result_s, start, next - start); + result_s += (next - start); + start = next + 1; + } + Py_MEMCPY(result_s, start, end - start); + + return result; +} + +/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */ + +Py_LOCAL(PyObject *) +stringlib_replace_delete_substring(PyObject *self, + const char *from_s, Py_ssize_t from_len, + Py_ssize_t maxcount) +{ + const char *self_s, *start, *next, *end; + char *result_s; + Py_ssize_t self_len, result_len; + Py_ssize_t count, offset; + PyObject *result; + + self_len = STRINGLIB_LEN(self); + self_s = STRINGLIB_STR(self); + + count = stringlib_count(self_s, self_len, + from_s, from_len, + maxcount); + + if (count == 0) { + /* no matches */ + return return_self(self); + } + + result_len = self_len - (count * from_len); + assert (result_len>=0); + + result = STRINGLIB_NEW(NULL, result_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + offset = stringlib_find(start, end - start, + from_s, from_len, + 0); + if (offset == -1) + break; + next = start + offset; + + Py_MEMCPY(result_s, start, next - start); + + result_s += (next - start); + start = next + from_len; + } + Py_MEMCPY(result_s, start, end - start); + return result; +} + +/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */ +Py_LOCAL(PyObject *) +stringlib_replace_single_character_in_place(PyObject *self, + char from_c, char to_c, + Py_ssize_t maxcount) +{ + const char *self_s, *end; + char *result_s, *start, *next; + Py_ssize_t self_len; + PyObject *result; + + /* The result string will be the same size */ + self_s = STRINGLIB_STR(self); + self_len = STRINGLIB_LEN(self); + + next = findchar(self_s, self_len, from_c); + + if (next == NULL) { + /* No matches; return the original bytes */ + return return_self(self); + } + + /* Need to make a new bytes */ + result = STRINGLIB_NEW(NULL, self_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + Py_MEMCPY(result_s, self_s, self_len); + + /* change everything in-place, starting with this one */ + start = result_s + (next - self_s); + *start = to_c; + start++; + end = result_s + self_len; + + while (--maxcount > 0) { + next = findchar(start, end - start, from_c); + if (next == NULL) + break; + *next = to_c; + start = next + 1; + } + + return result; +} + +/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */ +Py_LOCAL(PyObject *) +stringlib_replace_substring_in_place(PyObject *self, + const char *from_s, Py_ssize_t from_len, + const char *to_s, Py_ssize_t to_len, + Py_ssize_t maxcount) +{ + const char *self_s, *end; + char *result_s, *start; + Py_ssize_t self_len, offset; + PyObject *result; + + /* The result bytes will be the same size */ + + self_s = STRINGLIB_STR(self); + self_len = STRINGLIB_LEN(self); + + offset = stringlib_find(self_s, self_len, + from_s, from_len, + 0); + if (offset == -1) { + /* No matches; return the original bytes */ + return return_self(self); + } + + /* Need to make a new bytes */ + result = STRINGLIB_NEW(NULL, self_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + Py_MEMCPY(result_s, self_s, self_len); + + /* change everything in-place, starting with this one */ + start = result_s + offset; + Py_MEMCPY(start, to_s, from_len); + start += from_len; + end = result_s + self_len; + + while ( --maxcount > 0) { + offset = stringlib_find(start, end - start, + from_s, from_len, + 0); + if (offset == -1) + break; + Py_MEMCPY(start + offset, to_s, from_len); + start += offset + from_len; + } + + return result; +} + +/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */ +Py_LOCAL(PyObject *) +stringlib_replace_single_character(PyObject *self, + char from_c, + const char *to_s, Py_ssize_t to_len, + Py_ssize_t maxcount) +{ + const char *self_s, *start, *next, *end; + char *result_s; + Py_ssize_t self_len, result_len; + Py_ssize_t count; + PyObject *result; + + self_s = STRINGLIB_STR(self); + self_len = STRINGLIB_LEN(self); + + count = countchar(self_s, self_len, from_c, maxcount); + if (count == 0) { + /* no matches, return unchanged */ + return return_self(self); + } + + /* use the difference between current and new, hence the "-1" */ + /* result_len = self_len + count * (to_len-1) */ + assert(count > 0); + if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) { + PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); + return NULL; + } + result_len = self_len + count * (to_len - 1); + + result = STRINGLIB_NEW(NULL, result_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + next = findchar(start, end - start, from_c); + if (next == NULL) + break; + + if (next == start) { + /* replace with the 'to' */ + Py_MEMCPY(result_s, to_s, to_len); + result_s += to_len; + start += 1; + } else { + /* copy the unchanged old then the 'to' */ + Py_MEMCPY(result_s, start, next - start); + result_s += (next - start); + Py_MEMCPY(result_s, to_s, to_len); + result_s += to_len; + start = next + 1; + } + } + /* Copy the remainder of the remaining bytes */ + Py_MEMCPY(result_s, start, end - start); + + return result; +} + +/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */ +Py_LOCAL(PyObject *) +stringlib_replace_substring(PyObject *self, + const char *from_s, Py_ssize_t from_len, + const char *to_s, Py_ssize_t to_len, + Py_ssize_t maxcount) +{ + const char *self_s, *start, *next, *end; + char *result_s; + Py_ssize_t self_len, result_len; + Py_ssize_t count, offset; + PyObject *result; + + self_s = STRINGLIB_STR(self); + self_len = STRINGLIB_LEN(self); + + count = stringlib_count(self_s, self_len, + from_s, from_len, + maxcount); + + if (count == 0) { + /* no matches, return unchanged */ + return return_self(self); + } + + /* Check for overflow */ + /* result_len = self_len + count * (to_len-from_len) */ + assert(count > 0); + if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) { + PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); + return NULL; + } + result_len = self_len + count * (to_len - from_len); + + result = STRINGLIB_NEW(NULL, result_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + offset = stringlib_find(start, end - start, + from_s, from_len, + 0); + if (offset == -1) + break; + next = start + offset; + if (next == start) { + /* replace with the 'to' */ + Py_MEMCPY(result_s, to_s, to_len); + result_s += to_len; + start += from_len; + } else { + /* copy the unchanged old then the 'to' */ + Py_MEMCPY(result_s, start, next - start); + result_s += (next - start); + Py_MEMCPY(result_s, to_s, to_len); + result_s += to_len; + start = next + from_len; + } + } + /* Copy the remainder of the remaining bytes */ + Py_MEMCPY(result_s, start, end - start); + + return result; +} + + +Py_LOCAL(PyObject *) +stringlib_replace(PyObject *self, + const char *from_s, Py_ssize_t from_len, + const char *to_s, Py_ssize_t to_len, + Py_ssize_t maxcount) +{ + if (maxcount < 0) { + maxcount = PY_SSIZE_T_MAX; + } else if (maxcount == 0 || STRINGLIB_LEN(self) == 0) { + /* nothing to do; return the original bytes */ + return return_self(self); + } + + /* Handle zero-length special cases */ + if (from_len == 0) { + if (to_len == 0) { + /* nothing to do; return the original bytes */ + return return_self(self); + } + /* insert the 'to' bytes everywhere. */ + /* >>> b"Python".replace(b"", b".") */ + /* b'.P.y.t.h.o.n.' */ + return stringlib_replace_interleave(self, to_s, to_len, maxcount); + } + + /* Except for b"".replace(b"", b"A") == b"A" there is no way beyond this */ + /* point for an empty self bytes to generate a non-empty bytes */ + /* Special case so the remaining code always gets a non-empty bytes */ + if (STRINGLIB_LEN(self) == 0) { + return return_self(self); + } + + if (to_len == 0) { + /* delete all occurrences of 'from' bytes */ + if (from_len == 1) { + return stringlib_replace_delete_single_character( + self, from_s[0], maxcount); + } else { + return stringlib_replace_delete_substring( + self, from_s, from_len, maxcount); + } + } + + /* Handle special case where both bytes have the same length */ + + if (from_len == to_len) { + if (from_len == 1) { + return stringlib_replace_single_character_in_place( + self, from_s[0], to_s[0], maxcount); + } else { + return stringlib_replace_substring_in_place( + self, from_s, from_len, to_s, to_len, maxcount); + } + } + + /* Otherwise use the more generic algorithms */ + if (from_len == 1) { + return stringlib_replace_single_character( + self, from_s[0], to_s, to_len, maxcount); + } else { + /* len('from')>=2, len('to')>=1 */ + return stringlib_replace_substring( + self, from_s, from_len, to_s, to_len, maxcount); + } +} + +#undef findchar diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h index be09b5f..14fa28e 100644 --- a/Objects/stringlib/unicode_format.h +++ b/Objects/stringlib/unicode_format.h @@ -67,7 +67,7 @@ SubString_new_object(SubString *str) return PyUnicode_Substring(str->str, str->start, str->end); } -/* return a new string. if str->str is NULL, return None */ +/* return a new string. if str->str is NULL, return a new empty string */ Py_LOCAL_INLINE(PyObject *) SubString_new_object_or_empty(SubString *str) { diff --git a/Objects/structseq.c b/Objects/structseq.c index 664344b..e315cba 100644 --- a/Objects/structseq.c +++ b/Objects/structseq.c @@ -4,9 +4,9 @@ #include "Python.h" #include "structmember.h" -static char visible_length_key[] = "n_sequence_fields"; -static char real_length_key[] = "n_fields"; -static char unnamed_fields_key[] = "n_unnamed_fields"; +static const char visible_length_key[] = "n_sequence_fields"; +static const char real_length_key[] = "n_fields"; +static const char unnamed_fields_key[] = "n_unnamed_fields"; /* Fields with this name have only a field index, not a field name. They are only allowed for indices < n_visible_fields. */ @@ -16,14 +16,14 @@ _Py_IDENTIFIER(n_fields); _Py_IDENTIFIER(n_unnamed_fields); #define VISIBLE_SIZE(op) Py_SIZE(op) -#define VISIBLE_SIZE_TP(tp) PyLong_AsLong( \ +#define VISIBLE_SIZE_TP(tp) PyLong_AsSsize_t( \ _PyDict_GetItemId((tp)->tp_dict, &PyId_n_sequence_fields)) -#define REAL_SIZE_TP(tp) PyLong_AsLong( \ +#define REAL_SIZE_TP(tp) PyLong_AsSsize_t( \ _PyDict_GetItemId((tp)->tp_dict, &PyId_n_fields)) #define REAL_SIZE(op) REAL_SIZE_TP(Py_TYPE(op)) -#define UNNAMED_FIELDS_TP(tp) PyLong_AsLong( \ +#define UNNAMED_FIELDS_TP(tp) PyLong_AsSsize_t( \ _PyDict_GetItemId((tp)->tp_dict, &PyId_n_unnamed_fields)) #define UNNAMED_FIELDS(op) UNNAMED_FIELDS_TP(Py_TYPE(op)) @@ -164,7 +164,8 @@ structseq_repr(PyStructSequence *obj) #define TYPE_MAXSIZE 100 PyTypeObject *typ = Py_TYPE(obj); - int i, removelast = 0; + Py_ssize_t i; + int removelast = 0; Py_ssize_t len; char buf[REPR_BUFFER_SIZE]; char *endofbuf, *pbuf = buf; @@ -236,8 +237,7 @@ structseq_reduce(PyStructSequence* self) PyObject* tup = NULL; PyObject* dict = NULL; PyObject* result; - Py_ssize_t n_fields, n_visible_fields, n_unnamed_fields; - int i; + Py_ssize_t n_fields, n_visible_fields, n_unnamed_fields, i; n_fields = REAL_SIZE(self); n_visible_fields = VISIBLE_SIZE(self); @@ -325,7 +325,7 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc) { PyObject *dict; PyMemberDef* members; - int n_members, n_unnamed_members, i, k; + Py_ssize_t n_members, n_unnamed_members, i, k; PyObject *v; #ifdef Py_TRACE_REFS @@ -373,9 +373,9 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc) Py_INCREF(type); dict = type->tp_dict; -#define SET_DICT_FROM_INT(key, value) \ +#define SET_DICT_FROM_SIZE(key, value) \ do { \ - v = PyLong_FromLong((long) value); \ + v = PyLong_FromSsize_t(value); \ if (v == NULL) \ return -1; \ if (PyDict_SetItemString(dict, key, v) < 0) { \ @@ -385,9 +385,9 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc) Py_DECREF(v); \ } while (0) - SET_DICT_FROM_INT(visible_length_key, desc->n_in_sequence); - SET_DICT_FROM_INT(real_length_key, n_members); - SET_DICT_FROM_INT(unnamed_fields_key, n_unnamed_members); + SET_DICT_FROM_SIZE(visible_length_key, desc->n_in_sequence); + SET_DICT_FROM_SIZE(real_length_key, n_members); + SET_DICT_FROM_SIZE(unnamed_fields_key, n_unnamed_members); return 0; } diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c index 7920fec..a7774e2 100644 --- a/Objects/tupleobject.c +++ b/Objects/tupleobject.c @@ -149,7 +149,6 @@ PyTuple_GetItem(PyObject *op, Py_ssize_t i) int PyTuple_SetItem(PyObject *op, Py_ssize_t i, PyObject *newitem) { - PyObject *olditem; PyObject **p; if (!PyTuple_Check(op) || op->ob_refcnt != 1) { Py_XDECREF(newitem); @@ -163,9 +162,7 @@ PyTuple_SetItem(PyObject *op, Py_ssize_t i, PyObject *newitem) return -1; } p = ((PyTupleObject *)op) -> ob_item + i; - olditem = *p; - *p = newitem; - Py_XDECREF(olditem); + Py_XSETREF(*p, newitem); return 0; } diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 0a82f3a..9e7b4e6 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -548,7 +548,7 @@ type_get_bases(PyTypeObject *type, void *context) static PyTypeObject *best_base(PyObject *); static int mro_internal(PyTypeObject *, PyObject **); Py_LOCAL_INLINE(int) type_is_subtype_base_chain(PyTypeObject *, PyTypeObject *); -static int compatible_for_assignment(PyTypeObject *, PyTypeObject *, char *); +static int compatible_for_assignment(PyTypeObject *, PyTypeObject *, const char *); static int add_subclass(PyTypeObject*, PyTypeObject*); static int add_all_subclasses(PyTypeObject *type, PyObject *bases); static void remove_subclass(PyTypeObject *, PyTypeObject *); @@ -888,25 +888,33 @@ type_call(PyTypeObject *type, PyObject *args, PyObject *kwds) #endif obj = type->tp_new(type, args, kwds); - if (obj != NULL) { - /* Ugly exception: when the call was type(something), - don't call tp_init on the result. */ - if (type == &PyType_Type && - PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 && - (kwds == NULL || - (PyDict_Check(kwds) && PyDict_Size(kwds) == 0))) - return obj; - /* If the returned object is not an instance of type, - it won't be initialized. */ - if (!PyType_IsSubtype(Py_TYPE(obj), type)) - return obj; - type = Py_TYPE(obj); - if (type->tp_init != NULL) { - int res = type->tp_init(obj, args, kwds); - if (res < 0) { - Py_DECREF(obj); - obj = NULL; - } + obj = _Py_CheckFunctionResult((PyObject*)type, obj, NULL); + if (obj == NULL) + return NULL; + + /* Ugly exception: when the call was type(something), + don't call tp_init on the result. */ + if (type == &PyType_Type && + PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 && + (kwds == NULL || + (PyDict_Check(kwds) && PyDict_Size(kwds) == 0))) + return obj; + + /* If the returned object is not an instance of type, + it won't be initialized. */ + if (!PyType_IsSubtype(Py_TYPE(obj), type)) + return obj; + + type = Py_TYPE(obj); + if (type->tp_init != NULL) { + int res = type->tp_init(obj, args, kwds); + if (res < 0) { + assert(PyErr_Occurred()); + Py_DECREF(obj); + obj = NULL; + } + else { + assert(!PyErr_Occurred()); } } return obj; @@ -1411,7 +1419,7 @@ _PyObject_LookupSpecial(PyObject *self, _Py_Identifier *attrid) as lookup_method to cache the interned name string object. */ static PyObject * -call_method(PyObject *o, _Py_Identifier *nameid, char *format, ...) +call_method(PyObject *o, _Py_Identifier *nameid, const char *format, ...) { va_list va; PyObject *args, *func = 0, *retval; @@ -1447,7 +1455,7 @@ call_method(PyObject *o, _Py_Identifier *nameid, char *format, ...) /* Clone of call_method() that returns NotImplemented when the lookup fails. */ static PyObject * -call_maybe(PyObject *o, _Py_Identifier *nameid, char *format, ...) +call_maybe(PyObject *o, _Py_Identifier *nameid, const char *format, ...) { va_list va; PyObject *args, *func = 0, *retval; @@ -2084,7 +2092,7 @@ subtype_dict(PyObject *obj, void *context) static int subtype_setdict(PyObject *obj, PyObject *value, void *context) { - PyObject *dict, **dictptr; + PyObject **dictptr; PyTypeObject *base; base = get_builtin_base_with_dict(Py_TYPE(obj)); @@ -2115,10 +2123,8 @@ subtype_setdict(PyObject *obj, PyObject *value, void *context) "not a '%.200s'", Py_TYPE(value)->tp_name); return -1; } - dict = *dictptr; Py_XINCREF(value); - *dictptr = value; - Py_XDECREF(dict); + Py_XSETREF(*dictptr, value); return 0; } @@ -2671,7 +2677,7 @@ error: return NULL; } -static short slotoffsets[] = { +static const short slotoffsets[] = { -1, /* invalid slot */ #include "typeslots.inc" }; @@ -3590,7 +3596,7 @@ same_slots_added(PyTypeObject *a, PyTypeObject *b) } static int -compatible_for_assignment(PyTypeObject* oldto, PyTypeObject* newto, char* attr) +compatible_for_assignment(PyTypeObject* oldto, PyTypeObject* newto, const char* attr) { PyTypeObject *newbase, *oldbase; @@ -3866,6 +3872,24 @@ _PyObject_GetState(PyObject *obj, int required) } assert(slotnames == Py_None || PyList_Check(slotnames)); + if (required) { + Py_ssize_t basicsize = PyBaseObject_Type.tp_basicsize; + if (obj->ob_type->tp_dictoffset) + basicsize += sizeof(PyObject *); + if (obj->ob_type->tp_weaklistoffset) + basicsize += sizeof(PyObject *); + if (slotnames != Py_None) + basicsize += sizeof(PyObject *) * Py_SIZE(slotnames); + if (obj->ob_type->tp_basicsize > basicsize) { + Py_DECREF(slotnames); + Py_DECREF(state); + PyErr_Format(PyExc_TypeError, + "can't pickle %.200s objects", + Py_TYPE(obj)->tp_name); + return NULL; + } + } + if (slotnames != Py_None && Py_SIZE(slotnames) > 0) { PyObject *slots; Py_ssize_t slotnames_size, i; @@ -3920,7 +3944,7 @@ _PyObject_GetState(PyObject *obj, int required) } /* If we found some slot attributes, pack them in a tuple along - the orginal attribute dictionary. */ + the original attribute dictionary. */ if (PyDict_Size(slots) > 0) { PyObject *state2; @@ -4089,7 +4113,7 @@ _PyObject_GetItemsIter(PyObject *obj, PyObject **listitems, } static PyObject * -reduce_newobj(PyObject *obj, int proto) +reduce_newobj(PyObject *obj) { PyObject *args = NULL, *kwargs = NULL; PyObject *copyreg; @@ -4142,7 +4166,7 @@ reduce_newobj(PyObject *obj, int proto) } Py_XDECREF(args); } - else if (proto >= 4) { + else { _Py_IDENTIFIER(__newobj_ex__); newobj = _PyObject_GetAttrId(copyreg, &PyId___newobj_ex__); @@ -4160,16 +4184,6 @@ reduce_newobj(PyObject *obj, int proto) return NULL; } } - else { - PyErr_SetString(PyExc_ValueError, - "must use protocol 4 or greater to copy this " - "object; since __getnewargs_ex__ returned " - "keyword arguments."); - Py_DECREF(args); - Py_DECREF(kwargs); - Py_DECREF(copyreg); - return NULL; - } state = _PyObject_GetState(obj, !hasargs && !PyList_Check(obj) && !PyDict_Check(obj)); @@ -4215,7 +4229,7 @@ _common_reduce(PyObject *self, int proto) PyObject *copyreg, *res; if (proto >= 2) - return reduce_newobj(self, proto); + return reduce_newobj(self); copyreg = import_copyreg(); if (!copyreg) @@ -5342,7 +5356,7 @@ wrap_delitem(PyObject *self, PyObject *args, void *wrapped) /* Helper to check for object.__setattr__ or __delattr__ applied to a type. This is called the Carlo Verre hack after its discoverer. */ static int -hackcheck(PyObject *self, setattrofunc func, char *what) +hackcheck(PyObject *self, setattrofunc func, const char *what) { PyTypeObject *type = Py_TYPE(self); while (type && type->tp_flags & Py_TPFLAGS_HEAPTYPE) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index b146da9..1e7cba6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "Python.h" #include "ucnhash.h" #include "bytes_methods.h" +#include "stringlib/eq.h" #ifdef MS_WINDOWS #include <windows.h> @@ -162,6 +163,14 @@ extern "C" { *_to++ = (to_type) *_iter++; \ } while (0) +#ifdef MS_WINDOWS + /* On Windows, overallocate by 50% is the best factor */ +# define OVERALLOCATE_FACTOR 2 +#else + /* On Linux, overallocate by 25% is the best factor */ +# define OVERALLOCATE_FACTOR 4 +#endif + /* This dictionary holds all interned unicode strings. Note that references to strings in this dictionary are *not* counted in the string's ob_refcnt. When the interned string reaches a refcnt of 0 the string deallocation @@ -263,7 +272,7 @@ raise_encode_exception(PyObject **exceptionObject, const char *reason); /* Same for linebreaks */ -static unsigned char ascii_linebreak[] = { +static const unsigned char ascii_linebreak[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0x000A, * LINE FEED */ /* 0x000B, * LINE TABULATION */ @@ -292,6 +301,38 @@ static unsigned char ascii_linebreak[] = { #include "clinic/unicodeobject.c.h" +typedef enum { + _Py_ERROR_UNKNOWN=0, + _Py_ERROR_STRICT, + _Py_ERROR_SURROGATEESCAPE, + _Py_ERROR_REPLACE, + _Py_ERROR_IGNORE, + _Py_ERROR_BACKSLASHREPLACE, + _Py_ERROR_SURROGATEPASS, + _Py_ERROR_XMLCHARREFREPLACE, + _Py_ERROR_OTHER +} _Py_error_handler; + +static _Py_error_handler +get_error_handler(const char *errors) +{ + if (errors == NULL || strcmp(errors, "strict") == 0) + return _Py_ERROR_STRICT; + if (strcmp(errors, "surrogateescape") == 0) + return _Py_ERROR_SURROGATEESCAPE; + if (strcmp(errors, "replace") == 0) + return _Py_ERROR_REPLACE; + if (strcmp(errors, "ignore") == 0) + return _Py_ERROR_IGNORE; + if (strcmp(errors, "backslashreplace") == 0) + return _Py_ERROR_BACKSLASHREPLACE; + if (strcmp(errors, "surrogatepass") == 0) + return _Py_ERROR_SURROGATEPASS; + if (strcmp(errors, "xmlcharrefreplace") == 0) + return _Py_ERROR_XMLCHARREFREPLACE; + return _Py_ERROR_OTHER; +} + /* The max unicode value is always 0x10FFFF while using the PEP-393 API. This function is kept for backward compatibility with the old API. */ Py_UNICODE @@ -521,6 +562,129 @@ unicode_result_unchanged(PyObject *unicode) return _PyUnicode_Copy(unicode); } +/* Implementation of the "backslashreplace" error handler for 8-bit encodings: + ASCII, Latin1, UTF-8, etc. */ +static char* +backslashreplace(_PyBytesWriter *writer, char *str, + PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) +{ + Py_ssize_t size, i; + Py_UCS4 ch; + enum PyUnicode_Kind kind; + void *data; + + assert(PyUnicode_IS_READY(unicode)); + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + + size = 0; + /* determine replacement size */ + for (i = collstart; i < collend; ++i) { + Py_ssize_t incr; + + ch = PyUnicode_READ(kind, data, i); + if (ch < 0x100) + incr = 2+2; + else if (ch < 0x10000) + incr = 2+4; + else { + assert(ch <= MAX_UNICODE); + incr = 2+8; + } + if (size > PY_SSIZE_T_MAX - incr) { + PyErr_SetString(PyExc_OverflowError, + "encoded result is too long for a Python string"); + return NULL; + } + size += incr; + } + + str = _PyBytesWriter_Prepare(writer, str, size); + if (str == NULL) + return NULL; + + /* generate replacement */ + for (i = collstart; i < collend; ++i) { + ch = PyUnicode_READ(kind, data, i); + *str++ = '\\'; + if (ch >= 0x00010000) { + *str++ = 'U'; + *str++ = Py_hexdigits[(ch>>28)&0xf]; + *str++ = Py_hexdigits[(ch>>24)&0xf]; + *str++ = Py_hexdigits[(ch>>20)&0xf]; + *str++ = Py_hexdigits[(ch>>16)&0xf]; + *str++ = Py_hexdigits[(ch>>12)&0xf]; + *str++ = Py_hexdigits[(ch>>8)&0xf]; + } + else if (ch >= 0x100) { + *str++ = 'u'; + *str++ = Py_hexdigits[(ch>>12)&0xf]; + *str++ = Py_hexdigits[(ch>>8)&0xf]; + } + else + *str++ = 'x'; + *str++ = Py_hexdigits[(ch>>4)&0xf]; + *str++ = Py_hexdigits[ch&0xf]; + } + return str; +} + +/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: + ASCII, Latin1, UTF-8, etc. */ +static char* +xmlcharrefreplace(_PyBytesWriter *writer, char *str, + PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) +{ + Py_ssize_t size, i; + Py_UCS4 ch; + enum PyUnicode_Kind kind; + void *data; + + assert(PyUnicode_IS_READY(unicode)); + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + + size = 0; + /* determine replacement size */ + for (i = collstart; i < collend; ++i) { + Py_ssize_t incr; + + ch = PyUnicode_READ(kind, data, i); + if (ch < 10) + incr = 2+1+1; + else if (ch < 100) + incr = 2+2+1; + else if (ch < 1000) + incr = 2+3+1; + else if (ch < 10000) + incr = 2+4+1; + else if (ch < 100000) + incr = 2+5+1; + else if (ch < 1000000) + incr = 2+6+1; + else { + assert(ch <= MAX_UNICODE); + incr = 2+7+1; + } + if (size > PY_SSIZE_T_MAX - incr) { + PyErr_SetString(PyExc_OverflowError, + "encoded result is too long for a Python string"); + return NULL; + } + size += incr; + } + + str = _PyBytesWriter_Prepare(writer, str, size); + if (str == NULL) + return NULL; + + /* generate replacement */ + for (i = collstart; i < collend; ++i) { + str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); + } + return str; +} + /* --- Bloom Filters ----------------------------------------------------- */ /* stuff to implement simple "bloom filters" for Unicode characters. @@ -587,6 +751,18 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len) #undef BLOOM_UPDATE } +static int +ensure_unicode(PyObject *obj) +{ + if (!PyUnicode_Check(obj)) { + PyErr_Format(PyExc_TypeError, + "must be str, not %.100s", + Py_TYPE(obj)->tp_name); + return -1; + } + return PyUnicode_READY(obj); +} + /* Compilation of templated routines */ #include "stringlib/asciilib.h" @@ -647,27 +823,26 @@ Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind, Py_ssize_t size, Py_UCS4 ch, int direction) { - int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; - switch (kind) { case PyUnicode_1BYTE_KIND: - { - Py_UCS1 ch1 = (Py_UCS1) ch; - if (ch1 == ch) - return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); - else - return -1; - } + if ((Py_UCS1) ch != ch) + return -1; + if (direction > 0) + return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch); + else + return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch); case PyUnicode_2BYTE_KIND: - { - Py_UCS2 ch2 = (Py_UCS2) ch; - if (ch2 == ch) - return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); - else - return -1; - } + if ((Py_UCS2) ch != ch) + return -1; + if (direction > 0) + return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch); + else + return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch); case PyUnicode_4BYTE_KIND: - return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); + if (direction > 0) + return ucs4lib_find_char((Py_UCS4 *) s, size, ch); + else + return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch); default: assert(0); return -1; @@ -2903,7 +3078,7 @@ PyUnicode_FromEncodedObject(PyObject *obj, /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { PyErr_Format(PyExc_TypeError, - "coercing to str: need a bytes-like object, %.80s found", + "decoding to str: need a bytes-like object, %.80s found", Py_TYPE(obj)->tp_name); return NULL; } @@ -3167,24 +3342,22 @@ wcstombs_errorpos(const wchar_t *wstr) static int locale_error_handler(const char *errors, int *surrogateescape) { - if (errors == NULL) { - *surrogateescape = 0; - return 0; - } - - if (strcmp(errors, "strict") == 0) { + _Py_error_handler error_handler = get_error_handler(errors); + switch (error_handler) + { + case _Py_ERROR_STRICT: *surrogateescape = 0; return 0; - } - if (strcmp(errors, "surrogateescape") == 0) { + case _Py_ERROR_SURROGATEESCAPE: *surrogateescape = 1; return 0; + default: + PyErr_Format(PyExc_ValueError, + "only 'strict' and 'surrogateescape' error handlers " + "are supported, not '%s'", + errors); + return -1; } - PyErr_Format(PyExc_ValueError, - "only 'strict' and 'surrogateescape' error handlers " - "are supported, not '%s'", - errors); - return -1; } PyObject * @@ -3626,19 +3799,17 @@ PyUnicode_FSConverter(PyObject* arg, void* addr) output = arg; Py_INCREF(output); } - else { - arg = PyUnicode_FromObject(arg); - if (!arg) - return 0; + else if (PyUnicode_Check(arg)) { output = PyUnicode_EncodeFSDefault(arg); - Py_DECREF(arg); if (!output) return 0; - if (!PyBytes_Check(output)) { - Py_DECREF(output); - PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); - return 0; - } + assert(PyBytes_Check(output)); + } + else { + PyErr_Format(PyExc_TypeError, + "must be str or bytes, not %.100s", + Py_TYPE(arg)->tp_name); + return 0; } size = PyBytes_GET_SIZE(output); data = PyBytes_AS_STRING(output); @@ -3710,7 +3881,7 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) if (PyUnicode_UTF8(unicode) == NULL) { assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); - bytes = _PyUnicode_AsUTF8String(unicode, "strict"); + bytes = _PyUnicode_AsUTF8String(unicode, NULL); if (bytes == NULL) return NULL; _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); @@ -3976,7 +4147,7 @@ unicode_decode_call_errorhandler_wchar( Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, PyObject **output, Py_ssize_t *outpos) { - static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; + static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; PyObject *restuple = NULL; PyObject *repunicode = NULL; @@ -4084,7 +4255,7 @@ unicode_decode_call_errorhandler_writer( Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) { - static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; + static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; PyObject *restuple = NULL; PyObject *repunicode = NULL; @@ -4690,8 +4861,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s, Py_ssize_t startinpos; Py_ssize_t endinpos; const char *errmsg = ""; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) { if (consumed) @@ -4716,6 +4888,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, while (s < end) { Py_UCS4 ch; int kind = writer.kind; + if (kind == PyUnicode_1BYTE_KIND) { if (PyUnicode_IS_ASCII(writer.buffer)) ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); @@ -4754,24 +4927,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s, continue; } - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "utf-8", errmsg, - &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) - goto onError; + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) { + case _Py_ERROR_IGNORE: + s += (endinpos - startinpos); + break; + + case _Py_ERROR_REPLACE: + if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) + goto onError; + s += (endinpos - startinpos); + break; + + case _Py_ERROR_SURROGATEESCAPE: + { + Py_ssize_t i; + + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + for (i=startinpos; i<endinpos; i++) { + ch = (Py_UCS4)(unsigned char)(starts[i]); + PyUnicode_WRITE(writer.kind, writer.data, writer.pos, + ch + 0xdc00); + writer.pos++; + } + s += (endinpos - startinpos); + break; + } + + default: + if (unicode_decode_call_errorhandler_writer( + errors, &error_handler_obj, + "utf-8", errmsg, + &starts, &end, &startinpos, &endinpos, &exc, &s, + &writer)) + goto onError; + } } End: if (consumed) *consumed = s - starts; - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); _PyUnicodeWriter_Dealloc(&writer); return NULL; @@ -5862,11 +6067,10 @@ PyObject * PyUnicode_AsUnicodeEscapeString(PyObject *unicode) { Py_ssize_t i, len; - PyObject *repr; char *p; int kind; void *data; - Py_ssize_t expandsize = 0; + _PyBytesWriter writer; /* Initial allocation is based on the longest-possible character escape. @@ -5882,35 +6086,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) } if (PyUnicode_READY(unicode) == -1) return NULL; + + _PyBytesWriter_Init(&writer); + len = PyUnicode_GET_LENGTH(unicode); kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); - switch (kind) { - case PyUnicode_1BYTE_KIND: expandsize = 4; break; - case PyUnicode_2BYTE_KIND: expandsize = 6; break; - case PyUnicode_4BYTE_KIND: expandsize = 10; break; - } - - if (len == 0) - return PyBytes_FromStringAndSize(NULL, 0); - - if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) - return PyErr_NoMemory(); - - repr = PyBytes_FromStringAndSize(NULL, - 2 - + expandsize*len - + 1); - if (repr == NULL) - return NULL; - p = PyBytes_AS_STRING(repr); + p = _PyBytesWriter_Alloc(&writer, len); + if (p == NULL) + goto error; + writer.overallocate = 1; for (i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); /* Escape backslashes */ if (ch == '\\') { + /* -1: substract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = (char) ch; continue; @@ -5919,6 +6116,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* Map 21-bit characters to '\U00xxxxxx' */ else if (ch >= 0x10000) { assert(ch <= MAX_UNICODE); + + p = _PyBytesWriter_Prepare(&writer, p, 10-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'U'; *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; @@ -5934,6 +6136,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* Map 16-bit characters to '\uxxxx' */ if (ch >= 256) { + p = _PyBytesWriter_Prepare(&writer, p, 6-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'u'; *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; @@ -5944,20 +6150,37 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* Map special whitespace to '\t', \n', '\r' */ else if (ch == '\t') { + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 't'; } else if (ch == '\n') { + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'n'; } else if (ch == '\r') { + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'r'; } /* Map non-printable US ASCII to '\xhh' */ else if (ch < ' ' || ch >= 0x7F) { + /* -1: substract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 4-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'x'; *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; @@ -5969,10 +6192,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) *p++ = (char) ch; } - assert(p - PyBytes_AS_STRING(repr) > 0); - if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) - return NULL; - return repr; + return _PyBytesWriter_Finish(&writer, p); + +error: + _PyBytesWriter_Dealloc(&writer); + return NULL; } PyObject * @@ -6101,13 +6325,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, PyObject * PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) { - PyObject *repr; char *p; - char *q; - Py_ssize_t expandsize, pos; + Py_ssize_t pos; int kind; void *data; Py_ssize_t len; + _PyBytesWriter writer; if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); @@ -6115,28 +6338,29 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) } if (PyUnicode_READY(unicode) == -1) return NULL; + + _PyBytesWriter_Init(&writer); + kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); len = PyUnicode_GET_LENGTH(unicode); - /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 - bytes, and 1 byte characters 4. */ - expandsize = kind * 2 + 2; - if (len > PY_SSIZE_T_MAX / expandsize) - return PyErr_NoMemory(); - - repr = PyBytes_FromStringAndSize(NULL, expandsize * len); - if (repr == NULL) - return NULL; - if (len == 0) - return repr; + p = _PyBytesWriter_Alloc(&writer, len); + if (p == NULL) + goto error; + writer.overallocate = 1; - p = q = PyBytes_AS_STRING(repr); for (pos = 0; pos < len; pos++) { Py_UCS4 ch = PyUnicode_READ(kind, data, pos); /* Map 32-bit characters to '\Uxxxxxxxx' */ if (ch >= 0x10000) { assert(ch <= MAX_UNICODE); + + /* -1: substract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 10-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'U'; *p++ = Py_hexdigits[(ch >> 28) & 0xf]; @@ -6150,6 +6374,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) } /* Map 16-bit characters to '\uxxxx' */ else if (ch >= 256) { + /* -1: substract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 6-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'u'; *p++ = Py_hexdigits[(ch >> 12) & 0xf]; @@ -6162,10 +6391,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) *p++ = (char) ch; } - assert(p > q); - if (_PyBytes_Resize(&repr, p - q) < 0) - return NULL; - return repr; + return _PyBytesWriter_Finish(&writer, p); + +error: + _PyBytesWriter_Dealloc(&writer); + return NULL; } PyObject * @@ -6342,7 +6572,7 @@ unicode_encode_call_errorhandler(const char *errors, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos) { - static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; + static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; Py_ssize_t len; PyObject *restuple; PyObject *resunicode; @@ -6396,25 +6626,22 @@ unicode_encode_call_errorhandler(const char *errors, static PyObject * unicode_encode_ucs1(PyObject *unicode, const char *errors, - unsigned int limit) + const Py_UCS4 limit) { /* input state */ Py_ssize_t pos=0, size; int kind; void *data; - /* output object */ - PyObject *res; /* pointer into the output */ char *str; - /* current output position */ - Py_ssize_t ressize; const char *encoding = (limit == 256) ? "latin-1" : "ascii"; const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; - /* the following variable is used for caching string comparisons - * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ - int known_errorHandler = -1; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; + PyObject *rep = NULL; + /* output object */ + _PyBytesWriter writer; if (PyUnicode_READY(unicode) == -1) return NULL; @@ -6425,186 +6652,157 @@ unicode_encode_ucs1(PyObject *unicode, replacements, if we need more, we'll resize */ if (size == 0) return PyBytes_FromStringAndSize(NULL, 0); - res = PyBytes_FromStringAndSize(NULL, size); - if (res == NULL) + + _PyBytesWriter_Init(&writer); + str = _PyBytesWriter_Alloc(&writer, size); + if (str == NULL) return NULL; - str = PyBytes_AS_STRING(res); - ressize = size; while (pos < size) { - Py_UCS4 c = PyUnicode_READ(kind, data, pos); + Py_UCS4 ch = PyUnicode_READ(kind, data, pos); /* can we encode this? */ - if (c<limit) { + if (ch < limit) { /* no overflow check, because we know that the space is enough */ - *str++ = (char)c; + *str++ = (char)ch; ++pos; } else { - Py_ssize_t requiredsize; - PyObject *repunicode; - Py_ssize_t repsize, newpos, respos, i; + Py_ssize_t newpos, i; /* startpos for collecting unencodable chars */ Py_ssize_t collstart = pos; - Py_ssize_t collend = pos; + Py_ssize_t collend = collstart + 1; /* find all unecodable characters */ + while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) ++collend; + + /* Only overallocate the buffer if it's not the last write */ + writer.overallocate = (collend < size); + /* cache callback name lookup (if not done yet, i.e. it's the first error) */ - if (known_errorHandler==-1) { - if ((errors==NULL) || (!strcmp(errors, "strict"))) - known_errorHandler = 1; - else if (!strcmp(errors, "replace")) - known_errorHandler = 2; - else if (!strcmp(errors, "ignore")) - known_errorHandler = 3; - else if (!strcmp(errors, "xmlcharrefreplace")) - known_errorHandler = 4; - else - known_errorHandler = 0; - } - switch (known_errorHandler) { - case 1: /* strict */ + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) { + case _Py_ERROR_STRICT: raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); goto onError; - case 2: /* replace */ - while (collstart++ < collend) - *str++ = '?'; /* fall through */ - case 3: /* ignore */ + + case _Py_ERROR_REPLACE: + memset(str, '?', collend - collstart); + str += (collend - collstart); + /* fall through ignore error handler */ + case _Py_ERROR_IGNORE: pos = collend; break; - case 4: /* xmlcharrefreplace */ - respos = str - PyBytes_AS_STRING(res); - requiredsize = respos; - /* determine replacement size */ + + case _Py_ERROR_BACKSLASHREPLACE: + /* substract preallocated bytes */ + writer.min_size -= (collend - collstart); + str = backslashreplace(&writer, str, + unicode, collstart, collend); + if (str == NULL) + goto onError; + pos = collend; + break; + + case _Py_ERROR_XMLCHARREFREPLACE: + /* substract preallocated bytes */ + writer.min_size -= (collend - collstart); + str = xmlcharrefreplace(&writer, str, + unicode, collstart, collend); + if (str == NULL) + goto onError; + pos = collend; + break; + + case _Py_ERROR_SURROGATEESCAPE: for (i = collstart; i < collend; ++i) { - Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_ssize_t incr; - if (ch < 10) - incr = 2+1+1; - else if (ch < 100) - incr = 2+2+1; - else if (ch < 1000) - incr = 2+3+1; - else if (ch < 10000) - incr = 2+4+1; - else if (ch < 100000) - incr = 2+5+1; - else if (ch < 1000000) - incr = 2+6+1; - else { - assert(ch <= MAX_UNICODE); - incr = 2+7+1; + ch = PyUnicode_READ(kind, data, i); + if (ch < 0xdc80 || 0xdcff < ch) { + /* Not a UTF-8b surrogate */ + break; } - if (requiredsize > PY_SSIZE_T_MAX - incr) - goto overflow; - requiredsize += incr; - } - if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) - goto overflow; - requiredsize += size - collend; - if (requiredsize > ressize) { - if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) - requiredsize = 2*ressize; - if (_PyBytes_Resize(&res, requiredsize)) - goto onError; - str = PyBytes_AS_STRING(res) + respos; - ressize = requiredsize; + *str++ = (char)(ch - 0xdc00); + ++pos; } - /* generate replacement */ - for (i = collstart; i < collend; ++i) { - str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); - } - pos = collend; - break; + if (i >= collend) + break; + collstart = pos; + assert(collstart != collend); + /* fallback to general error handling */ + default: - repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, - encoding, reason, unicode, &exc, - collstart, collend, &newpos); - if (repunicode == NULL || (PyUnicode_Check(repunicode) && - PyUnicode_READY(repunicode) == -1)) + rep = unicode_encode_call_errorhandler(errors, &error_handler_obj, + encoding, reason, unicode, &exc, + collstart, collend, &newpos); + if (rep == NULL) goto onError; - if (PyBytes_Check(repunicode)) { + + /* substract preallocated bytes */ + writer.min_size -= 1; + + if (PyBytes_Check(rep)) { /* Directly copy bytes result to output. */ - repsize = PyBytes_Size(repunicode); - if (repsize > 1) { - /* Make room for all additional bytes. */ - respos = str - PyBytes_AS_STRING(res); - if (ressize > PY_SSIZE_T_MAX - repsize - 1) { - Py_DECREF(repunicode); - goto overflow; - } - if (_PyBytes_Resize(&res, ressize+repsize-1)) { - Py_DECREF(repunicode); - goto onError; - } - str = PyBytes_AS_STRING(res) + respos; - ressize += repsize-1; - } - memcpy(str, PyBytes_AsString(repunicode), repsize); - str += repsize; - pos = newpos; - Py_DECREF(repunicode); - break; - } - /* need more space? (at least enough for what we - have+the replacement+the rest of the string, so - we won't have to check space for encodable characters) */ - respos = str - PyBytes_AS_STRING(res); - repsize = PyUnicode_GET_LENGTH(repunicode); - requiredsize = respos; - if (requiredsize > PY_SSIZE_T_MAX - repsize) - goto overflow; - requiredsize += repsize; - if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) - goto overflow; - requiredsize += size - collend; - if (requiredsize > ressize) { - if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) - requiredsize = 2*ressize; - if (_PyBytes_Resize(&res, requiredsize)) { - Py_DECREF(repunicode); + str = _PyBytesWriter_WriteBytes(&writer, str, + PyBytes_AS_STRING(rep), + PyBytes_GET_SIZE(rep)); + if (str == NULL) goto onError; - } - str = PyBytes_AS_STRING(res) + respos; - ressize = requiredsize; } - /* check if there is anything unencodable in the replacement - and copy it to the output */ - for (i = 0; repsize-->0; ++i, ++str) { - c = PyUnicode_READ_CHAR(repunicode, i); - if (c >= limit) { - raise_encode_exception(&exc, encoding, unicode, - pos, pos+1, reason); - Py_DECREF(repunicode); + else { + assert(PyUnicode_Check(rep)); + + if (PyUnicode_READY(rep) < 0) goto onError; + + if (PyUnicode_IS_ASCII(rep)) { + /* Fast path: all characters are smaller than limit */ + assert(limit >= 128); + assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); + str = _PyBytesWriter_WriteBytes(&writer, str, + PyUnicode_DATA(rep), + PyUnicode_GET_LENGTH(rep)); + } + else { + Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep); + + str = _PyBytesWriter_Prepare(&writer, str, repsize); + if (str == NULL) + goto onError; + + /* check if there is anything unencodable in the + replacement and copy it to the output */ + for (i = 0; repsize-->0; ++i, ++str) { + ch = PyUnicode_READ_CHAR(rep, i); + if (ch >= limit) { + raise_encode_exception(&exc, encoding, unicode, + pos, pos+1, reason); + goto onError; + } + *str = (char)ch; + } } - *str = (char)c; } pos = newpos; - Py_DECREF(repunicode); + Py_CLEAR(rep); } + + /* If overallocation was disabled, ensure that it was the last + write. Otherwise, we missed an optimization */ + assert(writer.overallocate || pos == size); } } - /* Resize if we allocated to much */ - size = str - PyBytes_AS_STRING(res); - if (size < ressize) { /* If this falls res will be NULL */ - assert(size >= 0); - if (_PyBytes_Resize(&res, size) < 0) - goto onError; - } - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); - return res; - - overflow: - PyErr_SetString(PyExc_OverflowError, - "encoded result is too long for a Python string"); + return _PyBytesWriter_Finish(&writer, str); onError: - Py_XDECREF(res); - Py_XDECREF(errorHandler); + Py_XDECREF(rep); + _PyBytesWriter_Dealloc(&writer); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return NULL; } @@ -6664,8 +6862,9 @@ PyUnicode_DecodeASCII(const char *s, Py_ssize_t endinpos; Py_ssize_t outpos; const char *e; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) _Py_RETURN_UNICODE_EMPTY(); @@ -6694,12 +6893,42 @@ PyUnicode_DecodeASCII(const char *s, PyUnicode_WRITE(kind, data, writer.pos, c); writer.pos++; ++s; + continue; } - else { + + /* byte outsize range 0x00..0x7f: call the error handler */ + + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) + { + case _Py_ERROR_REPLACE: + case _Py_ERROR_SURROGATEESCAPE: + /* Fast-path: the error handler only writes one character, + but we may switch to UCS2 at the first write */ + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + kind = writer.kind; + data = writer.data; + + if (error_handler == _Py_ERROR_REPLACE) + PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); + else + PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); + writer.pos++; + ++s; + break; + + case _Py_ERROR_IGNORE: + ++s; + break; + + default: startinpos = s-starts; endinpos = startinpos + 1; if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, + errors, &error_handler_obj, "ascii", "ordinal not in range(128)", &starts, &e, &startinpos, &endinpos, &exc, &s, &writer)) @@ -6708,13 +6937,13 @@ PyUnicode_DecodeASCII(const char *s, data = writer.data; } } - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: _PyUnicodeWriter_Dealloc(&writer); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return NULL; } @@ -6769,7 +6998,7 @@ PyUnicode_AsASCIIString(PyObject *unicode) # define WC_ERR_INVALID_CHARS 0x0080 #endif -static char* +static const char* code_page_name(UINT code_page, PyObject **obj) { *obj = NULL; @@ -6877,7 +7106,7 @@ decode_code_page_errors(UINT code_page, PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *encoding_obj = NULL; - char *encoding; + const char *encoding; DWORD err; int ret = -1; @@ -7113,7 +7342,6 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes, BOOL usedDefaultChar = FALSE; BOOL *pusedDefaultChar = &usedDefaultChar; int outsize; - PyObject *exc = NULL; wchar_t *p; Py_ssize_t size; const DWORD flags = encode_code_page_flags(code_page, NULL); @@ -7222,7 +7450,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *encoding_obj = NULL; - char *encoding; + const char *encoding; Py_ssize_t newpos, newoutsize; PyObject *rep; int ret = -1; @@ -8080,7 +8308,7 @@ static int charmap_encoding_error( PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, PyObject **exceptionObject, - int *known_errorHandler, PyObject **errorHandler, const char *errors, + _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors, PyObject **res, Py_ssize_t *respos) { PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ @@ -8127,23 +8355,15 @@ charmap_encoding_error( } /* cache callback name lookup * (if not done yet, i.e. it's the first error) */ - if (*known_errorHandler==-1) { - if ((errors==NULL) || (!strcmp(errors, "strict"))) - *known_errorHandler = 1; - else if (!strcmp(errors, "replace")) - *known_errorHandler = 2; - else if (!strcmp(errors, "ignore")) - *known_errorHandler = 3; - else if (!strcmp(errors, "xmlcharrefreplace")) - *known_errorHandler = 4; - else - *known_errorHandler = 0; - } - switch (*known_errorHandler) { - case 1: /* strict */ + if (*error_handler == _Py_ERROR_UNKNOWN) + *error_handler = get_error_handler(errors); + + switch (*error_handler) { + case _Py_ERROR_STRICT: raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); return -1; - case 2: /* replace */ + + case _Py_ERROR_REPLACE: for (collpos = collstartpos; collpos<collendpos; ++collpos) { x = charmapencode_output('?', mapping, res, respos); if (x==enc_EXCEPTION) { @@ -8155,10 +8375,11 @@ charmap_encoding_error( } } /* fall through */ - case 3: /* ignore */ + case _Py_ERROR_IGNORE: *inpos = collendpos; break; - case 4: /* xmlcharrefreplace */ + + case _Py_ERROR_XMLCHARREFREPLACE: /* generate replacement (temporarily (mis)uses p) */ for (collpos = collstartpos; collpos < collendpos; ++collpos) { char buffer[2+29+1+1]; @@ -8176,8 +8397,9 @@ charmap_encoding_error( } *inpos = collendpos; break; + default: - repunicode = unicode_encode_call_errorhandler(errors, errorHandler, + repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj, encoding, reason, unicode, exceptionObject, collstartpos, collendpos, &newpos); if (repunicode == NULL) @@ -8240,12 +8462,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, Py_ssize_t size; /* current output position */ Py_ssize_t respos = 0; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; - /* the following variable is used for caching string comparisons - * -1=not initialized, 0=unknown, 1=strict, 2=replace, - * 3=ignore, 4=xmlcharrefreplace */ - int known_errorHandler = -1; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; void *data; int kind; @@ -8276,7 +8495,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, if (x==enc_FAILED) { /* unencodable character */ if (charmap_encoding_error(unicode, &inpos, mapping, &exc, - &known_errorHandler, &errorHandler, errors, + &error_handler, &error_handler_obj, errors, &res, &respos)) { goto onError; } @@ -8292,13 +8511,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, goto onError; Py_XDECREF(exc); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); return res; onError: Py_XDECREF(res); Py_XDECREF(exc); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); return NULL; } @@ -8365,7 +8584,7 @@ unicode_translate_call_errorhandler(const char *errors, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos) { - static char *argparse = "O!n;translating error handler must return (str, int) tuple"; + static const char *argparse = "O!n;translating error handler must return (str, int) tuple"; Py_ssize_t i_newpos; PyObject *restuple; @@ -8622,7 +8841,7 @@ exit: return res; } -PyObject * +static PyObject * _PyUnicode_TranslateCharmap(PyObject *input, PyObject *mapping, const char *errors) @@ -8651,10 +8870,8 @@ _PyUnicode_TranslateCharmap(PyObject *input, kind = PyUnicode_KIND(input); size = PyUnicode_GET_LENGTH(input); - if (size == 0) { - Py_INCREF(input); - return input; - } + if (size == 0) + return PyUnicode_FromObject(input); /* allocate enough for a simple 1:1 translation without replacements, if we need more, we'll resize */ @@ -8765,14 +8982,9 @@ PyUnicode_Translate(PyObject *str, PyObject *mapping, const char *errors) { - PyObject *result; - - str = PyUnicode_FromObject(str); - if (str == NULL) + if (ensure_unicode(str) < 0) return NULL; - result = _PyUnicode_TranslateCharmap(str, mapping, errors); - Py_DECREF(str); - return result; + return _PyUnicode_TranslateCharmap(str, mapping, errors); } static Py_UCS4 @@ -8954,9 +9166,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, } static Py_ssize_t -any_find_slice(int direction, PyObject* s1, PyObject* s2, +any_find_slice(PyObject* s1, PyObject* s2, Py_ssize_t start, - Py_ssize_t end) + Py_ssize_t end, + int direction) { int kind1, kind2; void *buf1, *buf2; @@ -9125,54 +9338,35 @@ PyUnicode_Count(PyObject *str, Py_ssize_t end) { Py_ssize_t result; - PyObject* str_obj; - PyObject* sub_obj; int kind1, kind2; void *buf1 = NULL, *buf2 = NULL; Py_ssize_t len1, len2; - str_obj = PyUnicode_FromObject(str); - if (!str_obj) + if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) return -1; - sub_obj = PyUnicode_FromObject(substr); - if (!sub_obj) { - Py_DECREF(str_obj); - return -1; - } - if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { - Py_DECREF(sub_obj); - Py_DECREF(str_obj); - return -1; - } - kind1 = PyUnicode_KIND(str_obj); - kind2 = PyUnicode_KIND(sub_obj); - if (kind1 < kind2) { - Py_DECREF(sub_obj); - Py_DECREF(str_obj); + kind1 = PyUnicode_KIND(str); + kind2 = PyUnicode_KIND(substr); + if (kind1 < kind2) return 0; - } - len1 = PyUnicode_GET_LENGTH(str_obj); - len2 = PyUnicode_GET_LENGTH(sub_obj); + len1 = PyUnicode_GET_LENGTH(str); + len2 = PyUnicode_GET_LENGTH(substr); ADJUST_INDICES(start, end, len1); - if (end - start < len2) { - Py_DECREF(sub_obj); - Py_DECREF(str_obj); + if (end - start < len2) return 0; - } - buf1 = PyUnicode_DATA(str_obj); - buf2 = PyUnicode_DATA(sub_obj); + buf1 = PyUnicode_DATA(str); + buf2 = PyUnicode_DATA(substr); if (kind2 != kind1) { - buf2 = _PyUnicode_AsKind(sub_obj, kind1); + buf2 = _PyUnicode_AsKind(substr, kind1); if (!buf2) goto onError; } switch (kind1) { case PyUnicode_1BYTE_KIND: - if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) + if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr)) result = asciilib_count( ((Py_UCS1*)buf1) + start, end - start, buf2, len2, PY_SSIZE_T_MAX @@ -9199,16 +9393,11 @@ PyUnicode_Count(PyObject *str, assert(0); result = 0; } - Py_DECREF(sub_obj); - Py_DECREF(str_obj); - if (kind2 != kind1) PyMem_Free(buf2); return result; onError: - Py_DECREF(sub_obj); - Py_DECREF(str_obj); if (kind2 != kind1 && buf2) PyMem_Free(buf2); return -1; @@ -9216,35 +9405,15 @@ PyUnicode_Count(PyObject *str, Py_ssize_t PyUnicode_Find(PyObject *str, - PyObject *sub, + PyObject *substr, Py_ssize_t start, Py_ssize_t end, int direction) { - Py_ssize_t result; - - str = PyUnicode_FromObject(str); - if (!str) - return -2; - sub = PyUnicode_FromObject(sub); - if (!sub) { - Py_DECREF(str); + if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) return -2; - } - if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { - Py_DECREF(sub); - Py_DECREF(str); - return -2; - } - result = any_find_slice(direction, - str, sub, start, end - ); - - Py_DECREF(str); - Py_DECREF(sub); - - return result; + return any_find_slice(str, substr, start, end, direction); } Py_ssize_t @@ -9347,22 +9516,10 @@ PyUnicode_Tailmatch(PyObject *str, Py_ssize_t end, int direction) { - Py_ssize_t result; - - str = PyUnicode_FromObject(str); - if (str == NULL) - return -1; - substr = PyUnicode_FromObject(substr); - if (substr == NULL) { - Py_DECREF(str); + if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) return -1; - } - result = tailmatch(str, substr, - start, end, direction); - Py_DECREF(str); - Py_DECREF(substr); - return result; + return tailmatch(str, substr, start, end, direction); } /* Apply fixfct filter to the Unicode object self and return a @@ -9968,13 +10125,8 @@ PyUnicode_Splitlines(PyObject *string, int keepends) { PyObject *list; - string = PyUnicode_FromObject(string); - if (string == NULL) + if (ensure_unicode(string) < 0) return NULL; - if (PyUnicode_READY(string) == -1) { - Py_DECREF(string); - return NULL; - } switch (PyUnicode_KIND(string)) { case PyUnicode_1BYTE_KIND: @@ -10001,7 +10153,6 @@ PyUnicode_Splitlines(PyObject *string, int keepends) assert(0); list = 0; } - Py_DECREF(string); return list; } @@ -10562,28 +10713,27 @@ unicode_casefold(PyObject *self) } -/* Argument converter. Coerces to a single unicode character */ +/* Argument converter. Accepts a single Unicode character. */ static int convert_uc(PyObject *obj, void *addr) { Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; - PyObject *uniobj; - uniobj = PyUnicode_FromObject(obj); - if (uniobj == NULL) { - PyErr_SetString(PyExc_TypeError, - "The fill character cannot be converted to Unicode"); + if (!PyUnicode_Check(obj)) { + PyErr_Format(PyExc_TypeError, + "The fill character must be a unicode character, " + "not %.100s", Py_TYPE(obj)->tp_name); return 0; } - if (PyUnicode_GET_LENGTH(uniobj) != 1) { + if (PyUnicode_READY(obj) < 0) + return 0; + if (PyUnicode_GET_LENGTH(obj) != 1) { PyErr_SetString(PyExc_TypeError, "The fill character must be exactly one character long"); - Py_DECREF(uniobj); return 0; } - *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); - Py_DECREF(uniobj); + *fillcharloc = PyUnicode_READ_CHAR(obj, 0); return 1; } @@ -10899,59 +11049,49 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) } int -PyUnicode_Contains(PyObject *container, PyObject *element) +_PyUnicode_EQ(PyObject *aa, PyObject *bb) +{ + return unicode_eq(aa, bb); +} + +int +PyUnicode_Contains(PyObject *str, PyObject *substr) { - PyObject *str, *sub; int kind1, kind2; void *buf1, *buf2; Py_ssize_t len1, len2; int result; - /* Coerce the two arguments */ - sub = PyUnicode_FromObject(element); - if (!sub) { + if (!PyUnicode_Check(substr)) { PyErr_Format(PyExc_TypeError, - "'in <string>' requires string as left operand, not %s", - element->ob_type->tp_name); + "'in <string>' requires string as left operand, not %.100s", + Py_TYPE(substr)->tp_name); return -1; } - - str = PyUnicode_FromObject(container); - if (!str) { - Py_DECREF(sub); + if (PyUnicode_READY(substr) == -1) + return -1; + if (ensure_unicode(str) < 0) return -1; - } kind1 = PyUnicode_KIND(str); - kind2 = PyUnicode_KIND(sub); - if (kind1 < kind2) { - Py_DECREF(sub); - Py_DECREF(str); + kind2 = PyUnicode_KIND(substr); + if (kind1 < kind2) return 0; - } len1 = PyUnicode_GET_LENGTH(str); - len2 = PyUnicode_GET_LENGTH(sub); - if (len1 < len2) { - Py_DECREF(sub); - Py_DECREF(str); + len2 = PyUnicode_GET_LENGTH(substr); + if (len1 < len2) return 0; - } buf1 = PyUnicode_DATA(str); - buf2 = PyUnicode_DATA(sub); + buf2 = PyUnicode_DATA(substr); if (len2 == 1) { Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1; - Py_DECREF(sub); - Py_DECREF(str); return result; } if (kind2 != kind1) { - buf2 = _PyUnicode_AsKind(sub, kind1); - if (!buf2) { - Py_DECREF(sub); - Py_DECREF(str); + buf2 = _PyUnicode_AsKind(substr, kind1); + if (!buf2) return -1; - } } switch (kind1) { @@ -10969,9 +11109,6 @@ PyUnicode_Contains(PyObject *container, PyObject *element) assert(0); } - Py_DECREF(str); - Py_DECREF(sub); - if (kind2 != kind1) PyMem_Free(buf2); @@ -10983,56 +11120,40 @@ PyUnicode_Contains(PyObject *container, PyObject *element) PyObject * PyUnicode_Concat(PyObject *left, PyObject *right) { - PyObject *u = NULL, *v = NULL, *w; + PyObject *result; Py_UCS4 maxchar, maxchar2; - Py_ssize_t u_len, v_len, new_len; + Py_ssize_t left_len, right_len, new_len; - /* Coerce the two arguments */ - u = PyUnicode_FromObject(left); - if (u == NULL) - goto onError; - v = PyUnicode_FromObject(right); - if (v == NULL) - goto onError; + if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0) + return NULL; /* Shortcuts */ - if (v == unicode_empty) { - Py_DECREF(v); - return u; - } - if (u == unicode_empty) { - Py_DECREF(u); - return v; - } + if (left == unicode_empty) + return PyUnicode_FromObject(right); + if (right == unicode_empty) + return PyUnicode_FromObject(left); - u_len = PyUnicode_GET_LENGTH(u); - v_len = PyUnicode_GET_LENGTH(v); - if (u_len > PY_SSIZE_T_MAX - v_len) { + left_len = PyUnicode_GET_LENGTH(left); + right_len = PyUnicode_GET_LENGTH(right); + if (left_len > PY_SSIZE_T_MAX - right_len) { PyErr_SetString(PyExc_OverflowError, "strings are too large to concat"); - goto onError; + return NULL; } - new_len = u_len + v_len; + new_len = left_len + right_len; - maxchar = PyUnicode_MAX_CHAR_VALUE(u); - maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); + maxchar = PyUnicode_MAX_CHAR_VALUE(left); + maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); maxchar = Py_MAX(maxchar, maxchar2); /* Concat the two Unicode strings */ - w = PyUnicode_New(new_len, maxchar); - if (w == NULL) - goto onError; - _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); - _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); - Py_DECREF(u); - Py_DECREF(v); - assert(_PyUnicode_CheckConsistency(w, 1)); - return w; - - onError: - Py_XDECREF(u); - Py_XDECREF(v); - return NULL; + result = PyUnicode_New(new_len, maxchar); + if (result == NULL) + return NULL; + _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len); + _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len); + assert(_PyUnicode_CheckConsistency(result, 1)); + return result; } void @@ -11123,6 +11244,25 @@ PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) Py_XDECREF(right); } +/* +Wraps stringlib_parse_args_finds() and additionally ensures that the +first argument is a unicode object. +*/ + +Py_LOCAL_INLINE(int) +parse_args_finds_unicode(const char * function_name, PyObject *args, + PyObject **substring, + Py_ssize_t *start, Py_ssize_t *end) +{ + if(stringlib_parse_args_finds(function_name, args, substring, + start, end)) { + if (ensure_unicode(*substring) < 0) + return 0; + return 1; + } + return 0; +} + PyDoc_STRVAR(count__doc__, "S.count(sub[, start[, end]]) -> int\n\ \n\ @@ -11141,31 +11281,26 @@ unicode_count(PyObject *self, PyObject *args) void *buf1, *buf2; Py_ssize_t len1, len2, iresult; - if (!stringlib_parse_args_finds_unicode("count", args, &substring, - &start, &end)) + if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) return NULL; kind1 = PyUnicode_KIND(self); kind2 = PyUnicode_KIND(substring); - if (kind1 < kind2) { - Py_DECREF(substring); + if (kind1 < kind2) return PyLong_FromLong(0); - } + len1 = PyUnicode_GET_LENGTH(self); len2 = PyUnicode_GET_LENGTH(substring); ADJUST_INDICES(start, end, len1); - if (end - start < len2) { - Py_DECREF(substring); + if (end - start < len2) return PyLong_FromLong(0); - } + buf1 = PyUnicode_DATA(self); buf2 = PyUnicode_DATA(substring); if (kind2 != kind1) { buf2 = _PyUnicode_AsKind(substring, kind1); - if (!buf2) { - Py_DECREF(substring); + if (!buf2) return NULL; - } } switch (kind1) { case PyUnicode_1BYTE_KIND: @@ -11195,8 +11330,6 @@ unicode_count(PyObject *self, PyObject *args) if (kind2 != kind1) PyMem_Free(buf2); - Py_DECREF(substring); - return result; } @@ -11330,22 +11463,13 @@ unicode_find(PyObject *self, PyObject *args) Py_ssize_t end = 0; Py_ssize_t result; - if (!stringlib_parse_args_finds_unicode("find", args, &substring, - &start, &end)) + if (!parse_args_finds_unicode("find", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) { - Py_DECREF(substring); - return NULL; - } - if (PyUnicode_READY(substring) == -1) { - Py_DECREF(substring); + if (PyUnicode_READY(self) == -1) return NULL; - } - result = any_find_slice(1, self, substring, start, end); - - Py_DECREF(substring); + result = any_find_slice(self, substring, start, end, 1); if (result == -2) return NULL; @@ -11418,22 +11542,13 @@ unicode_index(PyObject *self, PyObject *args) Py_ssize_t start = 0; Py_ssize_t end = 0; - if (!stringlib_parse_args_finds_unicode("index", args, &substring, - &start, &end)) + if (!parse_args_finds_unicode("index", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) { - Py_DECREF(substring); - return NULL; - } - if (PyUnicode_READY(substring) == -1) { - Py_DECREF(substring); + if (PyUnicode_READY(self) == -1) return NULL; - } - result = any_find_slice(1, self, substring, start, end); - - Py_DECREF(substring); + result = any_find_slice(self, substring, start, end, 1); if (result == -2) return NULL; @@ -11947,7 +12062,7 @@ unicode_lower(PyObject *self) #define BOTHSTRIP 2 /* Arrays indexed by above */ -static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; +static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; #define STRIPNAME(i) (stripformat[i]+3) @@ -12242,40 +12357,15 @@ unicode_repeat(PyObject *str, Py_ssize_t len) } PyObject * -PyUnicode_Replace(PyObject *obj, - PyObject *subobj, - PyObject *replobj, +PyUnicode_Replace(PyObject *str, + PyObject *substr, + PyObject *replstr, Py_ssize_t maxcount) { - PyObject *self; - PyObject *str1; - PyObject *str2; - PyObject *result; - - self = PyUnicode_FromObject(obj); - if (self == NULL) + if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 || + ensure_unicode(replstr) < 0) return NULL; - str1 = PyUnicode_FromObject(subobj); - if (str1 == NULL) { - Py_DECREF(self); - return NULL; - } - str2 = PyUnicode_FromObject(replobj); - if (str2 == NULL) { - Py_DECREF(self); - Py_DECREF(str1); - return NULL; - } - if (PyUnicode_READY(self) == -1 || - PyUnicode_READY(str1) == -1 || - PyUnicode_READY(str2) == -1) - result = NULL; - else - result = replace(self, str1, str2, maxcount); - Py_DECREF(self); - Py_DECREF(str1); - Py_DECREF(str2); - return result; + return replace(str, substr, replstr, maxcount); } PyDoc_STRVAR(replace__doc__, @@ -12291,28 +12381,12 @@ unicode_replace(PyObject *self, PyObject *args) PyObject *str1; PyObject *str2; Py_ssize_t maxcount = -1; - PyObject *result; - if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) + if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount)) return NULL; if (PyUnicode_READY(self) == -1) return NULL; - str1 = PyUnicode_FromObject(str1); - if (str1 == NULL) - return NULL; - str2 = PyUnicode_FromObject(str2); - if (str2 == NULL) { - Py_DECREF(str1); - return NULL; - } - if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) - result = NULL; - else - result = replace(self, str1, str2, maxcount); - - Py_DECREF(str1); - Py_DECREF(str2); - return result; + return replace(self, str1, str2, maxcount); } static PyObject * @@ -12497,22 +12571,13 @@ unicode_rfind(PyObject *self, PyObject *args) Py_ssize_t end = 0; Py_ssize_t result; - if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, - &start, &end)) + if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) { - Py_DECREF(substring); - return NULL; - } - if (PyUnicode_READY(substring) == -1) { - Py_DECREF(substring); + if (PyUnicode_READY(self) == -1) return NULL; - } - result = any_find_slice(-1, self, substring, start, end); - - Py_DECREF(substring); + result = any_find_slice(self, substring, start, end, -1); if (result == -2) return NULL; @@ -12534,22 +12599,13 @@ unicode_rindex(PyObject *self, PyObject *args) Py_ssize_t end = 0; Py_ssize_t result; - if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, - &start, &end)) + if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) { - Py_DECREF(substring); - return NULL; - } - if (PyUnicode_READY(substring) == -1) { - Py_DECREF(substring); + if (PyUnicode_READY(self) == -1) return NULL; - } - - result = any_find_slice(-1, self, substring, start, end); - Py_DECREF(substring); + result = any_find_slice(self, substring, start, end, -1); if (result == -2) return NULL; @@ -12589,24 +12645,10 @@ unicode_rjust(PyObject *self, PyObject *args) PyObject * PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) { - PyObject *result; - - s = PyUnicode_FromObject(s); - if (s == NULL) + if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - if (sep != NULL) { - sep = PyUnicode_FromObject(sep); - if (sep == NULL) { - Py_DECREF(s); - return NULL; - } - } - - result = split(s, sep, maxsplit); - Py_DECREF(s); - Py_XDECREF(sep); - return result; + return split(s, sep, maxsplit); } PyDoc_STRVAR(split__doc__, @@ -12631,35 +12673,26 @@ unicode_split(PyObject *self, PyObject *args, PyObject *kwds) if (substring == Py_None) return split(self, NULL, maxcount); - else if (PyUnicode_Check(substring)) + + if (PyUnicode_Check(substring)) return split(self, substring, maxcount); - else - return PyUnicode_Split(self, substring, maxcount); + + PyErr_Format(PyExc_TypeError, + "must be str or None, not %.100s", + Py_TYPE(substring)->tp_name); + return NULL; } PyObject * -PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) +PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj) { - PyObject* str_obj; - PyObject* sep_obj; PyObject* out; int kind1, kind2; void *buf1, *buf2; Py_ssize_t len1, len2; - str_obj = PyUnicode_FromObject(str_in); - if (!str_obj) + if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) return NULL; - sep_obj = PyUnicode_FromObject(sep_in); - if (!sep_obj) { - Py_DECREF(str_obj); - return NULL; - } - if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { - Py_DECREF(sep_obj); - Py_DECREF(str_obj); - return NULL; - } kind1 = PyUnicode_KIND(str_obj); kind2 = PyUnicode_KIND(sep_obj); @@ -12673,8 +12706,6 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); Py_DECREF(unicode_empty); } - Py_DECREF(sep_obj); - Py_DECREF(str_obj); return out; } buf1 = PyUnicode_DATA(str_obj); @@ -12682,7 +12713,7 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) if (kind2 != kind1) { buf2 = _PyUnicode_AsKind(sep_obj, kind1); if (!buf2) - goto onError; + return NULL; } switch (kind1) { @@ -12703,39 +12734,23 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) out = 0; } - Py_DECREF(sep_obj); - Py_DECREF(str_obj); if (kind2 != kind1) PyMem_Free(buf2); return out; - onError: - Py_DECREF(sep_obj); - Py_DECREF(str_obj); - if (kind2 != kind1 && buf2) - PyMem_Free(buf2); - return NULL; } PyObject * -PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) +PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj) { - PyObject* str_obj; - PyObject* sep_obj; PyObject* out; int kind1, kind2; void *buf1, *buf2; Py_ssize_t len1, len2; - str_obj = PyUnicode_FromObject(str_in); - if (!str_obj) - return NULL; - sep_obj = PyUnicode_FromObject(sep_in); - if (!sep_obj) { - Py_DECREF(str_obj); + if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) return NULL; - } kind1 = PyUnicode_KIND(str_obj); kind2 = PyUnicode_KIND(sep_obj); @@ -12749,8 +12764,6 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); Py_DECREF(unicode_empty); } - Py_DECREF(sep_obj); - Py_DECREF(str_obj); return out; } buf1 = PyUnicode_DATA(str_obj); @@ -12758,7 +12771,7 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) if (kind2 != kind1) { buf2 = _PyUnicode_AsKind(sep_obj, kind1); if (!buf2) - goto onError; + return NULL; } switch (kind1) { @@ -12779,18 +12792,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) out = 0; } - Py_DECREF(sep_obj); - Py_DECREF(str_obj); if (kind2 != kind1) PyMem_Free(buf2); return out; - onError: - Py_DECREF(sep_obj); - Py_DECREF(str_obj); - if (kind2 != kind1 && buf2) - PyMem_Free(buf2); - return NULL; } PyDoc_STRVAR(partition__doc__, @@ -12822,24 +12827,10 @@ unicode_rpartition(PyObject *self, PyObject *separator) PyObject * PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) { - PyObject *result; - - s = PyUnicode_FromObject(s); - if (s == NULL) + if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - if (sep != NULL) { - sep = PyUnicode_FromObject(sep); - if (sep == NULL) { - Py_DECREF(s); - return NULL; - } - } - result = rsplit(s, sep, maxsplit); - - Py_DECREF(s); - Py_XDECREF(sep); - return result; + return rsplit(s, sep, maxsplit); } PyDoc_STRVAR(rsplit__doc__, @@ -12864,10 +12855,14 @@ unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) if (substring == Py_None) return rsplit(self, NULL, maxcount); - else if (PyUnicode_Check(substring)) + + if (PyUnicode_Check(substring)) return rsplit(self, substring, maxcount); - else - return PyUnicode_RSplit(self, substring, maxcount); + + PyErr_Format(PyExc_TypeError, + "must be str or None, not %.100s", + Py_TYPE(substring)->tp_name); + return NULL; } PyDoc_STRVAR(splitlines__doc__, @@ -13148,11 +13143,15 @@ unicode_startswith(PyObject *self, if (PyTuple_Check(subobj)) { Py_ssize_t i; for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); - if (substring == NULL) + substring = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substring)) { + PyErr_Format(PyExc_TypeError, + "tuple for startswith must only contain str, " + "not %.100s", + Py_TYPE(substring)->tp_name); return NULL; + } result = tailmatch(self, substring, start, end, -1); - Py_DECREF(substring); if (result == -1) return NULL; if (result) { @@ -13162,15 +13161,13 @@ unicode_startswith(PyObject *self, /* nothing matched */ Py_RETURN_FALSE; } - substring = PyUnicode_FromObject(subobj); - if (substring == NULL) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " - "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); + if (!PyUnicode_Check(subobj)) { + PyErr_Format(PyExc_TypeError, + "startswith first arg must be str or " + "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); return NULL; } - result = tailmatch(self, substring, start, end, -1); - Py_DECREF(substring); + result = tailmatch(self, subobj, start, end, -1); if (result == -1) return NULL; return PyBool_FromLong(result); @@ -13200,12 +13197,15 @@ unicode_endswith(PyObject *self, if (PyTuple_Check(subobj)) { Py_ssize_t i; for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - substring = PyUnicode_FromObject( - PyTuple_GET_ITEM(subobj, i)); - if (substring == NULL) + substring = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substring)) { + PyErr_Format(PyExc_TypeError, + "tuple for endswith must only contain str, " + "not %.100s", + Py_TYPE(substring)->tp_name); return NULL; + } result = tailmatch(self, substring, start, end, +1); - Py_DECREF(substring); if (result == -1) return NULL; if (result) { @@ -13214,15 +13214,13 @@ unicode_endswith(PyObject *self, } Py_RETURN_FALSE; } - substring = PyUnicode_FromObject(subobj); - if (substring == NULL) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " - "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); + if (!PyUnicode_Check(subobj)) { + PyErr_Format(PyExc_TypeError, + "endswith first arg must be str or " + "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); return NULL; } - result = tailmatch(self, substring, start, end, +1); - Py_DECREF(substring); + result = tailmatch(self, subobj, start, end, +1); if (result == -1) return NULL; return PyBool_FromLong(result); @@ -13231,44 +13229,50 @@ unicode_endswith(PyObject *self, Py_LOCAL_INLINE(void) _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) { - if (!writer->readonly) + writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); + writer->data = PyUnicode_DATA(writer->buffer); + + if (!writer->readonly) { + writer->kind = PyUnicode_KIND(writer->buffer); writer->size = PyUnicode_GET_LENGTH(writer->buffer); + } else { + /* use a value smaller than PyUnicode_1BYTE_KIND() so + _PyUnicodeWriter_PrepareKind() will copy the buffer. */ + writer->kind = PyUnicode_WCHAR_KIND; + assert(writer->kind <= PyUnicode_1BYTE_KIND); + /* Copy-on-write mode: set buffer size to 0 so * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on * next write. */ writer->size = 0; } - writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); - writer->data = PyUnicode_DATA(writer->buffer); - writer->kind = PyUnicode_KIND(writer->buffer); } void _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) { memset(writer, 0, sizeof(*writer)); -#ifdef Py_DEBUG - writer->kind = 5; /* invalid kind */ -#endif + + /* ASCII is the bare minimum */ writer->min_char = 127; + + /* use a value smaller than PyUnicode_1BYTE_KIND() so + _PyUnicodeWriter_PrepareKind() will copy the buffer. */ + writer->kind = PyUnicode_WCHAR_KIND; + assert(writer->kind <= PyUnicode_1BYTE_KIND); } int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t length, Py_UCS4 maxchar) { -#ifdef MS_WINDOWS - /* On Windows, overallocate by 50% is the best factor */ -# define OVERALLOCATE_FACTOR 2 -#else - /* On Linux, overallocate by 25% is the best factor */ -# define OVERALLOCATE_FACTOR 4 -#endif Py_ssize_t newlen; PyObject *newbuffer; - assert(length > 0); + /* ensure that the _PyUnicodeWriter_Prepare macro was used */ + assert((maxchar > writer->maxchar && length >= 0) + || length > 0); if (length > PY_SSIZE_T_MAX - writer->pos) { PyErr_NoMemory(); @@ -13334,6 +13338,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, #undef OVERALLOCATE_FACTOR } +int +_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, + enum PyUnicode_Kind kind) +{ + Py_UCS4 maxchar; + + /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ + assert(writer->kind < kind); + + switch (kind) + { + case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; + case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; + case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; + default: + assert(0 && "invalid kind"); + return -1; + } + + return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); +} + Py_LOCAL_INLINE(int) _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) { @@ -13504,17 +13530,26 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) assert(PyUnicode_GET_LENGTH(str) == writer->pos); return str; } - if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { - PyObject *newbuffer; - newbuffer = resize_compact(writer->buffer, writer->pos); - if (newbuffer == NULL) { - Py_CLEAR(writer->buffer); - return NULL; + if (writer->pos == 0) { + Py_CLEAR(writer->buffer); + + /* Get the empty Unicode string singleton ('') */ + _Py_INCREF_UNICODE_EMPTY(); + str = unicode_empty; + } + else { + str = writer->buffer; + writer->buffer = NULL; + + if (PyUnicode_GET_LENGTH(str) != writer->pos) { + PyObject *str2; + str2 = resize_compact(str, writer->pos); + if (str2 == NULL) + return NULL; + str = str2; } - writer->buffer = newbuffer; } - str = writer->buffer; - writer->buffer = NULL; + assert(_PyUnicode_CheckConsistency(str, 1)); return unicode_result_ready(str); } @@ -14655,13 +14690,10 @@ PyUnicode_Format(PyObject *format, PyObject *args) return NULL; } - ctx.fmtstr = PyUnicode_FromObject(format); - if (ctx.fmtstr == NULL) - return NULL; - if (PyUnicode_READY(ctx.fmtstr) == -1) { - Py_DECREF(ctx.fmtstr); + if (ensure_unicode(format) < 0) return NULL; - } + + ctx.fmtstr = format; ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); @@ -14721,11 +14753,9 @@ PyUnicode_Format(PyObject *format, PyObject *args) if (ctx.args_owned) { Py_DECREF(ctx.args); } - Py_DECREF(ctx.fmtstr); return _PyUnicodeWriter_Finish(&ctx.writer); onError: - Py_DECREF(ctx.fmtstr); _PyUnicodeWriter_Dealloc(&ctx.writer); if (ctx.args_owned) { Py_DECREF(ctx.args); diff --git a/Objects/weakrefobject.c b/Objects/weakrefobject.c index 7e6f364..f75b1e8 100644 --- a/Objects/weakrefobject.c +++ b/Objects/weakrefobject.c @@ -265,7 +265,7 @@ insert_head(PyWeakReference *newref, PyWeakReference **list) } static int -parse_weakref_init_args(char *funcname, PyObject *args, PyObject *kwargs, +parse_weakref_init_args(const char *funcname, PyObject *args, PyObject *kwargs, PyObject **obp, PyObject **callbackp) { return PyArg_UnpackTuple(args, funcname, 1, 2, obp, callbackp); |