diff options
-rw-r--r-- | Makefile.pre.in | 1 | ||||
-rw-r--r-- | Misc/NEWS | 4 | ||||
-rw-r--r-- | Objects/bytearrayobject.c | 506 | ||||
-rw-r--r-- | Objects/bytesobject.c | 523 | ||||
-rw-r--r-- | Objects/stringlib/count.h | 16 | ||||
-rw-r--r-- | Objects/stringlib/ctype.h | 1 | ||||
-rw-r--r-- | Objects/stringlib/fastsearch.h | 40 | ||||
-rw-r--r-- | Objects/stringlib/find.h | 56 | ||||
-rw-r--r-- | Objects/stringlib/partition.h | 83 | ||||
-rw-r--r-- | Objects/stringlib/split.h | 788 | ||||
-rw-r--r-- | Objects/stringlib/stringdefs.h | 2 | ||||
-rw-r--r-- | Objects/stringlib/transmogrify.h | 91 | ||||
-rw-r--r-- | Objects/stringlib/unicodedefs.h | 2 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 420 | ||||
-rw-r--r-- | PC/VS8.0/pythoncore.vcproj | 4 | ||||
-rw-r--r-- | PCbuild/pythoncore.vcproj | 4 |
16 files changed, 1123 insertions, 1418 deletions
diff --git a/Makefile.pre.in b/Makefile.pre.in index 86d077c..638776c 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -582,6 +582,7 @@ BYTESTR_DEPS = \ $(srcdir)/Objects/stringlib/fastsearch.h \ $(srcdir)/Objects/stringlib/find.h \ $(srcdir)/Objects/stringlib/partition.h \ + $(srcdir)/Objects/stringlib/split.h \ $(srcdir)/Objects/stringlib/stringdefs.h \ $(srcdir)/Objects/stringlib/string_format.h \ $(srcdir)/Objects/stringlib/transmogrify.h \ @@ -12,6 +12,10 @@ What's New in Python 3.2 Alpha 1? Core and Builtins ----------------- +- Issue #7622: Improve the split(), rsplit(), splitlines() and replace() + methods of bytes, bytearray and unicode objects by using a common + implementation based on stringlib's fast search. Patch by Florent Xicluna. + - Issue #7632: Fix a crash in dtoa.c that occurred in debug builds when parsing certain long numeric strings corresponding to subnormal values. Also fix a number of bugs in dtoa.c that could lead to diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index 827c47b..823c800 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -1039,14 +1039,16 @@ bytearray_dealloc(PyByteArrayObject *self) #define STRINGLIB_STR PyByteArray_AS_STRING #define STRINGLIB_NEW PyByteArray_FromStringAndSize #define STRINGLIB_EMPTY nullbytes +#define STRINGLIB_ISSPACE Py_ISSPACE +#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r')) #define STRINGLIB_CHECK_EXACT PyByteArray_CheckExact #define STRINGLIB_MUTABLE 1 -#define FROM_BYTEARRAY 1 #include "stringlib/fastsearch.h" #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/partition.h" +#include "stringlib/split.h" #include "stringlib/ctype.h" #include "stringlib/transmogrify.h" @@ -1054,21 +1056,20 @@ bytearray_dealloc(PyByteArrayObject *self) /* The following Py_LOCAL_INLINE and Py_LOCAL functions were copied from the old char* style string object. */ -Py_LOCAL_INLINE(void) -_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len) -{ - if (*end > len) - *end = len; - else if (*end < 0) - *end += len; - if (*end < 0) - *end = 0; - if (*start < 0) - *start += len; - if (*start < 0) - *start = 0; -} - +/* helper macro to fixup start/end slice values */ +#define ADJUST_INDICES(start, end, len) \ + if (end > len) \ + end = len; \ + else if (end < 0) { \ + end += len; \ + if (end < 0) \ + end = 0; \ + } \ + if (start < 0) { \ + start += len; \ + if (start < 0) \ + start = 0; \ + } Py_LOCAL_INLINE(Py_ssize_t) bytearray_find_internal(PyByteArrayObject *self, PyObject *args, int dir) @@ -1136,10 +1137,10 @@ bytearray_count(PyByteArrayObject *self, PyObject *args) if (_getbuffer(sub_obj, &vsub) < 0) return NULL; - _adjust_indices(&start, &end, PyByteArray_GET_SIZE(self)); + ADJUST_INDICES(start, end, PyByteArray_GET_SIZE(self)); count_obj = PyLong_FromSsize_t( - stringlib_count(str + start, end - start, vsub.buf, vsub.len) + stringlib_count(str + start, end - start, vsub.buf, vsub.len, PY_SSIZE_T_MAX) ); PyBuffer_Release(&vsub); return count_obj; @@ -1247,7 +1248,7 @@ _bytearray_tailmatch(PyByteArrayObject *self, PyObject *substr, Py_ssize_t start if (_getbuffer(substr, &vsubstr) < 0) return -1; - _adjust_indices(&start, &end, len); + ADJUST_INDICES(start, end, len); if (direction < 0) { /* startswith */ @@ -1459,20 +1460,11 @@ bytearray_maketrans(PyObject *null, PyObject *args) } -#define FORWARD 1 -#define REVERSE -1 - /* find and count characters and substrings */ #define findchar(target, target_len, c) \ ((char *)memchr((const void *)(target), c, target_len)) -/* Don't call if length < 2 */ -#define Py_STRING_MATCH(target, offset, pattern, length) \ - (target[offset] == pattern[0] && \ - target[offset+length-1] == pattern[length-1] && \ - !memcmp(target+offset+1, pattern+1, length-2) ) - /* Bytes ops must return a string, create a copy */ Py_LOCAL(PyByteArrayObject *) @@ -1500,93 +1492,6 @@ countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount return count; } -Py_LOCAL(Py_ssize_t) -findstring(const char *target, Py_ssize_t target_len, - const char *pattern, Py_ssize_t pattern_len, - Py_ssize_t start, - Py_ssize_t end, - int direction) -{ - if (start < 0) { - start += target_len; - if (start < 0) - start = 0; - } - if (end > target_len) { - end = target_len; - } else if (end < 0) { - end += target_len; - if (end < 0) - end = 0; - } - - /* zero-length substrings always match at the first attempt */ - if (pattern_len == 0) - return (direction > 0) ? start : end; - - end -= pattern_len; - - if (direction < 0) { - for (; end >= start; end--) - if (Py_STRING_MATCH(target, end, pattern, pattern_len)) - return end; - } else { - for (; start <= end; start++) - if (Py_STRING_MATCH(target, start, pattern, pattern_len)) - return start; - } - return -1; -} - -Py_LOCAL_INLINE(Py_ssize_t) -countstring(const char *target, Py_ssize_t target_len, - const char *pattern, Py_ssize_t pattern_len, - Py_ssize_t start, - Py_ssize_t end, - int direction, Py_ssize_t maxcount) -{ - Py_ssize_t count=0; - - if (start < 0) { - start += target_len; - if (start < 0) - start = 0; - } - if (end > target_len) { - end = target_len; - } else if (end < 0) { - end += target_len; - if (end < 0) - end = 0; - } - - /* zero-length substrings match everywhere */ - if (pattern_len == 0 || maxcount == 0) { - if (target_len+1 < maxcount) - return target_len+1; - return maxcount; - } - - end -= pattern_len; - if (direction < 0) { - for (; (end >= start); end--) - if (Py_STRING_MATCH(target, end, pattern, pattern_len)) { - count++; - if (--maxcount <= 0) break; - end -= pattern_len-1; - } - } else { - for (; (start <= end); start++) - if (Py_STRING_MATCH(target, start, pattern, pattern_len)) { - count++; - if (--maxcount <= 0) - break; - start += pattern_len-1; - } - } - return count; -} - /* Algorithms for different cases of string replacement */ @@ -1708,10 +1613,9 @@ replace_delete_substring(PyByteArrayObject *self, self_len = PyByteArray_GET_SIZE(self); self_s = PyByteArray_AS_STRING(self); - count = countstring(self_s, self_len, - from_s, from_len, - 0, self_len, 1, - maxcount); + count = stringlib_count(self_s, self_len, + from_s, from_len, + maxcount); if (count == 0) { /* no matches */ @@ -1730,9 +1634,9 @@ replace_delete_substring(PyByteArrayObject *self, start = self_s; end = self_s + self_len; while (count-- > 0) { - offset = findstring(start, end-start, - from_s, from_len, - 0, end-start, FORWARD); + offset = stringlib_find(start, end-start, + from_s, from_len, + 0); if (offset == -1) break; next = start + offset; @@ -1808,9 +1712,9 @@ replace_substring_in_place(PyByteArrayObject *self, self_s = PyByteArray_AS_STRING(self); self_len = PyByteArray_GET_SIZE(self); - offset = findstring(self_s, self_len, - from_s, from_len, - 0, self_len, FORWARD); + offset = stringlib_find(self_s, self_len, + from_s, from_len, + 0); if (offset == -1) { /* No matches; return the original bytes */ return return_self(self); @@ -1830,9 +1734,9 @@ replace_substring_in_place(PyByteArrayObject *self, end = result_s + self_len; while ( --maxcount > 0) { - offset = findstring(start, end-start, - from_s, from_len, - 0, end-start, FORWARD); + offset = stringlib_find(start, end-start, + from_s, from_len, + 0); if (offset==-1) break; Py_MEMCPY(start+offset, to_s, from_len); @@ -1925,9 +1829,10 @@ replace_substring(PyByteArrayObject *self, self_s = PyByteArray_AS_STRING(self); self_len = PyByteArray_GET_SIZE(self); - count = countstring(self_s, self_len, - from_s, from_len, - 0, self_len, FORWARD, maxcount); + count = stringlib_count(self_s, self_len, + from_s, from_len, + maxcount); + if (count == 0) { /* no matches, return unchanged */ return return_self(self); @@ -1954,9 +1859,9 @@ replace_substring(PyByteArrayObject *self, start = self_s; end = self_s + self_len; while (count-- > 0) { - offset = findstring(start, end-start, - from_s, from_len, - 0, end-start, FORWARD); + offset = stringlib_find(start, end-start, + from_s, from_len, + 0); if (offset == -1) break; next = start+offset; @@ -2085,123 +1990,6 @@ bytearray_replace(PyByteArrayObject *self, PyObject *args) return res; } - -/* Overallocate the initial list to reduce the number of reallocs for small - split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three - resizes, to sizes 4, 8, then 16. Most observed string splits are for human - text (roughly 11 words per line) and field delimited data (usually 1-10 - fields). For large strings the split algorithms are bandwidth limited - so increasing the preallocation likely will not improve things.*/ - -#define MAX_PREALLOC 12 - -/* 5 splits gives 6 elements */ -#define PREALLOC_SIZE(maxsplit) \ - (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) - -#define SPLIT_APPEND(data, left, right) \ - str = PyByteArray_FromStringAndSize((data) + (left), \ - (right) - (left)); \ - if (str == NULL) \ - goto onError; \ - if (PyList_Append(list, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); - -#define SPLIT_ADD(data, left, right) { \ - str = PyByteArray_FromStringAndSize((data) + (left), \ - (right) - (left)); \ - if (str == NULL) \ - goto onError; \ - if (count < MAX_PREALLOC) { \ - PyList_SET_ITEM(list, count, str); \ - } else { \ - if (PyList_Append(list, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); \ - } \ - count++; } - -/* Always force the list to the expected size. */ -#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count - - -Py_LOCAL_INLINE(PyObject *) -split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount) -{ - register Py_ssize_t i, j, count = 0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - - if (list == NULL) - return NULL; - - i = j = 0; - while ((j < len) && (maxcount-- > 0)) { - for(; j < len; j++) { - /* I found that using memchr makes no difference */ - if (s[j] == ch) { - SPLIT_ADD(s, i, j); - i = j = j + 1; - break; - } - } - } - if (i <= len) { - SPLIT_ADD(s, i, len); - } - FIX_PREALLOC_SIZE(list); - return list; - - onError: - Py_DECREF(list); - return NULL; -} - - -Py_LOCAL_INLINE(PyObject *) -split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount) -{ - register Py_ssize_t i, j, count = 0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - - if (list == NULL) - return NULL; - - for (i = j = 0; i < len; ) { - /* find a token */ - while (i < len && Py_ISSPACE(s[i])) - i++; - j = i; - while (i < len && !Py_ISSPACE(s[i])) - i++; - if (j < i) { - if (maxcount-- <= 0) - break; - SPLIT_ADD(s, j, i); - while (i < len && Py_ISSPACE(s[i])) - i++; - j = i; - } - } - if (j < len) { - SPLIT_ADD(s, j, len); - } - FIX_PREALLOC_SIZE(list); - return list; - - onError: - Py_DECREF(list); - return NULL; -} - PyDoc_STRVAR(split__doc__, "B.split([sep[, maxsplit]]) -> list of bytearrays\n\ \n\ @@ -2213,10 +2001,10 @@ If maxsplit is given, at most maxsplit splits are done."); static PyObject * bytearray_split(PyByteArrayObject *self, PyObject *args) { - Py_ssize_t len = PyByteArray_GET_SIZE(self), n, i, j, pos; - Py_ssize_t maxsplit = -1, count = 0; + Py_ssize_t len = PyByteArray_GET_SIZE(self), n; + Py_ssize_t maxsplit = -1; const char *s = PyByteArray_AS_STRING(self), *sub; - PyObject *list, *str, *subobj = Py_None; + PyObject *list, *subobj = Py_None; Py_buffer vsub; if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit)) @@ -2225,73 +2013,18 @@ bytearray_split(PyByteArrayObject *self, PyObject *args) maxsplit = PY_SSIZE_T_MAX; if (subobj == Py_None) - return split_whitespace(s, len, maxsplit); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit); if (_getbuffer(subobj, &vsub) < 0) return NULL; sub = vsub.buf; n = vsub.len; - if (n == 0) { - PyErr_SetString(PyExc_ValueError, "empty separator"); - PyBuffer_Release(&vsub); - return NULL; - } - if (n == 1) { - list = split_char(s, len, sub[0], maxsplit); - PyBuffer_Release(&vsub); - return list; - } - - list = PyList_New(PREALLOC_SIZE(maxsplit)); - if (list == NULL) { - PyBuffer_Release(&vsub); - return NULL; - } - - i = j = 0; - while (maxsplit-- > 0) { - pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH); - if (pos < 0) - break; - j = i+pos; - SPLIT_ADD(s, i, j); - i = j + n; - } - SPLIT_ADD(s, i, len); - FIX_PREALLOC_SIZE(list); + list = stringlib_split( + (PyObject*) self, s, len, sub, n, maxsplit + ); PyBuffer_Release(&vsub); return list; - - onError: - Py_DECREF(list); - PyBuffer_Release(&vsub); - return NULL; -} - -/* stringlib's partition shares nullbytes in some cases. - undo this, we don't want the nullbytes to be shared. */ -static PyObject * -make_nullbytes_unique(PyObject *result) -{ - if (result != NULL) { - int i; - assert(PyTuple_Check(result)); - assert(PyTuple_GET_SIZE(result) == 3); - for (i = 0; i < 3; i++) { - if (PyTuple_GET_ITEM(result, i) == (PyObject *)nullbytes) { - PyObject *new = PyByteArray_FromStringAndSize(NULL, 0); - if (new == NULL) { - Py_DECREF(result); - result = NULL; - break; - } - Py_DECREF(nullbytes); - PyTuple_SET_ITEM(result, i, new); - } - } - } - return result; } PyDoc_STRVAR(partition__doc__, @@ -2318,7 +2051,7 @@ bytearray_partition(PyByteArrayObject *self, PyObject *sep_obj) ); Py_DECREF(bytesep); - return make_nullbytes_unique(result); + return result; } PyDoc_STRVAR(rpartition__doc__, @@ -2346,81 +2079,7 @@ bytearray_rpartition(PyByteArrayObject *self, PyObject *sep_obj) ); Py_DECREF(bytesep); - return make_nullbytes_unique(result); -} - -Py_LOCAL_INLINE(PyObject *) -rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount) -{ - register Py_ssize_t i, j, count=0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - - if (list == NULL) - return NULL; - - i = j = len - 1; - while ((i >= 0) && (maxcount-- > 0)) { - for (; i >= 0; i--) { - if (s[i] == ch) { - SPLIT_ADD(s, i + 1, j + 1); - j = i = i - 1; - break; - } - } - } - if (j >= -1) { - SPLIT_ADD(s, 0, j + 1); - } - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; - - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount) -{ - register Py_ssize_t i, j, count = 0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - - if (list == NULL) - return NULL; - - for (i = j = len - 1; i >= 0; ) { - /* find a token */ - while (i >= 0 && Py_ISSPACE(s[i])) - i--; - j = i; - while (i >= 0 && !Py_ISSPACE(s[i])) - i--; - if (j > i) { - if (maxcount-- <= 0) - break; - SPLIT_ADD(s, i + 1, j + 1); - while (i >= 0 && Py_ISSPACE(s[i])) - i--; - j = i; - } - } - if (j >= 0) { - SPLIT_ADD(s, 0, j + 1); - } - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; - - return list; - - onError: - Py_DECREF(list); - return NULL; + return result; } PyDoc_STRVAR(rsplit__doc__, @@ -2435,10 +2094,10 @@ If maxsplit is given, at most maxsplit splits are done."); static PyObject * bytearray_rsplit(PyByteArrayObject *self, PyObject *args) { - Py_ssize_t len = PyByteArray_GET_SIZE(self), n, j, pos; - Py_ssize_t maxsplit = -1, count = 0; + Py_ssize_t len = PyByteArray_GET_SIZE(self), n; + Py_ssize_t maxsplit = -1; const char *s = PyByteArray_AS_STRING(self), *sub; - PyObject *list, *str, *subobj = Py_None; + PyObject *list, *subobj = Py_None; Py_buffer vsub; if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit)) @@ -2447,50 +2106,18 @@ bytearray_rsplit(PyByteArrayObject *self, PyObject *args) maxsplit = PY_SSIZE_T_MAX; if (subobj == Py_None) - return rsplit_whitespace(s, len, maxsplit); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit); if (_getbuffer(subobj, &vsub) < 0) return NULL; sub = vsub.buf; n = vsub.len; - if (n == 0) { - PyErr_SetString(PyExc_ValueError, "empty separator"); - PyBuffer_Release(&vsub); - return NULL; - } - else if (n == 1) { - list = rsplit_char(s, len, sub[0], maxsplit); - PyBuffer_Release(&vsub); - return list; - } - - list = PyList_New(PREALLOC_SIZE(maxsplit)); - if (list == NULL) { - PyBuffer_Release(&vsub); - return NULL; - } - - j = len; - - while (maxsplit-- > 0) { - pos = fastsearch(s, j, sub, n, FAST_RSEARCH); - if (pos < 0) - break; - SPLIT_ADD(s, pos + n, j); - j = pos; - } - SPLIT_ADD(s, 0, j); - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; + list = stringlib_rsplit( + (PyObject*) self, s, len, sub, n, maxsplit + ); PyBuffer_Release(&vsub); return list; - -onError: - Py_DECREF(list); - PyBuffer_Release(&vsub); - return NULL; } PyDoc_STRVAR(reverse__doc__, @@ -2956,6 +2583,27 @@ bytearray_join(PyByteArrayObject *self, PyObject *it) return NULL; } +PyDoc_STRVAR(splitlines__doc__, +"B.splitlines([keepends]) -> list of lines\n\ +\n\ +Return a list of the lines in B, breaking at line boundaries.\n\ +Line breaks are not included in the resulting list unless keepends\n\ +is given and true."); + +static PyObject* +bytearray_splitlines(PyObject *self, PyObject *args) +{ + int keepends = 0; + + if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) + return NULL; + + return stringlib_splitlines( + (PyObject*) self, PyByteArray_AS_STRING(self), + PyByteArray_GET_SIZE(self), keepends + ); +} + PyDoc_STRVAR(fromhex_doc, "bytearray.fromhex(string) -> bytearray (static method)\n\ \n\ @@ -3134,7 +2782,7 @@ bytearray_methods[] = { {"rsplit", (PyCFunction)bytearray_rsplit, METH_VARARGS, rsplit__doc__}, {"rstrip", (PyCFunction)bytearray_rstrip, METH_VARARGS, rstrip__doc__}, {"split", (PyCFunction)bytearray_split, METH_VARARGS, split__doc__}, - {"splitlines", (PyCFunction)stringlib_splitlines, METH_VARARGS, + {"splitlines", (PyCFunction)bytearray_splitlines, METH_VARARGS, splitlines__doc__}, {"startswith", (PyCFunction)bytearray_startswith, METH_VARARGS , startswith__doc__}, diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 680a769..183722c 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -56,7 +56,7 @@ static PyBytesObject *nullstring; If `str' is NULL then PyBytes_FromStringAndSize() will allocate `size+1' bytes (setting the last byte to the null terminating character) and you can fill in the data yourself. If `str' is non-NULL then the resulting - PyString object must be treated as immutable and you must not fill in nor + PyBytes object must be treated as immutable and you must not fill in nor alter the data yourself, since the strings may be shared. The PyObject member `op->ob_size', which denotes the number of "extra @@ -568,9 +568,9 @@ PyBytes_AsStringAndSize(register PyObject *obj, #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/partition.h" +#include "stringlib/split.h" #include "stringlib/ctype.h" -#define STRINGLIB_MUTABLE 0 #include "stringlib/transmogrify.h" PyObject * @@ -1000,133 +1000,6 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; #define STRIPNAME(i) (stripformat[i]+3) - -/* Don't call if length < 2 */ -#define Py_STRING_MATCH(target, offset, pattern, length) \ - (target[offset] == pattern[0] && \ - target[offset+length-1] == pattern[length-1] && \ - !memcmp(target+offset+1, pattern+1, length-2) ) - - -/* Overallocate the initial list to reduce the number of reallocs for small - split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three - resizes, to sizes 4, 8, then 16. Most observed string splits are for human - text (roughly 11 words per line) and field delimited data (usually 1-10 - fields). For large strings the split algorithms are bandwidth limited - so increasing the preallocation likely will not improve things.*/ - -#define MAX_PREALLOC 12 - -/* 5 splits gives 6 elements */ -#define PREALLOC_SIZE(maxsplit) \ - (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) - -#define SPLIT_ADD(data, left, right) { \ - str = PyBytes_FromStringAndSize((data) + (left), \ - (right) - (left)); \ - if (str == NULL) \ - goto onError; \ - if (count < MAX_PREALLOC) { \ - PyList_SET_ITEM(list, count, str); \ - } else { \ - if (PyList_Append(list, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); \ - } \ - count++; } - -/* Always force the list to the expected size. */ -#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count - -#define SKIP_SPACE(s, i, len) { while (i<len && ISSPACE(s[i])) i++; } -#define SKIP_NONSPACE(s, i, len) { while (i<len && !ISSPACE(s[i])) i++; } -#define RSKIP_SPACE(s, i) { while (i>=0 && ISSPACE(s[i])) i--; } -#define RSKIP_NONSPACE(s, i) { while (i>=0 && !ISSPACE(s[i])) i--; } - -Py_LOCAL_INLINE(PyObject *) -split_whitespace(PyBytesObject *self, Py_ssize_t len, Py_ssize_t maxsplit) -{ - const char *s = PyBytes_AS_STRING(self); - Py_ssize_t i, j, count=0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit)); - - if (list == NULL) - return NULL; - - i = j = 0; - - while (maxsplit-- > 0) { - SKIP_SPACE(s, i, len); - if (i==len) break; - j = i; i++; - SKIP_NONSPACE(s, i, len); - if (j == 0 && i == len && PyBytes_CheckExact(self)) { - /* No whitespace in self, so just use it as list[0] */ - Py_INCREF(self); - PyList_SET_ITEM(list, 0, (PyObject *)self); - count++; - break; - } - SPLIT_ADD(s, j, i); - } - - if (i < len) { - /* Only occurs when maxsplit was reached */ - /* Skip any remaining whitespace and copy to end of string */ - SKIP_SPACE(s, i, len); - if (i != len) - SPLIT_ADD(s, i, len); - } - FIX_PREALLOC_SIZE(list); - return list; - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -split_char(PyBytesObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount) -{ - const char *s = PyBytes_AS_STRING(self); - register Py_ssize_t i, j, count=0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - - if (list == NULL) - return NULL; - - i = j = 0; - while ((j < len) && (maxcount-- > 0)) { - for(; j<len; j++) { - /* I found that using memchr makes no difference */ - if (s[j] == ch) { - SPLIT_ADD(s, i, j); - i = j = j + 1; - break; - } - } - } - if (i == 0 && count == 0 && PyBytes_CheckExact(self)) { - /* ch not in self, so just use self as list[0] */ - Py_INCREF(self); - PyList_SET_ITEM(list, 0, (PyObject *)self); - count++; - } - else if (i <= len) { - SPLIT_ADD(s, i, len); - } - FIX_PREALLOC_SIZE(list); - return list; - - onError: - Py_DECREF(list); - return NULL; -} - PyDoc_STRVAR(split__doc__, "B.split([sep[, maxsplit]]) -> list of bytes\n\ \n\ @@ -1138,74 +1011,26 @@ If maxsplit is given, at most maxsplit splits are done."); static PyObject * bytes_split(PyBytesObject *self, PyObject *args) { - Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j; - Py_ssize_t maxsplit = -1, count=0; + Py_ssize_t len = PyBytes_GET_SIZE(self), n; + Py_ssize_t maxsplit = -1; const char *s = PyBytes_AS_STRING(self), *sub; Py_buffer vsub; - PyObject *list, *str, *subobj = Py_None; -#ifdef USE_FAST - Py_ssize_t pos; -#endif + PyObject *list, *subobj = Py_None; if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit)) return NULL; if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (subobj == Py_None) - return split_whitespace(self, len, maxsplit); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit); if (_getbuffer(subobj, &vsub) < 0) return NULL; sub = vsub.buf; n = vsub.len; - if (n == 0) { - PyErr_SetString(PyExc_ValueError, "empty separator"); - PyBuffer_Release(&vsub); - return NULL; - } - else if (n == 1) { - list = split_char(self, len, sub[0], maxsplit); - PyBuffer_Release(&vsub); - return list; - } - - list = PyList_New(PREALLOC_SIZE(maxsplit)); - if (list == NULL) { - PyBuffer_Release(&vsub); - return NULL; - } - -#ifdef USE_FAST - i = j = 0; - while (maxsplit-- > 0) { - pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH); - if (pos < 0) - break; - j = i+pos; - SPLIT_ADD(s, i, j); - i = j + n; - } -#else - i = j = 0; - while ((j+n <= len) && (maxsplit-- > 0)) { - for (; j+n <= len; j++) { - if (Py_STRING_MATCH(s, j, sub, n)) { - SPLIT_ADD(s, i, j); - i = j = j + n; - break; - } - } - } -#endif - SPLIT_ADD(s, i, len); - FIX_PREALLOC_SIZE(list); + list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit); PyBuffer_Release(&vsub); return list; - - onError: - Py_DECREF(list); - PyBuffer_Release(&vsub); - return NULL; } PyDoc_STRVAR(partition__doc__, @@ -1263,90 +1088,6 @@ bytes_rpartition(PyBytesObject *self, PyObject *sep_obj) ); } -Py_LOCAL_INLINE(PyObject *) -rsplit_whitespace(PyBytesObject *self, Py_ssize_t len, Py_ssize_t maxsplit) -{ - const char *s = PyBytes_AS_STRING(self); - Py_ssize_t i, j, count=0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit)); - - if (list == NULL) - return NULL; - - i = j = len-1; - - while (maxsplit-- > 0) { - RSKIP_SPACE(s, i); - if (i<0) break; - j = i; i--; - RSKIP_NONSPACE(s, i); - if (j == len-1 && i < 0 && PyBytes_CheckExact(self)) { - /* No whitespace in self, so just use it as list[0] */ - Py_INCREF(self); - PyList_SET_ITEM(list, 0, (PyObject *)self); - count++; - break; - } - SPLIT_ADD(s, i + 1, j + 1); - } - if (i >= 0) { - /* Only occurs when maxsplit was reached. Skip any remaining - whitespace and copy to beginning of string. */ - RSKIP_SPACE(s, i); - if (i >= 0) - SPLIT_ADD(s, 0, i + 1); - - } - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; - return list; - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -rsplit_char(PyBytesObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount) -{ - const char *s = PyBytes_AS_STRING(self); - register Py_ssize_t i, j, count=0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - - if (list == NULL) - return NULL; - - i = j = len - 1; - while ((i >= 0) && (maxcount-- > 0)) { - for (; i >= 0; i--) { - if (s[i] == ch) { - SPLIT_ADD(s, i + 1, j + 1); - j = i = i - 1; - break; - } - } - } - if (i < 0 && count == 0 && PyBytes_CheckExact(self)) { - /* ch not in self, so just use self as list[0] */ - Py_INCREF(self); - PyList_SET_ITEM(list, 0, (PyObject *)self); - count++; - } - else if (j >= -1) { - SPLIT_ADD(s, 0, j + 1); - } - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - PyDoc_STRVAR(rsplit__doc__, "B.rsplit([sep[, maxsplit]]) -> list of bytes\n\ \n\ @@ -1360,71 +1101,28 @@ If maxsplit is given, at most maxsplit splits are done."); static PyObject * bytes_rsplit(PyBytesObject *self, PyObject *args) { - Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j; - Py_ssize_t maxsplit = -1, count=0; - const char *s, *sub; + Py_ssize_t len = PyBytes_GET_SIZE(self), n; + Py_ssize_t maxsplit = -1; + const char *s = PyBytes_AS_STRING(self), *sub; Py_buffer vsub; - PyObject *list, *str, *subobj = Py_None; + PyObject *list, *subobj = Py_None; if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit)) return NULL; if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (subobj == Py_None) - return rsplit_whitespace(self, len, maxsplit); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit); if (_getbuffer(subobj, &vsub) < 0) return NULL; sub = vsub.buf; n = vsub.len; - if (n == 0) { - PyErr_SetString(PyExc_ValueError, "empty separator"); - PyBuffer_Release(&vsub); - return NULL; - } - else if (n == 1) { - list = rsplit_char(self, len, sub[0], maxsplit); - PyBuffer_Release(&vsub); - return list; - } - - list = PyList_New(PREALLOC_SIZE(maxsplit)); - if (list == NULL) { - PyBuffer_Release(&vsub); - return NULL; - } - - j = len; - i = j - n; - - s = PyBytes_AS_STRING(self); - while ( (i >= 0) && (maxsplit-- > 0) ) { - for (; i>=0; i--) { - if (Py_STRING_MATCH(s, i, sub, n)) { - SPLIT_ADD(s, i + n, j); - j = i; - i -= n; - break; - } - } - } - SPLIT_ADD(s, 0, j); - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; + list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit); PyBuffer_Release(&vsub); return list; - -onError: - Py_DECREF(list); - PyBuffer_Release(&vsub); - return NULL; } -#undef SPLIT_ADD -#undef MAX_PREALLOC -#undef PREALLOC_SIZE - PyDoc_STRVAR(join__doc__, "B.join(iterable_of_bytes) -> bytes\n\ @@ -1531,20 +1229,20 @@ _PyBytes_Join(PyObject *sep, PyObject *x) return bytes_join(sep, x); } -Py_LOCAL_INLINE(void) -bytes_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len) -{ - if (*end > len) - *end = len; - else if (*end < 0) - *end += len; - if (*end < 0) - *end = 0; - if (*start < 0) - *start += len; - if (*start < 0) - *start = 0; -} +/* helper macro to fixup start/end slice values */ +#define ADJUST_INDICES(start, end, len) \ + if (end > len) \ + end = len; \ + else if (end < 0) { \ + end += len; \ + if (end < 0) \ + end = 0; \ + } \ + if (start < 0) { \ + start += len; \ + if (start < 0) \ + start = 0; \ + } Py_LOCAL_INLINE(Py_ssize_t) bytes_find_internal(PyBytesObject *self, PyObject *args, int dir) @@ -1591,7 +1289,7 @@ bytes_find_internal(PyBytesObject *self, PyObject *args, int dir) PyDoc_STRVAR(find__doc__, "B.find(sub[, start[, end]]) -> int\n\ \n\ -Return the lowest index in S where substring sub is found,\n\ +Return the lowest index in B where substring sub is found,\n\ such that sub is contained within s[start:end]. Optional\n\ arguments start and end are interpreted as in slice notation.\n\ \n\ @@ -1801,7 +1499,7 @@ PyDoc_STRVAR(count__doc__, "B.count(sub[, start[, end]]) -> int\n\ \n\ Return the number of non-overlapping occurrences of substring sub in\n\ -string S[start:end]. Optional arguments start and end are interpreted\n\ +string B[start:end]. Optional arguments start and end are interpreted\n\ as in slice notation."); static PyObject * @@ -1823,10 +1521,10 @@ bytes_count(PyBytesObject *self, PyObject *args) else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len)) return NULL; - bytes_adjust_indices(&start, &end, PyBytes_GET_SIZE(self)); + ADJUST_INDICES(start, end, PyBytes_GET_SIZE(self)); return PyLong_FromSsize_t( - stringlib_count(str + start, end - start, sub, sub_len) + stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX) ); } @@ -1943,9 +1641,6 @@ bytes_maketrans(PyObject *null, PyObject *args) return _Py_bytes_maketrans(args); } -#define FORWARD 1 -#define REVERSE -1 - /* find and count characters and substrings */ #define findchar(target, target_len, c) \ @@ -1981,94 +1676,6 @@ countchar(const char *target, int target_len, char c, Py_ssize_t maxcount) return count; } -Py_LOCAL(Py_ssize_t) -findstring(const char *target, Py_ssize_t target_len, - const char *pattern, Py_ssize_t pattern_len, - Py_ssize_t start, - Py_ssize_t end, - int direction) -{ - if (start < 0) { - start += target_len; - if (start < 0) - start = 0; - } - if (end > target_len) { - end = target_len; - } else if (end < 0) { - end += target_len; - if (end < 0) - end = 0; - } - - /* zero-length substrings always match at the first attempt */ - if (pattern_len == 0) - return (direction > 0) ? start : end; - - end -= pattern_len; - - if (direction < 0) { - for (; end >= start; end--) - if (Py_STRING_MATCH(target, end, pattern, pattern_len)) - return end; - } else { - for (; start <= end; start++) - if (Py_STRING_MATCH(target, start,pattern,pattern_len)) - return start; - } - return -1; -} - -Py_LOCAL_INLINE(Py_ssize_t) -countstring(const char *target, Py_ssize_t target_len, - const char *pattern, Py_ssize_t pattern_len, - Py_ssize_t start, - Py_ssize_t end, - int direction, Py_ssize_t maxcount) -{ - Py_ssize_t count=0; - - if (start < 0) { - start += target_len; - if (start < 0) - start = 0; - } - if (end > target_len) { - end = target_len; - } else if (end < 0) { - end += target_len; - if (end < 0) - end = 0; - } - - /* zero-length substrings match everywhere */ - if (pattern_len == 0 || maxcount == 0) { - if (target_len+1 < maxcount) - return target_len+1; - return maxcount; - } - - end -= pattern_len; - if (direction < 0) { - for (; (end >= start); end--) - if (Py_STRING_MATCH(target, end,pattern,pattern_len)) { - count++; - if (--maxcount <= 0) break; - end -= pattern_len-1; - } - } else { - for (; (start <= end); start++) - if (Py_STRING_MATCH(target, start, - pattern, pattern_len)) { - count++; - if (--maxcount <= 0) - break; - start += pattern_len-1; - } - } - return count; -} - /* Algorithms for different cases of string replacement */ @@ -2189,10 +1796,9 @@ replace_delete_substring(PyBytesObject *self, self_len = PyBytes_GET_SIZE(self); self_s = PyBytes_AS_STRING(self); - count = countstring(self_s, self_len, - from_s, from_len, - 0, self_len, 1, - maxcount); + count = stringlib_count(self_s, self_len, + from_s, from_len, + maxcount); if (count == 0) { /* no matches */ @@ -2211,9 +1817,9 @@ replace_delete_substring(PyBytesObject *self, start = self_s; end = self_s + self_len; while (count-- > 0) { - offset = findstring(start, end-start, - from_s, from_len, - 0, end-start, FORWARD); + offset = stringlib_find(start, end-start, + from_s, from_len, + 0); if (offset == -1) break; next = start + offset; @@ -2289,9 +1895,9 @@ replace_substring_in_place(PyBytesObject *self, self_s = PyBytes_AS_STRING(self); self_len = PyBytes_GET_SIZE(self); - offset = findstring(self_s, self_len, - from_s, from_len, - 0, self_len, FORWARD); + offset = stringlib_find(self_s, self_len, + from_s, from_len, + 0); if (offset == -1) { /* No matches; return the original string */ return return_self(self); @@ -2311,9 +1917,9 @@ replace_substring_in_place(PyBytesObject *self, end = result_s + self_len; while ( --maxcount > 0) { - offset = findstring(start, end-start, - from_s, from_len, - 0, end-start, FORWARD); + offset = stringlib_find(start, end-start, + from_s, from_len, + 0); if (offset==-1) break; Py_MEMCPY(start+offset, to_s, from_len); @@ -2407,9 +2013,10 @@ replace_substring(PyBytesObject *self, self_s = PyBytes_AS_STRING(self); self_len = PyBytes_GET_SIZE(self); - count = countstring(self_s, self_len, - from_s, from_len, - 0, self_len, FORWARD, maxcount); + count = stringlib_count(self_s, self_len, + from_s, from_len, + maxcount); + if (count == 0) { /* no matches, return unchanged */ return return_self(self); @@ -2438,9 +2045,9 @@ replace_substring(PyBytesObject *self, start = self_s; end = self_s + self_len; while (count-- > 0) { - offset = findstring(start, end-start, - from_s, from_len, - 0, end-start, FORWARD); + offset = stringlib_find(start, end-start, + from_s, from_len, + 0); if (offset == -1) break; next = start+offset; @@ -2598,7 +2205,7 @@ _bytes_tailmatch(PyBytesObject *self, PyObject *substr, Py_ssize_t start, return -1; str = PyBytes_AS_STRING(self); - bytes_adjust_indices(&start, &end, len); + ADJUST_INDICES(start, end, len); if (direction < 0) { /* startswith */ @@ -2703,7 +2310,7 @@ bytes_endswith(PyBytesObject *self, PyObject *args) PyDoc_STRVAR(decode__doc__, "B.decode([encoding[, errors]]) -> str\n\ \n\ -Decode S using the codec registered for encoding. encoding defaults\n\ +Decode B using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ @@ -2725,6 +2332,28 @@ bytes_decode(PyObject *self, PyObject *args, PyObject *kwargs) } +PyDoc_STRVAR(splitlines__doc__, +"B.splitlines([keepends]) -> list of lines\n\ +\n\ +Return a list of the lines in B, breaking at line boundaries.\n\ +Line breaks are not included in the resulting list unless keepends\n\ +is given and true."); + +static PyObject* +bytes_splitlines(PyObject *self, PyObject *args) +{ + int keepends = 0; + + if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) + return NULL; + + return stringlib_splitlines( + (PyObject*) self, PyBytes_AS_STRING(self), + PyBytes_GET_SIZE(self), keepends + ); +} + + PyDoc_STRVAR(fromhex_doc, "bytes.fromhex(string) -> bytes\n\ \n\ @@ -2857,7 +2486,7 @@ bytes_methods[] = { {"rsplit", (PyCFunction)bytes_rsplit, METH_VARARGS, rsplit__doc__}, {"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__}, {"split", (PyCFunction)bytes_split, METH_VARARGS, split__doc__}, - {"splitlines", (PyCFunction)stringlib_splitlines, METH_VARARGS, + {"splitlines", (PyCFunction)bytes_splitlines, METH_VARARGS, splitlines__doc__}, {"startswith", (PyCFunction)bytes_startswith, METH_VARARGS, startswith__doc__}, @@ -3239,7 +2868,7 @@ _PyBytes_Resize(PyObject **pv, Py_ssize_t newsize) /* _PyBytes_FormatLong emulates the format codes d, u, o, x and X, and * the F_ALT flag, for Python's long (unbounded) ints. It's not used for * Python's regular ints. - * Return value: a new PyString*, or NULL if error. + * Return value: a new PyBytes*, or NULL if error. * . *pbuf is set to point into it, * *plen set to the # of chars following that. * Caller must decref it when done using pbuf. diff --git a/Objects/stringlib/count.h b/Objects/stringlib/count.h index eba37e9..de34f96 100644 --- a/Objects/stringlib/count.h +++ b/Objects/stringlib/count.h @@ -9,28 +9,22 @@ Py_LOCAL_INLINE(Py_ssize_t) stringlib_count(const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR* sub, Py_ssize_t sub_len) + const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, + Py_ssize_t maxcount) { Py_ssize_t count; if (str_len < 0) return 0; /* start > len(str) */ if (sub_len == 0) - return str_len + 1; + return (str_len < maxcount) ? str_len + 1 : maxcount; - count = fastsearch(str, str_len, sub, sub_len, FAST_COUNT); + count = fastsearch(str, str_len, sub, sub_len, maxcount, FAST_COUNT); if (count < 0) - count = 0; /* no match */ + return 0; /* no match */ return count; } #endif - -/* -Local variables: -c-basic-offset: 4 -indent-tabs-mode: nil -End: -*/ diff --git a/Objects/stringlib/ctype.h b/Objects/stringlib/ctype.h index 8951276..739cf3d 100644 --- a/Objects/stringlib/ctype.h +++ b/Objects/stringlib/ctype.h @@ -107,4 +107,3 @@ stringlib_swapcase(PyObject *self) STRINGLIB_LEN(self)); return newobj; } - diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 7b9dd47..21cf3a2 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -18,10 +18,13 @@ #define FAST_SEARCH 1 #define FAST_RSEARCH 2 +#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1))))) +#define BLOOM(mask, ch) ((mask & (1 << ((ch) & (LONG_BIT - 1))))) + Py_LOCAL_INLINE(Py_ssize_t) fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, - int mode) + Py_ssize_t maxcount, int mode) { long mask; Py_ssize_t skip, count = 0; @@ -29,7 +32,7 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, w = n - m; - if (w < 0) + if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) return -1; /* look for special cases */ @@ -39,8 +42,11 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, /* use special case for 1-character strings */ if (mode == FAST_COUNT) { for (i = 0; i < n; i++) - if (s[i] == p[0]) + if (s[i] == p[0]) { count++; + if (count == maxcount) + return maxcount; + } return count; } else if (mode == FAST_SEARCH) { for (i = 0; i < n; i++) @@ -56,19 +62,20 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, mlast = m - 1; skip = mlast - 1; + mask = 0; if (mode != FAST_RSEARCH) { /* create compressed boyer-moore delta 1 table */ /* process pattern[:-1] */ - for (mask = i = 0; i < mlast; i++) { - mask |= (1 << (p[i] & 0x1F)); + for (i = 0; i < mlast; i++) { + BLOOM_ADD(mask, p[i]); if (p[i] == p[mlast]) skip = mlast - i - 1; } /* process pattern[-1] outside the loop */ - mask |= (1 << (p[mlast] & 0x1F)); + BLOOM_ADD(mask, p[mlast]); for (i = 0; i <= w; i++) { /* note: using mlast in the skip path slows things down on x86 */ @@ -82,17 +89,19 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, if (mode != FAST_COUNT) return i; count++; + if (count == maxcount) + return maxcount; i = i + mlast; continue; } /* miss: check if next character is part of pattern */ - if (!(mask & (1 << (s[i+m] & 0x1F)))) + if (!BLOOM(mask, s[i+m])) i = i + m; else i = i + skip; } else { /* skip: check if next character is part of pattern */ - if (!(mask & (1 << (s[i+m] & 0x1F)))) + if (!BLOOM(mask, s[i+m])) i = i + m; } } @@ -101,10 +110,10 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, /* create compressed boyer-moore delta 1 table */ /* process pattern[0] outside the loop */ - mask = (1 << (p[0] & 0x1F)); + BLOOM_ADD(mask, p[0]); /* process pattern[:0:-1] */ for (i = mlast; i > 0; i--) { - mask |= (1 << (p[i] & 0x1F)); + BLOOM_ADD(mask, p[i]); if (p[i] == p[0]) skip = i - 1; } @@ -119,13 +128,13 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, /* got a match! */ return i; /* miss: check if previous character is part of pattern */ - if (!(mask & (1 << (s[i-1] & 0x1F)))) + if (!BLOOM(mask, s[i-1])) i = i - m; else i = i - skip; } else { /* skip: check if previous character is part of pattern */ - if (!(mask & (1 << (s[i-1] & 0x1F)))) + if (!BLOOM(mask, s[i-1])) i = i - m; } } @@ -137,10 +146,3 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, } #endif - -/* -Local variables: -c-basic-offset: 4 -indent-tabs-mode: nil -End: -*/ diff --git a/Objects/stringlib/find.h b/Objects/stringlib/find.h index bf27d6a..f915296c 100644 --- a/Objects/stringlib/find.h +++ b/Objects/stringlib/find.h @@ -19,7 +19,7 @@ stringlib_find(const STRINGLIB_CHAR* str, Py_ssize_t str_len, if (sub_len == 0) return offset; - pos = fastsearch(str, str_len, sub, sub_len, FAST_SEARCH); + pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_SEARCH); if (pos >= 0) pos += offset; @@ -39,7 +39,7 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len, if (sub_len == 0) return str_len + offset; - pos = fastsearch(str, str_len, sub, sub_len, FAST_RSEARCH); + pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_RSEARCH); if (pos >= 0) pos += offset; @@ -47,22 +47,27 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len, return pos; } +/* helper macro to fixup start/end slice values */ +#define ADJUST_INDICES(start, end, len) \ + if (end > len) \ + end = len; \ + else if (end < 0) { \ + end += len; \ + if (end < 0) \ + end = 0; \ + } \ + if (start < 0) { \ + start += len; \ + if (start < 0) \ + start = 0; \ + } + Py_LOCAL_INLINE(Py_ssize_t) stringlib_find_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, Py_ssize_t start, Py_ssize_t end) { - if (start < 0) - start += str_len; - if (start < 0) - start = 0; - if (end > str_len) - end = str_len; - if (end < 0) - end += str_len; - if (end < 0) - end = 0; - + ADJUST_INDICES(start, end, str_len); return stringlib_find(str + start, end - start, sub, sub_len, start); } @@ -71,17 +76,7 @@ stringlib_rfind_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, Py_ssize_t start, Py_ssize_t end) { - if (start < 0) - start += str_len; - if (start < 0) - start = 0; - if (end > str_len) - end = str_len; - if (end < 0) - end += str_len; - if (end < 0) - end = 0; - + ADJUST_INDICES(start, end, str_len); return stringlib_rfind(str + start, end - start, sub, sub_len, start); } @@ -96,9 +91,9 @@ stringlib_contains_obj(PyObject* str, PyObject* sub) ) != -1; } -#endif /* STRINGLIB_STR */ +#endif /* STRINGLIB_WANT_CONTAINS_OBJ */ -#ifdef FROM_UNICODE +#if STRINGLIB_IS_UNICODE /* This function is a helper for the "find" family (find, rfind, index, @@ -146,13 +141,6 @@ _ParseTupleFinds (PyObject *args, PyObject **substring, return 1; } -#endif /* FROM_UNICODE */ +#endif /* STRINGLIB_IS_UNICODE */ #endif /* STRINGLIB_FIND_H */ - -/* -Local variables: -c-basic-offset: 4 -indent-tabs-mode: nil -End: -*/ diff --git a/Objects/stringlib/partition.h b/Objects/stringlib/partition.h index 2f26212..0170bdd 100644 --- a/Objects/stringlib/partition.h +++ b/Objects/stringlib/partition.h @@ -8,33 +8,39 @@ #endif Py_LOCAL_INLINE(PyObject*) -stringlib_partition( - PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, - PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len - ) +stringlib_partition(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + PyObject* sep_obj, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len) { PyObject* out; Py_ssize_t pos; if (sep_len == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; + return NULL; } out = PyTuple_New(3); if (!out) - return NULL; + return NULL; - pos = fastsearch(str, str_len, sep, sep_len, FAST_SEARCH); + pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_SEARCH); if (pos < 0) { - Py_INCREF(str_obj); - PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj); - Py_INCREF(STRINGLIB_EMPTY); - PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); - Py_INCREF(STRINGLIB_EMPTY); - PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY); - return out; +#if STRINGLIB_MUTABLE + PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, str_len)); + PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0)); + PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(NULL, 0)); +#else + Py_INCREF(str_obj); + PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj); + Py_INCREF(STRINGLIB_EMPTY); + PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); + Py_INCREF(STRINGLIB_EMPTY); + PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY); +#endif + return out; } PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos)); @@ -44,41 +50,47 @@ stringlib_partition( PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos)); if (PyErr_Occurred()) { - Py_DECREF(out); - return NULL; + Py_DECREF(out); + return NULL; } return out; } Py_LOCAL_INLINE(PyObject*) -stringlib_rpartition( - PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, - PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len - ) +stringlib_rpartition(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + PyObject* sep_obj, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len) { PyObject* out; Py_ssize_t pos; if (sep_len == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; + return NULL; } out = PyTuple_New(3); if (!out) - return NULL; + return NULL; - pos = fastsearch(str, str_len, sep, sep_len, FAST_RSEARCH); + pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_RSEARCH); if (pos < 0) { - Py_INCREF(STRINGLIB_EMPTY); - PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY); - Py_INCREF(STRINGLIB_EMPTY); - PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); - Py_INCREF(str_obj); - PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj); - return out; +#if STRINGLIB_MUTABLE + PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(NULL, 0)); + PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0)); + PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str, str_len)); +#else + Py_INCREF(STRINGLIB_EMPTY); + PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY); + Py_INCREF(STRINGLIB_EMPTY); + PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); + Py_INCREF(str_obj); + PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj); +#endif + return out; } PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos)); @@ -88,18 +100,11 @@ stringlib_rpartition( PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos)); if (PyErr_Occurred()) { - Py_DECREF(out); - return NULL; + Py_DECREF(out); + return NULL; } return out; } #endif - -/* -Local variables: -c-basic-offset: 4 -indent-tabs-mode: nil -End: -*/ diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h new file mode 100644 index 0000000..00b29b3 --- /dev/null +++ b/Objects/stringlib/split.h @@ -0,0 +1,788 @@ +/* stringlib: split implementation */ + +#ifndef STRINGLIB_SPLIT_H +#define STRINGLIB_SPLIT_H + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +/* Overallocate the initial list to reduce the number of reallocs for small + split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three + resizes, to sizes 4, 8, then 16. Most observed string splits are for human + text (roughly 11 words per line) and field delimited data (usually 1-10 + fields). For large strings the split algorithms are bandwidth limited + so increasing the preallocation likely will not improve things.*/ + +#define MAX_PREALLOC 12 + +/* 5 splits gives 6 elements */ +#define PREALLOC_SIZE(maxsplit) \ + (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) + +#define SPLIT_APPEND(data, left, right) \ + sub = STRINGLIB_NEW((data) + (left), \ + (right) - (left)); \ + if (sub == NULL) \ + goto onError; \ + if (PyList_Append(list, sub)) { \ + Py_DECREF(sub); \ + goto onError; \ + } \ + else \ + Py_DECREF(sub); + +#define SPLIT_ADD(data, left, right) { \ + sub = STRINGLIB_NEW((data) + (left), \ + (right) - (left)); \ + if (sub == NULL) \ + goto onError; \ + if (count < MAX_PREALLOC) { \ + PyList_SET_ITEM(list, count, sub); \ + } else { \ + if (PyList_Append(list, sub)) { \ + Py_DECREF(sub); \ + goto onError; \ + } \ + else \ + Py_DECREF(sub); \ + } \ + count++; } + + +/* Always force the list to the expected size. */ +#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count + +Py_LOCAL_INLINE(PyObject *) +stringlib_split_whitespace(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = 0; + while (maxcount-- > 0) { + while (i < str_len && STRINGLIB_ISSPACE(str[i])) + i++; + if (i == str_len) break; + j = i; i++; + while (i < str_len && !STRINGLIB_ISSPACE(str[i])) + i++; +#ifndef STRINGLIB_MUTABLE + if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No whitespace in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + break; + } +#endif + SPLIT_ADD(str, j, i); + } + + if (i < str_len) { + /* Only occurs when maxcount was reached */ + /* Skip any remaining whitespace and copy to end of string */ + while (i < str_len && STRINGLIB_ISSPACE(str[i])) + i++; + if (i != str_len) + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_split_char(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR ch, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = 0; + while ((j < str_len) && (maxcount-- > 0)) { + for(; j < str_len; j++) { + /* I found that using memchr makes no difference */ + if (str[j] == ch) { + SPLIT_ADD(str, i, j); + i = j = j + 1; + break; + } + } + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (i <= str_len) { + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_split(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, pos, count=0; + PyObject *list, *sub; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else if (sep_len == 1) + return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount); + + list = PyList_New(PREALLOC_SIZE(maxcount)); + if (list == NULL) + return NULL; + + i = j = 0; + while (maxcount-- > 0) { + pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); + if (pos < 0) + break; + j = i + pos; + SPLIT_ADD(str, i, j); + i = j + sep_len; + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No match in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + { + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit_whitespace(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = str_len - 1; + while (maxcount-- > 0) { + while (i >= 0 && STRINGLIB_ISSPACE(str[i])) + i--; + if (i < 0) break; + j = i; i--; + while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) + i--; +#ifndef STRINGLIB_MUTABLE + if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No whitespace in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + break; + } +#endif + SPLIT_ADD(str, i + 1, j + 1); + } + + if (i >= 0) { + /* Only occurs when maxcount was reached */ + /* Skip any remaining whitespace and copy to beginning of string */ + while (i >= 0 && STRINGLIB_ISSPACE(str[i])) + i--; + if (i >= 0) + SPLIT_ADD(str, 0, i + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit_char(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR ch, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = str_len - 1; + while ((i >= 0) && (maxcount-- > 0)) { + for(; i >= 0; i--) { + if (str[i] == ch) { + SPLIT_ADD(str, i + 1, j + 1); + j = i = i - 1; + break; + } + } + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (j >= -1) { + SPLIT_ADD(str, 0, j + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, + Py_ssize_t maxcount) +{ + Py_ssize_t j, pos, count=0; + PyObject *list, *sub; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else if (sep_len == 1) + return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount); + + list = PyList_New(PREALLOC_SIZE(maxcount)); + if (list == NULL) + return NULL; + + j = str_len; + while (maxcount-- > 0) { + pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH); + if (pos < 0) + break; + SPLIT_ADD(str, pos + sep_len, j); + j = pos; + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No match in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + { + SPLIT_ADD(str, 0, j); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_splitlines(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + int keepends) +{ + /* This does not use the preallocated list because splitlines is + usually run with hundreds of newlines. The overhead of + switching between PyList_SET_ITEM and append causes about a + 2-3% slowdown for that common case. A smarter implementation + could move the if check out, so the SET_ITEMs are done first + and the appends only done when the prealloc buffer is full. + That's too much work for little gain.*/ + + register Py_ssize_t i; + register Py_ssize_t j; + PyObject *list = PyList_New(0); + PyObject *sub; + + if (list == NULL) + return NULL; + + for (i = j = 0; i < str_len; ) { + Py_ssize_t eol; + + /* Find a line and append it */ + while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) + i++; + + /* Skip the line break reading CRLF as one line break */ + eol = i; + if (i < str_len) { + if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') + i += 2; + else + i++; + if (keepends) + eol = i; + } +#ifndef STRINGLIB_MUTABLE + if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No linebreak in str_obj, so just use it as list[0] */ + if (PyList_Append(list, str_obj)) + goto onError; + break; + } +#endif + SPLIT_APPEND(str, j, eol); + j = i; + } + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +#endif +/* stringlib: split implementation */ + +#ifndef STRINGLIB_SPLIT_H +#define STRINGLIB_SPLIT_H + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +/* Overallocate the initial list to reduce the number of reallocs for small + split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three + resizes, to sizes 4, 8, then 16. Most observed string splits are for human + text (roughly 11 words per line) and field delimited data (usually 1-10 + fields). For large strings the split algorithms are bandwidth limited + so increasing the preallocation likely will not improve things.*/ + +#define MAX_PREALLOC 12 + +/* 5 splits gives 6 elements */ +#define PREALLOC_SIZE(maxsplit) \ + (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) + +#define SPLIT_APPEND(data, left, right) \ + sub = STRINGLIB_NEW((data) + (left), \ + (right) - (left)); \ + if (sub == NULL) \ + goto onError; \ + if (PyList_Append(list, sub)) { \ + Py_DECREF(sub); \ + goto onError; \ + } \ + else \ + Py_DECREF(sub); + +#define SPLIT_ADD(data, left, right) { \ + sub = STRINGLIB_NEW((data) + (left), \ + (right) - (left)); \ + if (sub == NULL) \ + goto onError; \ + if (count < MAX_PREALLOC) { \ + PyList_SET_ITEM(list, count, sub); \ + } else { \ + if (PyList_Append(list, sub)) { \ + Py_DECREF(sub); \ + goto onError; \ + } \ + else \ + Py_DECREF(sub); \ + } \ + count++; } + + +/* Always force the list to the expected size. */ +#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count + +Py_LOCAL_INLINE(PyObject *) +stringlib_split_whitespace(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = 0; + while (maxcount-- > 0) { + while (i < str_len && STRINGLIB_ISSPACE(str[i])) + i++; + if (i == str_len) break; + j = i; i++; + while (i < str_len && !STRINGLIB_ISSPACE(str[i])) + i++; +#ifndef STRINGLIB_MUTABLE + if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No whitespace in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + break; + } +#endif + SPLIT_ADD(str, j, i); + } + + if (i < str_len) { + /* Only occurs when maxcount was reached */ + /* Skip any remaining whitespace and copy to end of string */ + while (i < str_len && STRINGLIB_ISSPACE(str[i])) + i++; + if (i != str_len) + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_split_char(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR ch, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = 0; + while ((j < str_len) && (maxcount-- > 0)) { + for(; j < str_len; j++) { + /* I found that using memchr makes no difference */ + if (str[j] == ch) { + SPLIT_ADD(str, i, j); + i = j = j + 1; + break; + } + } + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (i <= str_len) { + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_split(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, pos, count=0; + PyObject *list, *sub; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else if (sep_len == 1) + return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount); + + list = PyList_New(PREALLOC_SIZE(maxcount)); + if (list == NULL) + return NULL; + + i = j = 0; + while (maxcount-- > 0) { + pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); + if (pos < 0) + break; + j = i + pos; + SPLIT_ADD(str, i, j); + i = j + sep_len; + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No match in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + { + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit_whitespace(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = str_len - 1; + while (maxcount-- > 0) { + while (i >= 0 && STRINGLIB_ISSPACE(str[i])) + i--; + if (i < 0) break; + j = i; i--; + while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) + i--; +#ifndef STRINGLIB_MUTABLE + if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No whitespace in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + break; + } +#endif + SPLIT_ADD(str, i + 1, j + 1); + } + + if (i >= 0) { + /* Only occurs when maxcount was reached */ + /* Skip any remaining whitespace and copy to beginning of string */ + while (i >= 0 && STRINGLIB_ISSPACE(str[i])) + i--; + if (i >= 0) + SPLIT_ADD(str, 0, i + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit_char(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR ch, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = str_len - 1; + while ((i >= 0) && (maxcount-- > 0)) { + for(; i >= 0; i--) { + if (str[i] == ch) { + SPLIT_ADD(str, i + 1, j + 1); + j = i = i - 1; + break; + } + } + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (j >= -1) { + SPLIT_ADD(str, 0, j + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, + Py_ssize_t maxcount) +{ + Py_ssize_t j, pos, count=0; + PyObject *list, *sub; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else if (sep_len == 1) + return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount); + + list = PyList_New(PREALLOC_SIZE(maxcount)); + if (list == NULL) + return NULL; + + j = str_len; + while (maxcount-- > 0) { + pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH); + if (pos < 0) + break; + SPLIT_ADD(str, pos + sep_len, j); + j = pos; + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No match in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + { + SPLIT_ADD(str, 0, j); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_splitlines(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + int keepends) +{ + /* This does not use the preallocated list because splitlines is + usually run with hundreds of newlines. The overhead of + switching between PyList_SET_ITEM and append causes about a + 2-3% slowdown for that common case. A smarter implementation + could move the if check out, so the SET_ITEMs are done first + and the appends only done when the prealloc buffer is full. + That's too much work for little gain.*/ + + register Py_ssize_t i; + register Py_ssize_t j; + PyObject *list = PyList_New(0); + PyObject *sub; + + if (list == NULL) + return NULL; + + for (i = j = 0; i < str_len; ) { + Py_ssize_t eol; + + /* Find a line and append it */ + while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) + i++; + + /* Skip the line break reading CRLF as one line break */ + eol = i; + if (i < str_len) { + if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') + i += 2; + else + i++; + if (keepends) + eol = i; + } +#ifndef STRINGLIB_MUTABLE + if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No linebreak in str_obj, so just use it as list[0] */ + if (PyList_Append(list, str_obj)) + goto onError; + break; + } +#endif + SPLIT_APPEND(str, j, eol); + j = i; + } + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +#endif diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h index 8a650fe..1c49426 100644 --- a/Objects/stringlib/stringdefs.h +++ b/Objects/stringlib/stringdefs.h @@ -11,6 +11,8 @@ #define STRINGLIB_TYPE_NAME "string" #define STRINGLIB_PARSE_CODE "S" #define STRINGLIB_EMPTY nullstring +#define STRINGLIB_ISSPACE Py_ISSPACE +#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r')) #define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9')) #define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1) #define STRINGLIB_TOUPPER Py_TOUPPER diff --git a/Objects/stringlib/transmogrify.h b/Objects/stringlib/transmogrify.h index 7dc8177..1e132e5 100644 --- a/Objects/stringlib/transmogrify.h +++ b/Objects/stringlib/transmogrify.h @@ -1,13 +1,6 @@ /* NOTE: this API is -ONLY- for use with single byte character strings. */ /* Do not use it with Unicode. */ -#include "bytes_methods.h" - -#ifndef STRINGLIB_MUTABLE -#warning "STRINGLIB_MUTABLE not defined before #include, assuming 0" -#define STRINGLIB_MUTABLE 0 -#endif - /* the more complicated methods. parts of these should be pulled out into the shared code in bytes_methods.c to cut down on duplicate code bloat. */ @@ -269,87 +262,3 @@ stringlib_zfill(PyObject *self, PyObject *args) return (PyObject*) s; } - - -#define _STRINGLIB_SPLIT_APPEND(data, left, right) \ - str = STRINGLIB_NEW((data) + (left), \ - (right) - (left)); \ - if (str == NULL) \ - goto onError; \ - if (PyList_Append(list, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); - -PyDoc_STRVAR(splitlines__doc__, -"B.splitlines([keepends]) -> list of lines\n\ -\n\ -Return a list of the lines in B, breaking at line boundaries.\n\ -Line breaks are not included in the resulting list unless keepends\n\ -is given and true."); - -static PyObject* -stringlib_splitlines(PyObject *self, PyObject *args) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len; - int keepends = 0; - PyObject *list; - PyObject *str; - char *data; - - if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) - return NULL; - - data = STRINGLIB_STR(self); - len = STRINGLIB_LEN(self); - - /* This does not use the preallocated list because splitlines is - usually run with hundreds of newlines. The overhead of - switching between PyList_SET_ITEM and append causes about a - 2-3% slowdown for that common case. A smarter implementation - could move the if check out, so the SET_ITEMs are done first - and the appends only done when the prealloc buffer is full. - That's too much work for little gain.*/ - - list = PyList_New(0); - if (!list) - goto onError; - - for (i = j = 0; i < len; ) { - Py_ssize_t eol; - - /* Find a line and append it */ - while (i < len && data[i] != '\n' && data[i] != '\r') - i++; - - /* Skip the line break reading CRLF as one line break */ - eol = i; - if (i < len) { - if (data[i] == '\r' && i + 1 < len && - data[i+1] == '\n') - i += 2; - else - i++; - if (keepends) - eol = i; - } - _STRINGLIB_SPLIT_APPEND(data, j, eol); - j = i; - } - if (j < len) { - _STRINGLIB_SPLIT_APPEND(data, j, len); - } - - return list; - - onError: - Py_XDECREF(list); - return NULL; -} - -#undef _STRINGLIB_SPLIT_APPEND - diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h index a4d2144..09dae6d 100644 --- a/Objects/stringlib/unicodedefs.h +++ b/Objects/stringlib/unicodedefs.h @@ -11,6 +11,8 @@ #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_EMPTY unicode_empty +#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE +#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL #define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 40377ea..3d49db1 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -210,7 +210,8 @@ PyUnicode_GetMax(void) static BLOOM_MASK bloom_linebreak; -#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) +#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1))))) +#define BLOOM(mask, ch) ((mask & (1 << ((ch) & (LONG_BIT - 1))))) #define BLOOM_LINEBREAK(ch) \ ((ch) < 128U ? ascii_linebreak[(ch)] : \ @@ -225,7 +226,7 @@ Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) mask = 0; for (i = 0; i < len; i++) - mask |= (1 << (ptr[i] & 0x1F)); + BLOOM_ADD(mask, ptr[i]); return mask; } @@ -5873,28 +5874,30 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, #include "stringlib/unicodedefs.h" #include "stringlib/fastsearch.h" + #include "stringlib/count.h" -/* Include _ParseTupleFinds from find.h */ -#define FROM_UNICODE #include "stringlib/find.h" #include "stringlib/partition.h" +#include "stringlib/split.h" #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale #include "stringlib/localeutil.h" /* helper macro to fixup start/end slice values */ -#define FIX_START_END(obj) \ - if (start < 0) \ - start += (obj)->length; \ - if (start < 0) \ - start = 0; \ - if (end > (obj)->length) \ - end = (obj)->length; \ - if (end < 0) \ - end += (obj)->length; \ - if (end < 0) \ - end = 0; +#define ADJUST_INDICES(start, end, len) \ + if (end > len) \ + end = len; \ + else if (end < 0) { \ + end += len; \ + if (end < 0) \ + end = 0; \ + } \ + if (start < 0) { \ + start += len; \ + if (start < 0) \ + start = 0; \ + } Py_ssize_t PyUnicode_Count(PyObject *str, PyObject *substr, @@ -5914,10 +5917,10 @@ Py_ssize_t PyUnicode_Count(PyObject *str, return -1; } - FIX_START_END(str_obj); - + ADJUST_INDICES(start, end, str_obj->length); result = stringlib_count( - str_obj->str + start, end - start, sub_obj->str, sub_obj->length + str_obj->str + start, end - start, sub_obj->str, sub_obj->length, + PY_SSIZE_T_MAX ); Py_DECREF(sub_obj); @@ -5972,8 +5975,7 @@ int tailmatch(PyUnicodeObject *self, if (substring->length == 0) return 1; - FIX_START_END(self); - + ADJUST_INDICES(start, end, self->length); end -= substring->length; if (end < start) return 0; @@ -6314,305 +6316,40 @@ PyUnicodeObject *pad(PyUnicodeObject *self, return u; } -#define SPLIT_APPEND(data, left, right) \ - str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ - if (!str) \ - goto onError; \ - if (PyList_Append(list, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); - -static -PyObject *split_whitespace(PyUnicodeObject *self, - PyObject *list, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = 0; i < len; ) { - /* find a token */ - while (i < len && Py_UNICODE_ISSPACE(buf[i])) - i++; - j = i; - while (i < len && !Py_UNICODE_ISSPACE(buf[i])) - i++; - if (j < i) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, j, i); - while (i < len && Py_UNICODE_ISSPACE(buf[i])) - i++; - j = i; - } - } - if (j < len) { - SPLIT_APPEND(buf, j, len); - } - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -PyObject *PyUnicode_Splitlines(PyObject *string, - int keepends) +PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) { - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len; PyObject *list; - PyObject *str; - Py_UNICODE *data; string = PyUnicode_FromObject(string); if (string == NULL) return NULL; - data = PyUnicode_AS_UNICODE(string); - len = PyUnicode_GET_SIZE(string); - list = PyList_New(0); - if (!list) - goto onError; - - for (i = j = 0; i < len; ) { - Py_ssize_t eol; - - /* Find a line and append it */ - while (i < len && !BLOOM_LINEBREAK(data[i])) - i++; - - /* Skip the line break reading CRLF as one line break */ - eol = i; - if (i < len) { - if (data[i] == '\r' && i + 1 < len && - data[i+1] == '\n') - i += 2; - else - i++; - if (keepends) - eol = i; - } - SPLIT_APPEND(data, j, eol); - j = i; - } - if (j < len) { - SPLIT_APPEND(data, j, len); - } + list = stringlib_splitlines( + (PyObject*) string, PyUnicode_AS_UNICODE(string), + PyUnicode_GET_SIZE(string), keepends); Py_DECREF(string); return list; - - onError: - Py_XDECREF(list); - Py_DECREF(string); - return NULL; } static -PyObject *split_char(PyUnicodeObject *self, - PyObject *list, - Py_UNICODE ch, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = 0; i < len; ) { - if (buf[i] == ch) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, j, i); - i = j = i + 1; - } else - i++; - } - if (j <= len) { - SPLIT_APPEND(buf, j, len); - } - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *split_substring(PyUnicodeObject *self, - PyObject *list, - PyUnicodeObject *substring, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - Py_ssize_t sublen = substring->length; - PyObject *str; - - for (i = j = 0; i <= len - sublen; ) { - if (Py_UNICODE_MATCH(self, i, substring)) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(self->str, j, i); - i = j = i + sublen; - } else - i++; - } - if (j <= len) { - SPLIT_APPEND(self->str, j, len); - } - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *rsplit_whitespace(PyUnicodeObject *self, - PyObject *list, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = len - 1; i >= 0; ) { - /* find a token */ - while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) - i--; - j = i; - while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) - i--; - if (j > i) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, i + 1, j + 1); - while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) - i--; - j = i; - } - } - if (j >= 0) { - SPLIT_APPEND(buf, 0, j + 1); - } - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *rsplit_char(PyUnicodeObject *self, - PyObject *list, - Py_UNICODE ch, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = len - 1; i >= 0; ) { - if (buf[i] == ch) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, i + 1, j + 1); - j = i = i - 1; - } else - i--; - } - if (j >= -1) { - SPLIT_APPEND(buf, 0, j + 1); - } - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *rsplit_substring(PyUnicodeObject *self, - PyObject *list, - PyUnicodeObject *substring, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - Py_ssize_t sublen = substring->length; - PyObject *str; - - for (i = len - sublen, j = len; i >= 0; ) { - if (Py_UNICODE_MATCH(self, i, substring)) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(self->str, i + sublen, j); - j = i; - i -= sublen; - } else - i--; - } - if (j >= 0) { - SPLIT_APPEND(self->str, 0, j); - } - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -#undef SPLIT_APPEND - -static PyObject *split(PyUnicodeObject *self, PyUnicodeObject *substring, Py_ssize_t maxcount) { - PyObject *list; - if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; - list = PyList_New(0); - if (!list) - return NULL; - if (substring == NULL) - return split_whitespace(self,list,maxcount); - - else if (substring->length == 1) - return split_char(self,list,substring->str[0],maxcount); + return stringlib_split_whitespace( + (PyObject*) self, self->str, self->length, maxcount + ); - else if (substring->length == 0) { - Py_DECREF(list); - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else - return split_substring(self,list,substring,maxcount); + return stringlib_split( + (PyObject*) self, self->str, self->length, + substring->str, substring->length, + maxcount + ); } static @@ -6620,28 +6357,19 @@ PyObject *rsplit(PyUnicodeObject *self, PyUnicodeObject *substring, Py_ssize_t maxcount) { - PyObject *list; - if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; - list = PyList_New(0); - if (!list) - return NULL; - if (substring == NULL) - return rsplit_whitespace(self,list,maxcount); - - else if (substring->length == 1) - return rsplit_char(self,list,substring->str[0],maxcount); + return stringlib_rsplit_whitespace( + (PyObject*) self, self->str, self->length, maxcount + ); - else if (substring->length == 0) { - Py_DECREF(list); - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else - return rsplit_substring(self,list,substring,maxcount); + return stringlib_rsplit( + (PyObject*) self, self->str, self->length, + substring->str, substring->length, + maxcount + ); } static @@ -6654,9 +6382,13 @@ PyObject *replace(PyUnicodeObject *self, if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; + else if (maxcount == 0 || self->length == 0) + goto nothing; if (str1->length == str2->length) { /* same length */ + if (str1->length == 0) + goto nothing; Py_ssize_t i; if (str1->length == 1) { /* replace characters */ @@ -6676,8 +6408,8 @@ PyObject *replace(PyUnicodeObject *self, u->str[i] = u2; } } else { - i = fastsearch( - self->str, self->length, str1->str, str1->length, FAST_SEARCH + i = stringlib_find( + self->str, self->length, str1->str, str1->length, 0 ); if (i < 0) goto nothing; @@ -6685,14 +6417,20 @@ PyObject *replace(PyUnicodeObject *self, if (!u) return NULL; Py_UNICODE_COPY(u->str, self->str, self->length); - while (i <= self->length - str1->length) - if (Py_UNICODE_MATCH(self, i, str1)) { - if (--maxcount < 0) - break; - Py_UNICODE_COPY(u->str+i, str2->str, str2->length); - i += str1->length; - } else - i++; + + /* change everything in-place, starting with this one */ + Py_UNICODE_COPY(u->str+i, str2->str, str2->length); + i += str1->length; + + while ( --maxcount > 0) { + i = stringlib_find(self->str+i, self->length-i, + str1->str, str1->length, + i); + if (i == -1) + break; + Py_UNICODE_COPY(u->str+i, str2->str, str2->length); + i += str1->length; + } } } else { @@ -6701,9 +6439,8 @@ PyObject *replace(PyUnicodeObject *self, Py_UNICODE *p; /* replace strings */ - n = stringlib_count(self->str, self->length, str1->str, str1->length); - if (n > maxcount) - n = maxcount; + n = stringlib_count(self->str, self->length, str1->str, str1->length, + maxcount); if (n == 0) goto nothing; /* new_size = self->length + n * (str2->length - str1->length)); */ @@ -6733,15 +6470,12 @@ PyObject *replace(PyUnicodeObject *self, if (str1->length > 0) { while (n-- > 0) { /* look for next match */ - j = i; - while (j <= e) { - if (Py_UNICODE_MATCH(self, j, str1)) - break; - j++; - } - if (j > i) { - if (j > e) - break; + j = stringlib_find(self->str+i, self->length-i, + str1->str, str1->length, + i); + if (j == -1) + break; + else if (j > i) { /* copy unchanged part [i:j] */ Py_UNICODE_COPY(p, self->str+i, j-i); p += j - i; @@ -7192,11 +6926,11 @@ unicode_count(PyUnicodeObject *self, PyObject *args) if (substring == NULL) return NULL; - FIX_START_END(self); - + ADJUST_INDICES(start, end, self->length); result = PyLong_FromSsize_t( stringlib_count(self->str + start, end - start, - substring->str, substring->length) + substring->str, substring->length, + PY_SSIZE_T_MAX) ); Py_DECREF(substring); @@ -10066,11 +9800,3 @@ Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) #ifdef __cplusplus } #endif - - -/* - Local variables: - c-basic-offset: 4 - indent-tabs-mode: nil - End: -*/ diff --git a/PC/VS8.0/pythoncore.vcproj b/PC/VS8.0/pythoncore.vcproj index 2285013..86d440a 100644 --- a/PC/VS8.0/pythoncore.vcproj +++ b/PC/VS8.0/pythoncore.vcproj @@ -1491,6 +1491,10 @@ > </File> <File + RelativePath="..\..\Objects\stringlib\split.h" + > + </File> + <File RelativePath="..\..\Objects\structseq.c" > </File> diff --git a/PCbuild/pythoncore.vcproj b/PCbuild/pythoncore.vcproj index 21594cd..fde5cd1 100644 --- a/PCbuild/pythoncore.vcproj +++ b/PCbuild/pythoncore.vcproj @@ -1496,6 +1496,10 @@ > </File> <File + RelativePath="..\Objects\stringlib\split.h" + > + </File> + <File RelativePath="..\Objects\structseq.c" > </File> |