summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile.pre.in1
-rw-r--r--Misc/NEWS4
-rw-r--r--Objects/bytearrayobject.c506
-rw-r--r--Objects/stringlib/README.txt9
-rw-r--r--Objects/stringlib/count.h16
-rw-r--r--Objects/stringlib/ctype.h1
-rw-r--r--Objects/stringlib/fastsearch.h40
-rw-r--r--Objects/stringlib/find.h58
-rw-r--r--Objects/stringlib/partition.h83
-rw-r--r--Objects/stringlib/split.h394
-rw-r--r--Objects/stringlib/stringdefs.h5
-rw-r--r--Objects/stringlib/transmogrify.h91
-rw-r--r--Objects/stringlib/unicodedefs.h3
-rw-r--r--Objects/stringobject.c520
-rw-r--r--Objects/unicodeobject.c420
-rw-r--r--PC/VS8.0/pythoncore.vcproj4
-rw-r--r--PCbuild/pythoncore.vcproj4
17 files changed, 717 insertions, 1442 deletions
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 1491a9b..300a034 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -563,6 +563,7 @@ STRINGLIB_HEADERS= \
$(srcdir)/Objects/stringlib/find.h \
$(srcdir)/Objects/stringlib/formatter.h \
$(srcdir)/Objects/stringlib/partition.h \
+ $(srcdir)/Objects/stringlib/split.h \
$(srcdir)/Objects/stringlib/stringdefs.h \
$(srcdir)/Objects/stringlib/string_format.h \
$(srcdir)/Objects/stringlib/transmogrify.h \
diff --git a/Misc/NEWS b/Misc/NEWS
index d2b75b1..20cdf88 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,10 @@ What's New in Python 2.7 alpha 3?
Core and Builtins
-----------------
+- Issue #7622: Improve the split(), rsplit(), splitlines() and replace()
+ methods of bytes, bytearray and unicode objects by using a common
+ implementation based on stringlib's fast search. Patch by Florent Xicluna.
+
- Issue #7632: Fix a crash in dtoa.c that occurred in debug builds
when parsing certain long numeric strings corresponding to subnormal
values. Also fix a number of bugs in dtoa.c that could lead to
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c
index 6157c83..74cb1f1 100644
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -1115,14 +1115,16 @@ bytearray_dealloc(PyByteArrayObject *self)
#define STRINGLIB_STR PyByteArray_AS_STRING
#define STRINGLIB_NEW PyByteArray_FromStringAndSize
#define STRINGLIB_EMPTY nullbytes
+#define STRINGLIB_ISSPACE Py_ISSPACE
+#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r'))
#define STRINGLIB_CHECK_EXACT PyByteArray_CheckExact
#define STRINGLIB_MUTABLE 1
-#define FROM_BYTEARRAY 1
#include "stringlib/fastsearch.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/partition.h"
+#include "stringlib/split.h"
#include "stringlib/ctype.h"
#include "stringlib/transmogrify.h"
@@ -1130,21 +1132,20 @@ bytearray_dealloc(PyByteArrayObject *self)
/* The following Py_LOCAL_INLINE and Py_LOCAL functions
were copied from the old char* style string object. */
-Py_LOCAL_INLINE(void)
-_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
-{
- if (*end > len)
- *end = len;
- else if (*end < 0)
- *end += len;
- if (*end < 0)
- *end = 0;
- if (*start < 0)
- *start += len;
- if (*start < 0)
- *start = 0;
-}
-
+/* helper macro to fixup start/end slice values */
+#define ADJUST_INDICES(start, end, len) \
+ if (end > len) \
+ end = len; \
+ else if (end < 0) { \
+ end += len; \
+ if (end < 0) \
+ end = 0; \
+ } \
+ if (start < 0) { \
+ start += len; \
+ if (start < 0) \
+ start = 0; \
+ }
Py_LOCAL_INLINE(Py_ssize_t)
bytearray_find_internal(PyByteArrayObject *self, PyObject *args, int dir)
@@ -1212,10 +1213,10 @@ bytearray_count(PyByteArrayObject *self, PyObject *args)
if (_getbuffer(sub_obj, &vsub) < 0)
return NULL;
- _adjust_indices(&start, &end, PyByteArray_GET_SIZE(self));
+ ADJUST_INDICES(start, end, PyByteArray_GET_SIZE(self));
count_obj = PyInt_FromSsize_t(
- stringlib_count(str + start, end - start, vsub.buf, vsub.len)
+ stringlib_count(str + start, end - start, vsub.buf, vsub.len, PY_SSIZE_T_MAX)
);
PyBuffer_Release(&vsub);
return count_obj;
@@ -1323,7 +1324,7 @@ _bytearray_tailmatch(PyByteArrayObject *self, PyObject *substr, Py_ssize_t start
if (_getbuffer(substr, &vsubstr) < 0)
return -1;
- _adjust_indices(&start, &end, len);
+ ADJUST_INDICES(start, end, len);
if (direction < 0) {
/* startswith */
@@ -1528,20 +1529,11 @@ done:
}
-#define FORWARD 1
-#define REVERSE -1
-
/* find and count characters and substrings */
#define findchar(target, target_len, c) \
((char *)memchr((const void *)(target), c, target_len))
-/* Don't call if length < 2 */
-#define Py_STRING_MATCH(target, offset, pattern, length) \
- (target[offset] == pattern[0] && \
- target[offset+length-1] == pattern[length-1] && \
- !memcmp(target+offset+1, pattern+1, length-2) )
-
/* Bytes ops must return a string, create a copy */
Py_LOCAL(PyByteArrayObject *)
@@ -1568,93 +1560,6 @@ countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount
return count;
}
-Py_LOCAL(Py_ssize_t)
-findstring(const char *target, Py_ssize_t target_len,
- const char *pattern, Py_ssize_t pattern_len,
- Py_ssize_t start,
- Py_ssize_t end,
- int direction)
-{
- if (start < 0) {
- start += target_len;
- if (start < 0)
- start = 0;
- }
- if (end > target_len) {
- end = target_len;
- } else if (end < 0) {
- end += target_len;
- if (end < 0)
- end = 0;
- }
-
- /* zero-length substrings always match at the first attempt */
- if (pattern_len == 0)
- return (direction > 0) ? start : end;
-
- end -= pattern_len;
-
- if (direction < 0) {
- for (; end >= start; end--)
- if (Py_STRING_MATCH(target, end, pattern, pattern_len))
- return end;
- } else {
- for (; start <= end; start++)
- if (Py_STRING_MATCH(target, start, pattern, pattern_len))
- return start;
- }
- return -1;
-}
-
-Py_LOCAL_INLINE(Py_ssize_t)
-countstring(const char *target, Py_ssize_t target_len,
- const char *pattern, Py_ssize_t pattern_len,
- Py_ssize_t start,
- Py_ssize_t end,
- int direction, Py_ssize_t maxcount)
-{
- Py_ssize_t count=0;
-
- if (start < 0) {
- start += target_len;
- if (start < 0)
- start = 0;
- }
- if (end > target_len) {
- end = target_len;
- } else if (end < 0) {
- end += target_len;
- if (end < 0)
- end = 0;
- }
-
- /* zero-length substrings match everywhere */
- if (pattern_len == 0 || maxcount == 0) {
- if (target_len+1 < maxcount)
- return target_len+1;
- return maxcount;
- }
-
- end -= pattern_len;
- if (direction < 0) {
- for (; (end >= start); end--)
- if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
- count++;
- if (--maxcount <= 0) break;
- end -= pattern_len-1;
- }
- } else {
- for (; (start <= end); start++)
- if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
- count++;
- if (--maxcount <= 0)
- break;
- start += pattern_len-1;
- }
- }
- return count;
-}
-
/* Algorithms for different cases of string replacement */
@@ -1776,10 +1681,9 @@ replace_delete_substring(PyByteArrayObject *self,
self_len = PyByteArray_GET_SIZE(self);
self_s = PyByteArray_AS_STRING(self);
- count = countstring(self_s, self_len,
- from_s, from_len,
- 0, self_len, 1,
- maxcount);
+ count = stringlib_count(self_s, self_len,
+ from_s, from_len,
+ maxcount);
if (count == 0) {
/* no matches */
@@ -1798,9 +1702,9 @@ replace_delete_substring(PyByteArrayObject *self,
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
- offset = findstring(start, end-start,
- from_s, from_len,
- 0, end-start, FORWARD);
+ offset = stringlib_find(start, end-start,
+ from_s, from_len,
+ 0);
if (offset == -1)
break;
next = start + offset;
@@ -1876,9 +1780,9 @@ replace_substring_in_place(PyByteArrayObject *self,
self_s = PyByteArray_AS_STRING(self);
self_len = PyByteArray_GET_SIZE(self);
- offset = findstring(self_s, self_len,
- from_s, from_len,
- 0, self_len, FORWARD);
+ offset = stringlib_find(self_s, self_len,
+ from_s, from_len,
+ 0);
if (offset == -1) {
/* No matches; return the original bytes */
return return_self(self);
@@ -1898,9 +1802,9 @@ replace_substring_in_place(PyByteArrayObject *self,
end = result_s + self_len;
while ( --maxcount > 0) {
- offset = findstring(start, end-start,
- from_s, from_len,
- 0, end-start, FORWARD);
+ offset = stringlib_find(start, end-start,
+ from_s, from_len,
+ 0);
if (offset==-1)
break;
Py_MEMCPY(start+offset, to_s, from_len);
@@ -1993,9 +1897,10 @@ replace_substring(PyByteArrayObject *self,
self_s = PyByteArray_AS_STRING(self);
self_len = PyByteArray_GET_SIZE(self);
- count = countstring(self_s, self_len,
- from_s, from_len,
- 0, self_len, FORWARD, maxcount);
+ count = stringlib_count(self_s, self_len,
+ from_s, from_len,
+ maxcount);
+
if (count == 0) {
/* no matches, return unchanged */
return return_self(self);
@@ -2022,9 +1927,9 @@ replace_substring(PyByteArrayObject *self,
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
- offset = findstring(start, end-start,
- from_s, from_len,
- 0, end-start, FORWARD);
+ offset = stringlib_find(start, end-start,
+ from_s, from_len,
+ 0);
if (offset == -1)
break;
next = start+offset;
@@ -2153,123 +2058,6 @@ bytearray_replace(PyByteArrayObject *self, PyObject *args)
return res;
}
-
-/* Overallocate the initial list to reduce the number of reallocs for small
- split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
- resizes, to sizes 4, 8, then 16. Most observed string splits are for human
- text (roughly 11 words per line) and field delimited data (usually 1-10
- fields). For large strings the split algorithms are bandwidth limited
- so increasing the preallocation likely will not improve things.*/
-
-#define MAX_PREALLOC 12
-
-/* 5 splits gives 6 elements */
-#define PREALLOC_SIZE(maxsplit) \
- (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
-
-#define SPLIT_APPEND(data, left, right) \
- str = PyByteArray_FromStringAndSize((data) + (left), \
- (right) - (left)); \
- if (str == NULL) \
- goto onError; \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str);
-
-#define SPLIT_ADD(data, left, right) { \
- str = PyByteArray_FromStringAndSize((data) + (left), \
- (right) - (left)); \
- if (str == NULL) \
- goto onError; \
- if (count < MAX_PREALLOC) { \
- PyList_SET_ITEM(list, count, str); \
- } else { \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str); \
- } \
- count++; }
-
-/* Always force the list to the expected size. */
-#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
-
-
-Py_LOCAL_INLINE(PyObject *)
-split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
-{
- register Py_ssize_t i, j, count = 0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
-
- if (list == NULL)
- return NULL;
-
- i = j = 0;
- while ((j < len) && (maxcount-- > 0)) {
- for(; j < len; j++) {
- /* I found that using memchr makes no difference */
- if (s[j] == ch) {
- SPLIT_ADD(s, i, j);
- i = j = j + 1;
- break;
- }
- }
- }
- if (i <= len) {
- SPLIT_ADD(s, i, len);
- }
- FIX_PREALLOC_SIZE(list);
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-
-Py_LOCAL_INLINE(PyObject *)
-split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
-{
- register Py_ssize_t i, j, count = 0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
-
- if (list == NULL)
- return NULL;
-
- for (i = j = 0; i < len; ) {
- /* find a token */
- while (i < len && Py_ISSPACE(s[i]))
- i++;
- j = i;
- while (i < len && !Py_ISSPACE(s[i]))
- i++;
- if (j < i) {
- if (maxcount-- <= 0)
- break;
- SPLIT_ADD(s, j, i);
- while (i < len && Py_ISSPACE(s[i]))
- i++;
- j = i;
- }
- }
- if (j < len) {
- SPLIT_ADD(s, j, len);
- }
- FIX_PREALLOC_SIZE(list);
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
PyDoc_STRVAR(split__doc__,
"B.split([sep[, maxsplit]]) -> list of bytearray\n\
\n\
@@ -2281,10 +2069,10 @@ If maxsplit is given, at most maxsplit splits are done.");
static PyObject *
bytearray_split(PyByteArrayObject *self, PyObject *args)
{
- Py_ssize_t len = PyByteArray_GET_SIZE(self), n, i, j, pos;
- Py_ssize_t maxsplit = -1, count = 0;
+ Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
+ Py_ssize_t maxsplit = -1;
const char *s = PyByteArray_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj = Py_None;
+ PyObject *list, *subobj = Py_None;
Py_buffer vsub;
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
@@ -2293,73 +2081,18 @@ bytearray_split(PyByteArrayObject *self, PyObject *args)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
- return split_whitespace(s, len, maxsplit);
+ return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
if (_getbuffer(subobj, &vsub) < 0)
return NULL;
sub = vsub.buf;
n = vsub.len;
- if (n == 0) {
- PyErr_SetString(PyExc_ValueError, "empty separator");
- PyBuffer_Release(&vsub);
- return NULL;
- }
- if (n == 1) {
- list = split_char(s, len, sub[0], maxsplit);
- PyBuffer_Release(&vsub);
- return list;
- }
-
- list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL) {
- PyBuffer_Release(&vsub);
- return NULL;
- }
-
- i = j = 0;
- while (maxsplit-- > 0) {
- pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
- if (pos < 0)
- break;
- j = i+pos;
- SPLIT_ADD(s, i, j);
- i = j + n;
- }
- SPLIT_ADD(s, i, len);
- FIX_PREALLOC_SIZE(list);
+ list = stringlib_split(
+ (PyObject*) self, s, len, sub, n, maxsplit
+ );
PyBuffer_Release(&vsub);
return list;
-
- onError:
- Py_DECREF(list);
- PyBuffer_Release(&vsub);
- return NULL;
-}
-
-/* stringlib's partition shares nullbytes in some cases.
- undo this, we don't want the nullbytes to be shared. */
-static PyObject *
-make_nullbytes_unique(PyObject *result)
-{
- if (result != NULL) {
- int i;
- assert(PyTuple_Check(result));
- assert(PyTuple_GET_SIZE(result) == 3);
- for (i = 0; i < 3; i++) {
- if (PyTuple_GET_ITEM(result, i) == (PyObject *)nullbytes) {
- PyObject *new = PyByteArray_FromStringAndSize(NULL, 0);
- if (new == NULL) {
- Py_DECREF(result);
- result = NULL;
- break;
- }
- Py_DECREF(nullbytes);
- PyTuple_SET_ITEM(result, i, new);
- }
- }
- }
- return result;
}
PyDoc_STRVAR(partition__doc__,
@@ -2386,7 +2119,7 @@ bytearray_partition(PyByteArrayObject *self, PyObject *sep_obj)
);
Py_DECREF(bytesep);
- return make_nullbytes_unique(result);
+ return result;
}
PyDoc_STRVAR(rpartition__doc__,
@@ -2414,81 +2147,7 @@ bytearray_rpartition(PyByteArrayObject *self, PyObject *sep_obj)
);
Py_DECREF(bytesep);
- return make_nullbytes_unique(result);
-}
-
-Py_LOCAL_INLINE(PyObject *)
-rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
-{
- register Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
-
- if (list == NULL)
- return NULL;
-
- i = j = len - 1;
- while ((i >= 0) && (maxcount-- > 0)) {
- for (; i >= 0; i--) {
- if (s[i] == ch) {
- SPLIT_ADD(s, i + 1, j + 1);
- j = i = i - 1;
- break;
- }
- }
- }
- if (j >= -1) {
- SPLIT_ADD(s, 0, j + 1);
- }
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
-
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-Py_LOCAL_INLINE(PyObject *)
-rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
-{
- register Py_ssize_t i, j, count = 0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
-
- if (list == NULL)
- return NULL;
-
- for (i = j = len - 1; i >= 0; ) {
- /* find a token */
- while (i >= 0 && Py_ISSPACE(s[i]))
- i--;
- j = i;
- while (i >= 0 && !Py_ISSPACE(s[i]))
- i--;
- if (j > i) {
- if (maxcount-- <= 0)
- break;
- SPLIT_ADD(s, i + 1, j + 1);
- while (i >= 0 && Py_ISSPACE(s[i]))
- i--;
- j = i;
- }
- }
- if (j >= 0) {
- SPLIT_ADD(s, 0, j + 1);
- }
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
-
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
+ return result;
}
PyDoc_STRVAR(rsplit__doc__,
@@ -2503,10 +2162,10 @@ If maxsplit is given, at most maxsplit splits are done.");
static PyObject *
bytearray_rsplit(PyByteArrayObject *self, PyObject *args)
{
- Py_ssize_t len = PyByteArray_GET_SIZE(self), n, j, pos;
- Py_ssize_t maxsplit = -1, count = 0;
+ Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
+ Py_ssize_t maxsplit = -1;
const char *s = PyByteArray_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj = Py_None;
+ PyObject *list, *subobj = Py_None;
Py_buffer vsub;
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
@@ -2515,50 +2174,18 @@ bytearray_rsplit(PyByteArrayObject *self, PyObject *args)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
- return rsplit_whitespace(s, len, maxsplit);
+ return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
if (_getbuffer(subobj, &vsub) < 0)
return NULL;
sub = vsub.buf;
n = vsub.len;
- if (n == 0) {
- PyErr_SetString(PyExc_ValueError, "empty separator");
- PyBuffer_Release(&vsub);
- return NULL;
- }
- else if (n == 1) {
- list = rsplit_char(s, len, sub[0], maxsplit);
- PyBuffer_Release(&vsub);
- return list;
- }
-
- list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL) {
- PyBuffer_Release(&vsub);
- return NULL;
- }
-
- j = len;
-
- while (maxsplit-- > 0) {
- pos = fastsearch(s, j, sub, n, FAST_RSEARCH);
- if (pos < 0)
- break;
- SPLIT_ADD(s, pos + n, j);
- j = pos;
- }
- SPLIT_ADD(s, 0, j);
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
+ list = stringlib_rsplit(
+ (PyObject*) self, s, len, sub, n, maxsplit
+ );
PyBuffer_Release(&vsub);
return list;
-
-onError:
- Py_DECREF(list);
- PyBuffer_Release(&vsub);
- return NULL;
}
PyDoc_STRVAR(reverse__doc__,
@@ -3026,6 +2653,27 @@ bytearray_join(PyByteArrayObject *self, PyObject *it)
return NULL;
}
+PyDoc_STRVAR(splitlines__doc__,
+"B.splitlines([keepends]) -> list of lines\n\
+\n\
+Return a list of the lines in B, breaking at line boundaries.\n\
+Line breaks are not included in the resulting list unless keepends\n\
+is given and true.");
+
+static PyObject*
+bytearray_splitlines(PyObject *self, PyObject *args)
+{
+ int keepends = 0;
+
+ if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
+ return NULL;
+
+ return stringlib_splitlines(
+ (PyObject*) self, PyByteArray_AS_STRING(self),
+ PyByteArray_GET_SIZE(self), keepends
+ );
+}
+
PyDoc_STRVAR(fromhex_doc,
"bytearray.fromhex(string) -> bytearray\n\
\n\
@@ -3209,7 +2857,7 @@ bytearray_methods[] = {
{"rsplit", (PyCFunction)bytearray_rsplit, METH_VARARGS, rsplit__doc__},
{"rstrip", (PyCFunction)bytearray_rstrip, METH_VARARGS, rstrip__doc__},
{"split", (PyCFunction)bytearray_split, METH_VARARGS, split__doc__},
- {"splitlines", (PyCFunction)stringlib_splitlines, METH_VARARGS,
+ {"splitlines", (PyCFunction)bytearray_splitlines, METH_VARARGS,
splitlines__doc__},
{"startswith", (PyCFunction)bytearray_startswith, METH_VARARGS ,
startswith__doc__},
diff --git a/Objects/stringlib/README.txt b/Objects/stringlib/README.txt
index d948386..c884ec3 100644
--- a/Objects/stringlib/README.txt
+++ b/Objects/stringlib/README.txt
@@ -28,3 +28,12 @@ STRINGLIB_CHAR* STRINGLIB_STR(PyObject*)
returns the pointer to the character data for the given string
object (which must be of the right type)
+
+int STRINGLIB_CHECK_EXACT(PyObject *)
+
+ returns true if the object is an instance of our type, not a subclass.
+
+STRINGLIB_MUTABLE
+
+ Must be 0 or 1 to tell the cpp macros in stringlib code if the object
+ being operated on is mutable or not.
diff --git a/Objects/stringlib/count.h b/Objects/stringlib/count.h
index eba37e9..de34f96 100644
--- a/Objects/stringlib/count.h
+++ b/Objects/stringlib/count.h
@@ -9,28 +9,22 @@
Py_LOCAL_INLINE(Py_ssize_t)
stringlib_count(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
- const STRINGLIB_CHAR* sub, Py_ssize_t sub_len)
+ const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
+ Py_ssize_t maxcount)
{
Py_ssize_t count;
if (str_len < 0)
return 0; /* start > len(str) */
if (sub_len == 0)
- return str_len + 1;
+ return (str_len < maxcount) ? str_len + 1 : maxcount;
- count = fastsearch(str, str_len, sub, sub_len, FAST_COUNT);
+ count = fastsearch(str, str_len, sub, sub_len, maxcount, FAST_COUNT);
if (count < 0)
- count = 0; /* no match */
+ return 0; /* no match */
return count;
}
#endif
-
-/*
-Local variables:
-c-basic-offset: 4
-indent-tabs-mode: nil
-End:
-*/
diff --git a/Objects/stringlib/ctype.h b/Objects/stringlib/ctype.h
index 8951276..739cf3d 100644
--- a/Objects/stringlib/ctype.h
+++ b/Objects/stringlib/ctype.h
@@ -107,4 +107,3 @@ stringlib_swapcase(PyObject *self)
STRINGLIB_LEN(self));
return newobj;
}
-
diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
index 7b9dd47..21cf3a2 100644
--- a/Objects/stringlib/fastsearch.h
+++ b/Objects/stringlib/fastsearch.h
@@ -18,10 +18,13 @@
#define FAST_SEARCH 1
#define FAST_RSEARCH 2
+#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1)))))
+#define BLOOM(mask, ch) ((mask & (1 << ((ch) & (LONG_BIT - 1)))))
+
Py_LOCAL_INLINE(Py_ssize_t)
fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
const STRINGLIB_CHAR* p, Py_ssize_t m,
- int mode)
+ Py_ssize_t maxcount, int mode)
{
long mask;
Py_ssize_t skip, count = 0;
@@ -29,7 +32,7 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
w = n - m;
- if (w < 0)
+ if (w < 0 || (mode == FAST_COUNT && maxcount == 0))
return -1;
/* look for special cases */
@@ -39,8 +42,11 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* use special case for 1-character strings */
if (mode == FAST_COUNT) {
for (i = 0; i < n; i++)
- if (s[i] == p[0])
+ if (s[i] == p[0]) {
count++;
+ if (count == maxcount)
+ return maxcount;
+ }
return count;
} else if (mode == FAST_SEARCH) {
for (i = 0; i < n; i++)
@@ -56,19 +62,20 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
mlast = m - 1;
skip = mlast - 1;
+ mask = 0;
if (mode != FAST_RSEARCH) {
/* create compressed boyer-moore delta 1 table */
/* process pattern[:-1] */
- for (mask = i = 0; i < mlast; i++) {
- mask |= (1 << (p[i] & 0x1F));
+ for (i = 0; i < mlast; i++) {
+ BLOOM_ADD(mask, p[i]);
if (p[i] == p[mlast])
skip = mlast - i - 1;
}
/* process pattern[-1] outside the loop */
- mask |= (1 << (p[mlast] & 0x1F));
+ BLOOM_ADD(mask, p[mlast]);
for (i = 0; i <= w; i++) {
/* note: using mlast in the skip path slows things down on x86 */
@@ -82,17 +89,19 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
if (mode != FAST_COUNT)
return i;
count++;
+ if (count == maxcount)
+ return maxcount;
i = i + mlast;
continue;
}
/* miss: check if next character is part of pattern */
- if (!(mask & (1 << (s[i+m] & 0x1F))))
+ if (!BLOOM(mask, s[i+m]))
i = i + m;
else
i = i + skip;
} else {
/* skip: check if next character is part of pattern */
- if (!(mask & (1 << (s[i+m] & 0x1F))))
+ if (!BLOOM(mask, s[i+m]))
i = i + m;
}
}
@@ -101,10 +110,10 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* create compressed boyer-moore delta 1 table */
/* process pattern[0] outside the loop */
- mask = (1 << (p[0] & 0x1F));
+ BLOOM_ADD(mask, p[0]);
/* process pattern[:0:-1] */
for (i = mlast; i > 0; i--) {
- mask |= (1 << (p[i] & 0x1F));
+ BLOOM_ADD(mask, p[i]);
if (p[i] == p[0])
skip = i - 1;
}
@@ -119,13 +128,13 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* got a match! */
return i;
/* miss: check if previous character is part of pattern */
- if (!(mask & (1 << (s[i-1] & 0x1F))))
+ if (!BLOOM(mask, s[i-1]))
i = i - m;
else
i = i - skip;
} else {
/* skip: check if previous character is part of pattern */
- if (!(mask & (1 << (s[i-1] & 0x1F))))
+ if (!BLOOM(mask, s[i-1]))
i = i - m;
}
}
@@ -137,10 +146,3 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
}
#endif
-
-/*
-Local variables:
-c-basic-offset: 4
-indent-tabs-mode: nil
-End:
-*/
diff --git a/Objects/stringlib/find.h b/Objects/stringlib/find.h
index b5bace7..f915296c 100644
--- a/Objects/stringlib/find.h
+++ b/Objects/stringlib/find.h
@@ -19,7 +19,7 @@ stringlib_find(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
if (sub_len == 0)
return offset;
- pos = fastsearch(str, str_len, sub, sub_len, FAST_SEARCH);
+ pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_SEARCH);
if (pos >= 0)
pos += offset;
@@ -39,7 +39,7 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
if (sub_len == 0)
return str_len + offset;
- pos = fastsearch(str, str_len, sub, sub_len, FAST_RSEARCH);
+ pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_RSEARCH);
if (pos >= 0)
pos += offset;
@@ -47,22 +47,27 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
return pos;
}
+/* helper macro to fixup start/end slice values */
+#define ADJUST_INDICES(start, end, len) \
+ if (end > len) \
+ end = len; \
+ else if (end < 0) { \
+ end += len; \
+ if (end < 0) \
+ end = 0; \
+ } \
+ if (start < 0) { \
+ start += len; \
+ if (start < 0) \
+ start = 0; \
+ }
+
Py_LOCAL_INLINE(Py_ssize_t)
stringlib_find_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
Py_ssize_t start, Py_ssize_t end)
{
- if (start < 0)
- start += str_len;
- if (start < 0)
- start = 0;
- if (end > str_len)
- end = str_len;
- if (end < 0)
- end += str_len;
- if (end < 0)
- end = 0;
-
+ ADJUST_INDICES(start, end, str_len);
return stringlib_find(str + start, end - start, sub, sub_len, start);
}
@@ -71,21 +76,11 @@ stringlib_rfind_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
Py_ssize_t start, Py_ssize_t end)
{
- if (start < 0)
- start += str_len;
- if (start < 0)
- start = 0;
- if (end > str_len)
- end = str_len;
- if (end < 0)
- end += str_len;
- if (end < 0)
- end = 0;
-
+ ADJUST_INDICES(start, end, str_len);
return stringlib_rfind(str + start, end - start, sub, sub_len, start);
}
-#if defined(STRINGLIB_STR) && !defined(FROM_BYTEARRAY)
+#ifdef STRINGLIB_WANT_CONTAINS_OBJ
Py_LOCAL_INLINE(int)
stringlib_contains_obj(PyObject* str, PyObject* sub)
@@ -96,9 +91,9 @@ stringlib_contains_obj(PyObject* str, PyObject* sub)
) != -1;
}
-#endif /* STRINGLIB_STR */
+#endif /* STRINGLIB_WANT_CONTAINS_OBJ */
-#ifdef FROM_UNICODE
+#if STRINGLIB_IS_UNICODE
/*
This function is a helper for the "find" family (find, rfind, index,
@@ -146,13 +141,6 @@ _ParseTupleFinds (PyObject *args, PyObject **substring,
return 1;
}
-#endif /* FROM_UNICODE */
+#endif /* STRINGLIB_IS_UNICODE */
#endif /* STRINGLIB_FIND_H */
-
-/*
-Local variables:
-c-basic-offset: 4
-indent-tabs-mode: nil
-End:
-*/
diff --git a/Objects/stringlib/partition.h b/Objects/stringlib/partition.h
index 2f26212..0170bdd 100644
--- a/Objects/stringlib/partition.h
+++ b/Objects/stringlib/partition.h
@@ -8,33 +8,39 @@
#endif
Py_LOCAL_INLINE(PyObject*)
-stringlib_partition(
- PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len,
- PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len
- )
+stringlib_partition(PyObject* str_obj,
+ const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+ PyObject* sep_obj,
+ const STRINGLIB_CHAR* sep, Py_ssize_t sep_len)
{
PyObject* out;
Py_ssize_t pos;
if (sep_len == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
+ return NULL;
}
out = PyTuple_New(3);
if (!out)
- return NULL;
+ return NULL;
- pos = fastsearch(str, str_len, sep, sep_len, FAST_SEARCH);
+ pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_SEARCH);
if (pos < 0) {
- Py_INCREF(str_obj);
- PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
- Py_INCREF(STRINGLIB_EMPTY);
- PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
- Py_INCREF(STRINGLIB_EMPTY);
- PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY);
- return out;
+#if STRINGLIB_MUTABLE
+ PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, str_len));
+ PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0));
+ PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(NULL, 0));
+#else
+ Py_INCREF(str_obj);
+ PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
+ Py_INCREF(STRINGLIB_EMPTY);
+ PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
+ Py_INCREF(STRINGLIB_EMPTY);
+ PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY);
+#endif
+ return out;
}
PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos));
@@ -44,41 +50,47 @@ stringlib_partition(
PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos));
if (PyErr_Occurred()) {
- Py_DECREF(out);
- return NULL;
+ Py_DECREF(out);
+ return NULL;
}
return out;
}
Py_LOCAL_INLINE(PyObject*)
-stringlib_rpartition(
- PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len,
- PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len
- )
+stringlib_rpartition(PyObject* str_obj,
+ const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+ PyObject* sep_obj,
+ const STRINGLIB_CHAR* sep, Py_ssize_t sep_len)
{
PyObject* out;
Py_ssize_t pos;
if (sep_len == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
+ return NULL;
}
out = PyTuple_New(3);
if (!out)
- return NULL;
+ return NULL;
- pos = fastsearch(str, str_len, sep, sep_len, FAST_RSEARCH);
+ pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_RSEARCH);
if (pos < 0) {
- Py_INCREF(STRINGLIB_EMPTY);
- PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY);
- Py_INCREF(STRINGLIB_EMPTY);
- PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
- Py_INCREF(str_obj);
- PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj);
- return out;
+#if STRINGLIB_MUTABLE
+ PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(NULL, 0));
+ PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0));
+ PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str, str_len));
+#else
+ Py_INCREF(STRINGLIB_EMPTY);
+ PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY);
+ Py_INCREF(STRINGLIB_EMPTY);
+ PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
+ Py_INCREF(str_obj);
+ PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj);
+#endif
+ return out;
}
PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos));
@@ -88,18 +100,11 @@ stringlib_rpartition(
PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos));
if (PyErr_Occurred()) {
- Py_DECREF(out);
- return NULL;
+ Py_DECREF(out);
+ return NULL;
}
return out;
}
#endif
-
-/*
-Local variables:
-c-basic-offset: 4
-indent-tabs-mode: nil
-End:
-*/
diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h
new file mode 100644
index 0000000..60e7767
--- /dev/null
+++ b/Objects/stringlib/split.h
@@ -0,0 +1,394 @@
+/* stringlib: split implementation */
+
+#ifndef STRINGLIB_SPLIT_H
+#define STRINGLIB_SPLIT_H
+
+#ifndef STRINGLIB_FASTSEARCH_H
+#error must include "stringlib/fastsearch.h" before including this module
+#endif
+
+/* Overallocate the initial list to reduce the number of reallocs for small
+ split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
+ resizes, to sizes 4, 8, then 16. Most observed string splits are for human
+ text (roughly 11 words per line) and field delimited data (usually 1-10
+ fields). For large strings the split algorithms are bandwidth limited
+ so increasing the preallocation likely will not improve things.*/
+
+#define MAX_PREALLOC 12
+
+/* 5 splits gives 6 elements */
+#define PREALLOC_SIZE(maxsplit) \
+ (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
+
+#define SPLIT_APPEND(data, left, right) \
+ sub = STRINGLIB_NEW((data) + (left), \
+ (right) - (left)); \
+ if (sub == NULL) \
+ goto onError; \
+ if (PyList_Append(list, sub)) { \
+ Py_DECREF(sub); \
+ goto onError; \
+ } \
+ else \
+ Py_DECREF(sub);
+
+#define SPLIT_ADD(data, left, right) { \
+ sub = STRINGLIB_NEW((data) + (left), \
+ (right) - (left)); \
+ if (sub == NULL) \
+ goto onError; \
+ if (count < MAX_PREALLOC) { \
+ PyList_SET_ITEM(list, count, sub); \
+ } else { \
+ if (PyList_Append(list, sub)) { \
+ Py_DECREF(sub); \
+ goto onError; \
+ } \
+ else \
+ Py_DECREF(sub); \
+ } \
+ count++; }
+
+
+/* Always force the list to the expected size. */
+#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
+
+Py_LOCAL_INLINE(PyObject *)
+stringlib_split_whitespace(PyObject* str_obj,
+ const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+ Py_ssize_t maxcount)
+{
+ Py_ssize_t i, j, count=0;
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+ PyObject *sub;
+
+ if (list == NULL)
+ return NULL;
+
+ i = j = 0;
+ while (maxcount-- > 0) {
+ while (i < str_len && STRINGLIB_ISSPACE(str[i]))
+ i++;
+ if (i == str_len) break;
+ j = i; i++;
+ while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
+ i++;
+#ifndef STRINGLIB_MUTABLE
+ if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
+ /* No whitespace in str_obj, so just use it as list[0] */
+ Py_INCREF(str_obj);
+ PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
+ count++;
+ break;
+ }
+#endif
+ SPLIT_ADD(str, j, i);
+ }
+
+ if (i < str_len) {
+ /* Only occurs when maxcount was reached */
+ /* Skip any remaining whitespace and copy to end of string */
+ while (i < str_len && STRINGLIB_ISSPACE(str[i]))
+ i++;
+ if (i != str_len)
+ SPLIT_ADD(str, i, str_len);
+ }
+ FIX_PREALLOC_SIZE(list);
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
+Py_LOCAL_INLINE(PyObject *)
+stringlib_split_char(PyObject* str_obj,
+ const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+ const STRINGLIB_CHAR ch,
+ Py_ssize_t maxcount)
+{
+ Py_ssize_t i, j, count=0;
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+ PyObject *sub;
+
+ if (list == NULL)
+ return NULL;
+
+ i = j = 0;
+ while ((j < str_len) && (maxcount-- > 0)) {
+ for(; j < str_len; j++) {
+ /* I found that using memchr makes no difference */
+ if (str[j] == ch) {
+ SPLIT_ADD(str, i, j);
+ i = j = j + 1;
+ break;
+ }
+ }
+ }
+#ifndef STRINGLIB_MUTABLE
+ if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
+ /* ch not in str_obj, so just use str_obj as list[0] */
+ Py_INCREF(str_obj);
+ PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
+ count++;
+ } else
+#endif
+ if (i <= str_len) {
+ SPLIT_ADD(str, i, str_len);
+ }
+ FIX_PREALLOC_SIZE(list);
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
+Py_LOCAL_INLINE(PyObject *)
+stringlib_split(PyObject* str_obj,
+ const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+ const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
+ Py_ssize_t maxcount)
+{
+ Py_ssize_t i, j, pos, count=0;
+ PyObject *list, *sub;
+
+ if (sep_len == 0) {
+ PyErr_SetString(PyExc_ValueError, "empty separator");
+ return NULL;
+ }
+ else if (sep_len == 1)
+ return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
+
+ list = PyList_New(PREALLOC_SIZE(maxcount));
+ if (list == NULL)
+ return NULL;
+
+ i = j = 0;
+ while (maxcount-- > 0) {
+ pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
+ if (pos < 0)
+ break;
+ j = i + pos;
+ SPLIT_ADD(str, i, j);
+ i = j + sep_len;
+ }
+#ifndef STRINGLIB_MUTABLE
+ if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
+ /* No match in str_obj, so just use it as list[0] */
+ Py_INCREF(str_obj);
+ PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
+ count++;
+ } else
+#endif
+ {
+ SPLIT_ADD(str, i, str_len);
+ }
+ FIX_PREALLOC_SIZE(list);
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
+Py_LOCAL_INLINE(PyObject *)
+stringlib_rsplit_whitespace(PyObject* str_obj,
+ const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+ Py_ssize_t maxcount)
+{
+ Py_ssize_t i, j, count=0;
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+ PyObject *sub;
+
+ if (list == NULL)
+ return NULL;
+
+ i = j = str_len - 1;
+ while (maxcount-- > 0) {
+ while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
+ i--;
+ if (i < 0) break;
+ j = i; i--;
+ while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
+ i--;
+#ifndef STRINGLIB_MUTABLE
+ if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
+ /* No whitespace in str_obj, so just use it as list[0] */
+ Py_INCREF(str_obj);
+ PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
+ count++;
+ break;
+ }
+#endif
+ SPLIT_ADD(str, i + 1, j + 1);
+ }
+
+ if (i >= 0) {
+ /* Only occurs when maxcount was reached */
+ /* Skip any remaining whitespace and copy to beginning of string */
+ while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
+ i--;
+ if (i >= 0)
+ SPLIT_ADD(str, 0, i + 1);
+ }
+ FIX_PREALLOC_SIZE(list);
+ if (PyList_Reverse(list) < 0)
+ goto onError;
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
+Py_LOCAL_INLINE(PyObject *)
+stringlib_rsplit_char(PyObject* str_obj,
+ const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+ const STRINGLIB_CHAR ch,
+ Py_ssize_t maxcount)
+{
+ Py_ssize_t i, j, count=0;
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+ PyObject *sub;
+
+ if (list == NULL)
+ return NULL;
+
+ i = j = str_len - 1;
+ while ((i >= 0) && (maxcount-- > 0)) {
+ for(; i >= 0; i--) {
+ if (str[i] == ch) {
+ SPLIT_ADD(str, i + 1, j + 1);
+ j = i = i - 1;
+ break;
+ }
+ }
+ }
+#ifndef STRINGLIB_MUTABLE
+ if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
+ /* ch not in str_obj, so just use str_obj as list[0] */
+ Py_INCREF(str_obj);
+ PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
+ count++;
+ } else
+#endif
+ if (j >= -1) {
+ SPLIT_ADD(str, 0, j + 1);
+ }
+ FIX_PREALLOC_SIZE(list);
+ if (PyList_Reverse(list) < 0)
+ goto onError;
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
+Py_LOCAL_INLINE(PyObject *)
+stringlib_rsplit(PyObject* str_obj,
+ const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+ const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
+ Py_ssize_t maxcount)
+{
+ Py_ssize_t j, pos, count=0;
+ PyObject *list, *sub;
+
+ if (sep_len == 0) {
+ PyErr_SetString(PyExc_ValueError, "empty separator");
+ return NULL;
+ }
+ else if (sep_len == 1)
+ return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
+
+ list = PyList_New(PREALLOC_SIZE(maxcount));
+ if (list == NULL)
+ return NULL;
+
+ j = str_len;
+ while (maxcount-- > 0) {
+ pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
+ if (pos < 0)
+ break;
+ SPLIT_ADD(str, pos + sep_len, j);
+ j = pos;
+ }
+#ifndef STRINGLIB_MUTABLE
+ if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
+ /* No match in str_obj, so just use it as list[0] */
+ Py_INCREF(str_obj);
+ PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
+ count++;
+ } else
+#endif
+ {
+ SPLIT_ADD(str, 0, j);
+ }
+ FIX_PREALLOC_SIZE(list);
+ if (PyList_Reverse(list) < 0)
+ goto onError;
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
+Py_LOCAL_INLINE(PyObject *)
+stringlib_splitlines(PyObject* str_obj,
+ const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+ int keepends)
+{
+ /* This does not use the preallocated list because splitlines is
+ usually run with hundreds of newlines. The overhead of
+ switching between PyList_SET_ITEM and append causes about a
+ 2-3% slowdown for that common case. A smarter implementation
+ could move the if check out, so the SET_ITEMs are done first
+ and the appends only done when the prealloc buffer is full.
+ That's too much work for little gain.*/
+
+ register Py_ssize_t i;
+ register Py_ssize_t j;
+ PyObject *list = PyList_New(0);
+ PyObject *sub;
+
+ if (list == NULL)
+ return NULL;
+
+ for (i = j = 0; i < str_len; ) {
+ Py_ssize_t eol;
+
+ /* Find a line and append it */
+ while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
+ i++;
+
+ /* Skip the line break reading CRLF as one line break */
+ eol = i;
+ if (i < str_len) {
+ if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
+ i += 2;
+ else
+ i++;
+ if (keepends)
+ eol = i;
+ }
+#ifndef STRINGLIB_MUTABLE
+ if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
+ /* No linebreak in str_obj, so just use it as list[0] */
+ if (PyList_Append(list, str_obj))
+ goto onError;
+ break;
+ }
+#endif
+ SPLIT_APPEND(str, j, eol);
+ j = i;
+ }
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
+#endif
diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h
index 4a95258..84e4616 100644
--- a/Objects/stringlib/stringdefs.h
+++ b/Objects/stringlib/stringdefs.h
@@ -11,6 +11,8 @@
#define STRINGLIB_TYPE_NAME "string"
#define STRINGLIB_PARSE_CODE "S"
#define STRINGLIB_EMPTY nullstring
+#define STRINGLIB_ISSPACE Py_ISSPACE
+#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r'))
#define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9'))
#define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1)
#define STRINGLIB_TOUPPER Py_TOUPPER
@@ -21,8 +23,11 @@
#define STRINGLIB_NEW PyString_FromStringAndSize
#define STRINGLIB_RESIZE _PyString_Resize
#define STRINGLIB_CHECK PyString_Check
+#define STRINGLIB_CHECK_EXACT PyString_CheckExact
#define STRINGLIB_TOSTR PyObject_Str
#define STRINGLIB_GROUPING _PyString_InsertThousandsGrouping
#define STRINGLIB_GROUPING_LOCALE _PyString_InsertThousandsGroupingLocale
+#define STRINGLIB_WANT_CONTAINS_OBJ 1
+
#endif /* !STRINGLIB_STRINGDEFS_H */
diff --git a/Objects/stringlib/transmogrify.h b/Objects/stringlib/transmogrify.h
index 7dc8177..1e132e5 100644
--- a/Objects/stringlib/transmogrify.h
+++ b/Objects/stringlib/transmogrify.h
@@ -1,13 +1,6 @@
/* NOTE: this API is -ONLY- for use with single byte character strings. */
/* Do not use it with Unicode. */
-#include "bytes_methods.h"
-
-#ifndef STRINGLIB_MUTABLE
-#warning "STRINGLIB_MUTABLE not defined before #include, assuming 0"
-#define STRINGLIB_MUTABLE 0
-#endif
-
/* the more complicated methods. parts of these should be pulled out into the
shared code in bytes_methods.c to cut down on duplicate code bloat. */
@@ -269,87 +262,3 @@ stringlib_zfill(PyObject *self, PyObject *args)
return (PyObject*) s;
}
-
-
-#define _STRINGLIB_SPLIT_APPEND(data, left, right) \
- str = STRINGLIB_NEW((data) + (left), \
- (right) - (left)); \
- if (str == NULL) \
- goto onError; \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str);
-
-PyDoc_STRVAR(splitlines__doc__,
-"B.splitlines([keepends]) -> list of lines\n\
-\n\
-Return a list of the lines in B, breaking at line boundaries.\n\
-Line breaks are not included in the resulting list unless keepends\n\
-is given and true.");
-
-static PyObject*
-stringlib_splitlines(PyObject *self, PyObject *args)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len;
- int keepends = 0;
- PyObject *list;
- PyObject *str;
- char *data;
-
- if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
- return NULL;
-
- data = STRINGLIB_STR(self);
- len = STRINGLIB_LEN(self);
-
- /* This does not use the preallocated list because splitlines is
- usually run with hundreds of newlines. The overhead of
- switching between PyList_SET_ITEM and append causes about a
- 2-3% slowdown for that common case. A smarter implementation
- could move the if check out, so the SET_ITEMs are done first
- and the appends only done when the prealloc buffer is full.
- That's too much work for little gain.*/
-
- list = PyList_New(0);
- if (!list)
- goto onError;
-
- for (i = j = 0; i < len; ) {
- Py_ssize_t eol;
-
- /* Find a line and append it */
- while (i < len && data[i] != '\n' && data[i] != '\r')
- i++;
-
- /* Skip the line break reading CRLF as one line break */
- eol = i;
- if (i < len) {
- if (data[i] == '\r' && i + 1 < len &&
- data[i+1] == '\n')
- i += 2;
- else
- i++;
- if (keepends)
- eol = i;
- }
- _STRINGLIB_SPLIT_APPEND(data, j, eol);
- j = i;
- }
- if (j < len) {
- _STRINGLIB_SPLIT_APPEND(data, j, len);
- }
-
- return list;
-
- onError:
- Py_XDECREF(list);
- return NULL;
-}
-
-#undef _STRINGLIB_SPLIT_APPEND
-
diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h
index 524781f..dd814f6 100644
--- a/Objects/stringlib/unicodedefs.h
+++ b/Objects/stringlib/unicodedefs.h
@@ -11,6 +11,8 @@
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_EMPTY unicode_empty
+#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
+#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL
#define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER
@@ -21,6 +23,7 @@
#define STRINGLIB_NEW PyUnicode_FromUnicode
#define STRINGLIB_RESIZE PyUnicode_Resize
#define STRINGLIB_CHECK PyUnicode_Check
+#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
#define STRINGLIB_GROUPING _PyUnicode_InsertThousandsGrouping
#if PY_VERSION_HEX < 0x03000000
diff --git a/Objects/stringobject.c b/Objects/stringobject.c
index 0f3874e..43ef3fa 100644
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@@ -841,6 +841,7 @@ PyString_AsStringAndSize(register PyObject *obj,
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/partition.h"
+#include "stringlib/split.h"
#define _Py_InsertThousandsGrouping _PyString_InsertThousandsGrouping
#include "stringlib/localeutil.h"
@@ -1425,145 +1426,6 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
#define STRIPNAME(i) (stripformat[i]+3)
-
-/* Don't call if length < 2 */
-#define Py_STRING_MATCH(target, offset, pattern, length) \
- (target[offset] == pattern[0] && \
- target[offset+length-1] == pattern[length-1] && \
- !memcmp(target+offset+1, pattern+1, length-2) )
-
-
-/* Overallocate the initial list to reduce the number of reallocs for small
- split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
- resizes, to sizes 4, 8, then 16. Most observed string splits are for human
- text (roughly 11 words per line) and field delimited data (usually 1-10
- fields). For large strings the split algorithms are bandwidth limited
- so increasing the preallocation likely will not improve things.*/
-
-#define MAX_PREALLOC 12
-
-/* 5 splits gives 6 elements */
-#define PREALLOC_SIZE(maxsplit) \
- (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
-
-#define SPLIT_APPEND(data, left, right) \
- str = PyString_FromStringAndSize((data) + (left), \
- (right) - (left)); \
- if (str == NULL) \
- goto onError; \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str);
-
-#define SPLIT_ADD(data, left, right) { \
- str = PyString_FromStringAndSize((data) + (left), \
- (right) - (left)); \
- if (str == NULL) \
- goto onError; \
- if (count < MAX_PREALLOC) { \
- PyList_SET_ITEM(list, count, str); \
- } else { \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str); \
- } \
- count++; }
-
-/* Always force the list to the expected size. */
-#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
-
-#define SKIP_SPACE(s, i, len) { while (i<len && isspace(Py_CHARMASK(s[i]))) i++; }
-#define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; }
-#define RSKIP_SPACE(s, i) { while (i>=0 && isspace(Py_CHARMASK(s[i]))) i--; }
-#define RSKIP_NONSPACE(s, i) { while (i>=0 && !isspace(Py_CHARMASK(s[i]))) i--; }
-
-Py_LOCAL_INLINE(PyObject *)
-split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
-{
- const char *s = PyString_AS_STRING(self);
- Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
-
- if (list == NULL)
- return NULL;
-
- i = j = 0;
-
- while (maxsplit-- > 0) {
- SKIP_SPACE(s, i, len);
- if (i==len) break;
- j = i; i++;
- SKIP_NONSPACE(s, i, len);
- if (j == 0 && i == len && PyString_CheckExact(self)) {
- /* No whitespace in self, so just use it as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- break;
- }
- SPLIT_ADD(s, j, i);
- }
-
- if (i < len) {
- /* Only occurs when maxsplit was reached */
- /* Skip any remaining whitespace and copy to end of string */
- SKIP_SPACE(s, i, len);
- if (i != len)
- SPLIT_ADD(s, i, len);
- }
- FIX_PREALLOC_SIZE(list);
- return list;
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-Py_LOCAL_INLINE(PyObject *)
-split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
-{
- const char *s = PyString_AS_STRING(self);
- register Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
-
- if (list == NULL)
- return NULL;
-
- i = j = 0;
- while ((j < len) && (maxcount-- > 0)) {
- for(; j<len; j++) {
- /* I found that using memchr makes no difference */
- if (s[j] == ch) {
- SPLIT_ADD(s, i, j);
- i = j = j + 1;
- break;
- }
- }
- }
- if (i == 0 && count == 0 && PyString_CheckExact(self)) {
- /* ch not in self, so just use self as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- }
- else if (i <= len) {
- SPLIT_ADD(s, i, len);
- }
- FIX_PREALLOC_SIZE(list);
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
PyDoc_STRVAR(split__doc__,
"S.split([sep [,maxsplit]]) -> list of strings\n\
\n\
@@ -1576,17 +1438,17 @@ from the result.");
static PyObject *
string_split(PyStringObject *self, PyObject *args)
{
- Py_ssize_t len = PyString_GET_SIZE(self), n, i, j, pos;
- Py_ssize_t maxsplit = -1, count=0;
+ Py_ssize_t len = PyString_GET_SIZE(self), n;
+ Py_ssize_t maxsplit = -1;
const char *s = PyString_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj = Py_None;
+ PyObject *subobj = Py_None;
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
- return split_whitespace(self, len, maxsplit);
+ return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
n = PyString_GET_SIZE(subobj);
@@ -1598,33 +1460,7 @@ string_split(PyStringObject *self, PyObject *args)
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
return NULL;
- if (n == 0) {
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else if (n == 1)
- return split_char(self, len, sub[0], maxsplit);
-
- list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL)
- return NULL;
-
- i = j = 0;
- while (maxsplit-- > 0) {
- pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
- if (pos < 0)
- break;
- j = i + pos;
- SPLIT_ADD(s, i, j);
- i = j + n;
- }
- SPLIT_ADD(s, i, len);
- FIX_PREALLOC_SIZE(list);
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
+ return stringlib_split((PyObject*) self, s, len, sub, n, maxsplit);
}
PyDoc_STRVAR(partition__doc__,
@@ -1689,90 +1525,6 @@ string_rpartition(PyStringObject *self, PyObject *sep_obj)
);
}
-Py_LOCAL_INLINE(PyObject *)
-rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
-{
- const char *s = PyString_AS_STRING(self);
- Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
-
- if (list == NULL)
- return NULL;
-
- i = j = len-1;
-
- while (maxsplit-- > 0) {
- RSKIP_SPACE(s, i);
- if (i<0) break;
- j = i; i--;
- RSKIP_NONSPACE(s, i);
- if (j == len-1 && i < 0 && PyString_CheckExact(self)) {
- /* No whitespace in self, so just use it as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- break;
- }
- SPLIT_ADD(s, i + 1, j + 1);
- }
- if (i >= 0) {
- /* Only occurs when maxsplit was reached */
- /* Skip any remaining whitespace and copy to beginning of string */
- RSKIP_SPACE(s, i);
- if (i >= 0)
- SPLIT_ADD(s, 0, i + 1);
-
- }
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-Py_LOCAL_INLINE(PyObject *)
-rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
-{
- const char *s = PyString_AS_STRING(self);
- register Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
-
- if (list == NULL)
- return NULL;
-
- i = j = len - 1;
- while ((i >= 0) && (maxcount-- > 0)) {
- for (; i >= 0; i--) {
- if (s[i] == ch) {
- SPLIT_ADD(s, i + 1, j + 1);
- j = i = i - 1;
- break;
- }
- }
- }
- if (i < 0 && count == 0 && PyString_CheckExact(self)) {
- /* ch not in self, so just use self as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- }
- else if (j >= -1) {
- SPLIT_ADD(s, 0, j + 1);
- }
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
PyDoc_STRVAR(rsplit__doc__,
"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
\n\
@@ -1785,17 +1537,17 @@ is a separator.");
static PyObject *
string_rsplit(PyStringObject *self, PyObject *args)
{
- Py_ssize_t len = PyString_GET_SIZE(self), n, j, pos;
- Py_ssize_t maxsplit = -1, count=0;
+ Py_ssize_t len = PyString_GET_SIZE(self), n;
+ Py_ssize_t maxsplit = -1;
const char *s = PyString_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj = Py_None;
+ PyObject *subobj = Py_None;
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
- return rsplit_whitespace(self, len, maxsplit);
+ return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
n = PyString_GET_SIZE(subobj);
@@ -1807,35 +1559,7 @@ string_rsplit(PyStringObject *self, PyObject *args)
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
return NULL;
- if (n == 0) {
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else if (n == 1)
- return rsplit_char(self, len, sub[0], maxsplit);
-
- list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL)
- return NULL;
-
- j = len;
-
- while (maxsplit-- > 0) {
- pos = fastsearch(s, j, sub, n, FAST_RSEARCH);
- if (pos < 0)
- break;
- SPLIT_ADD(s, pos + n, j);
- j = pos;
- }
- SPLIT_ADD(s, 0, j);
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
-onError:
- Py_DECREF(list);
- return NULL;
+ return stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit);
}
@@ -1950,20 +1674,20 @@ _PyString_Join(PyObject *sep, PyObject *x)
return string_join((PyStringObject *)sep, x);
}
-Py_LOCAL_INLINE(void)
-string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
-{
- if (*end > len)
- *end = len;
- else if (*end < 0)
- *end += len;
- if (*end < 0)
- *end = 0;
- if (*start < 0)
- *start += len;
- if (*start < 0)
- *start = 0;
-}
+/* helper macro to fixup start/end slice values */
+#define ADJUST_INDICES(start, end, len) \
+ if (end > len) \
+ end = len; \
+ else if (end < 0) { \
+ end += len; \
+ if (end < 0) \
+ end = 0; \
+ } \
+ if (start < 0) { \
+ start += len; \
+ if (start < 0) \
+ start = 0; \
+ }
Py_LOCAL_INLINE(Py_ssize_t)
string_find_internal(PyStringObject *self, PyObject *args, int dir)
@@ -2417,10 +2141,10 @@ string_count(PyStringObject *self, PyObject *args)
else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len))
return NULL;
- string_adjust_indices(&start, &end, PyString_GET_SIZE(self));
+ ADJUST_INDICES(start, end, PyString_GET_SIZE(self));
return PyInt_FromSsize_t(
- stringlib_count(str + start, end - start, sub, sub_len)
+ stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX)
);
}
@@ -2583,9 +2307,6 @@ string_translate(PyStringObject *self, PyObject *args)
}
-#define FORWARD 1
-#define REVERSE -1
-
/* find and count characters and substrings */
#define findchar(target, target_len, c) \
@@ -2621,93 +2342,6 @@ countchar(const char *target, int target_len, char c, Py_ssize_t maxcount)
return count;
}
-Py_LOCAL(Py_ssize_t)
-findstring(const char *target, Py_ssize_t target_len,
- const char *pattern, Py_ssize_t pattern_len,
- Py_ssize_t start,
- Py_ssize_t end,
- int direction)
-{
- if (start < 0) {
- start += target_len;
- if (start < 0)
- start = 0;
- }
- if (end > target_len) {
- end = target_len;
- } else if (end < 0) {
- end += target_len;
- if (end < 0)
- end = 0;
- }
-
- /* zero-length substrings always match at the first attempt */
- if (pattern_len == 0)
- return (direction > 0) ? start : end;
-
- end -= pattern_len;
-
- if (direction < 0) {
- for (; end >= start; end--)
- if (Py_STRING_MATCH(target, end, pattern, pattern_len))
- return end;
- } else {
- for (; start <= end; start++)
- if (Py_STRING_MATCH(target, start, pattern, pattern_len))
- return start;
- }
- return -1;
-}
-
-Py_LOCAL_INLINE(Py_ssize_t)
-countstring(const char *target, Py_ssize_t target_len,
- const char *pattern, Py_ssize_t pattern_len,
- Py_ssize_t start,
- Py_ssize_t end,
- int direction, Py_ssize_t maxcount)
-{
- Py_ssize_t count=0;
-
- if (start < 0) {
- start += target_len;
- if (start < 0)
- start = 0;
- }
- if (end > target_len) {
- end = target_len;
- } else if (end < 0) {
- end += target_len;
- if (end < 0)
- end = 0;
- }
-
- /* zero-length substrings match everywhere */
- if (pattern_len == 0 || maxcount == 0) {
- if (target_len+1 < maxcount)
- return target_len+1;
- return maxcount;
- }
-
- end -= pattern_len;
- if (direction < 0) {
- for (; (end >= start); end--)
- if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
- count++;
- if (--maxcount <= 0) break;
- end -= pattern_len-1;
- }
- } else {
- for (; (start <= end); start++)
- if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
- count++;
- if (--maxcount <= 0)
- break;
- start += pattern_len-1;
- }
- }
- return count;
-}
-
/* Algorithms for different cases of string replacement */
@@ -2828,10 +2462,9 @@ replace_delete_substring(PyStringObject *self,
self_len = PyString_GET_SIZE(self);
self_s = PyString_AS_STRING(self);
- count = countstring(self_s, self_len,
- from_s, from_len,
- 0, self_len, 1,
- maxcount);
+ count = stringlib_count(self_s, self_len,
+ from_s, from_len,
+ maxcount);
if (count == 0) {
/* no matches */
@@ -2850,9 +2483,9 @@ replace_delete_substring(PyStringObject *self,
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
- offset = findstring(start, end-start,
- from_s, from_len,
- 0, end-start, FORWARD);
+ offset = stringlib_find(start, end-start,
+ from_s, from_len,
+ 0);
if (offset == -1)
break;
next = start + offset;
@@ -2928,9 +2561,9 @@ replace_substring_in_place(PyStringObject *self,
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
- offset = findstring(self_s, self_len,
- from_s, from_len,
- 0, self_len, FORWARD);
+ offset = stringlib_find(self_s, self_len,
+ from_s, from_len,
+ 0);
if (offset == -1) {
/* No matches; return the original string */
return return_self(self);
@@ -2950,9 +2583,9 @@ replace_substring_in_place(PyStringObject *self,
end = result_s + self_len;
while ( --maxcount > 0) {
- offset = findstring(start, end-start,
- from_s, from_len,
- 0, end-start, FORWARD);
+ offset = stringlib_find(start, end-start,
+ from_s, from_len,
+ 0);
if (offset==-1)
break;
Py_MEMCPY(start+offset, to_s, from_len);
@@ -3044,9 +2677,10 @@ replace_substring(PyStringObject *self,
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
- count = countstring(self_s, self_len,
- from_s, from_len,
- 0, self_len, FORWARD, maxcount);
+ count = stringlib_count(self_s, self_len,
+ from_s, from_len,
+ maxcount);
+
if (count == 0) {
/* no matches, return unchanged */
return return_self(self);
@@ -3073,9 +2707,9 @@ replace_substring(PyStringObject *self,
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
- offset = findstring(start, end-start,
- from_s, from_len,
- 0, end-start, FORWARD);
+ offset = stringlib_find(start, end-start,
+ from_s, from_len,
+ 0);
if (offset == -1)
break;
next = start+offset;
@@ -3245,7 +2879,7 @@ _string_tailmatch(PyStringObject *self, PyObject *substr, Py_ssize_t start,
return -1;
str = PyString_AS_STRING(self);
- string_adjust_indices(&start, &end, len);
+ ADJUST_INDICES(start, end, len);
if (direction < 0) {
/* startswith */
@@ -3913,62 +3547,15 @@ is given and true.");
static PyObject*
string_splitlines(PyStringObject *self, PyObject *args)
{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len;
int keepends = 0;
- PyObject *list;
- PyObject *str;
- char *data;
if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
return NULL;
- data = PyString_AS_STRING(self);
- len = PyString_GET_SIZE(self);
-
- /* This does not use the preallocated list because splitlines is
- usually run with hundreds of newlines. The overhead of
- switching between PyList_SET_ITEM and append causes about a
- 2-3% slowdown for that common case. A smarter implementation
- could move the if check out, so the SET_ITEMs are done first
- and the appends only done when the prealloc buffer is full.
- That's too much work for little gain.*/
-
- list = PyList_New(0);
- if (!list)
- goto onError;
-
- for (i = j = 0; i < len; ) {
- Py_ssize_t eol;
-
- /* Find a line and append it */
- while (i < len && data[i] != '\n' && data[i] != '\r')
- i++;
-
- /* Skip the line break reading CRLF as one line break */
- eol = i;
- if (i < len) {
- if (data[i] == '\r' && i + 1 < len &&
- data[i+1] == '\n')
- i += 2;
- else
- i++;
- if (keepends)
- eol = i;
- }
- SPLIT_APPEND(data, j, eol);
- j = i;
- }
- if (j < len) {
- SPLIT_APPEND(data, j, len);
- }
-
- return list;
-
- onError:
- Py_XDECREF(list);
- return NULL;
+ return stringlib_splitlines(
+ (PyObject*) self, PyString_AS_STRING(self), PyString_GET_SIZE(self),
+ keepends
+ );
}
PyDoc_STRVAR(sizeof__doc__,
@@ -3982,11 +3569,6 @@ string_sizeof(PyStringObject *v)
return PyInt_FromSsize_t(res);
}
-#undef SPLIT_APPEND
-#undef SPLIT_ADD
-#undef MAX_PREALLOC
-#undef PREALLOC_SIZE
-
static PyObject *
string_getnewargs(PyStringObject *v)
{
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 79e824e..b7874f1 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -194,7 +194,8 @@ PyUnicode_GetMax(void)
static BLOOM_MASK bloom_linebreak;
-#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
+#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1)))))
+#define BLOOM(mask, ch) ((mask & (1 << ((ch) & (LONG_BIT - 1)))))
#define BLOOM_LINEBREAK(ch) \
((ch) < 128U ? ascii_linebreak[(ch)] : \
@@ -209,7 +210,7 @@ Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
mask = 0;
for (i = 0; i < len; i++)
- mask |= (1 << (ptr[i] & 0x1F));
+ BLOOM_ADD(mask, ptr[i]);
return mask;
}
@@ -5245,27 +5246,27 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
/* --- Helpers ------------------------------------------------------------ */
#include "stringlib/unicodedefs.h"
-
-#define FROM_UNICODE
-
#include "stringlib/fastsearch.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/partition.h"
+#include "stringlib/split.h"
/* helper macro to fixup start/end slice values */
-#define FIX_START_END(obj) \
- if (start < 0) \
- start += (obj)->length; \
- if (start < 0) \
- start = 0; \
- if (end > (obj)->length) \
- end = (obj)->length; \
- if (end < 0) \
- end += (obj)->length; \
- if (end < 0) \
- end = 0;
+#define ADJUST_INDICES(start, end, len) \
+ if (end > len) \
+ end = len; \
+ else if (end < 0) { \
+ end += len; \
+ if (end < 0) \
+ end = 0; \
+ } \
+ if (start < 0) { \
+ start += len; \
+ if (start < 0) \
+ start = 0; \
+ }
Py_ssize_t PyUnicode_Count(PyObject *str,
PyObject *substr,
@@ -5285,10 +5286,10 @@ Py_ssize_t PyUnicode_Count(PyObject *str,
return -1;
}
- FIX_START_END(str_obj);
-
+ ADJUST_INDICES(start, end, str_obj->length);
result = stringlib_count(
- str_obj->str + start, end - start, sub_obj->str, sub_obj->length
+ str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
+ PY_SSIZE_T_MAX
);
Py_DECREF(sub_obj);
@@ -5343,8 +5344,7 @@ int tailmatch(PyUnicodeObject *self,
if (substring->length == 0)
return 1;
- FIX_START_END(self);
-
+ ADJUST_INDICES(start, end, self->length);
end -= substring->length;
if (end < start)
return 0;
@@ -5721,305 +5721,40 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
return u;
}
-#define SPLIT_APPEND(data, left, right) \
- str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
- if (!str) \
- goto onError; \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str);
-
-static
-PyObject *split_whitespace(PyUnicodeObject *self,
- PyObject *list,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = 0; i < len; ) {
- /* find a token */
- while (i < len && Py_UNICODE_ISSPACE(buf[i]))
- i++;
- j = i;
- while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
- i++;
- if (j < i) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, j, i);
- while (i < len && Py_UNICODE_ISSPACE(buf[i]))
- i++;
- j = i;
- }
- }
- if (j < len) {
- SPLIT_APPEND(buf, j, len);
- }
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-PyObject *PyUnicode_Splitlines(PyObject *string,
- int keepends)
+PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len;
PyObject *list;
- PyObject *str;
- Py_UNICODE *data;
string = PyUnicode_FromObject(string);
if (string == NULL)
return NULL;
- data = PyUnicode_AS_UNICODE(string);
- len = PyUnicode_GET_SIZE(string);
-
- list = PyList_New(0);
- if (!list)
- goto onError;
- for (i = j = 0; i < len; ) {
- Py_ssize_t eol;
+ list = stringlib_splitlines(
+ (PyObject*) string, PyUnicode_AS_UNICODE(string),
+ PyUnicode_GET_SIZE(string), keepends);
- /* Find a line and append it */
- while (i < len && !BLOOM_LINEBREAK(data[i]))
- i++;
-
- /* Skip the line break reading CRLF as one line break */
- eol = i;
- if (i < len) {
- if (data[i] == '\r' && i + 1 < len &&
- data[i+1] == '\n')
- i += 2;
- else
- i++;
- if (keepends)
- eol = i;
- }
- SPLIT_APPEND(data, j, eol);
- j = i;
- }
- if (j < len) {
- SPLIT_APPEND(data, j, len);
- }
-
- Py_DECREF(string);
- return list;
-
- onError:
- Py_XDECREF(list);
Py_DECREF(string);
- return NULL;
-}
-
-static
-PyObject *split_char(PyUnicodeObject *self,
- PyObject *list,
- Py_UNICODE ch,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = 0; i < len; ) {
- if (buf[i] == ch) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, j, i);
- i = j = i + 1;
- } else
- i++;
- }
- if (j <= len) {
- SPLIT_APPEND(buf, j, len);
- }
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *split_substring(PyUnicodeObject *self,
- PyObject *list,
- PyUnicodeObject *substring,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- Py_ssize_t sublen = substring->length;
- PyObject *str;
-
- for (i = j = 0; i <= len - sublen; ) {
- if (Py_UNICODE_MATCH(self, i, substring)) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(self->str, j, i);
- i = j = i + sublen;
- } else
- i++;
- }
- if (j <= len) {
- SPLIT_APPEND(self->str, j, len);
- }
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *rsplit_whitespace(PyUnicodeObject *self,
- PyObject *list,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = len - 1; i >= 0; ) {
- /* find a token */
- while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
- i--;
- j = i;
- while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
- i--;
- if (j > i) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, i + 1, j + 1);
- while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
- i--;
- j = i;
- }
- }
- if (j >= 0) {
- SPLIT_APPEND(buf, 0, j + 1);
- }
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *rsplit_char(PyUnicodeObject *self,
- PyObject *list,
- Py_UNICODE ch,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = len - 1; i >= 0; ) {
- if (buf[i] == ch) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, i + 1, j + 1);
- j = i = i - 1;
- } else
- i--;
- }
- if (j >= -1) {
- SPLIT_APPEND(buf, 0, j + 1);
- }
- if (PyList_Reverse(list) < 0)
- goto onError;
return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
}
static
-PyObject *rsplit_substring(PyUnicodeObject *self,
- PyObject *list,
- PyUnicodeObject *substring,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- Py_ssize_t sublen = substring->length;
- PyObject *str;
-
- for (i = len - sublen, j = len; i >= 0; ) {
- if (Py_UNICODE_MATCH(self, i, substring)) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(self->str, i + sublen, j);
- j = i;
- i -= sublen;
- } else
- i--;
- }
- if (j >= 0) {
- SPLIT_APPEND(self->str, 0, j);
- }
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-#undef SPLIT_APPEND
-
-static
PyObject *split(PyUnicodeObject *self,
PyUnicodeObject *substring,
Py_ssize_t maxcount)
{
- PyObject *list;
-
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
- list = PyList_New(0);
- if (!list)
- return NULL;
-
if (substring == NULL)
- return split_whitespace(self,list,maxcount);
-
- else if (substring->length == 1)
- return split_char(self,list,substring->str[0],maxcount);
+ return stringlib_split_whitespace(
+ (PyObject*) self, self->str, self->length, maxcount
+ );
- else if (substring->length == 0) {
- Py_DECREF(list);
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else
- return split_substring(self,list,substring,maxcount);
+ return stringlib_split(
+ (PyObject*) self, self->str, self->length,
+ substring->str, substring->length,
+ maxcount
+ );
}
static
@@ -6027,28 +5762,19 @@ PyObject *rsplit(PyUnicodeObject *self,
PyUnicodeObject *substring,
Py_ssize_t maxcount)
{
- PyObject *list;
-
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
- list = PyList_New(0);
- if (!list)
- return NULL;
-
if (substring == NULL)
- return rsplit_whitespace(self,list,maxcount);
-
- else if (substring->length == 1)
- return rsplit_char(self,list,substring->str[0],maxcount);
+ return stringlib_rsplit_whitespace(
+ (PyObject*) self, self->str, self->length, maxcount
+ );
- else if (substring->length == 0) {
- Py_DECREF(list);
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else
- return rsplit_substring(self,list,substring,maxcount);
+ return stringlib_rsplit(
+ (PyObject*) self, self->str, self->length,
+ substring->str, substring->length,
+ maxcount
+ );
}
static
@@ -6061,9 +5787,13 @@ PyObject *replace(PyUnicodeObject *self,
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
+ else if (maxcount == 0 || self->length == 0)
+ goto nothing;
if (str1->length == str2->length) {
/* same length */
+ if (str1->length == 0)
+ goto nothing;
Py_ssize_t i;
if (str1->length == 1) {
/* replace characters */
@@ -6083,8 +5813,8 @@ PyObject *replace(PyUnicodeObject *self,
u->str[i] = u2;
}
} else {
- i = fastsearch(
- self->str, self->length, str1->str, str1->length, FAST_SEARCH
+ i = stringlib_find(
+ self->str, self->length, str1->str, str1->length, 0
);
if (i < 0)
goto nothing;
@@ -6092,14 +5822,20 @@ PyObject *replace(PyUnicodeObject *self,
if (!u)
return NULL;
Py_UNICODE_COPY(u->str, self->str, self->length);
- while (i <= self->length - str1->length)
- if (Py_UNICODE_MATCH(self, i, str1)) {
- if (--maxcount < 0)
- break;
- Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
- i += str1->length;
- } else
- i++;
+
+ /* change everything in-place, starting with this one */
+ Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+ i += str1->length;
+
+ while ( --maxcount > 0) {
+ i = stringlib_find(self->str+i, self->length-i,
+ str1->str, str1->length,
+ i);
+ if (i == -1)
+ break;
+ Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+ i += str1->length;
+ }
}
} else {
@@ -6108,9 +5844,8 @@ PyObject *replace(PyUnicodeObject *self,
Py_UNICODE *p;
/* replace strings */
- n = stringlib_count(self->str, self->length, str1->str, str1->length);
- if (n > maxcount)
- n = maxcount;
+ n = stringlib_count(self->str, self->length, str1->str, str1->length,
+ maxcount);
if (n == 0)
goto nothing;
/* new_size = self->length + n * (str2->length - str1->length)); */
@@ -6140,15 +5875,12 @@ PyObject *replace(PyUnicodeObject *self,
if (str1->length > 0) {
while (n-- > 0) {
/* look for next match */
- j = i;
- while (j <= e) {
- if (Py_UNICODE_MATCH(self, j, str1))
- break;
- j++;
- }
- if (j > i) {
- if (j > e)
- break;
+ j = stringlib_find(self->str+i, self->length-i,
+ str1->str, str1->length,
+ i);
+ if (j == -1)
+ break;
+ else if (j > i) {
/* copy unchanged part [i:j] */
Py_UNICODE_COPY(p, self->str+i, j-i);
p += j - i;
@@ -6585,11 +6317,11 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
if (substring == NULL)
return NULL;
- FIX_START_END(self);
-
+ ADJUST_INDICES(start, end, self->length);
result = PyInt_FromSsize_t(
stringlib_count(self->str + start, end - start,
- substring->str, substring->length)
+ substring->str, substring->length,
+ PY_SSIZE_T_MAX)
);
Py_DECREF(substring);
@@ -9132,11 +8864,3 @@ _PyUnicode_Fini(void)
#ifdef __cplusplus
}
#endif
-
-
-/*
- Local variables:
- c-basic-offset: 4
- indent-tabs-mode: nil
- End:
-*/
diff --git a/PC/VS8.0/pythoncore.vcproj b/PC/VS8.0/pythoncore.vcproj
index ad48cee..31c7992 100644
--- a/PC/VS8.0/pythoncore.vcproj
+++ b/PC/VS8.0/pythoncore.vcproj
@@ -1539,6 +1539,10 @@
>
</File>
<File
+ RelativePath="..\..\Objects\stringlib\split.h"
+ >
+ </File>
+ <File
RelativePath="..\..\Objects\structseq.c"
>
</File>
diff --git a/PCbuild/pythoncore.vcproj b/PCbuild/pythoncore.vcproj
index 47521f01..6078c86 100644
--- a/PCbuild/pythoncore.vcproj
+++ b/PCbuild/pythoncore.vcproj
@@ -1539,6 +1539,10 @@
>
</File>
<File
+ RelativePath="..\Objects\stringlib\split.h"
+ >
+ </File>
+ <File
RelativePath="..\Objects\structseq.c"
>
</File>