summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Dalke <dalke@dalkescientific.com>2006-05-26 19:02:09 (GMT)
committerAndrew Dalke <dalke@dalkescientific.com>2006-05-26 19:02:09 (GMT)
commitc5da53ba78cc00d8de448f99eb185067d53302ba (patch)
tree3f74f2eb5ad64f46ff8687fa1aa7ac8a272020c1
parentafe6598732b4a8eee9f2d746080cd0bb28f17704 (diff)
downloadcpython-c5da53ba78cc00d8de448f99eb185067d53302ba.zip
cpython-c5da53ba78cc00d8de448f99eb185067d53302ba.tar.gz
cpython-c5da53ba78cc00d8de448f99eb185067d53302ba.tar.bz2
substring split now uses /F's fast string matching algorithm.
(If compiled without FAST search support, changed the pre-memcmp test to check the last character as well as the first. This gave a 25% speedup for my test case.) Rewrote the split algorithms so they stop when maxsplit gets to 0. Previously they did a string match first then checked if the maxsplit was reached. The new way prevents a needless string search.
-rw-r--r--Objects/stringobject.c97
1 files changed, 57 insertions, 40 deletions
diff --git a/Objects/stringobject.c b/Objects/stringobject.c
index 44deeed..5d57f15 100644
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@@ -1322,6 +1322,13 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
#define STRIPNAME(i) (stripformat[i]+3)
+/* Don't call if length < 2 */
+#define Py_STRING_MATCH(target, offset, pattern, length) \
+ (target[offset] == pattern[0] && \
+ target[offset+length-1] == pattern[length-1] && \
+ !memcmp(target+offset+1, pattern+1, length-2) )
+
+
/* Overallocate the initial list to reduce the number of reallocs for small
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
@@ -1416,18 +1423,19 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
if (list == NULL)
return NULL;
- for (i = j = 0; i < len; ) {
- /* TODO: Use findchar/memchr for this? */
- if (s[i] == ch) {
- if (maxcount-- <= 0)
+ i = j = 0;
+ while ((j < len) && (maxcount-- > 0)) {
+ for(; j<len; j++) {
+ /* I found that using memchr makes no difference */
+ if (s[j] == ch) {
+ SPLIT_ADD(s, i, j);
+ i = j = j + 1;
break;
- SPLIT_ADD(s, j, i);
- i = j = i + 1;
- } else
- i++;
+ }
+ }
}
- if (j <= len) {
- SPLIT_ADD(s, j, len);
+ if (i <= len) {
+ SPLIT_ADD(s, i, len);
}
FIX_PREALLOC_SIZE(list);
return list;
@@ -1452,6 +1460,9 @@ string_split(PyStringObject *self, PyObject *args)
Py_ssize_t maxsplit = -1, count=0;
const char *s = PyString_AS_STRING(self), *sub;
PyObject *list, *str, *subobj = Py_None;
+#ifdef USE_FAST
+ Py_ssize_t pos;
+#endif
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
return NULL;
@@ -1481,19 +1492,30 @@ string_split(PyStringObject *self, PyObject *args)
if (list == NULL)
return NULL;
+#ifdef USE_FAST
i = j = 0;
- while (i+n <= len) {
- /* TODO: Use Py_STRING_MATCH */
- if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) {
- if (maxsplit-- <= 0)
+ while (maxsplit-- > 0) {
+ pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
+ if (pos < 0)
+ break;
+ j = i+pos;
+ SPLIT_ADD(s, i, j);
+ i = j + n;
+
+ }
+#else
+ i = j = 0;
+ while ((j+n <= len) && (maxsplit-- > 0)) {
+ for (; j+n <= len; j++) {
+ if (Py_STRING_MATCH(s, j, sub, n)) {
+ SPLIT_ADD(s, i, j);
+ i = j = j + n;
break;
- SPLIT_ADD(s, j, i);
- i = j = i + n;
+ }
}
- else
- i++;
}
- SPLIT_ADD(s, j, len);
+#endif
+ SPLIT_ADD(s, i, len);
FIX_PREALLOC_SIZE(list);
return list;
@@ -1610,14 +1632,15 @@ rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
if (list == NULL)
return NULL;
- for (i = j = len - 1; i >= 0; ) {
- if (s[i] == ch) {
- if (maxcount-- <= 0)
+ i = j = len - 1;
+ while ((i >= 0) && (maxcount-- > 0)) {
+ for (; i >= 0; i--) {
+ if (s[i] == ch) {
+ SPLIT_ADD(s, i + 1, j + 1);
+ j = i = i - 1;
break;
- SPLIT_ADD(s, i + 1, j + 1);
- j = i = i - 1;
- } else
- i--;
+ }
+ }
}
if (j >= -1) {
SPLIT_ADD(s, 0, j + 1);
@@ -1679,16 +1702,16 @@ string_rsplit(PyStringObject *self, PyObject *args)
j = len;
i = j - n;
- while (i >= 0) {
- if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) {
- if (maxsplit-- <= 0)
+
+ while ( (i >= 0) && (maxsplit-- > 0) ) {
+ for (; i>=0; i--) {
+ if (Py_STRING_MATCH(s, i, sub, n)) {
+ SPLIT_ADD(s, i + n, j);
+ j = i;
+ i -= n;
break;
- SPLIT_ADD(s, i+n, j);
- j = i;
- i -= n;
+ }
}
- else
- i--;
}
SPLIT_ADD(s, 0, j);
FIX_PREALLOC_SIZE(list);
@@ -2449,12 +2472,6 @@ string_translate(PyStringObject *self, PyObject *args)
/* find and count characters and substrings */
-/* Don't call if length < 2 */
-#define Py_STRING_MATCH(target, offset, pattern, length) \
- (target[offset] == pattern[0] && \
- target[offset+length-1] == pattern[length-1] && \
- !memcmp(target+offset+1, pattern+1, length-2) )
-
#define findchar(target, target_len, c) \
((char *)memchr((const void *)(target), c, target_len))