From e68955cf3299efa93c95a6638e8e68191c57302b Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Thu, 25 May 2006 17:08:14 +0000 Subject: needforspeed: new replace implementation by Andrew Dalke. replace is now about 3x faster on my machine, for the replace tests from string- bench. --- Objects/stringobject.c | 787 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 605 insertions(+), 182 deletions(-) diff --git a/Objects/stringobject.c b/Objects/stringobject.c index 77796df..aed0c23 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -2379,175 +2379,623 @@ string_translate(PyStringObject *self, PyObject *args) } -/* What follows is used for implementing replace(). Perry Stoll. */ +#define FORWARD 1 +#define REVERSE -1 -/* - mymemfind +/* find and count characters and substrings */ - strstr replacement for arbitrary blocks of memory. +/* Don't call if length < 2 */ +#define Py_STRING_MATCH(target, offset, pattern, length) \ + (target[offset] == pattern[0] && \ + target[offset+length-1] == pattern[length-1] && \ + !memcmp(target+offset+1, pattern+1, length-2) ) + +#define findchar(target, target_len, c) \ + ((char *)memchr((const void *)(target), c, target_len)) + +/* String ops must return a string. */ +/* If the object is subclass of string, create a copy */ +static PyStringObject * +return_self(PyStringObject *self) +{ + if (PyString_CheckExact(self)) { + Py_INCREF(self); + return self; + } + return (PyStringObject *)PyString_FromStringAndSize( + PyString_AS_STRING(self), + PyString_GET_SIZE(self)); +} - Locates the first occurrence in the memory pointed to by MEM of the - contents of memory pointed to by PAT. Returns the index into MEM if - found, or -1 if not found. If len of PAT is greater than length of - MEM, the function returns -1. -*/ static Py_ssize_t -mymemfind(const char *mem, Py_ssize_t len, const char *pat, Py_ssize_t pat_len) +countchar(char *target, int target_len, char c) { - register Py_ssize_t ii; + Py_ssize_t count=0; + char *start=target; + char *end=target+target_len; + + while ( (start=findchar(start, end-start, c)) != NULL ) { + count++; + start += 1; + } - /* pattern can not occur in the last pat_len-1 chars */ - len -= pat_len; + return count; +} - for (ii = 0; ii <= len; ii++) { - if (mem[ii] == pat[0] && memcmp(&mem[ii], pat, pat_len) == 0) { - return ii; - } +static Py_ssize_t +findstring(char *target, Py_ssize_t target_len, + char *pattern, Py_ssize_t pattern_len, + Py_ssize_t start, + Py_ssize_t end, + int direction) +{ + if (start < 0) { + start += target_len; + if (start < 0) + start = 0; + } + if (end > target_len) { + end = target_len; + } else if (end < 0) { + end += target_len; + if (end < 0) + end = 0; + } + + /* zero-length substrings always match at the first attempt */ + if (pattern_len == 0) + return (direction > 0) ? start : end; + + end -= pattern_len; + + if (direction < 0) { + for (; end >= start; end--) + if (Py_STRING_MATCH(target, end, pattern, pattern_len)) + return end; + } else { + for (; start <= end; start++) + if (Py_STRING_MATCH(target, start, pattern, pattern_len)) + return start; } return -1; } -/* - mymemcnt +Py_ssize_t +countstring(char *target, Py_ssize_t target_len, + char *pattern, Py_ssize_t pattern_len, + Py_ssize_t start, + Py_ssize_t end, + int direction) +{ + Py_ssize_t count=0; - Return the number of distinct times PAT is found in MEM. - meaning mem=1111 and pat==11 returns 2. - mem=11111 and pat==11 also return 2. - */ -static Py_ssize_t -mymemcnt(const char *mem, Py_ssize_t len, const char *pat, Py_ssize_t pat_len) + if (start < 0) { + start += target_len; + if (start < 0) + start = 0; + } + if (end > target_len) { + end = target_len; + } else if (end < 0) { + end += target_len; + if (end < 0) + end = 0; + } + + /* zero-length substrings match everywhere */ + if (pattern_len == 0) + return target_len+1; + + end -= pattern_len; + + if (direction < 0) { + for (; end >= start; end--) + if (Py_STRING_MATCH(target, end, pattern, pattern_len)) { + count++; + end -= pattern_len-1; + } + } else { + for (; start <= end; start++) + if (Py_STRING_MATCH(target, start, pattern, pattern_len)) { + count++; + start += pattern_len-1; + } + } + return count; +} + + +/* Algorithms for difference cases of string replacement */ + +/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */ +static PyStringObject * +replace_interleave(PyStringObject *self, + PyStringObject *to, + Py_ssize_t maxcount) { - register Py_ssize_t offset = 0; - Py_ssize_t nfound = 0; + char *self_s, *to_s, *result_s; + Py_ssize_t self_len, to_len, result_len; + Py_ssize_t count, i, product; + PyStringObject *result; + + self_len = PyString_GET_SIZE(self); + to_len = PyString_GET_SIZE(to); + + /* 1 at the end plus 1 after every character */ + count = self_len+1; + if (maxcount < count) + count = maxcount; + + /* Check for overflow */ + /* result_len = count * to_len + self_len; */ + product = count * to_len; + if (product / to_len != count) { + PyErr_SetString(PyExc_OverflowError, + "replace string is too long"); + return NULL; + } + result_len = product + self_len; + if (result_len < 0) { + PyErr_SetString(PyExc_OverflowError, + "replace string is too long"); + return NULL; + } + + if (! (result = (PyStringObject *) + PyString_FromStringAndSize(NULL, result_len)) ) + return NULL; - while (len >= 0) { - offset = mymemfind(mem, len, pat, pat_len); - if (offset == -1) - break; - mem += offset + pat_len; - len -= offset + pat_len; - nfound++; + self_s = PyString_AS_STRING(self); + to_s = PyString_AS_STRING(to); + to_len = PyString_GET_SIZE(to); + result_s = PyString_AS_STRING(result); + + /* TODO: special case single character, which doesn't need memcpy */ + + /* Lay the first one down (guaranteed this will occur) */ + memcpy(result_s, to_s, to_len); + result_s += to_len; + count -= 1; + + for (i=0; i=1, len(from)==1, to="", maxcount>=1 */ +static PyStringObject * +replace_delete_single_character(PyStringObject *self, + char from_c, Py_ssize_t maxcount) +{ + char *self_s, *result_s; + char *start, *next, *end; + Py_ssize_t self_len, result_len; + Py_ssize_t count; + PyStringObject *result; + + self_len = PyString_GET_SIZE(self); + self_s = PyString_AS_STRING(self); + + count = countchar(self_s, self_len, from_c); + if (count == 0) { + return return_self(self); + } + if (count > maxcount) + count = maxcount; + + result_len = self_len - count; /* from_len == 1 */ + assert(result_len>=0); + + if ( (result = (PyStringObject *) + PyString_FromStringAndSize(NULL, result_len)) == NULL) + return NULL; + result_s = PyString_AS_STRING(result); - Return a string in which all occurrences of PAT in memory STR are - replaced with SUB. + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + next = findchar(start, end-start, from_c); + if (next == NULL) + break; + memcpy(result_s, start, next-start); + result_s += (next-start); + start = next+1; + } + memcpy(result_s, start, end-start); + + return result; +} - If length of PAT is less than length of STR or there are no occurrences - of PAT in STR, then the original string is returned. Otherwise, a new - string is allocated here and returned. +/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */ + +static PyStringObject * +replace_delete_substring(PyStringObject *self, PyStringObject *from, + Py_ssize_t maxcount) { + char *self_s, *from_s, *result_s; + char *start, *next, *end; + Py_ssize_t self_len, from_len, result_len; + Py_ssize_t count, offset; + PyStringObject *result; + + self_len = PyString_GET_SIZE(self); + self_s = PyString_AS_STRING(self); + from_len = PyString_GET_SIZE(from); + from_s = PyString_AS_STRING(from); + + count = countstring(self_s, self_len, + from_s, from_len, + 0, self_len, 1); + + if (count > maxcount) + count = maxcount; + + if (count == 0) { + /* no matches */ + return return_self(self); + } + + result_len = self_len - (count * from_len); + assert (result_len>=0); + + if ( (result = (PyStringObject *) + PyString_FromStringAndSize(NULL, result_len)) == NULL ) + return NULL; + + result_s = PyString_AS_STRING(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + offset = findstring(start, end-start, + from_s, from_len, + 0, end-start, FORWARD); + if (offset == -1) + break; + next = start + offset; + + memcpy(result_s, start, next-start); + + result_s += (next-start); + start = next+from_len; + } + memcpy(result_s, start, end-start); + return result; +} - on return, out_len is: - the length of output string, or - -1 if the input string is returned, or - unchanged if an error occurs (no memory). +/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */ +static PyStringObject * +replace_single_character_in_place(PyStringObject *self, + char from_c, char to_c, + Py_ssize_t maxcount) +{ + char *self_s, *result_s, *start, *end, *next; + Py_ssize_t self_len; + PyStringObject *result; + + /* The result string will be the same size */ + self_s = PyString_AS_STRING(self); + self_len = PyString_GET_SIZE(self); + + next = findchar(self_s, self_len, from_c); + + if (next == NULL) { + /* No matches; return the original string */ + return return_self(self); + } + + /* Need to make a new string */ + result = (PyStringObject *) PyString_FromStringAndSize(self_s, self_len); + if (result == NULL) + return NULL; + result_s = PyString_AS_STRING(result); + + /* change everything in-place, starting with this one */ + start = result_s + (next-self_s); + *start = to_c; + start++; + end = result_s + self_len; + + while (--maxcount > 0) { + next = findchar(start, end-start, from_c); + if (next == NULL) + break; + *next = to_c; + start = next+1; + } + + return result; +} - return value is: - the new string allocated locally, or - NULL if an error occurred. -*/ -static char * -mymemreplace(const char *str, Py_ssize_t len, /* input string */ - const char *pat, Py_ssize_t pat_len, /* pattern string to find */ - const char *sub, Py_ssize_t sub_len, /* substitution string */ - Py_ssize_t count, /* number of replacements */ - Py_ssize_t *out_len) +/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */ +static PyStringObject * +replace_substring_in_place(PyStringObject *self, + PyStringObject *from, + PyStringObject *to, + Py_ssize_t maxcount) { - char *out_s; - char *new_s; - Py_ssize_t nfound, offset, new_len; - Py_ssize_t product, delta; - - if (len == 0 || (pat_len == 0 && sub_len == 0) || pat_len > len) - goto return_same; - - /* find length of output string */ - nfound = (pat_len > 0) ? mymemcnt(str, len, pat, pat_len) : len + 1; - if (count < 0) - count = PY_SSIZE_T_MAX; - else if (nfound > count) - nfound = count; - if (nfound == 0) - goto return_same; - - delta = (sub_len - pat_len); - if (delta == 0) { - new_len = len; - } else { - product = nfound * (sub_len - pat_len); - if ((product / (sub_len - pat_len)) != nfound) { - PyErr_SetString(PyExc_OverflowError, - "replace string is too long"); - return NULL; - } - new_len = len + product; - if (new_len < 0) { - PyErr_SetString(PyExc_OverflowError, - "replace string is too long"); - return NULL; - } - } + char *result_s, *start, *end; + char *self_s, *from_s, *to_s; + Py_ssize_t self_len, from_len, offset; + PyStringObject *result; + + /* The result string will be the same size */ + + self_s = PyString_AS_STRING(self); + self_len = PyString_GET_SIZE(self); + + from_s = PyString_AS_STRING(from); + from_len = PyString_GET_SIZE(from); + to_s = PyString_AS_STRING(to); + + offset = findstring(self_s, self_len, + from_s, from_len, + 0, self_len, FORWARD); + + if (offset == -1) { + /* No matches; return the original string */ + return return_self(self); + } + + /* Need to make a new string */ + result = (PyStringObject *) PyString_FromStringAndSize(self_s, self_len); + if (result == NULL) + return NULL; + result_s = PyString_AS_STRING(result); + + /* change everything in-place, starting with this one */ + start = result_s + offset; + memcpy(start, to_s, from_len); + start += from_len; + end = result_s + self_len; + + while ( --maxcount > 0) { + offset = findstring(start, end-start, + from_s, from_len, + 0, end-start, FORWARD); + if (offset==-1) + break; + memcpy(start+offset, to_s, from_len); + start += offset+from_len; + } + + return result; +} - if (new_len == 0) { - /* Have to allocate something for the caller to free(). */ - out_s = (char *)PyMem_MALLOC(1); - if (out_s == NULL) - return NULL; - out_s[0] = '\0'; +/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */ +static PyStringObject * +replace_single_character(PyStringObject *self, + char from_c, + PyStringObject *to, + Py_ssize_t maxcount) +{ + char *self_s, *to_s, *result_s; + char *start, *next, *end; + Py_ssize_t self_len, to_len, result_len; + Py_ssize_t count, product; + PyStringObject *result; + + self_s = PyString_AS_STRING(self); + self_len = PyString_GET_SIZE(self); + + count = countchar(self_s, self_len, from_c); + if (count > maxcount) + count = maxcount; + + if (count == 0) { + /* no matches, return unchanged */ + return return_self(self); + } + + to_s = PyString_AS_STRING(to); + to_len = PyString_GET_SIZE(to); + + /* use the difference between current and new, hence the "-1" */ + /* result_len = self_len + count * (to_len-1) */ + product = count * (to_len-1); + if (product / (to_len-1) != count) { + PyErr_SetString(PyExc_OverflowError, "replace string is too long"); + return NULL; } - else { - assert(new_len > 0); - new_s = (char *)PyMem_MALLOC(new_len); - if (new_s == NULL) - return NULL; - out_s = new_s; - - if (pat_len > 0) { - for (; nfound > 0; --nfound) { - /* find index of next instance of pattern */ - offset = mymemfind(str, len, pat, pat_len); - if (offset == -1) - break; - - /* copy non matching part of input string */ - memcpy(new_s, str, offset); - str += offset + pat_len; - len -= offset + pat_len; - - /* copy substitute into the output string */ - new_s += offset; - memcpy(new_s, sub, sub_len); - new_s += sub_len; - } - /* copy any remaining values into output string */ - if (len > 0) - memcpy(new_s, str, len); - } - else { - for (;;++str, --len) { - memcpy(new_s, sub, sub_len); - new_s += sub_len; - if (--nfound <= 0) { - memcpy(new_s, str, len); - break; - } - *new_s++ = *str; - } + result_len = self_len + product; + if (result_len < 0) { + PyErr_SetString(PyExc_OverflowError, "replace string is too long"); + return NULL; + } + + if ( (result = (PyStringObject *) + PyString_FromStringAndSize(NULL, result_len)) == NULL) + return NULL; + result_s = PyString_AS_STRING(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + next = findchar(start, end-start, from_c); + if (next == NULL) + break; + + if (next == start) { + /* replace with the 'to' */ + memcpy(result_s, to_s, to_len); + result_s += to_len; + start += 1; + } else { + /* copy the unchanged old then the 'to' */ + memcpy(result_s, start, next-start); + result_s += (next-start); + memcpy(result_s, to_s, to_len); + result_s += to_len; + start = next+1; } } - *out_len = new_len; - return out_s; + /* Copy the remainder of the remaining string */ + memcpy(result_s, start, end-start); + + return result; +} - return_same: - *out_len = -1; - return (char *)str; /* cast away const */ +/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */ +static PyStringObject * +replace_substring(PyStringObject *self, + PyStringObject *from, + PyStringObject *to, + Py_ssize_t maxcount) { + char *self_s, *from_s, *to_s, *result_s; + char *start, *next, *end; + Py_ssize_t self_len, from_len, to_len, result_len; + Py_ssize_t count, offset, product; + PyStringObject *result; + + self_s = PyString_AS_STRING(self); + self_len = PyString_GET_SIZE(self); + from_s = PyString_AS_STRING(from); + from_len = PyString_GET_SIZE(from); + + count = countstring(self_s, self_len, + from_s, from_len, + 0, self_len, FORWARD); + if (count > maxcount) + count = maxcount; + + if (count == 0) { + /* no matches, return unchanged */ + return return_self(self); + } + + to_s = PyString_AS_STRING(to); + to_len = PyString_GET_SIZE(to); + + /* Check for overflow */ + /* result_len = self_len + count * (to_len-from_len) */ + product = count * (to_len-from_len); + if (product / (to_len-from_len) != count) { + PyErr_SetString(PyExc_OverflowError, "replace string is too long"); + return NULL; + } + result_len = self_len + product; + if (result_len < 0) { + PyErr_SetString(PyExc_OverflowError, "replace string is too long"); + return NULL; + } + + if ( (result = (PyStringObject *) + PyString_FromStringAndSize(NULL, result_len)) == NULL) + return NULL; + result_s = PyString_AS_STRING(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + offset = findstring(start, end-start, + from_s, from_len, + 0, end-start, FORWARD); + if (offset == -1) + break; + next = start+offset; + if (next == start) { + /* replace with the 'to' */ + memcpy(result_s, to_s, to_len); + result_s += to_len; + start += from_len; + } else { + /* copy the unchanged old then the 'to' */ + memcpy(result_s, start, next-start); + result_s += (next-start); + memcpy(result_s, to_s, to_len); + result_s += to_len; + start = next+from_len; + } + } + /* Copy the remainder of the remaining string */ + memcpy(result_s, start, end-start); + + return result; } +static PyStringObject * +replace(PyStringObject *self, + PyStringObject *from, + PyStringObject *to, + Py_ssize_t maxcount) +{ + Py_ssize_t from_len, to_len; + + if (maxcount < 0) { + maxcount = PY_SSIZE_T_MAX; + } else if (maxcount == 0 || PyString_GET_SIZE(self) == 0) { + /* nothing to do; return the original string */ + return return_self(self); + } + + from_len = PyString_GET_SIZE(from); + to_len = PyString_GET_SIZE(to); + + if (maxcount == 0 || + (from_len == 0 && to_len == 0)) { + /* nothing to do; return the original string */ + return return_self(self); + } + + /* Handle zero-length special cases */ + + if (from_len == 0) { + /* insert the 'to' string everywhere. */ + /* >>> "Python".replace("", ".") */ + /* '.P.y.t.h.o.n.' */ + return replace_interleave(self, to, maxcount); + } + + /* Except for "".replace("", "A") == "A" there is no way beyond this */ + /* point for an empty self string to generate a non-empty string */ + /* Special case so the remaining code always gets a non-empty string */ + if (PyString_GET_SIZE(self) == 0) { + return return_self(self); + } + + if (to_len == 0) { + /* delete all occurances of 'from' string */ + if (from_len == 1) { + return replace_delete_single_character( + self, PyString_AS_STRING(from)[0], maxcount); + } else { + return replace_delete_substring(self, from, maxcount); + } + } + + /* Handle special case where both strings have the same length */ + + if (from_len == to_len) { + if (from_len == 1) { + return replace_single_character_in_place( + self, + PyString_AS_STRING(from)[0], + PyString_AS_STRING(to)[0], + maxcount); + } else { + return replace_substring_in_place( + self, from, to, maxcount); + } + } + + /* Otherwise use the more generic algorithms */ + if (from_len == 1) { + return replace_single_character(self, PyString_AS_STRING(from)[0], + to, maxcount); + } else { + /* len('from')>=2, len('to')>=1 */ + return replace_substring(self, from, to, maxcount); + } +} + PyDoc_STRVAR(replace__doc__, "S.replace (old, new[, count]) -> string\n\ \n\ @@ -2558,67 +3006,42 @@ given, only the first count occurrences are replaced."); static PyObject * string_replace(PyStringObject *self, PyObject *args) { - const char *str = PyString_AS_STRING(self), *sub, *repl; - char *new_s; - const Py_ssize_t len = PyString_GET_SIZE(self); - Py_ssize_t sub_len, repl_len, out_len; Py_ssize_t count = -1; - PyObject *newobj; - PyObject *subobj, *replobj; + PyObject *from, *to; + char *tmp_s; + Py_ssize_t tmp_len; - if (!PyArg_ParseTuple(args, "OO|n:replace", - &subobj, &replobj, &count)) + if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count)) return NULL; - if (PyString_Check(subobj)) { - sub = PyString_AS_STRING(subobj); - sub_len = PyString_GET_SIZE(subobj); + if (PyString_Check(from)) { + /* Can this be made a '!check' after the Unicode check? */ } #ifdef Py_USING_UNICODE - else if (PyUnicode_Check(subobj)) + if (PyUnicode_Check(from)) return PyUnicode_Replace((PyObject *)self, - subobj, replobj, count); + from, to, count); #endif - else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len)) + else if (PyObject_AsCharBuffer(from, &tmp_s, &tmp_len)) return NULL; - if (PyString_Check(replobj)) { - repl = PyString_AS_STRING(replobj); - repl_len = PyString_GET_SIZE(replobj); + if (PyString_Check(to)) { + /* Can this be made a '!check' after the Unicode check? */ } #ifdef Py_USING_UNICODE - else if (PyUnicode_Check(replobj)) + else if (PyUnicode_Check(to)) return PyUnicode_Replace((PyObject *)self, - subobj, replobj, count); + from, to, count); #endif - else if (PyObject_AsCharBuffer(replobj, &repl, &repl_len)) + else if (PyObject_AsCharBuffer(to, &tmp_s, &tmp_len)) return NULL; - new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len); - if (new_s == NULL) { - if (!PyErr_Occurred()) - PyErr_NoMemory(); - return NULL; - } - if (out_len == -1) { - if (PyString_CheckExact(self)) { - /* we're returning another reference to self */ - newobj = (PyObject*)self; - Py_INCREF(newobj); - } - else { - newobj = PyString_FromStringAndSize(str, len); - if (newobj == NULL) - return NULL; - } - } - else { - newobj = PyString_FromStringAndSize(new_s, out_len); - PyMem_FREE(new_s); - } - return newobj; + return (PyObject *)replace((PyStringObject *) self, + (PyStringObject *) from, + (PyStringObject *) to, count); } +/** End DALKE **/ PyDoc_STRVAR(startswith__doc__, "S.startswith(prefix[, start[, end]]) -> bool\n\ -- cgit v0.12