summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2006-05-24 14:28:11 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2006-05-24 14:28:11 (GMT)
commit6471ee4f1862432363294e5e069524f4ee8442ee (patch)
treeed222f3812c11619e46d80050c02f1fc70c8e353 /Objects
parent240bf2a8e48369e330bfd25e3a346e3f18151006 (diff)
downloadcpython-6471ee4f1862432363294e5e069524f4ee8442ee.zip
cpython-6471ee4f1862432363294e5e069524f4ee8442ee.tar.gz
cpython-6471ee4f1862432363294e5e069524f4ee8442ee.tar.bz2
needforspeed: use "fastsearch" for count and findstring helpers. this
results in a 2.5x speedup on the stringbench count tests, and a 20x (!) speedup on the stringbench search/find/contains test, compared to 2.5a2. for more on the algorithm, see: http://effbot.org/zone/stringlib.htm if you get weird results, you can disable the new algoritm by undefining USE_FAST in Objects/unicodeobject.c. enjoy /F
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c110
1 files changed, 109 insertions, 1 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 4bb0eae..485e360 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3848,7 +3848,94 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
/* --- Helpers ------------------------------------------------------------ */
-static Py_ssize_t count(PyUnicodeObject *self,
+#define USE_FAST /* experimental fast search implementation */
+
+/* fast search/count implementation, based on a mix between boyer-
+ moore and horspool, with a few more bells and whistles on the top.
+ for some more background, see: http://effbot.org/stringlib */
+
+#define FAST_COUNT 0
+#define FAST_SEARCH 1
+
+LOCAL(int) fastsearch(Py_UNICODE* s, Py_ssize_t n,
+ Py_UNICODE* p, Py_ssize_t m,
+ int mode)
+{
+ long mask;
+ int skip, count = 0;
+ Py_ssize_t i, j, mlast, w;
+
+ w = n - m;
+
+ if (w < 0)
+ return -1;
+
+ /* look for special cases */
+ if (m <= 1) {
+ if (m < 0)
+ return -1;
+ /* use special case for 1-character strings */
+ if (mode == FAST_COUNT) {
+ for (i = 0; i < n; i++)
+ if (s[i] == p[0])
+ count++;
+ return count;
+ } else {
+ for (i = 0; i < n; i++)
+ if (s[i] == p[0])
+ return i;
+ }
+ return -1;
+ }
+
+ mlast = m - 1;
+
+ /* create compressed boyer-moore delta 1 table */
+ skip = mlast - 1;
+ /* process pattern[:-1] */
+ for (mask = i = 0; i < mlast; i++) {
+ mask |= (1 << (p[i] & 0x1F));
+ if (p[i] == p[mlast])
+ skip = mlast - i - 1;
+ }
+ /* process pattern[-1] outside the loop */
+ mask |= (1 << (p[mlast] & 0x1F));
+
+ for (i = 0; i <= w; i++) {
+ /* note: using mlast in the skip path slows things down on x86 */
+ if (s[i+m-1] == p[m-1]) {
+ /* candidate match */
+ for (j = 0; j < mlast; j++)
+ if (s[i+j] != p[j])
+ break;
+ if (j == mlast) {
+ /* got a match! */
+ if (mode != FAST_COUNT)
+ return i;
+ count++;
+ i = i + mlast;
+ continue;
+ }
+ /* miss: check if next character is part of pattern */
+ if (!(mask & (1 << (s[i+m] & 0x1F))))
+ i = i + m;
+ else {
+ i = i + skip;
+ continue;
+ }
+ } else {
+ /* skip: check if next character is part of pattern */
+ if (!(mask & (1 << (s[i+m] & 0x1F))))
+ i = i + m;
+ }
+ }
+
+ if (mode != FAST_COUNT)
+ return -1;
+ return count;
+}
+
+LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
Py_ssize_t start,
Py_ssize_t end,
PyUnicodeObject *substring)
@@ -3869,6 +3956,14 @@ static Py_ssize_t count(PyUnicodeObject *self,
if (substring->length == 0)
return (end - start + 1);
+#ifdef USE_FAST
+ count = fastsearch(
+ PyUnicode_AS_UNICODE(self) + start, end - start,
+ substring->str, substring->length, FAST_COUNT
+ );
+ if (count < 0)
+ count = 0; /* no match */
+#else
end -= substring->length;
while (start <= end)
@@ -3877,6 +3972,7 @@ static Py_ssize_t count(PyUnicodeObject *self,
start += substring->length;
} else
start++;
+#endif
return count;
}
@@ -3927,6 +4023,18 @@ static Py_ssize_t findstring(PyUnicodeObject *self,
if (substring->length == 0)
return (direction > 0) ? start : end;
+#ifdef USE_FAST
+ if (direction > 0) {
+ Py_ssize_t pos = fastsearch(
+ PyUnicode_AS_UNICODE(self) + start, end - start,
+ substring->str, substring->length, FAST_SEARCH
+ );
+ if (pos < 0)
+ return pos;
+ return pos + start;
+ }
+#endif
+
end -= substring->length;
if (direction < 0) {