1 files changed, 455 insertions, 317 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e62c774..6cdb0fc 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4,6 +4,9 @@ Unicode implementation based on original code by Fredrik Lundh,
 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
 Unicode Integration Proposal (see file Misc/unicode.txt).
 
+Major speed upgrades to the method implementations at the Reykjavik
+NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
+
 Copyright (c) Corporation for National Research Initiatives.
 
 --------------------------------------------------------------------
@@ -121,6 +124,51 @@ PyUnicode_GetMax(void)
 #endif
 }
 
+/* --- Bloom Filters ----------------------------------------------------- */
+
+/* stuff to implement simple "bloom filters" for Unicode characters.
+   to keep things simple, we use a single bitmask, using the least 5
+   bits from each unicode characters as the bit index. */
+
+/* the linebreak mask is set up by Unicode_Init below */
+
+#define BLOOM_MASK unsigned long
+
+static BLOOM_MASK bloom_linebreak;
+
+#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
+
+#define BLOOM_LINEBREAK(ch)\
+    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
+
+Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
+{
+    /* calculate simple bloom-style bitmask for a given unicode string */
+
+    long mask;
+    Py_ssize_t i;
+
+    mask = 0;
+    for (i = 0; i < len; i++)
+        mask |= (1 << (ptr[i] & 0x1F));
+
+    return mask;
+}
+
+Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
+{
+    Py_ssize_t i;
+
+    for (i = 0; i < setlen; i++)
+        if (set[i] == chr)
+            return 1;
+
+    return 0;
+}
+
+#define BLOOM_MEMBER(mask, chr, set, setlen)\
+    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
+
 /* --- Unicode Object ----------------------------------------------------- */
 
 static
@@ -136,6 +184,7 @@ int unicode_resize(register PyUnicodeObject *unicode,
     /* Resizing shared object (unicode_empty or single character
        objects) in-place is not allowed. Use PyUnicode_Resize()
        instead ! */
+
     if (unicode == unicode_empty || 
 	(unicode->length == 1 && 
 	 unicode->str[0] < 256U &&
@@ -145,8 +194,11 @@ int unicode_resize(register PyUnicodeObject *unicode,
         return -1;
     }
 
-    /* We allocate one more byte to make sure the string is
-       Ux0000 terminated -- XXX is this needed ? */
+    /* We allocate one more byte to make sure the string is Ux0000 terminated.
+       The overallocation is also used by fastsearch, which assumes that it's
+       safe to look at str[length] (without making any assumptions about what
+       it contains). */
+
     oldstr = unicode->str;
     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
     if (!unicode->str) {
@@ -181,7 +233,7 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 {
     register PyUnicodeObject *unicode;
 
-    /* Optimization fo empty strings */
+    /* Optimization for empty strings */
     if (length == 0 && unicode_empty != NULL) {
         Py_INCREF(unicode_empty);
         return unicode_empty;
@@ -1963,9 +2015,20 @@ onError:
 
 */
 
-static const Py_UNICODE *findchar(const Py_UNICODE *s,
-				  Py_ssize_t size,
-				  Py_UNICODE ch);
+Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
+                                      Py_ssize_t size,
+                                      Py_UNICODE ch)
+{
+    /* like wcschr, but doesn't stop at NULL characters */
+
+    while (size-- > 0) {
+        if (*s == ch)
+            return s;
+        s++;
+    }
+
+    return NULL;
+}
 
 static
 PyObject *unicodeescape_string(const Py_UNICODE *s,
@@ -2313,7 +2376,7 @@ PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
     end = s + size;
 
     while (s < end) {
-        *p = *(Py_UNICODE *)s;
+        memcpy(p, s, sizeof(Py_UNICODE));
         /* We have to sanity check the raw data, otherwise doom looms for
            some malformed UCS-4 data. */
         if (
@@ -3791,124 +3854,104 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
 
 /* --- Helpers ------------------------------------------------------------ */
 
-static
-Py_ssize_t count(PyUnicodeObject *self,
-		 Py_ssize_t start,
-		 Py_ssize_t end,
-		 PyUnicodeObject *substring)
-{
-    Py_ssize_t count = 0;
+#define STRINGLIB_CHAR Py_UNICODE
 
-    if (start < 0)
-        start += self->length;
-    if (start < 0)
-        start = 0;
-    if (end > self->length)
-        end = self->length;
-    if (end < 0)
-        end += self->length;
-    if (end < 0)
-        end = 0;
+#define STRINGLIB_LEN PyUnicode_GET_SIZE
+#define STRINGLIB_NEW PyUnicode_FromUnicode
+#define STRINGLIB_STR PyUnicode_AS_UNICODE
 
-    if (substring->length == 0)
-	return (end - start + 1);
+Py_LOCAL_INLINE(int)
+STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
+{
+    if (str[0] != other[0])
+        return 1;
+    return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
+}
 
-    end -= substring->length;
+#define STRINGLIB_EMPTY unicode_empty
 
-    while (start <= end)
-        if (Py_UNICODE_MATCH(self, start, substring)) {
-            count++;
-            start += substring->length;
-        } else
-            start++;
+#include "stringlib/fastsearch.h"
 
-    return count;
-}
+#include "stringlib/count.h"
+#include "stringlib/find.h"
+#include "stringlib/partition.h"
+
+/* helper macro to fixup start/end slice values */
+#define FIX_START_END(obj)                      \
+    if (start < 0)                              \
+        start += (obj)->length;                 \
+    if (start < 0)                              \
+        start = 0;                              \
+    if (end > (obj)->length)                    \
+        end = (obj)->length;                    \
+    if (end < 0)                                \
+        end += (obj)->length;                   \
+    if (end < 0)                                \
+        end = 0;
 
 Py_ssize_t PyUnicode_Count(PyObject *str,
-		    PyObject *substr,
-		    Py_ssize_t start,
-		    Py_ssize_t end)
+                           PyObject *substr,
+                           Py_ssize_t start,
+                           Py_ssize_t end)
 {
     Py_ssize_t result;
+    PyUnicodeObject* str_obj;
+    PyUnicodeObject* sub_obj;
 
-    str = PyUnicode_FromObject(str);
-    if (str == NULL)
+    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
+    if (!str_obj)
 	return -1;
-    substr = PyUnicode_FromObject(substr);
-    if (substr == NULL) {
-	Py_DECREF(str);
+    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
+    if (!sub_obj) {
+	Py_DECREF(str_obj);
 	return -1;
     }
 
-    result = count((PyUnicodeObject *)str,
-		   start, end,
-		   (PyUnicodeObject *)substr);
-
-    Py_DECREF(str);
-    Py_DECREF(substr);
-    return result;
-}
-
-static
-Py_ssize_t findstring(PyUnicodeObject *self,
-	       PyUnicodeObject *substring,
-	       Py_ssize_t start,
-	       Py_ssize_t end,
-	       int direction)
-{
-    if (start < 0)
-        start += self->length;
-    if (start < 0)
-        start = 0;
-
-    if (end > self->length)
-        end = self->length;
-    if (end < 0)
-        end += self->length;
-    if (end < 0)
-        end = 0;
+    FIX_START_END(str_obj);
 
-    if (substring->length == 0)
-	return (direction > 0) ? start : end;
-
-    end -= substring->length;
+    result = stringlib_count(
+        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
+        );
 
-    if (direction < 0) {
-        for (; end >= start; end--)
-            if (Py_UNICODE_MATCH(self, end, substring))
-                return end;
-    } else {
-        for (; start <= end; start++)
-            if (Py_UNICODE_MATCH(self, start, substring))
-                return start;
-    }
+    Py_DECREF(sub_obj);
+    Py_DECREF(str_obj);
 
-    return -1;
+    return result;
 }
 
 Py_ssize_t PyUnicode_Find(PyObject *str,
-		   PyObject *substr,
-		   Py_ssize_t start,
-		   Py_ssize_t end,
-		   int direction)
+                          PyObject *sub,
+                          Py_ssize_t start,
+                          Py_ssize_t end,
+                          int direction)
 {
     Py_ssize_t result;
 
     str = PyUnicode_FromObject(str);
-    if (str == NULL)
+    if (!str)
 	return -2;
-    substr = PyUnicode_FromObject(substr);
-    if (substr == NULL) {
+    sub = PyUnicode_FromObject(sub);
+    if (!sub) {
 	Py_DECREF(str);
 	return -2;
     }
 
-    result = findstring((PyUnicodeObject *)str,
-			(PyUnicodeObject *)substr,
-			start, end, direction);
+    if (direction > 0)
+        result = stringlib_find_slice(
+            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
+            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
+            start, end
+            );
+    else
+        result = stringlib_rfind_slice(
+            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
+            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
+            start, end
+            );
+
     Py_DECREF(str);
-    Py_DECREF(substr);
+    Py_DECREF(sub);
+
     return result;
 }
 
@@ -3919,20 +3962,10 @@ int tailmatch(PyUnicodeObject *self,
 	      Py_ssize_t end,
 	      int direction)
 {
-    if (start < 0)
-        start += self->length;
-    if (start < 0)
-        start = 0;
-
     if (substring->length == 0)
         return 1;
 
-    if (end > self->length)
-        end = self->length;
-    if (end < 0)
-        end += self->length;
-    if (end < 0)
-        end = 0;
+    FIX_START_END(self);
 
     end -= substring->length;
     if (end < start)
@@ -3974,22 +4007,6 @@ Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
     return result;
 }
 
-static
-const Py_UNICODE *findchar(const Py_UNICODE *s,
-		     Py_ssize_t size,
-		     Py_UNICODE ch)
-{
-    /* like wcschr, but doesn't stop at NULL characters */
-
-    while (size-- > 0) {
-        if (*s == ch)
-            return s;
-        s++;
-    }
-
-    return NULL;
-}
-
 /* Apply fixfct filter to the Unicode object self and return a
    reference to the modified object */
 
@@ -4148,10 +4165,10 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
     PyObject *internal_separator = NULL;
     const Py_UNICODE blank = ' ';
     const Py_UNICODE *sep = &blank;
-    size_t seplen = 1;
+    Py_ssize_t seplen = 1;
     PyUnicodeObject *res = NULL; /* the result */
-    size_t res_alloc = 100;  /* # allocated bytes for string in res */
-    size_t res_used;         /* # used bytes */
+    Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
+    Py_ssize_t res_used;         /* # used bytes */
     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
     PyObject *fseq;          /* PySequence_Fast(seq) */
     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
@@ -4212,8 +4229,8 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
     res_used = 0;
 
     for (i = 0; i < seqlen; ++i) {
-	size_t itemlen;
-	size_t new_res_used;
+	Py_ssize_t itemlen;
+	Py_ssize_t new_res_used;
 
 	item = PySequence_Fast_GET_ITEM(fseq, i);
 	/* Convert item to Unicode. */
@@ -4235,19 +4252,18 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
         /* Make sure we have enough space for the separator and the item. */
 	itemlen = PyUnicode_GET_SIZE(item);
 	new_res_used = res_used + itemlen;
-	if (new_res_used < res_used ||  new_res_used > PY_SSIZE_T_MAX)
+	if (new_res_used <= 0)
 	    goto Overflow;
 	if (i < seqlen - 1) {
 	    new_res_used += seplen;
-	    if (new_res_used < res_used ||  new_res_used > PY_SSIZE_T_MAX)
+	    if (new_res_used <= 0)
 		goto Overflow;
 	}
 	if (new_res_used > res_alloc) {
 	    /* double allocated size until it's big enough */
 	    do {
-	        size_t oldsize = res_alloc;
 	        res_alloc += res_alloc;
-	        if (res_alloc < oldsize || res_alloc > PY_SSIZE_T_MAX)
+	        if (res_alloc <= 0)
 	            goto Overflow;
 	    } while (new_res_used > res_alloc);
 	    if (_PyUnicode_Resize(&res, res_alloc) < 0) {
@@ -4333,17 +4349,6 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
         else								\
             Py_DECREF(str);
 
-#define SPLIT_INSERT(data, left, right)					\
-	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
-	if (!str)							\
-	    goto onError;						\
-	if (PyList_Insert(list, 0, str)) {				\
-	    Py_DECREF(str);						\
-	    goto onError;						\
-	}								\
-        else								\
-            Py_DECREF(str);
-
 static
 PyObject *split_whitespace(PyUnicodeObject *self,
 			   PyObject *list,
@@ -4404,7 +4409,7 @@ PyObject *PyUnicode_Splitlines(PyObject *string,
 	Py_ssize_t eol;
 
 	/* Find a line and append it */
-	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
+	while (i < len && !BLOOM_LINEBREAK(data[i]))
 	    i++;
 
 	/* Skip the line break reading CRLF as one line break */
@@ -4515,15 +4520,17 @@ PyObject *rsplit_whitespace(PyUnicodeObject *self,
 	if (j > i) {
 	    if (maxcount-- <= 0)
 		break;
-	    SPLIT_INSERT(self->str, i + 1, j + 1);
+	    SPLIT_APPEND(self->str, i + 1, j + 1);
 	    while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
 		i--;
 	    j = i;
 	}
     }
     if (j >= 0) {
-	SPLIT_INSERT(self->str, 0, j + 1);
+	SPLIT_APPEND(self->str, 0, j + 1);
     }
+    if (PyList_Reverse(list) < 0)
+        goto onError;
     return list;
 
  onError:
@@ -4546,14 +4553,16 @@ PyObject *rsplit_char(PyUnicodeObject *self,
 	if (self->str[i] == ch) {
 	    if (maxcount-- <= 0)
 		break;
-	    SPLIT_INSERT(self->str, i + 1, j + 1);
+	    SPLIT_APPEND(self->str, i + 1, j + 1);
 	    j = i = i - 1;
 	} else
 	    i--;
     }
     if (j >= -1) {
-	SPLIT_INSERT(self->str, 0, j + 1);
+	SPLIT_APPEND(self->str, 0, j + 1);
     }
+    if (PyList_Reverse(list) < 0)
+        goto onError;
     return list;
 
  onError:
@@ -4577,15 +4586,17 @@ PyObject *rsplit_substring(PyUnicodeObject *self,
 	if (Py_UNICODE_MATCH(self, i, substring)) {
 	    if (maxcount-- <= 0)
 		break;
-	    SPLIT_INSERT(self->str, i + sublen, j);
+	    SPLIT_APPEND(self->str, i + sublen, j);
 	    j = i;
 	    i -= sublen;
 	} else
 	    i--;
     }
     if (j >= 0) {
-	SPLIT_INSERT(self->str, 0, j);
+	SPLIT_APPEND(self->str, 0, j);
     }
+    if (PyList_Reverse(list) < 0)
+        goto onError;
     return list;
 
  onError:
@@ -4594,7 +4605,6 @@ PyObject *rsplit_substring(PyUnicodeObject *self,
 }
 
 #undef SPLIT_APPEND
-#undef SPLIT_INSERT
 
 static
 PyObject *split(PyUnicodeObject *self,
@@ -4665,88 +4675,128 @@ PyObject *replace(PyUnicodeObject *self,
     if (maxcount < 0)
 	maxcount = PY_SSIZE_T_MAX;
 
-    if (str1->length == 1 && str2->length == 1) {
+    if (str1->length == str2->length) {
+        /* same length */
         Py_ssize_t i;
-
-        /* replace characters */
-        if (!findchar(self->str, self->length, str1->str[0]) &&
-            PyUnicode_CheckExact(self)) {
-            /* nothing to replace, return original string */
-            Py_INCREF(self);
-            u = self;
+        if (str1->length == 1) {
+            /* replace characters */
+            Py_UNICODE u1, u2;
+            if (!findchar(self->str, self->length, str1->str[0]))
+                goto nothing;
+            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
+            if (!u)
+                return NULL;
+            Py_UNICODE_COPY(u->str, self->str, self->length);
+            u1 = str1->str[0];
+            u2 = str2->str[0];
+            for (i = 0; i < u->length; i++)
+                if (u->str[i] == u1) {
+                    if (--maxcount < 0)
+                        break;
+                    u->str[i] = u2;
+                }
         } else {
-	    Py_UNICODE u1 = str1->str[0];
-	    Py_UNICODE u2 = str2->str[0];
-
-            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
-                NULL,
-                self->length
+            i = fastsearch(
+                self->str, self->length, str1->str, str1->length, FAST_SEARCH
                 );
-            if (u != NULL) {
-		Py_UNICODE_COPY(u->str, self->str,
-				self->length);
-                for (i = 0; i < u->length; i++)
-                    if (u->str[i] == u1) {
-                        if (--maxcount < 0)
-                            break;
-                        u->str[i] = u2;
-                    }
-        }
+            if (i < 0)
+                goto nothing;
+            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
+            if (!u)
+                return NULL;
+            Py_UNICODE_COPY(u->str, self->str, self->length);
+            while (i <= self->length - str1->length)
+                if (Py_UNICODE_MATCH(self, i, str1)) {
+                    if (--maxcount < 0)
+                        break;
+                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+                    i += str1->length;
+                } else
+                    i++;
         }
-
     } else {
-        Py_ssize_t n, i;
+
+        Py_ssize_t n, i, j, e;
+        Py_ssize_t product, new_size, delta;
         Py_UNICODE *p;
 
         /* replace strings */
-        n = count(self, 0, self->length, str1);
+        n = stringlib_count(self->str, self->length, str1->str, str1->length);
         if (n > maxcount)
             n = maxcount;
-        if (n == 0) {
-            /* nothing to replace, return original string */
-            if (PyUnicode_CheckExact(self)) {
-                Py_INCREF(self);
-                u = self;
-            }
-            else {
-                u = (PyUnicodeObject *)
-                    PyUnicode_FromUnicode(self->str, self->length);
-	    }
+        if (n == 0)
+            goto nothing;
+        /* new_size = self->length + n * (str2->length - str1->length)); */
+        delta = (str2->length - str1->length);
+        if (delta == 0) {
+            new_size = self->length;
         } else {
-            u = _PyUnicode_New(
-                self->length + n * (str2->length - str1->length));
-            if (u) {
-                i = 0;
-                p = u->str;
-                if (str1->length > 0) {
-                    while (i <= self->length - str1->length)
-                        if (Py_UNICODE_MATCH(self, i, str1)) {
-                            /* replace string segment */
-                            Py_UNICODE_COPY(p, str2->str, str2->length);
-                            p += str2->length;
-                            i += str1->length;
-                            if (--n <= 0) {
-                                /* copy remaining part */
-                                Py_UNICODE_COPY(p, self->str+i, self->length-i);
-                                break;
-                            }
-                        } else
-                            *p++ = self->str[i++];
-                } else {
-                    while (n > 0) {
-                        Py_UNICODE_COPY(p, str2->str, str2->length);
-                        p += str2->length;
-                        if (--n <= 0)
-                            break;
-                        *p++ = self->str[i++];
-                    }
-                    Py_UNICODE_COPY(p, self->str+i, self->length-i);
+            product = n * (str2->length - str1->length);
+            if ((product / (str2->length - str1->length)) != n) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "replace string is too long");
+                return NULL;
+            }
+            new_size = self->length + product;
+            if (new_size < 0) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "replace string is too long");
+                return NULL;
+            }
+        }
+        u = _PyUnicode_New(new_size);
+        if (!u)
+            return NULL;
+        i = 0;
+        p = u->str;
+        e = self->length - str1->length;
+        if (str1->length > 0) {
+            while (n-- > 0) {
+                /* look for next match */
+                j = i;
+                while (j <= e) {
+                    if (Py_UNICODE_MATCH(self, j, str1))
+                        break;
+                    j++;
+                }
+		if (j > i) {
+                    if (j > e)
+                        break;
+                    /* copy unchanged part [i:j] */
+                    Py_UNICODE_COPY(p, self->str+i, j-i);
+                    p += j - i;
+                }
+                /* copy substitution string */
+                if (str2->length > 0) {
+                    Py_UNICODE_COPY(p, str2->str, str2->length);
+                    p += str2->length;
                 }
+                i = j + str1->length;
+            }
+            if (i < self->length)
+                /* copy tail [i:] */
+                Py_UNICODE_COPY(p, self->str+i, self->length-i);
+        } else {
+            /* interleave */
+            while (n > 0) {
+                Py_UNICODE_COPY(p, str2->str, str2->length);
+                p += str2->length;
+                if (--n <= 0)
+                    break;
+                *p++ = self->str[i++];
             }
+            Py_UNICODE_COPY(p, self->str+i, self->length-i);
         }
     }
-
     return (PyObject *) u;
+
+nothing:
+    /* nothing to replace; return original string (when possible) */
+    if (PyUnicode_CheckExact(self)) {
+        Py_INCREF(self);
+        return (PyObject *) self;
+    }
+    return PyUnicode_FromUnicode(self->str, self->length);
 }
 
 /* --- Unicode Object Methods --------------------------------------------- */
@@ -4983,54 +5033,29 @@ onError:
 int PyUnicode_Contains(PyObject *container,
 		       PyObject *element)
 {
-    PyUnicodeObject *u = NULL, *v = NULL;
+    PyObject *str, *sub;
     int result;
-    Py_ssize_t size;
-    register const Py_UNICODE *lhs, *end, *rhs;
 
     /* Coerce the two arguments */
-    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
-    if (v == NULL) {
+    sub = PyUnicode_FromObject(element);
+    if (!sub) {
 	PyErr_SetString(PyExc_TypeError,
 	    "'in <string>' requires string as left operand");
-	goto onError;
+        return -1;
     }
-    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
-    if (u == NULL)
-	goto onError;
-
-    size = PyUnicode_GET_SIZE(v);
-    rhs = PyUnicode_AS_UNICODE(v);
-    lhs = PyUnicode_AS_UNICODE(u);
 
-    result = 0;
-    if (size == 1) {
-	end = lhs + PyUnicode_GET_SIZE(u);
-	while (lhs < end) {
-	    if (*lhs++ == *rhs) {
-		result = 1;
-		break;
-	    }
-	}
-    }
-    else {
-	end = lhs + (PyUnicode_GET_SIZE(u) - size);
-	while (lhs <= end) {
-	    if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
-		result = 1;
-		break;
-	    }
-	}
+    str = PyUnicode_FromObject(container);
+    if (!str) {
+        Py_DECREF(sub);
+        return -1;
     }
 
-    Py_DECREF(u);
-    Py_DECREF(v);
-    return result;
+    result = stringlib_contains_obj(str, sub);
 
-onError:
-    Py_XDECREF(u);
-    Py_XDECREF(v);
-    return -1;
+    Py_DECREF(str);
+    Py_DECREF(sub);
+
+    return result;
 }
 
 /* Concat to string or Unicode object giving a new Unicode object. */
@@ -5078,8 +5103,8 @@ onError:
 PyDoc_STRVAR(count__doc__,
 "S.count(sub[, start[, end]]) -> int\n\
 \n\
-Return the number of occurrences of substring sub in Unicode string\n\
-S[start:end].  Optional arguments start and end are\n\
+Return the number of non-overlapping occurrences of substring sub in\n\
+Unicode string S[start:end].  Optional arguments start and end are\n\
 interpreted as in slice notation.");
 
 static PyObject *
@@ -5095,24 +5120,19 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
         return NULL;
 
     substring = (PyUnicodeObject *)PyUnicode_FromObject(
-						(PyObject *)substring);
+        (PyObject *)substring);
     if (substring == NULL)
 	return NULL;
 
-    if (start < 0)
-        start += self->length;
-    if (start < 0)
-        start = 0;
-    if (end > self->length)
-        end = self->length;
-    if (end < 0)
-        end += self->length;
-    if (end < 0)
-        end = 0;
+    FIX_START_END(self);
 
-    result = PyInt_FromLong((long) count(self, start, end, substring));
+    result = PyInt_FromSsize_t(
+        stringlib_count(self->str + start, end - start,
+                        substring->str, substring->length)
+        );
 
     Py_DECREF(substring);
+
     return result;
 }
 
@@ -5262,23 +5282,27 @@ Return -1 on failure.");
 static PyObject *
 unicode_find(PyUnicodeObject *self, PyObject *args)
 {
-    PyUnicodeObject *substring;
+    PyObject *substring;
     Py_ssize_t start = 0;
     Py_ssize_t end = PY_SSIZE_T_MAX;
-    PyObject *result;
+    Py_ssize_t result;
 
     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
 		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
         return NULL;
-    substring = (PyUnicodeObject *)PyUnicode_FromObject(
-						(PyObject *)substring);
-    if (substring == NULL)
+    substring = PyUnicode_FromObject(substring);
+    if (!substring)
 	return NULL;
 
-    result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
+    result = stringlib_find_slice(
+        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
+        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
+        start, end
+        );
 
     Py_DECREF(substring);
-    return result;
+
+    return PyInt_FromSsize_t(result);
 }
 
 static PyObject *
@@ -5328,26 +5352,30 @@ static PyObject *
 unicode_index(PyUnicodeObject *self, PyObject *args)
 {
     Py_ssize_t result;
-    PyUnicodeObject *substring;
+    PyObject *substring;
     Py_ssize_t start = 0;
     Py_ssize_t end = PY_SSIZE_T_MAX;
 
     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
 		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
         return NULL;
-
-    substring = (PyUnicodeObject *)PyUnicode_FromObject(
-						(PyObject *)substring);
-    if (substring == NULL)
+    substring = PyUnicode_FromObject(substring);
+    if (!substring)
 	return NULL;
 
-    result = findstring(self, substring, start, end, 1);
+    result = stringlib_find_slice(
+        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
+        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
+        start, end
+        );
 
     Py_DECREF(substring);
+
     if (result < 0) {
         PyErr_SetString(PyExc_ValueError, "substring not found");
         return NULL;
     }
+
     return PyInt_FromSsize_t(result);
 }
 
@@ -5702,16 +5730,6 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
 
 #define STRIPNAME(i) (stripformat[i]+3)
 
-static const Py_UNICODE *
-unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
-{
-	size_t i;
-	for (i = 0; i < n; ++i)
-		if (s[i] == c)
-			return s+i;
-	return NULL;
-}
-
 /* externally visible for str.strip(unicode) */
 PyObject *
 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
@@ -5722,27 +5740,29 @@ _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
 	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
 	Py_ssize_t i, j;
 
+        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
+
 	i = 0;
 	if (striptype != RIGHTSTRIP) {
-		while (i < len && unicode_memchr(sep, s[i], seplen)) {
-			i++;
-		}
+            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
+                i++;
+            }
 	}
 
 	j = len;
 	if (striptype != LEFTSTRIP) {
-		do {
-			j--;
-		} while (j >= i && unicode_memchr(sep, s[j], seplen));
-		j++;
+            do {
+                j--;
+            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
+            j++;
 	}
 
 	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
-		Py_INCREF(self);
-		return (PyObject*)self;
+            Py_INCREF(self);
+            return (PyObject*)self;
 	}
 	else
-		return PyUnicode_FromUnicode(s+i, j-i);
+            return PyUnicode_FromUnicode(s+i, j-i);
 }
 
 
@@ -5898,9 +5918,19 @@ unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
 
     p = u->str;
 
-    while (len-- > 0) {
-        Py_UNICODE_COPY(p, str->str, str->length);
-        p += str->length;
+    if (str->length == 1 && len > 0) {
+        Py_UNICODE_FILL(p, str->str[0], len);
+    } else {
+	Py_ssize_t done = 0; /* number of characters copied this far */
+	if (done < nchars) {
+            Py_UNICODE_COPY(p, str->str, str->length);
+            done = str->length;
+	}
+	while (done < nchars) {
+            int n = (done <= nchars-done) ? done : nchars-done;
+            Py_UNICODE_COPY(p+done, p, n);
+            done += n;
+	}
     }
 
     return (PyObject*) u;
@@ -5993,23 +6023,27 @@ Return -1 on failure.");
 static PyObject *
 unicode_rfind(PyUnicodeObject *self, PyObject *args)
 {
-    PyUnicodeObject *substring;
+    PyObject *substring;
     Py_ssize_t start = 0;
     Py_ssize_t end = PY_SSIZE_T_MAX;
-    PyObject *result;
+    Py_ssize_t result;
 
     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
 		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
         return NULL;
-    substring = (PyUnicodeObject *)PyUnicode_FromObject(
-						(PyObject *)substring);
-    if (substring == NULL)
+    substring = PyUnicode_FromObject(substring);
+    if (!substring)
 	return NULL;
 
-    result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
+    result = stringlib_rfind_slice(
+        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
+        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
+        start, end
+        );
 
     Py_DECREF(substring);
-    return result;
+
+    return PyInt_FromSsize_t(result);
 }
 
 PyDoc_STRVAR(rindex__doc__,
@@ -6020,22 +6054,26 @@ Like S.rfind() but raise ValueError when the substring is not found.");
 static PyObject *
 unicode_rindex(PyUnicodeObject *self, PyObject *args)
 {
-    Py_ssize_t result;
-    PyUnicodeObject *substring;
+    PyObject *substring;
     Py_ssize_t start = 0;
     Py_ssize_t end = PY_SSIZE_T_MAX;
+    Py_ssize_t result;
 
     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
 		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
         return NULL;
-    substring = (PyUnicodeObject *)PyUnicode_FromObject(
-						(PyObject *)substring);
-    if (substring == NULL)
+    substring = PyUnicode_FromObject(substring);
+    if (!substring)
 	return NULL;
 
-    result = findstring(self, substring, start, end, -1);
+    result = stringlib_rfind_slice(
+        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
+        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
+        start, end
+        );
 
     Py_DECREF(substring);
+
     if (result < 0) {
         PyErr_SetString(PyExc_ValueError, "substring not found");
         return NULL;
@@ -6137,6 +6175,87 @@ unicode_split(PyUnicodeObject *self, PyObject *args)
 	return PyUnicode_Split((PyObject *)self, substring, maxcount);
 }
 
+PyObject *
+PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
+{
+    PyObject* str_obj;
+    PyObject* sep_obj;
+    PyObject* out;
+
+    str_obj = PyUnicode_FromObject(str_in);
+    if (!str_obj)
+	return NULL;
+    sep_obj = PyUnicode_FromObject(sep_in);
+    if (!sep_obj) {
+        Py_DECREF(str_obj);
+        return NULL;
+    }
+
+    out = stringlib_partition(
+        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
+        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
+        );
+
+    Py_DECREF(sep_obj);
+    Py_DECREF(str_obj);
+
+    return out;
+}
+
+
+PyObject *
+PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
+{
+    PyObject* str_obj;
+    PyObject* sep_obj;
+    PyObject* out;
+
+    str_obj = PyUnicode_FromObject(str_in);
+    if (!str_obj)
+	return NULL;
+    sep_obj = PyUnicode_FromObject(sep_in);
+    if (!sep_obj) {
+        Py_DECREF(str_obj);
+        return NULL;
+    }
+
+    out = stringlib_rpartition(
+        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
+        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
+        );
+
+    Py_DECREF(sep_obj);
+    Py_DECREF(str_obj);
+
+    return out;
+}
+
+PyDoc_STRVAR(partition__doc__,
+"S.partition(sep) -> (head, sep, tail)\n\
+\n\
+Searches for the separator sep in S, and returns the part before it,\n\
+the separator itself, and the part after it.  If the separator is not\n\
+found, returns S and two empty strings.");
+
+static PyObject*
+unicode_partition(PyUnicodeObject *self, PyObject *separator)
+{
+    return PyUnicode_Partition((PyObject *)self, separator);
+}
+
+PyDoc_STRVAR(rpartition__doc__,
+"S.rpartition(sep) -> (head, sep, tail)\n\
+\n\
+Searches for the separator sep in S, starting at the end of S, and returns\n\
+the part before it, the separator itself, and the part after it.  If the\n\
+separator is not found, returns S and two empty strings.");
+
+static PyObject*
+unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
+{
+    return PyUnicode_RPartition((PyObject *)self, separator);
+}
+
 PyObject *PyUnicode_RSplit(PyObject *s,
 			   PyObject *sep,
 			   Py_ssize_t maxsplit)
@@ -6390,6 +6509,7 @@ static PyMethodDef unicode_methods[] = {
     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
+    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
@@ -6400,6 +6520,7 @@ static PyMethodDef unicode_methods[] = {
     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
+    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
@@ -7375,6 +7496,18 @@ void _PyUnicode_Init(void)
 {
     int i;
 
+    /* XXX - move this array to unicodectype.c ? */
+    Py_UNICODE linebreak[] = {
+        0x000A, /* LINE FEED */
+        0x000D, /* CARRIAGE RETURN */
+        0x001C, /* FILE SEPARATOR */
+        0x001D, /* GROUP SEPARATOR */
+        0x001E, /* RECORD SEPARATOR */
+        0x0085, /* NEXT LINE */
+        0x2028, /* LINE SEPARATOR */
+        0x2029, /* PARAGRAPH SEPARATOR */
+    };
+
     /* Init the implementation */
     unicode_freelist = NULL;
     unicode_freelist_size = 0;
@@ -7384,6 +7517,11 @@ void _PyUnicode_Init(void)
 	unicode_latin1[i] = NULL;
     if (PyType_Ready(&PyUnicode_Type) < 0)
 	Py_FatalError("Can't initialize 'unicode'");
+
+    /* initialize the linebreak bloom filter */
+    bloom_linebreak = make_bloom_mask(
+        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
+        );
 }
 
 /* Finalize the Unicode implementation */