12 files changed, 203 insertions, 28 deletions
diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h
index f62813d..d0fc18d 100644
--- a/Objects/stringlib/asciilib.h
+++ b/Objects/stringlib/asciilib.h
@@ -19,7 +19,6 @@
 #define STRINGLIB_STR            PyUnicode_1BYTE_DATA
 #define STRINGLIB_LEN            PyUnicode_GET_LENGTH
 #define STRINGLIB_NEW(STR,LEN)   _PyUnicode_FromASCII((char*)(STR),(LEN))
-#define STRINGLIB_RESIZE         not_supported
 #define STRINGLIB_CHECK          PyUnicode_Check
 #define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact
 
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index f353367..f855003 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -47,7 +47,7 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
                     unsigned long value = *(unsigned long *) _s;
                     if (value & ASCII_CHAR_MASK)
                         break;
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+#if PY_LITTLE_ENDIAN
                     _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
                     _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
                     _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
@@ -486,7 +486,7 @@ STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
     const unsigned char *q = *inptr;
     STRINGLIB_CHAR *p = dest + *outpos;
     /* Offsets from q for retrieving byte pairs in the right order. */
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+#if PY_LITTLE_ENDIAN
     int ihi = !!native_ordering, ilo = !native_ordering;
 #else
     int ihi = !native_ordering, ilo = !!native_ordering;
@@ -517,7 +517,7 @@ STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
                     block = SWAB(block);
 #endif
                 }
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+#if PY_LITTLE_ENDIAN
 # if SIZEOF_LONG == 4
                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
                 p[1] = (STRINGLIB_CHAR)(block >> 16);
diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
index 55ac77d..cd7cac4 100644
--- a/Objects/stringlib/fastsearch.h
+++ b/Objects/stringlib/fastsearch.h
@@ -142,6 +142,8 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
     mask = 0;
 
     if (mode != FAST_RSEARCH) {
+        const STRINGLIB_CHAR *ss = s + m - 1;
+        const STRINGLIB_CHAR *pp = p + m - 1;
 
         /* create compressed boyer-moore delta 1 table */
 
@@ -156,7 +158,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
 
         for (i = 0; i <= w; i++) {
             /* note: using mlast in the skip path slows things down on x86 */
-            if (s[i+m-1] == p[m-1]) {
+            if (ss[i] == pp[0]) {
                 /* candidate match */
                 for (j = 0; j < mlast; j++)
                     if (s[i+j] != p[j])
@@ -172,13 +174,13 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
                     continue;
                 }
                 /* miss: check if next character is part of pattern */
-                if (!STRINGLIB_BLOOM(mask, s[i+m]))
+                if (!STRINGLIB_BLOOM(mask, ss[i+1]))
                     i = i + m;
                 else
                     i = i + skip;
             } else {
                 /* skip: check if next character is part of pattern */
-                if (!STRINGLIB_BLOOM(mask, s[i+m]))
+                if (!STRINGLIB_BLOOM(mask, ss[i+1]))
                     i = i + m;
             }
         }
diff --git a/Objects/stringlib/join.h b/Objects/stringlib/join.h
new file mode 100644
index 0000000..5568b31
--- /dev/null
+++ b/Objects/stringlib/join.h
@@ -0,0 +1,133 @@
+/* stringlib: bytes joining implementation */
+
+#if STRINGLIB_SIZEOF_CHAR != 1
+#error join.h only compatible with byte-wise strings
+#endif
+
+Py_LOCAL_INLINE(PyObject *)
+STRINGLIB(bytes_join)(PyObject *sep, PyObject *iterable)
+{
+    char *sepstr = STRINGLIB_STR(sep);
+    const Py_ssize_t seplen = STRINGLIB_LEN(sep);
+    PyObject *res = NULL;
+    char *p;
+    Py_ssize_t seqlen = 0;
+    Py_ssize_t sz = 0;
+    Py_ssize_t i, nbufs;
+    PyObject *seq, *item;
+    Py_buffer *buffers = NULL;
+#define NB_STATIC_BUFFERS 10
+    Py_buffer static_buffers[NB_STATIC_BUFFERS];
+
+    seq = PySequence_Fast(iterable, "can only join an iterable");
+    if (seq == NULL) {
+        return NULL;
+    }
+
+    seqlen = PySequence_Fast_GET_SIZE(seq);
+    if (seqlen == 0) {
+        Py_DECREF(seq);
+        return STRINGLIB_NEW(NULL, 0);
+    }
+#ifndef STRINGLIB_MUTABLE
+    if (seqlen == 1) {
+        item = PySequence_Fast_GET_ITEM(seq, 0);
+        if (STRINGLIB_CHECK_EXACT(item)) {
+            Py_INCREF(item);
+            Py_DECREF(seq);
+            return item;
+        }
+    }
+#endif
+    if (seqlen > NB_STATIC_BUFFERS) {
+        buffers = PyMem_NEW(Py_buffer, seqlen);
+        if (buffers == NULL) {
+            Py_DECREF(seq);
+            PyErr_NoMemory();
+            return NULL;
+        }
+    }
+    else {
+        buffers = static_buffers;
+    }
+
+    /* Here is the general case.  Do a pre-pass to figure out the total
+     * amount of space we'll need (sz), and see whether all arguments are
+     * buffer-compatible.
+     */
+    for (i = 0, nbufs = 0; i < seqlen; i++) {
+        Py_ssize_t itemlen;
+        item = PySequence_Fast_GET_ITEM(seq, i);
+        if (_getbuffer(item, &buffers[i]) < 0) {
+            PyErr_Format(PyExc_TypeError,
+                         "sequence item %zd: expected bytes, bytearray, "
+                         "or an object with the buffer interface, %.80s found",
+                         i, Py_TYPE(item)->tp_name);
+            goto error;
+        }
+        nbufs = i + 1;  /* for error cleanup */
+        itemlen = buffers[i].len;
+        if (itemlen > PY_SSIZE_T_MAX - sz) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "join() result is too long");
+            goto error;
+        }
+        sz += itemlen;
+        if (i != 0) {
+            if (seplen > PY_SSIZE_T_MAX - sz) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "join() result is too long");
+                goto error;
+            }
+            sz += seplen;
+        }
+        if (seqlen != PySequence_Fast_GET_SIZE(seq)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                            "sequence changed size during iteration");
+            goto error;
+        }
+    }
+
+    /* Allocate result space. */
+    res = STRINGLIB_NEW(NULL, sz);
+    if (res == NULL)
+        goto error;
+
+    /* Catenate everything. */
+    p = STRINGLIB_STR(res);
+    if (!seplen) {
+        /* fast path */
+        for (i = 0; i < nbufs; i++) {
+            Py_ssize_t n = buffers[i].len;
+            char *q = buffers[i].buf;
+            Py_MEMCPY(p, q, n);
+            p += n;
+        }
+        goto done;
+    }
+    for (i = 0; i < nbufs; i++) {
+        Py_ssize_t n;
+        char *q;
+        if (i) {
+            Py_MEMCPY(p, sepstr, seplen);
+            p += seplen;
+        }
+        n = buffers[i].len;
+        q = buffers[i].buf;
+        Py_MEMCPY(p, q, n);
+        p += n;
+    }
+    goto done;
+
+error:
+    res = NULL;
+done:
+    Py_DECREF(seq);
+    for (i = 0; i < nbufs; i++)
+        PyBuffer_Release(&buffers[i]);
+    if (buffers != static_buffers)
+        PyMem_FREE(buffers);
+    return res;
+}
+
+#undef NB_STATIC_BUFFERS
diff --git a/Objects/stringlib/replace.h b/Objects/stringlib/replace.h
new file mode 100644
index 0000000..ef318ed
--- /dev/null
+++ b/Objects/stringlib/replace.h
@@ -0,0 +1,53 @@
+/* stringlib: replace implementation */
+
+#ifndef STRINGLIB_FASTSEARCH_H
+#error must include "stringlib/fastsearch.h" before including this module
+#endif
+
+Py_LOCAL_INLINE(void)
+STRINGLIB(replace_1char_inplace)(STRINGLIB_CHAR* s, STRINGLIB_CHAR* end,
+                                 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
+{
+    *s = u2;
+    while (--maxcount && ++s != end) {
+        /* Find the next character to be replaced.
+
+           If it occurs often, it is faster to scan for it using an inline
+           loop.  If it occurs seldom, it is faster to scan for it using a
+           function call; the overhead of the function call is amortized
+           across the many characters that call covers.  We start with an
+           inline loop and use a heuristic to determine whether to fall back
+           to a function call. */
+        if (*s != u1) {
+            int attempts = 10;
+            /* search u1 in a dummy loop */
+            while (1) {
+                if (++s == end)
+                    return;
+                if (*s == u1)
+                    break;
+                if (!--attempts) {
+                    /* if u1 was not found for attempts iterations,
+                       use FASTSEARCH() or memchr() */
+#if STRINGLIB_SIZEOF_CHAR == 1
+                    s++;
+                    s = memchr(s, u1, end - s);
+                    if (s == NULL)
+                        return;
+#else
+                    Py_ssize_t i;
+                    STRINGLIB_CHAR ch1 = (STRINGLIB_CHAR) u1;
+                    s++;
+                    i = FASTSEARCH(s, end - s, &ch1, 1, 0, FAST_SEARCH);
+                    if (i < 0)
+                        return;
+                    s += i;
+#endif
+                    /* restart the dummy loop */
+                    break;
+                }
+            }
+        }
+        *s = u2;
+    }
+}
diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h
index 7bb91a7..ce27f3e 100644
--- a/Objects/stringlib/stringdefs.h
+++ b/Objects/stringlib/stringdefs.h
@@ -21,7 +21,6 @@
 #define STRINGLIB_STR            PyBytes_AS_STRING
 #define STRINGLIB_LEN            PyBytes_GET_SIZE
 #define STRINGLIB_NEW            PyBytes_FromStringAndSize
-#define STRINGLIB_RESIZE         _PyBytes_Resize
 #define STRINGLIB_CHECK          PyBytes_Check
 #define STRINGLIB_CHECK_EXACT    PyBytes_CheckExact
 #define STRINGLIB_TOSTR          PyObject_Str
diff --git a/Objects/stringlib/ucs1lib.h b/Objects/stringlib/ucs1lib.h
index e8c6fcb..ce1eb57 100644
--- a/Objects/stringlib/ucs1lib.h
+++ b/Objects/stringlib/ucs1lib.h
@@ -19,7 +19,6 @@
 #define STRINGLIB_STR            PyUnicode_1BYTE_DATA
 #define STRINGLIB_LEN            PyUnicode_GET_LENGTH
 #define STRINGLIB_NEW            _PyUnicode_FromUCS1
-#define STRINGLIB_RESIZE         not_supported
 #define STRINGLIB_CHECK          PyUnicode_Check
 #define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact
 
diff --git a/Objects/stringlib/ucs2lib.h b/Objects/stringlib/ucs2lib.h
index 45e5729..f900cb6 100644
--- a/Objects/stringlib/ucs2lib.h
+++ b/Objects/stringlib/ucs2lib.h
@@ -19,7 +19,6 @@
 #define STRINGLIB_STR            PyUnicode_2BYTE_DATA
 #define STRINGLIB_LEN            PyUnicode_GET_LENGTH
 #define STRINGLIB_NEW            _PyUnicode_FromUCS2
-#define STRINGLIB_RESIZE         not_supported
 #define STRINGLIB_CHECK          PyUnicode_Check
 #define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact
 
diff --git a/Objects/stringlib/ucs4lib.h b/Objects/stringlib/ucs4lib.h
index 647a27e..86a480f 100644
--- a/Objects/stringlib/ucs4lib.h
+++ b/Objects/stringlib/ucs4lib.h
@@ -19,7 +19,6 @@
 #define STRINGLIB_STR            PyUnicode_4BYTE_DATA
 #define STRINGLIB_LEN            PyUnicode_GET_LENGTH
 #define STRINGLIB_NEW            _PyUnicode_FromUCS4
-#define STRINGLIB_RESIZE         not_supported
 #define STRINGLIB_CHECK          PyUnicode_Check
 #define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact
 
diff --git a/Objects/stringlib/undef.h b/Objects/stringlib/undef.h
index 03117ec..f9d3f1d 100644
--- a/Objects/stringlib/undef.h
+++ b/Objects/stringlib/undef.h
@@ -6,7 +6,6 @@
 #undef  STRINGLIB_STR
 #undef  STRINGLIB_LEN
 #undef  STRINGLIB_NEW
-#undef  STRINGLIB_RESIZE
 #undef  _Py_InsertThousandsGrouping
 #undef STRINGLIB_IS_UNICODE
 
diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h
index e9be516..9429169 100644
--- a/Objects/stringlib/unicode_format.h
+++ b/Objects/stringlib/unicode_format.h
@@ -869,25 +869,19 @@ do_markup(SubString *input, PyObject *args, PyObject *kwargs,
     SubString literal;
     SubString field_name;
     SubString format_spec;
-    Py_UCS4 conversion, maxchar;
-    Py_ssize_t sublen;
-    int err;
+    Py_UCS4 conversion;
 
     MarkupIterator_init(&iter, input->str, input->start, input->end);
     while ((result = MarkupIterator_next(&iter, &literal, &field_present,
                                          &field_name, &format_spec,
                                          &conversion,
                                          &format_spec_needs_expanding)) == 2) {
-        sublen = literal.end - literal.start;
-        if (sublen) {
-            maxchar = _PyUnicode_FindMaxChar(literal.str,
-                                             literal.start, literal.end);
-            err = _PyUnicodeWriter_Prepare(writer, sublen, maxchar);
-            if (err == -1)
+        if (literal.end != literal.start) {
+            if (!field_present && iter.str.start == iter.str.end)
+                writer->overallocate = 0;
+            if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
+                                                literal.start, literal.end) < 0)
                 return 0;
-            _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
-                                          literal.str, literal.start, sublen);
-            writer->pos += sublen;
         }
 
         if (field_present) {
@@ -912,7 +906,6 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
              int recursion_depth, AutoNumber *auto_number)
 {
     _PyUnicodeWriter writer;
-    Py_ssize_t minlen;
 
     /* check the recursion level */
     if (recursion_depth <= 0) {
@@ -921,8 +914,9 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
         return NULL;
     }
 
-    minlen = PyUnicode_GET_LENGTH(input->str) + 100;
-    _PyUnicodeWriter_Init(&writer, minlen);
+    _PyUnicodeWriter_Init(&writer);
+    writer.overallocate = 1;
+    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
 
     if (!do_markup(input, args, kwargs, &writer, recursion_depth,
                    auto_number)) {
diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h
index f16f21e..48d00ec 100644
--- a/Objects/stringlib/unicodedefs.h
+++ b/Objects/stringlib/unicodedefs.h
@@ -21,7 +21,6 @@
 #define STRINGLIB_STR            PyUnicode_AS_UNICODE
 #define STRINGLIB_LEN            PyUnicode_GET_SIZE
 #define STRINGLIB_NEW            PyUnicode_FromUnicode
-#define STRINGLIB_RESIZE         PyUnicode_Resize
 #define STRINGLIB_CHECK          PyUnicode_Check
 #define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact