40 files changed, 4208 insertions, 4129 deletions
diff --git a/Objects/abstract.c b/Objects/abstract.c
index 88205bd..5d8a44b 100644
--- a/Objects/abstract.c
+++ b/Objects/abstract.c
@@ -141,8 +141,11 @@ PyObject_GetItem(PyObject *o, PyObject *key)
         return null_error();
 
     m = o->ob_type->tp_as_mapping;
-    if (m && m->mp_subscript)
-        return m->mp_subscript(o, key);
+    if (m && m->mp_subscript) {
+        PyObject *item = m->mp_subscript(o, key);
+        assert((item != NULL) ^ (PyErr_Occurred() != NULL));
+        return item;
+    }
 
     if (o->ob_type->tp_as_sequence) {
         if (PyIndex_Check(key)) {
@@ -1348,21 +1351,39 @@ PyNumber_Float(PyObject *o)
 
     if (o == NULL)
         return null_error();
+    if (PyFloat_CheckExact(o)) {
+        Py_INCREF(o);
+        return o;
+    }
     m = o->ob_type->tp_as_number;
     if (m && m->nb_float) { /* This should include subclasses of float */
         PyObject *res = m->nb_float(o);
-        if (res && !PyFloat_Check(res)) {
+        double val;
+        if (!res || PyFloat_CheckExact(res)) {
+            return res;
+        }
+        if (!PyFloat_Check(res)) {
             PyErr_Format(PyExc_TypeError,
-              "__float__ returned non-float (type %.200s)",
-              res->ob_type->tp_name);
+                         "%.50s.__float__ returned non-float (type %.50s)",
+                         o->ob_type->tp_name, res->ob_type->tp_name);
             Py_DECREF(res);
             return NULL;
         }
-        return res;
+        /* Issue #26983: warn if 'res' not of exact type float. */
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "%.50s.__float__ returned non-float (type %.50s).  "
+                "The ability to return an instance of a strict subclass of float "
+                "is deprecated, and may be removed in a future version of Python.",
+                o->ob_type->tp_name, res->ob_type->tp_name)) {
+            Py_DECREF(res);
+            return NULL;
+        }
+        val = PyFloat_AS_DOUBLE(res);
+        Py_DECREF(res);
+        return PyFloat_FromDouble(val);
     }
     if (PyFloat_Check(o)) { /* A float subclass with nb_float == NULL */
-        PyFloatObject *po = (PyFloatObject *)o;
-        return PyFloat_FromDouble(po->ob_fval);
+        return PyFloat_FromDouble(PyFloat_AS_DOUBLE(o));
     }
     return PyFloat_FromString(o);
 }
@@ -1544,8 +1565,10 @@ PySequence_GetItem(PyObject *s, Py_ssize_t i)
         if (i < 0) {
             if (m->sq_length) {
                 Py_ssize_t l = (*m->sq_length)(s);
-                if (l < 0)
+                if (l < 0) {
+                    assert(PyErr_Occurred());
                     return NULL;
+                }
                 i += l;
             }
         }
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c
index 1458635..f8c21d4 100644
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -278,31 +278,6 @@ PyByteArray_Concat(PyObject *a, PyObject *b)
     return (PyObject *)result;
 }
 
-static PyObject *
-bytearray_format(PyByteArrayObject *self, PyObject *args)
-{
-    PyObject *bytes_in, *bytes_out, *res;
-    char *bytestring;
-
-    if (self == NULL || !PyByteArray_Check(self) || args == NULL) {
-        PyErr_BadInternalCall();
-        return NULL;
-    }
-    bytestring = PyByteArray_AS_STRING(self);
-    bytes_in = PyBytes_FromString(bytestring);
-    if (bytes_in == NULL)
-        return NULL;
-    bytes_out = _PyBytes_Format(bytes_in, args);
-    Py_DECREF(bytes_in);
-    if (bytes_out == NULL)
-        return NULL;
-    res = PyByteArray_FromObject(bytes_out);
-    Py_DECREF(bytes_out);
-    if (res == NULL)
-        return NULL;
-    return res;
-}
-
 /* Functions stuffed into the type object */
 
 static Py_ssize_t
@@ -1119,161 +1094,27 @@ bytearray_dealloc(PyByteArrayObject *self)
 #include "stringlib/transmogrify.h"
 
 
-/* The following Py_LOCAL_INLINE and Py_LOCAL functions
-were copied from the old char* style string object. */
-
-/* helper macro to fixup start/end slice values */
-#define ADJUST_INDICES(start, end, len)         \
-    if (end > len)                              \
-        end = len;                              \
-    else if (end < 0) {                         \
-        end += len;                             \
-        if (end < 0)                            \
-            end = 0;                            \
-    }                                           \
-    if (start < 0) {                            \
-        start += len;                           \
-        if (start < 0)                          \
-            start = 0;                          \
-    }
-
-Py_LOCAL_INLINE(Py_ssize_t)
-bytearray_find_internal(PyByteArrayObject *self, PyObject *args, int dir)
-{
-    PyObject *subobj;
-    char byte;
-    Py_buffer subbuf;
-    const char *sub;
-    Py_ssize_t len, sub_len;
-    Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
-    Py_ssize_t res;
-
-    if (!stringlib_parse_args_finds_byte("find/rfind/index/rindex",
-                                         args, &subobj, &byte, &start, &end))
-        return -2;
-
-    if (subobj) {
-        if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0)
-            return -2;
-
-        sub = subbuf.buf;
-        sub_len = subbuf.len;
-    }
-    else {
-        sub = &byte;
-        sub_len = 1;
-    }
-    len = PyByteArray_GET_SIZE(self);
-
-    ADJUST_INDICES(start, end, len);
-    if (end - start < sub_len)
-        res = -1;
-    else if (sub_len == 1
-#ifndef HAVE_MEMRCHR
-            && dir > 0
-#endif
-    ) {
-        unsigned char needle = *sub;
-        int mode = (dir > 0) ? FAST_SEARCH : FAST_RSEARCH;
-        res = stringlib_fastsearch_memchr_1char(
-            PyByteArray_AS_STRING(self) + start, end - start,
-            needle, needle, mode);
-        if (res >= 0)
-            res += start;
-    }
-    else {
-        if (dir > 0)
-            res = stringlib_find_slice(
-                PyByteArray_AS_STRING(self), len,
-                sub, sub_len, start, end);
-        else
-            res = stringlib_rfind_slice(
-                PyByteArray_AS_STRING(self), len,
-                sub, sub_len, start, end);
-    }
-
-    if (subobj)
-        PyBuffer_Release(&subbuf);
-
-    return res;
-}
-
-PyDoc_STRVAR(find__doc__,
-"B.find(sub[, start[, end]]) -> int\n\
-\n\
-Return the lowest index in B where subsection sub is found,\n\
-such that sub is contained within B[start,end].  Optional\n\
-arguments start and end are interpreted as in slice notation.\n\
-\n\
-Return -1 on failure.");
-
 static PyObject *
 bytearray_find(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytearray_find_internal(self, args, +1);
-    if (result == -2)
-        return NULL;
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_find(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
-PyDoc_STRVAR(count__doc__,
-"B.count(sub[, start[, end]]) -> int\n\
-\n\
-Return the number of non-overlapping occurrences of subsection sub in\n\
-bytes B[start:end].  Optional arguments start and end are interpreted\n\
-as in slice notation.");
-
 static PyObject *
 bytearray_count(PyByteArrayObject *self, PyObject *args)
 {
-    PyObject *sub_obj;
-    const char *str = PyByteArray_AS_STRING(self), *sub;
-    Py_ssize_t sub_len;
-    char byte;
-    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
-
-    Py_buffer vsub;
-    PyObject *count_obj;
-
-    if (!stringlib_parse_args_finds_byte("count", args, &sub_obj, &byte,
-                                         &start, &end))
-        return NULL;
-
-    if (sub_obj) {
-        if (PyObject_GetBuffer(sub_obj, &vsub, PyBUF_SIMPLE) != 0)
-            return NULL;
-
-        sub = vsub.buf;
-        sub_len = vsub.len;
-    }
-    else {
-        sub = &byte;
-        sub_len = 1;
-    }
-
-    ADJUST_INDICES(start, end, PyByteArray_GET_SIZE(self));
-
-    count_obj = PyLong_FromSsize_t(
-        stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX)
-        );
-
-    if (sub_obj)
-        PyBuffer_Release(&vsub);
-
-    return count_obj;
+    return _Py_bytes_count(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
 /*[clinic input]
 bytearray.clear
 
-    self: self(type="PyByteArrayObject *")
-
 Remove all items from the bytearray.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_clear_impl(PyByteArrayObject *self)
-/*[clinic end generated code: output=85c2fe6aede0956c input=e524fd330abcdc18]*/
+/*[clinic end generated code: output=85c2fe6aede0956c input=ed6edae9de447ac4]*/
 {
     if (PyByteArray_Resize((PyObject *)self, 0) < 0)
         return NULL;
@@ -1283,236 +1124,57 @@ bytearray_clear_impl(PyByteArrayObject *self)
 /*[clinic input]
 bytearray.copy
 
-    self: self(type="PyByteArrayObject *")
-
 Return a copy of B.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_copy_impl(PyByteArrayObject *self)
-/*[clinic end generated code: output=68cfbcfed484c132 input=6d5d2975aa0f33f3]*/
+/*[clinic end generated code: output=68cfbcfed484c132 input=6597b0c01bccaa9e]*/
 {
     return PyByteArray_FromStringAndSize(PyByteArray_AS_STRING((PyObject *)self),
                                          PyByteArray_GET_SIZE(self));
 }
 
-PyDoc_STRVAR(index__doc__,
-"B.index(sub[, start[, end]]) -> int\n\
-\n\
-Like B.find() but raise ValueError when the subsection is not found.");
-
 static PyObject *
 bytearray_index(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytearray_find_internal(self, args, +1);
-    if (result == -2)
-        return NULL;
-    if (result == -1) {
-        PyErr_SetString(PyExc_ValueError,
-                        "subsection not found");
-        return NULL;
-    }
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_index(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
-
-PyDoc_STRVAR(rfind__doc__,
-"B.rfind(sub[, start[, end]]) -> int\n\
-\n\
-Return the highest index in B where subsection sub is found,\n\
-such that sub is contained within B[start,end].  Optional\n\
-arguments start and end are interpreted as in slice notation.\n\
-\n\
-Return -1 on failure.");
-
 static PyObject *
 bytearray_rfind(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytearray_find_internal(self, args, -1);
-    if (result == -2)
-        return NULL;
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_rfind(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
-
-PyDoc_STRVAR(rindex__doc__,
-"B.rindex(sub[, start[, end]]) -> int\n\
-\n\
-Like B.rfind() but raise ValueError when the subsection is not found.");
-
 static PyObject *
 bytearray_rindex(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytearray_find_internal(self, args, -1);
-    if (result == -2)
-        return NULL;
-    if (result == -1) {
-        PyErr_SetString(PyExc_ValueError,
-                        "subsection not found");
-        return NULL;
-    }
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_rindex(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
-
 static int
 bytearray_contains(PyObject *self, PyObject *arg)
 {
-    Py_ssize_t ival = PyNumber_AsSsize_t(arg, PyExc_ValueError);
-    if (ival == -1 && PyErr_Occurred()) {
-        Py_buffer varg;
-        Py_ssize_t pos;
-        PyErr_Clear();
-        if (PyObject_GetBuffer(arg, &varg, PyBUF_SIMPLE) != 0)
-            return -1;
-        pos = stringlib_find(PyByteArray_AS_STRING(self), Py_SIZE(self),
-                             varg.buf, varg.len, 0);
-        PyBuffer_Release(&varg);
-        return pos >= 0;
-    }
-    if (ival < 0 || ival >= 256) {
-        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)");
-        return -1;
-    }
-
-    return memchr(PyByteArray_AS_STRING(self), (int) ival, Py_SIZE(self)) != NULL;
+    return _Py_bytes_contains(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), arg);
 }
 
-
-/* Matches the end (direction >= 0) or start (direction < 0) of self
- * against substr, using the start and end arguments. Returns
- * -1 on error, 0 if not found and 1 if found.
- */
-Py_LOCAL(int)
-_bytearray_tailmatch(PyByteArrayObject *self, PyObject *substr, Py_ssize_t start,
-                 Py_ssize_t end, int direction)
-{
-    Py_ssize_t len = PyByteArray_GET_SIZE(self);
-    const char* str;
-    Py_buffer vsubstr;
-    int rv = 0;
-
-    str = PyByteArray_AS_STRING(self);
-
-    if (PyObject_GetBuffer(substr, &vsubstr, PyBUF_SIMPLE) != 0)
-        return -1;
-
-    ADJUST_INDICES(start, end, len);
-
-    if (direction < 0) {
-        /* startswith */
-        if (start+vsubstr.len > len) {
-            goto done;
-        }
-    } else {
-        /* endswith */
-        if (end-start < vsubstr.len || start > len) {
-            goto done;
-        }
-
-        if (end-vsubstr.len > start)
-            start = end - vsubstr.len;
-    }
-    if (end-start >= vsubstr.len)
-        rv = ! memcmp(str+start, vsubstr.buf, vsubstr.len);
-
-done:
-    PyBuffer_Release(&vsubstr);
-    return rv;
-}
-
-
-PyDoc_STRVAR(startswith__doc__,
-"B.startswith(prefix[, start[, end]]) -> bool\n\
-\n\
-Return True if B starts with the specified prefix, False otherwise.\n\
-With optional start, test B beginning at that position.\n\
-With optional end, stop comparing B at that position.\n\
-prefix can also be a tuple of bytes to try.");
-
 static PyObject *
 bytearray_startswith(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t start = 0;
-    Py_ssize_t end = PY_SSIZE_T_MAX;
-    PyObject *subobj;
-    int result;
-
-    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
-        return NULL;
-    if (PyTuple_Check(subobj)) {
-        Py_ssize_t i;
-        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            result = _bytearray_tailmatch(self,
-                                      PyTuple_GET_ITEM(subobj, i),
-                                      start, end, -1);
-            if (result == -1)
-                return NULL;
-            else if (result) {
-                Py_RETURN_TRUE;
-            }
-        }
-        Py_RETURN_FALSE;
-    }
-    result = _bytearray_tailmatch(self, subobj, start, end, -1);
-    if (result == -1) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "startswith first arg must be bytes "
-                         "or a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name);
-        return NULL;
-    }
-    else
-        return PyBool_FromLong(result);
+    return _Py_bytes_startswith(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
-PyDoc_STRVAR(endswith__doc__,
-"B.endswith(suffix[, start[, end]]) -> bool\n\
-\n\
-Return True if B ends with the specified suffix, False otherwise.\n\
-With optional start, test B beginning at that position.\n\
-With optional end, stop comparing B at that position.\n\
-suffix can also be a tuple of bytes to try.");
-
 static PyObject *
 bytearray_endswith(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t start = 0;
-    Py_ssize_t end = PY_SSIZE_T_MAX;
-    PyObject *subobj;
-    int result;
-
-    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
-        return NULL;
-    if (PyTuple_Check(subobj)) {
-        Py_ssize_t i;
-        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            result = _bytearray_tailmatch(self,
-                                      PyTuple_GET_ITEM(subobj, i),
-                                      start, end, +1);
-            if (result == -1)
-                return NULL;
-            else if (result) {
-                Py_RETURN_TRUE;
-            }
-        }
-        Py_RETURN_FALSE;
-    }
-    result = _bytearray_tailmatch(self, subobj, start, end, +1);
-    if (result == -1) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "endswith first arg must be bytes or "
-                         "a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name);
-        return NULL;
-    }
-    else
-        return PyBool_FromLong(result);
+    return _Py_bytes_endswith(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
 
 /*[clinic input]
 bytearray.translate
 
-    self: self(type="PyByteArrayObject *")
     table: object
         Translation table, which must be a bytes object of length 256.
     [
@@ -1529,7 +1191,7 @@ The remaining characters are mapped through the given translation table.
 static PyObject *
 bytearray_translate_impl(PyByteArrayObject *self, PyObject *table,
                          int group_right_1, PyObject *deletechars)
-/*[clinic end generated code: output=2bebc86a9a1ff083 input=b749ad85f4860824]*/
+/*[clinic end generated code: output=2bebc86a9a1ff083 input=846a01671bccc1c5]*/
 {
     char *input, *output;
     const char *table_chars;
@@ -1572,7 +1234,7 @@ bytearray_translate_impl(PyByteArrayObject *self, PyObject *table,
     result = PyByteArray_FromStringAndSize((char *)NULL, inlen);
     if (result == NULL)
         goto done;
-    output_start = output = PyByteArray_AsString(result);
+    output_start = output = PyByteArray_AS_STRING(result);
     input = PyByteArray_AS_STRING(input_obj);
 
     if (vdel.len == 0 && table_chars != NULL) {
@@ -1642,493 +1304,6 @@ bytearray_maketrans_impl(Py_buffer *frm, Py_buffer *to)
 }
 
 
-/* find and count characters and substrings */
-
-#define findchar(target, target_len, c)                         \
-  ((char *)memchr((const void *)(target), c, target_len))
-
-
-/* Bytes ops must return a string, create a copy */
-Py_LOCAL(PyByteArrayObject *)
-return_self(PyByteArrayObject *self)
-{
-    /* always return a new bytearray */
-    return (PyByteArrayObject *)PyByteArray_FromStringAndSize(
-            PyByteArray_AS_STRING(self),
-            PyByteArray_GET_SIZE(self));
-}
-
-Py_LOCAL_INLINE(Py_ssize_t)
-countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount)
-{
-    Py_ssize_t count=0;
-    const char *start=target;
-    const char *end=target+target_len;
-
-    while ( (start=findchar(start, end-start, c)) != NULL ) {
-        count++;
-        if (count >= maxcount)
-            break;
-        start += 1;
-    }
-    return count;
-}
-
-
-/* Algorithms for different cases of string replacement */
-
-/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_interleave(PyByteArrayObject *self,
-                   const char *to_s, Py_ssize_t to_len,
-                   Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, i;
-    PyByteArrayObject *result;
-
-    self_len = PyByteArray_GET_SIZE(self);
-
-    /* 1 at the end plus 1 after every character;
-       count = min(maxcount, self_len + 1) */
-    if (maxcount <= self_len)
-        count = maxcount;
-    else
-        /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */
-        count = self_len + 1;
-
-    /* Check for overflow */
-    /*   result_len = count * to_len + self_len; */
-    assert(count > 0);
-    if (to_len > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "replace string is too long");
-        return NULL;
-    }
-    result_len = count * to_len + self_len;
-
-    if (! (result = (PyByteArrayObject *)
-                     PyByteArray_FromStringAndSize(NULL, result_len)) )
-        return NULL;
-
-    self_s = PyByteArray_AS_STRING(self);
-    result_s = PyByteArray_AS_STRING(result);
-
-    /* TODO: special case single character, which doesn't need memcpy */
-
-    /* Lay the first one down (guaranteed this will occur) */
-    Py_MEMCPY(result_s, to_s, to_len);
-    result_s += to_len;
-    count -= 1;
-
-    for (i=0; i<count; i++) {
-        *result_s++ = *self_s++;
-        Py_MEMCPY(result_s, to_s, to_len);
-        result_s += to_len;
-    }
-
-    /* Copy the rest of the original string */
-    Py_MEMCPY(result_s, self_s, self_len-i);
-
-    return result;
-}
-
-/* Special case for deleting a single character */
-/* len(self)>=1, len(from)==1, to="", maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_delete_single_character(PyByteArrayObject *self,
-                                char from_c, Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count;
-    PyByteArrayObject *result;
-
-    self_len = PyByteArray_GET_SIZE(self);
-    self_s = PyByteArray_AS_STRING(self);
-
-    count = countchar(self_s, self_len, from_c, maxcount);
-    if (count == 0) {
-        return return_self(self);
-    }
-
-    result_len = self_len - count;  /* from_len == 1 */
-    assert(result_len>=0);
-
-    if ( (result = (PyByteArrayObject *)
-                    PyByteArray_FromStringAndSize(NULL, result_len)) == NULL)
-        return NULL;
-    result_s = PyByteArray_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-        Py_MEMCPY(result_s, start, next-start);
-        result_s += (next-start);
-        start = next+1;
-    }
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
-
-Py_LOCAL(PyByteArrayObject *)
-replace_delete_substring(PyByteArrayObject *self,
-                         const char *from_s, Py_ssize_t from_len,
-                         Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, offset;
-    PyByteArrayObject *result;
-
-    self_len = PyByteArray_GET_SIZE(self);
-    self_s = PyByteArray_AS_STRING(self);
-
-    count = stringlib_count(self_s, self_len,
-                            from_s, from_len,
-                            maxcount);
-
-    if (count == 0) {
-        /* no matches */
-        return return_self(self);
-    }
-
-    result_len = self_len - (count * from_len);
-    assert (result_len>=0);
-
-    if ( (result = (PyByteArrayObject *)
-        PyByteArray_FromStringAndSize(NULL, result_len)) == NULL )
-            return NULL;
-
-    result_s = PyByteArray_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset == -1)
-            break;
-        next = start + offset;
-
-        Py_MEMCPY(result_s, start, next-start);
-
-        result_s += (next-start);
-        start = next+from_len;
-    }
-    Py_MEMCPY(result_s, start, end-start);
-    return result;
-}
-
-/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_single_character_in_place(PyByteArrayObject *self,
-                                  char from_c, char to_c,
-                                  Py_ssize_t maxcount)
-{
-    char *self_s, *result_s, *start, *end, *next;
-    Py_ssize_t self_len;
-    PyByteArrayObject *result;
-
-    /* The result string will be the same size */
-    self_s = PyByteArray_AS_STRING(self);
-    self_len = PyByteArray_GET_SIZE(self);
-
-    next = findchar(self_s, self_len, from_c);
-
-    if (next == NULL) {
-        /* No matches; return the original bytes */
-        return return_self(self);
-    }
-
-    /* Need to make a new bytes */
-    result = (PyByteArrayObject *) PyByteArray_FromStringAndSize(NULL, self_len);
-    if (result == NULL)
-        return NULL;
-    result_s = PyByteArray_AS_STRING(result);
-    Py_MEMCPY(result_s, self_s, self_len);
-
-    /* change everything in-place, starting with this one */
-    start =  result_s + (next-self_s);
-    *start = to_c;
-    start++;
-    end = result_s + self_len;
-
-    while (--maxcount > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-        *next = to_c;
-        start = next+1;
-    }
-
-    return result;
-}
-
-/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_substring_in_place(PyByteArrayObject *self,
-                           const char *from_s, Py_ssize_t from_len,
-                           const char *to_s, Py_ssize_t to_len,
-                           Py_ssize_t maxcount)
-{
-    char *result_s, *start, *end;
-    char *self_s;
-    Py_ssize_t self_len, offset;
-    PyByteArrayObject *result;
-
-    /* The result bytes will be the same size */
-
-    self_s = PyByteArray_AS_STRING(self);
-    self_len = PyByteArray_GET_SIZE(self);
-
-    offset = stringlib_find(self_s, self_len,
-                            from_s, from_len,
-                            0);
-    if (offset == -1) {
-        /* No matches; return the original bytes */
-        return return_self(self);
-    }
-
-    /* Need to make a new bytes */
-    result = (PyByteArrayObject *) PyByteArray_FromStringAndSize(NULL, self_len);
-    if (result == NULL)
-        return NULL;
-    result_s = PyByteArray_AS_STRING(result);
-    Py_MEMCPY(result_s, self_s, self_len);
-
-    /* change everything in-place, starting with this one */
-    start =  result_s + offset;
-    Py_MEMCPY(start, to_s, from_len);
-    start += from_len;
-    end = result_s + self_len;
-
-    while ( --maxcount > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset==-1)
-            break;
-        Py_MEMCPY(start+offset, to_s, from_len);
-        start += offset+from_len;
-    }
-
-    return result;
-}
-
-/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_single_character(PyByteArrayObject *self,
-                         char from_c,
-                         const char *to_s, Py_ssize_t to_len,
-                         Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count;
-    PyByteArrayObject *result;
-
-    self_s = PyByteArray_AS_STRING(self);
-    self_len = PyByteArray_GET_SIZE(self);
-
-    count = countchar(self_s, self_len, from_c, maxcount);
-    if (count == 0) {
-        /* no matches, return unchanged */
-        return return_self(self);
-    }
-
-    /* use the difference between current and new, hence the "-1" */
-    /*   result_len = self_len + count * (to_len-1)  */
-    assert(count > 0);
-    if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
-        return NULL;
-    }
-    result_len = self_len + count * (to_len - 1);
-
-    if ( (result = (PyByteArrayObject *)
-          PyByteArray_FromStringAndSize(NULL, result_len)) == NULL)
-            return NULL;
-    result_s = PyByteArray_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-
-        if (next == start) {
-            /* replace with the 'to' */
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start += 1;
-        } else {
-            /* copy the unchanged old then the 'to' */
-            Py_MEMCPY(result_s, start, next-start);
-            result_s += (next-start);
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start = next+1;
-        }
-    }
-    /* Copy the remainder of the remaining bytes */
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_substring(PyByteArrayObject *self,
-                  const char *from_s, Py_ssize_t from_len,
-                  const char *to_s, Py_ssize_t to_len,
-                  Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, offset;
-    PyByteArrayObject *result;
-
-    self_s = PyByteArray_AS_STRING(self);
-    self_len = PyByteArray_GET_SIZE(self);
-
-    count = stringlib_count(self_s, self_len,
-                            from_s, from_len,
-                            maxcount);
-
-    if (count == 0) {
-        /* no matches, return unchanged */
-        return return_self(self);
-    }
-
-    /* Check for overflow */
-    /*    result_len = self_len + count * (to_len-from_len) */
-    assert(count > 0);
-    if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
-        return NULL;
-    }
-    result_len = self_len + count * (to_len - from_len);
-
-    if ( (result = (PyByteArrayObject *)
-          PyByteArray_FromStringAndSize(NULL, result_len)) == NULL)
-        return NULL;
-    result_s = PyByteArray_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset == -1)
-            break;
-        next = start+offset;
-        if (next == start) {
-            /* replace with the 'to' */
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start += from_len;
-        } else {
-            /* copy the unchanged old then the 'to' */
-            Py_MEMCPY(result_s, start, next-start);
-            result_s += (next-start);
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start = next+from_len;
-        }
-    }
-    /* Copy the remainder of the remaining bytes */
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-
-Py_LOCAL(PyByteArrayObject *)
-replace(PyByteArrayObject *self,
-        const char *from_s, Py_ssize_t from_len,
-        const char *to_s, Py_ssize_t to_len,
-        Py_ssize_t maxcount)
-{
-    if (maxcount < 0) {
-        maxcount = PY_SSIZE_T_MAX;
-    } else if (maxcount == 0 || PyByteArray_GET_SIZE(self) == 0) {
-        /* nothing to do; return the original bytes */
-        return return_self(self);
-    }
-
-    if (maxcount == 0 ||
-        (from_len == 0 && to_len == 0)) {
-        /* nothing to do; return the original bytes */
-        return return_self(self);
-    }
-
-    /* Handle zero-length special cases */
-
-    if (from_len == 0) {
-        /* insert the 'to' bytes everywhere.   */
-        /*    >>> "Python".replace("", ".")     */
-        /*    '.P.y.t.h.o.n.'                   */
-        return replace_interleave(self, to_s, to_len, maxcount);
-    }
-
-    /* Except for "".replace("", "A") == "A" there is no way beyond this */
-    /* point for an empty self bytes to generate a non-empty bytes */
-    /* Special case so the remaining code always gets a non-empty bytes */
-    if (PyByteArray_GET_SIZE(self) == 0) {
-        return return_self(self);
-    }
-
-    if (to_len == 0) {
-        /* delete all occurrences of 'from' bytes */
-        if (from_len == 1) {
-            return replace_delete_single_character(
-                    self, from_s[0], maxcount);
-        } else {
-            return replace_delete_substring(self, from_s, from_len, maxcount);
-        }
-    }
-
-    /* Handle special case where both bytes have the same length */
-
-    if (from_len == to_len) {
-        if (from_len == 1) {
-            return replace_single_character_in_place(
-                    self,
-                    from_s[0],
-                    to_s[0],
-                    maxcount);
-        } else {
-            return replace_substring_in_place(
-                self, from_s, from_len, to_s, to_len, maxcount);
-        }
-    }
-
-    /* Otherwise use the more generic algorithms */
-    if (from_len == 1) {
-        return replace_single_character(self, from_s[0],
-                                        to_s, to_len, maxcount);
-    } else {
-        /* len('from')>=2, len('to')>=1 */
-        return replace_substring(self, from_s, from_len, to_s, to_len, maxcount);
-    }
-}
-
-
 /*[clinic input]
 bytearray.replace
 
@@ -2150,9 +1325,9 @@ bytearray_replace_impl(PyByteArrayObject *self, Py_buffer *old,
                        Py_buffer *new, Py_ssize_t count)
 /*[clinic end generated code: output=d39884c4dc59412a input=aa379d988637c7fb]*/
 {
-    return (PyObject *)replace((PyByteArrayObject *) self,
-                               old->buf, old->len,
-                               new->buf, new->len, count);
+    return stringlib_replace((PyObject *)self,
+                             (const char *)old->buf, old->len,
+                             (const char *)new->buf, new->len, count);
 }
 
 /*[clinic input]
@@ -2200,7 +1375,6 @@ bytearray_split_impl(PyByteArrayObject *self, PyObject *sep,
 /*[clinic input]
 bytearray.partition
 
-    self: self(type="PyByteArrayObject *")
     sep: object
     /
 
@@ -2216,7 +1390,7 @@ bytearray object and two empty bytearray objects.
 
 static PyObject *
 bytearray_partition(PyByteArrayObject *self, PyObject *sep)
-/*[clinic end generated code: output=45d2525ddd35f957 input=7d7fe37b1696d506]*/
+/*[clinic end generated code: output=45d2525ddd35f957 input=86f89223892b70b5]*/
 {
     PyObject *bytesep, *result;
 
@@ -2238,7 +1412,6 @@ bytearray_partition(PyByteArrayObject *self, PyObject *sep)
 /*[clinic input]
 bytearray.rpartition
 
-    self: self(type="PyByteArrayObject *")
     sep: object
     /
 
@@ -2254,7 +1427,7 @@ objects and the original bytearray object.
 
 static PyObject *
 bytearray_rpartition(PyByteArrayObject *self, PyObject *sep)
-/*[clinic end generated code: output=440de3c9426115e8 input=9b8cd540c1b75853]*/
+/*[clinic end generated code: output=440de3c9426115e8 input=5f4094f2de87c8f3]*/
 {
     PyObject *bytesep, *result;
 
@@ -2312,14 +1485,12 @@ bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep,
 /*[clinic input]
 bytearray.reverse
 
-    self: self(type="PyByteArrayObject *")
-
 Reverse the order of the values in B in place.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_reverse_impl(PyByteArrayObject *self)
-/*[clinic end generated code: output=9f7616f29ab309d3 input=7933a499b8597bd1]*/
+/*[clinic end generated code: output=9f7616f29ab309d3 input=543356319fc78557]*/
 {
     char swap, *head, *tail;
     Py_ssize_t i, j, n = Py_SIZE(self);
@@ -2348,7 +1519,6 @@ class bytesvalue_converter(CConverter):
 /*[clinic input]
 bytearray.insert
 
-    self: self(type="PyByteArrayObject *")
     index: Py_ssize_t
         The index where the value is to be inserted.
     item: bytesvalue
@@ -2360,7 +1530,7 @@ Insert a single item into the bytearray before the given index.
 
 static PyObject *
 bytearray_insert_impl(PyByteArrayObject *self, Py_ssize_t index, int item)
-/*[clinic end generated code: output=76c775a70e7b07b7 input=833766836ba30e1e]*/
+/*[clinic end generated code: output=76c775a70e7b07b7 input=b2b5d07e9de6c070]*/
 {
     Py_ssize_t n = Py_SIZE(self);
     char *buf;
@@ -2390,7 +1560,6 @@ bytearray_insert_impl(PyByteArrayObject *self, Py_ssize_t index, int item)
 /*[clinic input]
 bytearray.append
 
-    self: self(type="PyByteArrayObject *")
     item: bytesvalue
         The item to be appended.
     /
@@ -2400,7 +1569,7 @@ Append a single item to the end of the bytearray.
 
 static PyObject *
 bytearray_append_impl(PyByteArrayObject *self, int item)
-/*[clinic end generated code: output=a154e19ed1886cb6 input=ae56ea87380407cc]*/
+/*[clinic end generated code: output=a154e19ed1886cb6 input=20d6bec3d1340593]*/
 {
     Py_ssize_t n = Py_SIZE(self);
 
@@ -2420,7 +1589,6 @@ bytearray_append_impl(PyByteArrayObject *self, int item)
 /*[clinic input]
 bytearray.extend
 
-    self: self(type="PyByteArrayObject *")
     iterable_of_ints: object
         The iterable of items to append.
     /
@@ -2430,7 +1598,7 @@ Append all the items from the iterator or sequence to the end of the bytearray.
 
 static PyObject *
 bytearray_extend(PyByteArrayObject *self, PyObject *iterable_of_ints)
-/*[clinic end generated code: output=98155dbe249170b1 input=ce83a5d75b70d850]*/
+/*[clinic end generated code: output=98155dbe249170b1 input=c617b3a93249ba28]*/
 {
     PyObject *it, *item, *bytearray_obj;
     Py_ssize_t buf_size = 0, len = 0;
@@ -2515,7 +1683,6 @@ bytearray_extend(PyByteArrayObject *self, PyObject *iterable_of_ints)
 /*[clinic input]
 bytearray.pop
 
-    self: self(type="PyByteArrayObject *")
     index: Py_ssize_t = -1
         The index from where to remove the item.
         -1 (the default value) means remove the last item.
@@ -2528,7 +1695,7 @@ If no index argument is given, will pop the last item.
 
 static PyObject *
 bytearray_pop_impl(PyByteArrayObject *self, Py_ssize_t index)
-/*[clinic end generated code: output=e0ccd401f8021da8 input=0797e6c0ca9d5a85]*/
+/*[clinic end generated code: output=e0ccd401f8021da8 input=3591df2d06c0d237]*/
 {
     int value;
     Py_ssize_t n = Py_SIZE(self);
@@ -2560,7 +1727,6 @@ bytearray_pop_impl(PyByteArrayObject *self, Py_ssize_t index)
 /*[clinic input]
 bytearray.remove
 
-    self: self(type="PyByteArrayObject *")
     value: bytesvalue
         The value to remove.
     /
@@ -2570,20 +1736,20 @@ Remove the first occurrence of a value in the bytearray.
 
 static PyObject *
 bytearray_remove_impl(PyByteArrayObject *self, int value)
-/*[clinic end generated code: output=d659e37866709c13 input=47560b11fd856c24]*/
+/*[clinic end generated code: output=d659e37866709c13 input=121831240cd51ddf]*/
 {
-    Py_ssize_t n = Py_SIZE(self);
+    Py_ssize_t where, n = Py_SIZE(self);
     char *buf = PyByteArray_AS_STRING(self);
-    char *where = memchr(buf, value, n);
 
-    if (!where) {
+    where = stringlib_find_char(buf, n, value);
+    if (where < 0) {
         PyErr_SetString(PyExc_ValueError, "value not found in bytearray");
         return NULL;
     }
     if (!_canresize(self))
         return NULL;
 
-    memmove(where, where + 1, buf + n - where);
+    memmove(buf + where, buf + where + 1, n - where);
     if (PyByteArray_Resize((PyObject *)self, n - 1) < 0)
         return NULL;
 
@@ -2593,8 +1759,8 @@ bytearray_remove_impl(PyByteArrayObject *self, int value)
 /* XXX These two helpers could be optimized if argsize == 1 */
 
 static Py_ssize_t
-lstrip_helper(char *myptr, Py_ssize_t mysize,
-              void *argptr, Py_ssize_t argsize)
+lstrip_helper(const char *myptr, Py_ssize_t mysize,
+              const void *argptr, Py_ssize_t argsize)
 {
     Py_ssize_t i = 0;
     while (i < mysize && memchr(argptr, (unsigned char) myptr[i], argsize))
@@ -2603,8 +1769,8 @@ lstrip_helper(char *myptr, Py_ssize_t mysize,
 }
 
 static Py_ssize_t
-rstrip_helper(char *myptr, Py_ssize_t mysize,
-              void *argptr, Py_ssize_t argsize)
+rstrip_helper(const char *myptr, Py_ssize_t mysize,
+              const void *argptr, Py_ssize_t argsize)
 {
     Py_ssize_t i = mysize - 1;
     while (i >= 0 && memchr(argptr, (unsigned char) myptr[i], argsize))
@@ -2805,27 +1971,10 @@ bytearray_splitlines_impl(PyByteArrayObject *self, int keepends)
         );
 }
 
-static int
-hex_digit_to_int(Py_UCS4 c)
-{
-    if (c >= 128)
-        return -1;
-    if (Py_ISDIGIT(c))
-        return c - '0';
-    else {
-        if (Py_ISUPPER(c))
-            c = Py_TOLOWER(c);
-        if (c >= 'a' && c <= 'f')
-            return c - 'a' + 10;
-    }
-    return -1;
-}
-
 /*[clinic input]
 @classmethod
 bytearray.fromhex
 
-    cls: self(type="PyObject*")
     string: unicode
     /
 
@@ -2836,51 +1985,15 @@ Example: bytearray.fromhex('B9 01EF') -> bytearray(b'\\xb9\\x01\\xef')
 [clinic start generated code]*/
 
 static PyObject *
-bytearray_fromhex_impl(PyObject*cls, PyObject *string)
-/*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
+bytearray_fromhex_impl(PyTypeObject *type, PyObject *string)
+/*[clinic end generated code: output=8f0f0b6d30fb3ba0 input=f033a16d1fb21f48]*/
 {
-    PyObject *newbytes;
-    char *buf;
-    Py_ssize_t hexlen, byteslen, i, j;
-    int top, bot;
-    void *data;
-    unsigned int kind;
-
-    assert(PyUnicode_Check(string));
-    if (PyUnicode_READY(string))
-        return NULL;
-    kind = PyUnicode_KIND(string);
-    data = PyUnicode_DATA(string);
-    hexlen = PyUnicode_GET_LENGTH(string);
-
-    byteslen = hexlen/2; /* This overestimates if there are spaces */
-    newbytes = PyByteArray_FromStringAndSize(NULL, byteslen);
-    if (!newbytes)
-        return NULL;
-    buf = PyByteArray_AS_STRING(newbytes);
-    for (i = j = 0; i < hexlen; i += 2) {
-        /* skip over spaces in the input */
-        while (PyUnicode_READ(kind, data, i) == ' ')
-            i++;
-        if (i >= hexlen)
-            break;
-        top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
-        bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
-        if (top == -1 || bot == -1) {
-            PyErr_Format(PyExc_ValueError,
-                         "non-hexadecimal number found in "
-                         "fromhex() arg at position %zd", i);
-            goto error;
-        }
-        buf[j++] = (top << 4) + bot;
+    PyObject *result = _PyBytes_FromHex(string, type == &PyByteArray_Type);
+    if (type != &PyByteArray_Type && result != NULL) {
+        Py_SETREF(result, PyObject_CallFunctionObjArgs((PyObject *)type,
+                                                       result, NULL));
     }
-    if (PyByteArray_Resize(newbytes, j) < 0)
-        goto error;
-    return newbytes;
-
-  error:
-    Py_DECREF(newbytes);
-    return NULL;
+    return result;
 }
 
 PyDoc_STRVAR(hex__doc__,
@@ -2935,14 +2048,12 @@ _common_reduce(PyByteArrayObject *self, int proto)
 /*[clinic input]
 bytearray.__reduce__ as bytearray_reduce
 
-    self: self(type="PyByteArrayObject *")
-
 Return state information for pickling.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_reduce_impl(PyByteArrayObject *self)
-/*[clinic end generated code: output=52bf304086464cab input=fbb07de4d102a03a]*/
+/*[clinic end generated code: output=52bf304086464cab input=44b5737ada62dd3f]*/
 {
     return _common_reduce(self, 2);
 }
@@ -2950,7 +2061,6 @@ bytearray_reduce_impl(PyByteArrayObject *self)
 /*[clinic input]
 bytearray.__reduce_ex__ as bytearray_reduce_ex
 
-    self: self(type="PyByteArrayObject *")
     proto: int = 0
     /
 
@@ -2959,7 +2069,7 @@ Return state information for pickling.
 
 static PyObject *
 bytearray_reduce_ex_impl(PyByteArrayObject *self, int proto)
-/*[clinic end generated code: output=52eac33377197520 input=0e091a42ca6dbd91]*/
+/*[clinic end generated code: output=52eac33377197520 input=f129bc1a1aa151ee]*/
 {
     return _common_reduce(self, proto);
 }
@@ -2967,14 +2077,12 @@ bytearray_reduce_ex_impl(PyByteArrayObject *self, int proto)
 /*[clinic input]
 bytearray.__sizeof__ as bytearray_sizeof
 
-    self: self(type="PyByteArrayObject *")
-
 Returns the size of the bytearray object in memory, in bytes.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_sizeof_impl(PyByteArrayObject *self)
-/*[clinic end generated code: output=738abdd17951c427 input=6b23d305362b462b]*/
+/*[clinic end generated code: output=738abdd17951c427 input=e27320fd98a4bc5a]*/
 {
     Py_ssize_t res;
 
@@ -3015,19 +2123,22 @@ bytearray_methods[] = {
     BYTEARRAY_APPEND_METHODDEF
     {"capitalize", (PyCFunction)stringlib_capitalize, METH_NOARGS,
      _Py_capitalize__doc__},
-    {"center", (PyCFunction)stringlib_center, METH_VARARGS, center__doc__},
+    {"center", (PyCFunction)stringlib_center, METH_VARARGS, _Py_center__doc__},
     BYTEARRAY_CLEAR_METHODDEF
     BYTEARRAY_COPY_METHODDEF
-    {"count", (PyCFunction)bytearray_count, METH_VARARGS, count__doc__},
+    {"count", (PyCFunction)bytearray_count, METH_VARARGS,
+     _Py_count__doc__},
     BYTEARRAY_DECODE_METHODDEF
-    {"endswith", (PyCFunction)bytearray_endswith, METH_VARARGS, endswith__doc__},
+    {"endswith", (PyCFunction)bytearray_endswith, METH_VARARGS,
+     _Py_endswith__doc__},
     {"expandtabs", (PyCFunction)stringlib_expandtabs, METH_VARARGS | METH_KEYWORDS,
-     expandtabs__doc__},
+     _Py_expandtabs__doc__},
     BYTEARRAY_EXTEND_METHODDEF
-    {"find", (PyCFunction)bytearray_find, METH_VARARGS, find__doc__},
+    {"find", (PyCFunction)bytearray_find, METH_VARARGS,
+     _Py_find__doc__},
     BYTEARRAY_FROMHEX_METHODDEF
     {"hex", (PyCFunction)bytearray_hex, METH_NOARGS, hex__doc__},
-    {"index", (PyCFunction)bytearray_index, METH_VARARGS, index__doc__},
+    {"index", (PyCFunction)bytearray_index, METH_VARARGS, _Py_index__doc__},
     BYTEARRAY_INSERT_METHODDEF
     {"isalnum", (PyCFunction)stringlib_isalnum, METH_NOARGS,
      _Py_isalnum__doc__},
@@ -3044,7 +2155,7 @@ bytearray_methods[] = {
     {"isupper", (PyCFunction)stringlib_isupper, METH_NOARGS,
      _Py_isupper__doc__},
     BYTEARRAY_JOIN_METHODDEF
-    {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, ljust__doc__},
+    {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, _Py_ljust__doc__},
     {"lower", (PyCFunction)stringlib_lower, METH_NOARGS, _Py_lower__doc__},
     BYTEARRAY_LSTRIP_METHODDEF
     BYTEARRAY_MAKETRANS_METHODDEF
@@ -3053,23 +2164,23 @@ bytearray_methods[] = {
     BYTEARRAY_REMOVE_METHODDEF
     BYTEARRAY_REPLACE_METHODDEF
     BYTEARRAY_REVERSE_METHODDEF
-    {"rfind", (PyCFunction)bytearray_rfind, METH_VARARGS, rfind__doc__},
-    {"rindex", (PyCFunction)bytearray_rindex, METH_VARARGS, rindex__doc__},
-    {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, rjust__doc__},
+    {"rfind", (PyCFunction)bytearray_rfind, METH_VARARGS, _Py_rfind__doc__},
+    {"rindex", (PyCFunction)bytearray_rindex, METH_VARARGS, _Py_rindex__doc__},
+    {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, _Py_rjust__doc__},
     BYTEARRAY_RPARTITION_METHODDEF
     BYTEARRAY_RSPLIT_METHODDEF
     BYTEARRAY_RSTRIP_METHODDEF
     BYTEARRAY_SPLIT_METHODDEF
     BYTEARRAY_SPLITLINES_METHODDEF
     {"startswith", (PyCFunction)bytearray_startswith, METH_VARARGS ,
-     startswith__doc__},
+     _Py_startswith__doc__},
     BYTEARRAY_STRIP_METHODDEF
     {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS,
      _Py_swapcase__doc__},
     {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__},
     BYTEARRAY_TRANSLATE_METHODDEF
     {"upper", (PyCFunction)stringlib_upper, METH_NOARGS, _Py_upper__doc__},
-    {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, zfill__doc__},
+    {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, _Py_zfill__doc__},
     {NULL}
 };
 
@@ -3078,7 +2189,7 @@ bytearray_mod(PyObject *v, PyObject *w)
 {
     if (!PyByteArray_Check(v))
         Py_RETURN_NOTIMPLEMENTED;
-    return bytearray_format((PyByteArrayObject *)v, w);
+    return _PyBytes_FormatEx(PyByteArray_AS_STRING(v), PyByteArray_GET_SIZE(v), w, 1);
 }
 
 static PyNumberMethods bytearray_as_number = {
diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c
index a299915..d7f061b 100644
--- a/Objects/bytes_methods.c
+++ b/Objects/bytes_methods.c
@@ -1,3 +1,4 @@
+#define PY_SSIZE_T_CLEAN
 #include "Python.h"
 #include "bytes_methods.h"
 
@@ -277,7 +278,7 @@ Return a titlecased version of B, i.e. ASCII words start with uppercase\n\
 characters, all remaining cased characters have lowercase.");
 
 void
-_Py_bytes_title(char *result, char *s, Py_ssize_t len)
+_Py_bytes_title(char *result, const char *s, Py_ssize_t len)
 {
     Py_ssize_t i;
     int previous_is_cased = 0;
@@ -306,7 +307,7 @@ Return a copy of B with only its first character capitalized (ASCII)\n\
 and the rest lower-cased.");
 
 void
-_Py_bytes_capitalize(char *result, char *s, Py_ssize_t len)
+_Py_bytes_capitalize(char *result, const char *s, Py_ssize_t len)
 {
     Py_ssize_t i;
 
@@ -336,7 +337,7 @@ Return a copy of B with uppercase ASCII characters converted\n\
 to lowercase ASCII and vice versa.");
 
 void
-_Py_bytes_swapcase(char *result, char *s, Py_ssize_t len)
+_Py_bytes_swapcase(char *result, const char *s, Py_ssize_t len)
 {
     Py_ssize_t i;
 
@@ -387,3 +388,427 @@ _Py_bytes_maketrans(Py_buffer *frm, Py_buffer *to)
 
     return res;
 }
+
+#define FASTSEARCH fastsearch
+#define STRINGLIB(F) stringlib_##F
+#define STRINGLIB_CHAR char
+#define STRINGLIB_SIZEOF_CHAR 1
+
+#include "stringlib/fastsearch.h"
+#include "stringlib/count.h"
+#include "stringlib/find.h"
+
+/*
+Wraps stringlib_parse_args_finds() and additionally checks whether the
+first argument is an integer in range(0, 256).
+
+If this is the case, writes the integer value to the byte parameter
+and sets subobj to NULL. Otherwise, sets the first argument to subobj
+and doesn't touch byte. The other parameters are similar to those of
+stringlib_parse_args_finds().
+*/
+
+Py_LOCAL_INLINE(int)
+parse_args_finds_byte(const char *function_name, PyObject *args,
+                      PyObject **subobj, char *byte,
+                      Py_ssize_t *start, Py_ssize_t *end)
+{
+    PyObject *tmp_subobj;
+    Py_ssize_t ival;
+    PyObject *err;
+
+    if(!stringlib_parse_args_finds(function_name, args, &tmp_subobj,
+                                   start, end))
+        return 0;
+
+    if (!PyNumber_Check(tmp_subobj)) {
+        *subobj = tmp_subobj;
+        return 1;
+    }
+
+    ival = PyNumber_AsSsize_t(tmp_subobj, PyExc_OverflowError);
+    if (ival == -1) {
+        err = PyErr_Occurred();
+        if (err && !PyErr_GivenExceptionMatches(err, PyExc_OverflowError)) {
+            PyErr_Clear();
+            *subobj = tmp_subobj;
+            return 1;
+        }
+    }
+
+    if (ival < 0 || ival > 255) {
+        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)");
+        return 0;
+    }
+
+    *subobj = NULL;
+    *byte = (char)ival;
+    return 1;
+}
+
+/* helper macro to fixup start/end slice values */
+#define ADJUST_INDICES(start, end, len)         \
+    if (end > len)                          \
+        end = len;                          \
+    else if (end < 0) {                     \
+        end += len;                         \
+        if (end < 0)                        \
+        end = 0;                        \
+    }                                       \
+    if (start < 0) {                        \
+        start += len;                       \
+        if (start < 0)                      \
+        start = 0;                      \
+    }
+
+Py_LOCAL_INLINE(Py_ssize_t)
+find_internal(const char *str, Py_ssize_t len,
+              const char *function_name, PyObject *args, int dir)
+{
+    PyObject *subobj;
+    char byte;
+    Py_buffer subbuf;
+    const char *sub;
+    Py_ssize_t sub_len;
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+    Py_ssize_t res;
+
+    if (!parse_args_finds_byte(function_name, args,
+                               &subobj, &byte, &start, &end))
+        return -2;
+
+    if (subobj) {
+        if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0)
+            return -2;
+
+        sub = subbuf.buf;
+        sub_len = subbuf.len;
+    }
+    else {
+        sub = &byte;
+        sub_len = 1;
+    }
+
+    ADJUST_INDICES(start, end, len);
+    if (end - start < sub_len)
+        res = -1;
+    else if (sub_len == 1) {
+        if (dir > 0)
+            res = stringlib_find_char(
+                str + start, end - start,
+                *sub);
+        else
+            res = stringlib_rfind_char(
+                str + start, end - start,
+                *sub);
+        if (res >= 0)
+            res += start;
+    }
+    else {
+        if (dir > 0)
+            res = stringlib_find_slice(
+                str, len,
+                sub, sub_len, start, end);
+        else
+            res = stringlib_rfind_slice(
+                str, len,
+                sub, sub_len, start, end);
+    }
+
+    if (subobj)
+        PyBuffer_Release(&subbuf);
+
+    return res;
+}
+
+PyDoc_STRVAR_shared(_Py_find__doc__,
+"B.find(sub[, start[, end]]) -> int\n\
+\n\
+Return the lowest index in B where subsection sub is found,\n\
+such that sub is contained within B[start,end].  Optional\n\
+arguments start and end are interpreted as in slice notation.\n\
+\n\
+Return -1 on failure.");
+
+PyObject *
+_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *args)
+{
+    Py_ssize_t result = find_internal(str, len, "find", args, +1);
+    if (result == -2)
+        return NULL;
+    return PyLong_FromSsize_t(result);
+}
+
+PyDoc_STRVAR_shared(_Py_index__doc__,
+"B.index(sub[, start[, end]]) -> int\n\
+\n\
+Like B.find() but raise ValueError when the subsection is not found.");
+
+PyObject *
+_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *args)
+{
+    Py_ssize_t result = find_internal(str, len, "index", args, +1);
+    if (result == -2)
+        return NULL;
+    if (result == -1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "subsection not found");
+        return NULL;
+    }
+    return PyLong_FromSsize_t(result);
+}
+
+PyDoc_STRVAR_shared(_Py_rfind__doc__,
+"B.rfind(sub[, start[, end]]) -> int\n\
+\n\
+Return the highest index in B where subsection sub is found,\n\
+such that sub is contained within B[start,end].  Optional\n\
+arguments start and end are interpreted as in slice notation.\n\
+\n\
+Return -1 on failure.");
+
+PyObject *
+_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *args)
+{
+    Py_ssize_t result = find_internal(str, len, "rfind", args, -1);
+    if (result == -2)
+        return NULL;
+    return PyLong_FromSsize_t(result);
+}
+
+PyDoc_STRVAR_shared(_Py_rindex__doc__,
+"B.rindex(sub[, start[, end]]) -> int\n\
+\n\
+Like B.rfind() but raise ValueError when the subsection is not found.");
+
+PyObject *
+_Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *args)
+{
+    Py_ssize_t result = find_internal(str, len, "rindex", args, -1);
+    if (result == -2)
+        return NULL;
+    if (result == -1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "subsection not found");
+        return NULL;
+    }
+    return PyLong_FromSsize_t(result);
+}
+
+PyDoc_STRVAR_shared(_Py_count__doc__,
+"B.count(sub[, start[, end]]) -> int\n\
+\n\
+Return the number of non-overlapping occurrences of subsection sub in\n\
+bytes B[start:end].  Optional arguments start and end are interpreted\n\
+as in slice notation.");
+
+PyObject *
+_Py_bytes_count(const char *str, Py_ssize_t len, PyObject *args)
+{
+    PyObject *sub_obj;
+    const char *sub;
+    Py_ssize_t sub_len;
+    char byte;
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+
+    Py_buffer vsub;
+    PyObject *count_obj;
+
+    if (!parse_args_finds_byte("count", args,
+                               &sub_obj, &byte, &start, &end))
+        return NULL;
+
+    if (sub_obj) {
+        if (PyObject_GetBuffer(sub_obj, &vsub, PyBUF_SIMPLE) != 0)
+            return NULL;
+
+        sub = vsub.buf;
+        sub_len = vsub.len;
+    }
+    else {
+        sub = &byte;
+        sub_len = 1;
+    }
+
+    ADJUST_INDICES(start, end, len);
+
+    count_obj = PyLong_FromSsize_t(
+        stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX)
+        );
+
+    if (sub_obj)
+        PyBuffer_Release(&vsub);
+
+    return count_obj;
+}
+
+int
+_Py_bytes_contains(const char *str, Py_ssize_t len, PyObject *arg)
+{
+    Py_ssize_t ival = PyNumber_AsSsize_t(arg, NULL);
+    if (ival == -1 && PyErr_Occurred()) {
+        Py_buffer varg;
+        Py_ssize_t pos;
+        PyErr_Clear();
+        if (PyObject_GetBuffer(arg, &varg, PyBUF_SIMPLE) != 0)
+            return -1;
+        pos = stringlib_find(str, len,
+                             varg.buf, varg.len, 0);
+        PyBuffer_Release(&varg);
+        return pos >= 0;
+    }
+    if (ival < 0 || ival >= 256) {
+        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)");
+        return -1;
+    }
+
+    return memchr(str, (int) ival, len) != NULL;
+}
+
+
+/* Matches the end (direction >= 0) or start (direction < 0) of the buffer
+ * against substr, using the start and end arguments. Returns
+ * -1 on error, 0 if not found and 1 if found.
+ */
+Py_LOCAL(int)
+tailmatch(const char *str, Py_ssize_t len, PyObject *substr,
+          Py_ssize_t start, Py_ssize_t end, int direction)
+{
+    Py_buffer sub_view = {NULL, NULL};
+    const char *sub;
+    Py_ssize_t slen;
+
+    if (PyBytes_Check(substr)) {
+        sub = PyBytes_AS_STRING(substr);
+        slen = PyBytes_GET_SIZE(substr);
+    }
+    else {
+        if (PyObject_GetBuffer(substr, &sub_view, PyBUF_SIMPLE) != 0)
+            return -1;
+        sub = sub_view.buf;
+        slen = sub_view.len;
+    }
+
+    ADJUST_INDICES(start, end, len);
+
+    if (direction < 0) {
+        /* startswith */
+        if (start + slen > len)
+            goto notfound;
+    } else {
+        /* endswith */
+        if (end - start < slen || start > len)
+            goto notfound;
+
+        if (end - slen > start)
+            start = end - slen;
+    }
+    if (end - start < slen)
+        goto notfound;
+    if (memcmp(str + start, sub, slen) != 0)
+        goto notfound;
+
+    PyBuffer_Release(&sub_view);
+    return 1;
+
+notfound:
+    PyBuffer_Release(&sub_view);
+    return 0;
+}
+
+Py_LOCAL(PyObject *)
+_Py_bytes_tailmatch(const char *str, Py_ssize_t len,
+                    const char *function_name, PyObject *args,
+                    int direction)
+{
+    Py_ssize_t start = 0;
+    Py_ssize_t end = PY_SSIZE_T_MAX;
+    PyObject *subobj;
+    int result;
+
+    if (!stringlib_parse_args_finds(function_name, args, &subobj, &start, &end))
+        return NULL;
+    if (PyTuple_Check(subobj)) {
+        Py_ssize_t i;
+        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
+            result = tailmatch(str, len, PyTuple_GET_ITEM(subobj, i),
+                               start, end, direction);
+            if (result == -1)
+                return NULL;
+            else if (result) {
+                Py_RETURN_TRUE;
+            }
+        }
+        Py_RETURN_FALSE;
+    }
+    result = tailmatch(str, len, subobj, start, end, direction);
+    if (result == -1) {
+        if (PyErr_ExceptionMatches(PyExc_TypeError))
+            PyErr_Format(PyExc_TypeError,
+                         "%s first arg must be bytes or a tuple of bytes, "
+                         "not %s",
+                         function_name, Py_TYPE(subobj)->tp_name);
+        return NULL;
+    }
+    else
+        return PyBool_FromLong(result);
+}
+
+PyDoc_STRVAR_shared(_Py_startswith__doc__,
+"B.startswith(prefix[, start[, end]]) -> bool\n\
+\n\
+Return True if B starts with the specified prefix, False otherwise.\n\
+With optional start, test B beginning at that position.\n\
+With optional end, stop comparing B at that position.\n\
+prefix can also be a tuple of bytes to try.");
+
+PyObject *
+_Py_bytes_startswith(const char *str, Py_ssize_t len, PyObject *args)
+{
+    return _Py_bytes_tailmatch(str, len, "startswith", args, -1);
+}
+
+PyDoc_STRVAR_shared(_Py_endswith__doc__,
+"B.endswith(suffix[, start[, end]]) -> bool\n\
+\n\
+Return True if B ends with the specified suffix, False otherwise.\n\
+With optional start, test B beginning at that position.\n\
+With optional end, stop comparing B at that position.\n\
+suffix can also be a tuple of bytes to try.");
+
+PyObject *
+_Py_bytes_endswith(const char *str, Py_ssize_t len, PyObject *args)
+{
+    return _Py_bytes_tailmatch(str, len, "endswith", args, +1);
+}
+
+PyDoc_STRVAR_shared(_Py_expandtabs__doc__,
+"B.expandtabs(tabsize=8) -> copy of B\n\
+\n\
+Return a copy of B where all tab characters are expanded using spaces.\n\
+If tabsize is not given, a tab size of 8 characters is assumed.");
+
+PyDoc_STRVAR_shared(_Py_ljust__doc__,
+"B.ljust(width[, fillchar]) -> copy of B\n"
+"\n"
+"Return B left justified in a string of length width. Padding is\n"
+"done using the specified fill character (default is a space).");
+
+PyDoc_STRVAR_shared(_Py_rjust__doc__,
+"B.rjust(width[, fillchar]) -> copy of B\n"
+"\n"
+"Return B right justified in a string of length width. Padding is\n"
+"done using the specified fill character (default is a space)");
+
+PyDoc_STRVAR_shared(_Py_center__doc__,
+"B.center(width[, fillchar]) -> copy of B\n"
+"\n"
+"Return B centered in a string of length width.  Padding is\n"
+"done using the specified fill character (default is a space).");
+
+PyDoc_STRVAR_shared(_Py_zfill__doc__,
+"B.zfill(width) -> copy of B\n"
+"\n"
+"Pad a numeric string B with zeros on the left, to fill a field\n"
+"of the specified width.  B is never truncated.");
+
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index 5934336..5f77867 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -9,9 +9,9 @@
 #include <stddef.h>
 
 /*[clinic input]
-class bytes "PyBytesObject*" "&PyBytes_Type"
+class bytes "PyBytesObject *" "&PyBytes_Type"
 [clinic start generated code]*/
-/*[clinic end generated code: output=da39a3ee5e6b4b0d input=1a1d9102afc1b00c]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=7a238f965d64892b]*/
 
 #include "clinic/bytesobject.c.h"
 
@@ -30,6 +30,10 @@ static PyBytesObject *nullstring;
 */
 #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
 
+/* Forward declaration */
+Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer,
+                                                   char *str);
+
 /*
    For PyBytes_FromString(), the parameter `str' points to a null-terminated
    string containing exactly `size' bytes.
@@ -174,190 +178,184 @@ PyBytes_FromString(const char *str)
 PyObject *
 PyBytes_FromFormatV(const char *format, va_list vargs)
 {
-    va_list count;
-    Py_ssize_t n = 0;
-    const char* f;
     char *s;
-    PyObject* string;
+    const char *f;
+    const char *p;
+    Py_ssize_t prec;
+    int longflag;
+    int size_tflag;
+    /* Longest 64-bit formatted numbers:
+       - "18446744073709551615\0" (21 bytes)
+       - "-9223372036854775808\0" (21 bytes)
+       Decimal takes the most space (it isn't enough for octal.)
+
+       Longest 64-bit pointer representation:
+       "0xffffffffffffffff\0" (19 bytes). */
+    char buffer[21];
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
+
+    s = _PyBytesWriter_Alloc(&writer, strlen(format));
+    if (s == NULL)
+        return NULL;
+    writer.overallocate = 1;
+
+#define WRITE_BYTES(str) \
+    do { \
+        s = _PyBytesWriter_WriteBytes(&writer, s, (str), strlen(str)); \
+        if (s == NULL) \
+            goto error; \
+    } while (0)
 
-    Py_VA_COPY(count, vargs);
-    /* step 1: figure out how large a buffer we need */
     for (f = format; *f; f++) {
-        if (*f == '%') {
-            const char* p = f;
-            while (*++f && *f != '%' && !Py_ISALPHA(*f))
-                ;
-
-            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
-             * they don't affect the amount of space we reserve.
-             */
-            if ((*f == 'l' || *f == 'z') &&
-                            (f[1] == 'd' || f[1] == 'u'))
-                ++f;
-
-            switch (*f) {
-            case 'c':
-            {
-                int c = va_arg(count, int);
-                if (c < 0 || c > 255) {
-                    PyErr_SetString(PyExc_OverflowError,
-                                    "PyBytes_FromFormatV(): %c format "
-                                    "expects an integer in range [0; 255]");
-                    return NULL;
-                }
-                n++;
-                break;
+        if (*f != '%') {
+            *s++ = *f;
+            continue;
+        }
+
+        p = f++;
+
+        /* ignore the width (ex: 10 in "%10s") */
+        while (Py_ISDIGIT(*f))
+            f++;
+
+        /* parse the precision (ex: 10 in "%.10s") */
+        prec = 0;
+        if (*f == '.') {
+            f++;
+            for (; Py_ISDIGIT(*f); f++) {
+                prec = (prec * 10) + (*f - '0');
             }
-            case '%':
-                n++;
-                break;
-            case 'd': case 'u': case 'i': case 'x':
-                (void) va_arg(count, int);
-                /* 20 bytes is enough to hold a 64-bit
-                   integer.  Decimal takes the most space.
-                   This isn't enough for octal. */
-                n += 20;
-                break;
-            case 's':
-                s = va_arg(count, char*);
-                n += strlen(s);
-                break;
-            case 'p':
-                (void) va_arg(count, int);
-                /* maximum 64-bit pointer representation:
-                 * 0xffffffffffffffff
-                 * so 19 characters is enough.
-                 * XXX I count 18 -- what's the extra for?
-                 */
-                n += 19;
-                break;
-            default:
-                /* if we stumble upon an unknown
-                   formatting code, copy the rest of
-                   the format string to the output
-                   string. (we cannot just skip the
-                   code, since there's no way to know
-                   what's in the argument list) */
-                n += strlen(p);
-                goto expand;
+        }
+
+        while (*f && *f != '%' && !Py_ISALPHA(*f))
+            f++;
+
+        /* handle the long flag ('l'), but only for %ld and %lu.
+           others can be added when necessary. */
+        longflag = 0;
+        if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
+            longflag = 1;
+            ++f;
+        }
+
+        /* handle the size_t flag ('z'). */
+        size_tflag = 0;
+        if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
+            size_tflag = 1;
+            ++f;
+        }
+
+        /* substract bytes preallocated for the format string
+           (ex: 2 for "%s") */
+        writer.min_size -= (f - p + 1);
+
+        switch (*f) {
+        case 'c':
+        {
+            int c = va_arg(vargs, int);
+            if (c < 0 || c > 255) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "PyBytes_FromFormatV(): %c format "
+                                "expects an integer in range [0; 255]");
+                goto error;
             }
-        } else
-            n++;
-    }
- expand:
-    /* step 2: fill the buffer */
-    /* Since we've analyzed how much space we need for the worst case,
-       use sprintf directly instead of the slower PyOS_snprintf. */
-    string = PyBytes_FromStringAndSize(NULL, n);
-    if (!string)
-        return NULL;
+            writer.min_size++;
+            *s++ = (unsigned char)c;
+            break;
+        }
 
-    s = PyBytes_AsString(string);
+        case 'd':
+            if (longflag)
+                sprintf(buffer, "%ld", va_arg(vargs, long));
+            else if (size_tflag)
+                sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
+                    va_arg(vargs, Py_ssize_t));
+            else
+                sprintf(buffer, "%d", va_arg(vargs, int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
 
-    for (f = format; *f; f++) {
-        if (*f == '%') {
-            const char* p = f++;
+        case 'u':
+            if (longflag)
+                sprintf(buffer, "%lu",
+                    va_arg(vargs, unsigned long));
+            else if (size_tflag)
+                sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
+                    va_arg(vargs, size_t));
+            else
+                sprintf(buffer, "%u",
+                    va_arg(vargs, unsigned int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
+
+        case 'i':
+            sprintf(buffer, "%i", va_arg(vargs, int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
+
+        case 'x':
+            sprintf(buffer, "%x", va_arg(vargs, int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
+
+        case 's':
+        {
             Py_ssize_t i;
-            int longflag = 0;
-            int size_tflag = 0;
-            /* parse the width.precision part (we're only
-               interested in the precision value, if any) */
-            n = 0;
-            while (Py_ISDIGIT(*f))
-                n = (n*10) + *f++ - '0';
-            if (*f == '.') {
-                f++;
-                n = 0;
-                while (Py_ISDIGIT(*f))
-                    n = (n*10) + *f++ - '0';
-            }
-            while (*f && *f != '%' && !Py_ISALPHA(*f))
-                f++;
-            /* handle the long flag, but only for %ld and %lu.
-               others can be added when necessary. */
-            if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
-                longflag = 1;
-                ++f;
-            }
-            /* handle the size_t flag. */
-            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
-                size_tflag = 1;
-                ++f;
-            }
 
-            switch (*f) {
-            case 'c':
-            {
-                int c = va_arg(vargs, int);
-                /* c has been checked for overflow in the first step */
-                *s++ = (unsigned char)c;
-                break;
+            p = va_arg(vargs, const char*);
+            i = strlen(p);
+            if (prec > 0 && i > prec)
+                i = prec;
+            s = _PyBytesWriter_WriteBytes(&writer, s, p, i);
+            if (s == NULL)
+                goto error;
+            break;
+        }
+
+        case 'p':
+            sprintf(buffer, "%p", va_arg(vargs, void*));
+            assert(strlen(buffer) < sizeof(buffer));
+            /* %p is ill-defined:  ensure leading 0x. */
+            if (buffer[1] == 'X')
+                buffer[1] = 'x';
+            else if (buffer[1] != 'x') {
+                memmove(buffer+2, buffer, strlen(buffer)+1);
+                buffer[0] = '0';
+                buffer[1] = 'x';
             }
-            case 'd':
-                if (longflag)
-                    sprintf(s, "%ld", va_arg(vargs, long));
-                else if (size_tflag)
-                    sprintf(s, "%" PY_FORMAT_SIZE_T "d",
-                        va_arg(vargs, Py_ssize_t));
-                else
-                    sprintf(s, "%d", va_arg(vargs, int));
-                s += strlen(s);
-                break;
-            case 'u':
-                if (longflag)
-                    sprintf(s, "%lu",
-                        va_arg(vargs, unsigned long));
-                else if (size_tflag)
-                    sprintf(s, "%" PY_FORMAT_SIZE_T "u",
-                        va_arg(vargs, size_t));
-                else
-                    sprintf(s, "%u",
-                        va_arg(vargs, unsigned int));
-                s += strlen(s);
-                break;
-            case 'i':
-                sprintf(s, "%i", va_arg(vargs, int));
-                s += strlen(s);
-                break;
-            case 'x':
-                sprintf(s, "%x", va_arg(vargs, int));
-                s += strlen(s);
-                break;
-            case 's':
-                p = va_arg(vargs, char*);
-                i = strlen(p);
-                if (n > 0 && i > n)
-                    i = n;
-                Py_MEMCPY(s, p, i);
-                s += i;
-                break;
-            case 'p':
-                sprintf(s, "%p", va_arg(vargs, void*));
-                /* %p is ill-defined:  ensure leading 0x. */
-                if (s[1] == 'X')
-                    s[1] = 'x';
-                else if (s[1] != 'x') {
-                    memmove(s+2, s, strlen(s)+1);
-                    s[0] = '0';
-                    s[1] = 'x';
-                }
-                s += strlen(s);
-                break;
-            case '%':
-                *s++ = '%';
-                break;
-            default:
-                strcpy(s, p);
-                s += strlen(s);
-                goto end;
+            WRITE_BYTES(buffer);
+            break;
+
+        case '%':
+            writer.min_size++;
+            *s++ = '%';
+            break;
+
+        default:
+            if (*f == 0) {
+                /* fix min_size if we reached the end of the format string */
+                writer.min_size++;
             }
-        } else
-            *s++ = *f;
+
+            /* invalid format string: copy unformatted string and exit */
+            WRITE_BYTES(p);
+            return _PyBytesWriter_Finish(&writer, s);
+        }
     }
 
- end:
-    _PyBytes_Resize(&string, s - PyBytes_AS_STRING(string));
-    return string;
+#undef WRITE_BYTES
+
+    return _PyBytesWriter_Finish(&writer, s);
+
+ error:
+    _PyBytesWriter_Dealloc(&writer);
+    return NULL;
 }
 
 PyObject *
@@ -409,12 +407,14 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
 
 /* Returns a new reference to a PyBytes object, or NULL on failure. */
 
-static PyObject *
-formatfloat(PyObject *v, int flags, int prec, int type)
+static char*
+formatfloat(PyObject *v, int flags, int prec, int type,
+            PyObject **p_result, _PyBytesWriter *writer, char *str)
 {
     char *p;
     PyObject *result;
     double x;
+    size_t len;
 
     x = PyFloat_AsDouble(v);
     if (x == -1.0 && PyErr_Occurred()) {
@@ -431,9 +431,22 @@ formatfloat(PyObject *v, int flags, int prec, int type)
 
     if (p == NULL)
         return NULL;
-    result = PyBytes_FromStringAndSize(p, strlen(p));
+
+    len = strlen(p);
+    if (writer != NULL) {
+        str = _PyBytesWriter_Prepare(writer, str, len);
+        if (str == NULL)
+            return NULL;
+        Py_MEMCPY(str, p, len);
+        PyMem_Free(p);
+        str += len;
+        return str;
+    }
+
+    result = PyBytes_FromStringAndSize(p, len);
     PyMem_Free(p);
-    return result;
+    *p_result = result;
+    return str;
 }
 
 static PyObject *
@@ -473,11 +486,11 @@ formatlong(PyObject *v, int flags, int prec, int type)
 static int
 byte_converter(PyObject *arg, char *p)
 {
-    if (PyBytes_Check(arg) && PyBytes_Size(arg) == 1) {
+    if (PyBytes_Check(arg) && PyBytes_GET_SIZE(arg) == 1) {
         *p = PyBytes_AS_STRING(arg)[0];
         return 1;
     }
-    else if (PyByteArray_Check(arg) && PyByteArray_Size(arg) == 1) {
+    else if (PyByteArray_Check(arg) && PyByteArray_GET_SIZE(arg) == 1) {
         *p = PyByteArray_AS_STRING(arg)[0];
         return 1;
     }
@@ -557,36 +570,36 @@ format_obj(PyObject *v, const char **pbuf, Py_ssize_t *plen)
     return NULL;
 }
 
-/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
-
-   FORMATBUFLEN is the length of the buffer in which the ints &
-   chars are formatted. XXX This is a magic number. Each formatting
-   routine does bounds checking to ensure no overflow, but a better
-   solution may be to malloc a buffer of appropriate size for each
-   format. For now, the current solution is sufficient.
-*/
-#define FORMATBUFLEN (size_t)120
+/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) */
 
 PyObject *
-_PyBytes_Format(PyObject *format, PyObject *args)
+_PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
+                  PyObject *args, int use_bytearray)
 {
-    char *fmt, *res;
+    const char *fmt;
+    char *res;
     Py_ssize_t arglen, argidx;
-    Py_ssize_t reslen, rescnt, fmtcnt;
+    Py_ssize_t fmtcnt;
     int args_owned = 0;
-    PyObject *result;
     PyObject *dict = NULL;
-    if (format == NULL || !PyBytes_Check(format) || args == NULL) {
+    _PyBytesWriter writer;
+
+    if (args == NULL) {
         PyErr_BadInternalCall();
         return NULL;
     }
-    fmt = PyBytes_AS_STRING(format);
-    fmtcnt = PyBytes_GET_SIZE(format);
-    reslen = rescnt = fmtcnt + 100;
-    result = PyBytes_FromStringAndSize((char *)NULL, reslen);
-    if (result == NULL)
+    fmt = format;
+    fmtcnt = format_len;
+
+    _PyBytesWriter_Init(&writer);
+    writer.use_bytearray = use_bytearray;
+
+    res = _PyBytesWriter_Alloc(&writer, fmtcnt);
+    if (res == NULL)
         return NULL;
-    res = PyBytes_AsString(result);
+    if (!use_bytearray)
+        writer.overallocate = 1;
+
     if (PyTuple_Check(args)) {
         arglen = PyTuple_GET_SIZE(args);
         argidx = 0;
@@ -600,18 +613,23 @@ _PyBytes_Format(PyObject *format, PyObject *args)
         !PyByteArray_Check(args)) {
             dict = args;
     }
+
     while (--fmtcnt >= 0) {
         if (*fmt != '%') {
-            if (--rescnt < 0) {
-                rescnt = fmtcnt + 100;
-                reslen += rescnt;
-                if (_PyBytes_Resize(&result, reslen))
-                    return NULL;
-                res = PyBytes_AS_STRING(result)
-                    + reslen - rescnt;
-                --rescnt;
-            }
-            *res++ = *fmt++;
+            Py_ssize_t len;
+            char *pos;
+
+            pos = strchr(fmt + 1, '%');
+            if (pos != NULL)
+                len = pos - fmt;
+            else
+                len = format_len - (fmt - format);
+            assert(len != 0);
+
+            Py_MEMCPY(res, fmt, len);
+            res += len;
+            fmt += len;
+            fmtcnt -= (len - 1);
         }
         else {
             /* Got a format specifier */
@@ -626,10 +644,14 @@ _PyBytes_Format(PyObject *format, PyObject *args)
             int sign;
             Py_ssize_t len = 0;
             char onechar; /* For byte_converter() */
+            Py_ssize_t alloc;
+#ifdef Py_DEBUG
+            char *before;
+#endif
 
             fmt++;
             if (*fmt == '(') {
-                char *keystart;
+                const char *keystart;
                 Py_ssize_t keylen;
                 PyObject *key;
                 int pcount = 1;
@@ -673,6 +695,8 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 arglen = -1;
                 argidx = -2;
             }
+
+            /* Parse flags. Example: "%+i" => flags=F_SIGN. */
             while (--fmtcnt >= 0) {
                 switch (c = *fmt++) {
                 case '-': flags |= F_LJUST; continue;
@@ -683,6 +707,8 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 }
                 break;
             }
+
+            /* Parse width. Example: "%10s" => width=10 */
             if (c == '*') {
                 v = getnextarg(args, arglen, &argidx);
                 if (v == NULL)
@@ -717,6 +743,8 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                     width = width*10 + (c - '0');
                 }
             }
+
+            /* Parse precision. Example: "%.3f" => prec=3 */
             if (c == '.') {
                 prec = 0;
                 if (--fmtcnt >= 0)
@@ -771,13 +799,19 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (v == NULL)
                     goto error;
             }
+
+            if (fmtcnt < 0) {
+                /* last writer: disable writer overallocation */
+                writer.overallocate = 0;
+            }
+
             sign = 0;
             fill = ' ';
             switch (c) {
             case '%':
-                pbuf = "%";
-                len = 1;
-                break;
+                *res++ = '%';
+                continue;
+
             case 'r':
                 // %r is only for 2/3 code; 3 only code should use %a
             case 'a':
@@ -790,6 +824,7 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (prec >= 0 && len > prec)
                     len = prec;
                 break;
+
             case 's':
                 // %s is only for 2/3 code; 3 only code should use %b
             case 'b':
@@ -799,12 +834,49 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (prec >= 0 && len > prec)
                     len = prec;
                 break;
+
             case 'i':
             case 'd':
             case 'u':
             case 'o':
             case 'x':
             case 'X':
+                if (PyLong_CheckExact(v)
+                    && width == -1 && prec == -1
+                    && !(flags & (F_SIGN | F_BLANK))
+                    && c != 'X')
+                {
+                    /* Fast path */
+                    int alternate = flags & F_ALT;
+                    int base;
+
+                    switch(c)
+                    {
+                        default:
+                            assert(0 && "'type' not in [diuoxX]");
+                        case 'd':
+                        case 'i':
+                        case 'u':
+                            base = 10;
+                            break;
+                        case 'o':
+                            base = 8;
+                            break;
+                        case 'x':
+                        case 'X':
+                            base = 16;
+                            break;
+                    }
+
+                    /* Fast path */
+                    writer.min_size -= 2; /* size preallocated for "%d" */
+                    res = _PyLong_FormatBytesWriter(&writer, res,
+                                                    v, base, alternate);
+                    if (res == NULL)
+                        goto error;
+                    continue;
+                }
+
                 temp = formatlong(v, flags, prec, c);
                 if (!temp)
                     goto error;
@@ -815,14 +887,25 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (flags & F_ZERO)
                     fill = '0';
                 break;
+
             case 'e':
             case 'E':
             case 'f':
             case 'F':
             case 'g':
             case 'G':
-                temp = formatfloat(v, flags, prec, c);
-                if (temp == NULL)
+                if (width == -1 && prec == -1
+                    && !(flags & (F_SIGN | F_BLANK)))
+                {
+                    /* Fast path */
+                    writer.min_size -= 2; /* size preallocated for "%f" */
+                    res = formatfloat(v, flags, prec, c, NULL, &writer, res);
+                    if (res == NULL)
+                        goto error;
+                    continue;
+                }
+
+                if (!formatfloat(v, flags, prec, c, &temp, NULL, res))
                     goto error;
                 pbuf = PyBytes_AS_STRING(temp);
                 len = PyBytes_GET_SIZE(temp);
@@ -830,21 +913,28 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (flags & F_ZERO)
                     fill = '0';
                 break;
+
             case 'c':
                 pbuf = &onechar;
                 len = byte_converter(v, &onechar);
                 if (!len)
                     goto error;
+                if (width == -1) {
+                    /* Fast path */
+                    *res++ = onechar;
+                    continue;
+                }
                 break;
+
             default:
                 PyErr_Format(PyExc_ValueError,
                   "unsupported format character '%c' (0x%x) "
                   "at index %zd",
                   c, c,
-                  (Py_ssize_t)(fmt - 1 -
-                               PyBytes_AsString(format)));
+                  (Py_ssize_t)(fmt - 1 - format));
                 goto error;
             }
+
             if (sign) {
                 if (*pbuf == '-' || *pbuf == '+') {
                     sign = *pbuf++;
@@ -859,29 +949,31 @@ _PyBytes_Format(PyObject *format, PyObject *args)
             }
             if (width < len)
                 width = len;
-            if (rescnt - (sign != 0) < width) {
-                reslen -= rescnt;
-                rescnt = width + fmtcnt + 100;
-                reslen += rescnt;
-                if (reslen < 0) {
-                    Py_DECREF(result);
-                    Py_XDECREF(temp);
-                    return PyErr_NoMemory();
-                }
-                if (_PyBytes_Resize(&result, reslen)) {
-                    Py_XDECREF(temp);
-                    return NULL;
-                }
-                res = PyBytes_AS_STRING(result)
-                    + reslen - rescnt;
+
+            alloc = width;
+            if (sign != 0 && len == width)
+                alloc++;
+            /* 2: size preallocated for %s */
+            if (alloc > 2) {
+                res = _PyBytesWriter_Prepare(&writer, res, alloc - 2);
+                if (res == NULL)
+                    goto error;
             }
+#ifdef Py_DEBUG
+            before = res;
+#endif
+
+            /* Write the sign if needed */
             if (sign) {
                 if (fill != ' ')
                     *res++ = sign;
-                rescnt--;
                 if (width > len)
                     width--;
             }
+
+            /* Write the numeric prefix for "x", "X" and "o" formats
+               if the alternate form is used.
+               For example, write "0x" for the "%#x" format. */
             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
                 assert(pbuf[0] == '0');
                 assert(pbuf[1] == c);
@@ -889,18 +981,21 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                     *res++ = *pbuf++;
                     *res++ = *pbuf++;
                 }
-                rescnt -= 2;
                 width -= 2;
                 if (width < 0)
                     width = 0;
                 len -= 2;
             }
+
+            /* Pad left with the fill character if needed */
             if (width > len && !(flags & F_LJUST)) {
-                do {
-                    --rescnt;
-                    *res++ = fill;
-                } while (--width > len);
+                memset(res, fill, width - len);
+                res += (width - len);
+                width = len;
             }
+
+            /* If padding with spaces: write sign if needed and/or numeric
+               prefix if the alternate form is used */
             if (fill == ' ') {
                 if (sign)
                     *res++ = sign;
@@ -912,13 +1007,17 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                     *res++ = *pbuf++;
                 }
             }
+
+            /* Copy bytes */
             Py_MEMCPY(res, pbuf, len);
             res += len;
-            rescnt -= len;
-            while (--width >= len) {
-                --rescnt;
-                *res++ = ' ';
+
+            /* Pad right with the fill character if needed */
+            if (width > len) {
+                memset(res, ' ', width - len);
+                res += (width - len);
             }
+
             if (dict && (argidx < arglen) && c != '%') {
                 PyErr_SetString(PyExc_TypeError,
                            "not all arguments converted during bytes formatting");
@@ -926,22 +1025,31 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 goto error;
             }
             Py_XDECREF(temp);
+
+#ifdef Py_DEBUG
+            /* check that we computed the exact size for this write */
+            assert((res - before) == alloc);
+#endif
         } /* '%' */
+
+        /* If overallocation was disabled, ensure that it was the last
+           write. Otherwise, we missed an optimization */
+        assert(writer.overallocate || fmtcnt < 0 || use_bytearray);
     } /* until end */
+
     if (argidx < arglen && !dict) {
         PyErr_SetString(PyExc_TypeError,
                         "not all arguments converted during bytes formatting");
         goto error;
     }
+
     if (args_owned) {
         Py_DECREF(args);
     }
-    if (_PyBytes_Resize(&result, reslen - rescnt))
-        return NULL;
-    return result;
+    return _PyBytesWriter_Finish(&writer, res);
 
  error:
-    Py_DECREF(result);
+    _PyBytesWriter_Dealloc(&writer);
     if (args_owned) {
         Py_DECREF(args);
     }
@@ -961,6 +1069,42 @@ bytes_dealloc(PyObject *op)
    the string is UTF-8 encoded and should be re-encoded in the
    specified encoding.  */
 
+static char *
+_PyBytes_DecodeEscapeRecode(const char **s, const char *end,
+                            const char *errors, const char *recode_encoding,
+                            _PyBytesWriter *writer, char *p)
+{
+    PyObject *u, *w;
+    const char* t;
+
+    t = *s;
+    /* Decode non-ASCII bytes as UTF-8. */
+    while (t < end && (*t & 0x80))
+        t++;
+    u = PyUnicode_DecodeUTF8(*s, t - *s, errors);
+    if (u == NULL)
+        return NULL;
+
+    /* Recode them in target encoding. */
+    w = PyUnicode_AsEncodedString(u, recode_encoding, errors);
+    Py_DECREF(u);
+    if  (w == NULL)
+        return NULL;
+    assert(PyBytes_Check(w));
+
+    /* Append bytes to output buffer. */
+    writer->min_size--;   /* substract 1 preallocated byte */
+    p = _PyBytesWriter_WriteBytes(writer, p,
+                                  PyBytes_AS_STRING(w),
+                                  PyBytes_GET_SIZE(w));
+    Py_DECREF(w);
+    if (p == NULL)
+        return NULL;
+
+    *s = t;
+    return p;
+}
+
 PyObject *PyBytes_DecodeEscape(const char *s,
                                 Py_ssize_t len,
                                 const char *errors,
@@ -968,54 +1112,42 @@ PyObject *PyBytes_DecodeEscape(const char *s,
                                 const char *recode_encoding)
 {
     int c;
-    char *p, *buf;
+    char *p;
     const char *end;
-    PyObject *v;
-    Py_ssize_t newlen = recode_encoding ? 4*len:len;
-    v = PyBytes_FromStringAndSize((char *)NULL, newlen);
-    if (v == NULL)
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
+
+    p = _PyBytesWriter_Alloc(&writer, len);
+    if (p == NULL)
         return NULL;
-    p = buf = PyBytes_AsString(v);
+    writer.overallocate = 1;
+
     end = s + len;
     while (s < end) {
         if (*s != '\\') {
           non_esc:
-            if (recode_encoding && (*s & 0x80)) {
-                PyObject *u, *w;
-                char *r;
-                const char* t;
-                Py_ssize_t rn;
-                t = s;
-                /* Decode non-ASCII bytes as UTF-8. */
-                while (t < end && (*t & 0x80)) t++;
-                u = PyUnicode_DecodeUTF8(s, t - s, errors);
-                if(!u) goto failed;
-
-                /* Recode them in target encoding. */
-                w = PyUnicode_AsEncodedString(
-                    u, recode_encoding, errors);
-                Py_DECREF(u);
-                if (!w)                 goto failed;
-
-                /* Append bytes to output buffer. */
-                assert(PyBytes_Check(w));
-                r = PyBytes_AS_STRING(w);
-                rn = PyBytes_GET_SIZE(w);
-                Py_MEMCPY(p, r, rn);
-                p += rn;
-                Py_DECREF(w);
-                s = t;
-            } else {
+            if (!(recode_encoding && (*s & 0x80))) {
                 *p++ = *s++;
             }
+            else {
+                /* non-ASCII character and need to recode */
+                p = _PyBytes_DecodeEscapeRecode(&s, end,
+                                                errors, recode_encoding,
+                                                &writer, p);
+                if (p == NULL)
+                    goto failed;
+            }
             continue;
         }
+
         s++;
-        if (s==end) {
+        if (s == end) {
             PyErr_SetString(PyExc_ValueError,
                             "Trailing \\ in string");
             goto failed;
         }
+
         switch (*s++) {
         /* XXX This assumes ASCII! */
         case '\n': break;
@@ -1040,28 +1172,18 @@ PyObject *PyBytes_DecodeEscape(const char *s,
             *p++ = c;
             break;
         case 'x':
-            if (s+1 < end && Py_ISXDIGIT(s[0]) && Py_ISXDIGIT(s[1])) {
-                unsigned int x = 0;
-                c = Py_CHARMASK(*s);
-                s++;
-                if (Py_ISDIGIT(c))
-                    x = c - '0';
-                else if (Py_ISLOWER(c))
-                    x = 10 + c - 'a';
-                else
-                    x = 10 + c - 'A';
-                x = x << 4;
-                c = Py_CHARMASK(*s);
-                s++;
-                if (Py_ISDIGIT(c))
-                    x += c - '0';
-                else if (Py_ISLOWER(c))
-                    x += 10 + c - 'a';
-                else
-                    x += 10 + c - 'A';
-                *p++ = x;
-                break;
+            if (s+1 < end) {
+                int digit1, digit2;
+                digit1 = _PyLong_DigitValue[Py_CHARMASK(s[0])];
+                digit2 = _PyLong_DigitValue[Py_CHARMASK(s[1])];
+                if (digit1 < 16 && digit2 < 16) {
+                    *p++ = (unsigned char)((digit1 << 4) + digit2);
+                    s += 2;
+                    break;
+                }
             }
+            /* invalid hexadecimal digits */
+
             if (!errors || strcmp(errors, "strict") == 0) {
                 PyErr_Format(PyExc_ValueError,
                              "invalid \\x escape at position %d",
@@ -1083,6 +1205,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
             if (s < end && Py_ISXDIGIT(s[0]))
                 s++; /* and a hexdigit */
             break;
+
         default:
             *p++ = '\\';
             s--;
@@ -1090,11 +1213,11 @@ PyObject *PyBytes_DecodeEscape(const char *s,
                              UTF-8 bytes may follow. */
         }
     }
-    if (p-buf < newlen)
-        _PyBytes_Resize(&v, p - buf);
-    return v;
+
+    return _PyBytesWriter_Finish(&writer, p);
+
   failed:
-    Py_DECREF(v);
+    _PyBytesWriter_Dealloc(&writer);
     return NULL;
 }
 
@@ -1363,24 +1486,7 @@ bytes_repeat(PyBytesObject *a, Py_ssize_t n)
 static int
 bytes_contains(PyObject *self, PyObject *arg)
 {
-    Py_ssize_t ival = PyNumber_AsSsize_t(arg, PyExc_ValueError);
-    if (ival == -1 && PyErr_Occurred()) {
-        Py_buffer varg;
-        Py_ssize_t pos;
-        PyErr_Clear();
-        if (PyObject_GetBuffer(arg, &varg, PyBUF_SIMPLE) != 0)
-            return -1;
-        pos = stringlib_find(PyBytes_AS_STRING(self), Py_SIZE(self),
-                             varg.buf, varg.len, 0);
-        PyBuffer_Release(&varg);
-        return pos >= 0;
-    }
-    if (ival < 0 || ival >= 256) {
-        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)");
-        return -1;
-    }
-
-    return memchr(PyBytes_AS_STRING(self), (int) ival, Py_SIZE(self)) != NULL;
+    return _Py_bytes_contains(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), arg);
 }
 
 static PyObject *
@@ -1627,8 +1733,8 @@ Return a list of the sections in the bytes, using sep as the delimiter.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_split_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit)
-/*[clinic end generated code: output=8bde44dacb36ef2e input=8b809b39074abbfa]*/
+bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit)
+/*[clinic end generated code: output=52126b5844c1d8ef input=8b809b39074abbfa]*/
 {
     Py_ssize_t len = PyBytes_GET_SIZE(self), n;
     const char *s = PyBytes_AS_STRING(self), *sub;
@@ -1652,7 +1758,6 @@ bytes_split_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit)
 /*[clinic input]
 bytes.partition
 
-    self: self(type="PyBytesObject *")
     sep: Py_buffer
     /
 
@@ -1668,7 +1773,7 @@ object and two empty bytes objects.
 
 static PyObject *
 bytes_partition_impl(PyBytesObject *self, Py_buffer *sep)
-/*[clinic end generated code: output=f532b392a17ff695 input=bc855dc63ca949de]*/
+/*[clinic end generated code: output=f532b392a17ff695 input=61cca95519406099]*/
 {
     return stringlib_partition(
         (PyObject*) self,
@@ -1680,7 +1785,6 @@ bytes_partition_impl(PyBytesObject *self, Py_buffer *sep)
 /*[clinic input]
 bytes.rpartition
 
-    self: self(type="PyBytesObject *")
     sep: Py_buffer
     /
 
@@ -1696,7 +1800,7 @@ objects and the original bytes object.
 
 static PyObject *
 bytes_rpartition_impl(PyBytesObject *self, Py_buffer *sep)
-/*[clinic end generated code: output=191b114cbb028e50 input=6588fff262a9170e]*/
+/*[clinic end generated code: output=191b114cbb028e50 input=67f689e63a62d478]*/
 {
     return stringlib_rpartition(
         (PyObject*) self,
@@ -1714,8 +1818,8 @@ Splitting is done starting at the end of the bytes and working to the front.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_rsplit_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit)
-/*[clinic end generated code: output=0b6570b977911d88 input=0f86c9f28f7d7b7b]*/
+bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit)
+/*[clinic end generated code: output=ba698d9ea01e1c8f input=0f86c9f28f7d7b7b]*/
 {
     Py_ssize_t len = PyBytes_GET_SIZE(self), n;
     const char *s = PyBytes_AS_STRING(self), *sub;
@@ -1753,8 +1857,8 @@ Example: b'.'.join([b'ab', b'pq', b'rs']) -> b'ab.pq.rs'.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_join(PyBytesObject*self, PyObject *iterable_of_bytes)
-/*[clinic end generated code: output=634aff14764ff997 input=7fe377b95bd549d2]*/
+bytes_join(PyBytesObject *self, PyObject *iterable_of_bytes)
+/*[clinic end generated code: output=a046f379f626f6f8 input=7fe377b95bd549d2]*/
 {
     return stringlib_bytes_join((PyObject*)self, iterable_of_bytes);
 }
@@ -1767,158 +1871,30 @@ _PyBytes_Join(PyObject *sep, PyObject *x)
     return bytes_join((PyBytesObject*)sep, x);
 }
 
-/* helper macro to fixup start/end slice values */
-#define ADJUST_INDICES(start, end, len)         \
-    if (end > len)                          \
-        end = len;                          \
-    else if (end < 0) {                     \
-        end += len;                         \
-        if (end < 0)                        \
-        end = 0;                        \
-    }                                       \
-    if (start < 0) {                        \
-        start += len;                       \
-        if (start < 0)                      \
-        start = 0;                      \
-    }
-
-Py_LOCAL_INLINE(Py_ssize_t)
-bytes_find_internal(PyBytesObject *self, PyObject *args, int dir)
-{
-    PyObject *subobj;
-    char byte;
-    Py_buffer subbuf;
-    const char *sub;
-    Py_ssize_t len, sub_len;
-    Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
-    Py_ssize_t res;
-
-    if (!stringlib_parse_args_finds_byte("find/rfind/index/rindex",
-                                         args, &subobj, &byte, &start, &end))
-        return -2;
-
-    if (subobj) {
-        if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0)
-            return -2;
-
-        sub = subbuf.buf;
-        sub_len = subbuf.len;
-    }
-    else {
-        sub = &byte;
-        sub_len = 1;
-    }
-    len = PyBytes_GET_SIZE(self);
-
-    ADJUST_INDICES(start, end, len);
-    if (end - start < sub_len)
-        res = -1;
-    else if (sub_len == 1
-#ifndef HAVE_MEMRCHR
-            && dir > 0
-#endif
-    ) {
-        unsigned char needle = *sub;
-        int mode = (dir > 0) ? FAST_SEARCH : FAST_RSEARCH;
-        res = stringlib_fastsearch_memchr_1char(
-            PyBytes_AS_STRING(self) + start, end - start,
-            needle, needle, mode);
-        if (res >= 0)
-            res += start;
-    }
-    else {
-        if (dir > 0)
-            res = stringlib_find_slice(
-                PyBytes_AS_STRING(self), len,
-                sub, sub_len, start, end);
-        else
-            res = stringlib_rfind_slice(
-                PyBytes_AS_STRING(self), len,
-                sub, sub_len, start, end);
-    }
-
-    if (subobj)
-        PyBuffer_Release(&subbuf);
-
-    return res;
-}
-
-
-PyDoc_STRVAR(find__doc__,
-"B.find(sub[, start[, end]]) -> int\n\
-\n\
-Return the lowest index in B where substring sub is found,\n\
-such that sub is contained within B[start:end].  Optional\n\
-arguments start and end are interpreted as in slice notation.\n\
-\n\
-Return -1 on failure.");
-
 static PyObject *
 bytes_find(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytes_find_internal(self, args, +1);
-    if (result == -2)
-        return NULL;
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_find(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
-
-PyDoc_STRVAR(index__doc__,
-"B.index(sub[, start[, end]]) -> int\n\
-\n\
-Like B.find() but raise ValueError when the substring is not found.");
-
 static PyObject *
 bytes_index(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytes_find_internal(self, args, +1);
-    if (result == -2)
-        return NULL;
-    if (result == -1) {
-        PyErr_SetString(PyExc_ValueError,
-                        "substring not found");
-        return NULL;
-    }
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_index(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
 
-PyDoc_STRVAR(rfind__doc__,
-"B.rfind(sub[, start[, end]]) -> int\n\
-\n\
-Return the highest index in B where substring sub is found,\n\
-such that sub is contained within B[start:end].  Optional\n\
-arguments start and end are interpreted as in slice notation.\n\
-\n\
-Return -1 on failure.");
-
 static PyObject *
 bytes_rfind(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytes_find_internal(self, args, -1);
-    if (result == -2)
-        return NULL;
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_rfind(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
 
-PyDoc_STRVAR(rindex__doc__,
-"B.rindex(sub[, start[, end]]) -> int\n\
-\n\
-Like B.rfind() but raise ValueError when the substring is not found.");
-
 static PyObject *
 bytes_rindex(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytes_find_internal(self, args, -1);
-    if (result == -2)
-        return NULL;
-    if (result == -1) {
-        PyErr_SetString(PyExc_ValueError,
-                        "substring not found");
-        return NULL;
-    }
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_rindex(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
 
@@ -2005,7 +1981,6 @@ do_argstrip(PyBytesObject *self, int striptype, PyObject *bytes)
 /*[clinic input]
 bytes.strip
 
-    self: self(type="PyBytesObject *")
     bytes: object = None
     /
 
@@ -2016,7 +1991,7 @@ If the argument is omitted or None, strip leading and trailing ASCII whitespace.
 
 static PyObject *
 bytes_strip_impl(PyBytesObject *self, PyObject *bytes)
-/*[clinic end generated code: output=c7c228d3bd104a1b input=37daa5fad1395d95]*/
+/*[clinic end generated code: output=c7c228d3bd104a1b input=8a354640e4e0b3ef]*/
 {
     return do_argstrip(self, BOTHSTRIP, bytes);
 }
@@ -2024,7 +1999,6 @@ bytes_strip_impl(PyBytesObject *self, PyObject *bytes)
 /*[clinic input]
 bytes.lstrip
 
-    self: self(type="PyBytesObject *")
     bytes: object = None
     /
 
@@ -2035,7 +2009,7 @@ If the argument is omitted or None, strip leading  ASCII whitespace.
 
 static PyObject *
 bytes_lstrip_impl(PyBytesObject *self, PyObject *bytes)
-/*[clinic end generated code: output=28602e586f524e82 input=88811b09dfbc2988]*/
+/*[clinic end generated code: output=28602e586f524e82 input=9baff4398c3f6857]*/
 {
     return do_argstrip(self, LEFTSTRIP, bytes);
 }
@@ -2043,7 +2017,6 @@ bytes_lstrip_impl(PyBytesObject *self, PyObject *bytes)
 /*[clinic input]
 bytes.rstrip
 
-    self: self(type="PyBytesObject *")
     bytes: object = None
     /
 
@@ -2054,64 +2027,22 @@ If the argument is omitted or None, strip trailing ASCII whitespace.
 
 static PyObject *
 bytes_rstrip_impl(PyBytesObject *self, PyObject *bytes)
-/*[clinic end generated code: output=547e3815c95447da input=8f93c9cd361f0140]*/
+/*[clinic end generated code: output=547e3815c95447da input=b78af445c727e32b]*/
 {
     return do_argstrip(self, RIGHTSTRIP, bytes);
 }
 
 
-PyDoc_STRVAR(count__doc__,
-"B.count(sub[, start[, end]]) -> int\n\
-\n\
-Return the number of non-overlapping occurrences of substring sub in\n\
-string B[start:end].  Optional arguments start and end are interpreted\n\
-as in slice notation.");
-
 static PyObject *
 bytes_count(PyBytesObject *self, PyObject *args)
 {
-    PyObject *sub_obj;
-    const char *str = PyBytes_AS_STRING(self), *sub;
-    Py_ssize_t sub_len;
-    char byte;
-    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
-
-    Py_buffer vsub;
-    PyObject *count_obj;
-
-    if (!stringlib_parse_args_finds_byte("count", args, &sub_obj, &byte,
-                                         &start, &end))
-        return NULL;
-
-    if (sub_obj) {
-        if (PyObject_GetBuffer(sub_obj, &vsub, PyBUF_SIMPLE) != 0)
-            return NULL;
-
-        sub = vsub.buf;
-        sub_len = vsub.len;
-    }
-    else {
-        sub = &byte;
-        sub_len = 1;
-    }
-
-    ADJUST_INDICES(start, end, PyBytes_GET_SIZE(self));
-
-    count_obj = PyLong_FromSsize_t(
-        stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX)
-        );
-
-    if (sub_obj)
-        PyBuffer_Release(&vsub);
-
-    return count_obj;
+    return _Py_bytes_count(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
 
 /*[clinic input]
 bytes.translate
 
-    self: self(type="PyBytesObject *")
     table: object
         Translation table, which must be a bytes object of length 256.
     [
@@ -2128,7 +2059,7 @@ The remaining characters are mapped through the given translation table.
 static PyObject *
 bytes_translate_impl(PyBytesObject *self, PyObject *table, int group_right_1,
                      PyObject *deletechars)
-/*[clinic end generated code: output=233df850eb50bf8d input=d8fa5519d7cc4be7]*/
+/*[clinic end generated code: output=233df850eb50bf8d input=ca20edf39d780d49]*/
 {
     char *input, *output;
     Py_buffer table_view = {NULL, NULL};
@@ -2189,7 +2120,7 @@ bytes_translate_impl(PyBytesObject *self, PyObject *table, int group_right_1,
         PyBuffer_Release(&table_view);
         return NULL;
     }
-    output_start = output = PyBytes_AsString(result);
+    output_start = output = PyBytes_AS_STRING(result);
     input = PyBytes_AS_STRING(input_obj);
 
     if (dellen == 0 && table_chars != NULL) {
@@ -2265,498 +2196,6 @@ bytes_maketrans_impl(Py_buffer *frm, Py_buffer *to)
     return _Py_bytes_maketrans(frm, to);
 }
 
-/* find and count characters and substrings */
-
-#define findchar(target, target_len, c)                         \
-  ((char *)memchr((const void *)(target), c, target_len))
-
-/* String ops must return a string.  */
-/* If the object is subclass of string, create a copy */
-Py_LOCAL(PyBytesObject *)
-return_self(PyBytesObject *self)
-{
-    if (PyBytes_CheckExact(self)) {
-        Py_INCREF(self);
-        return self;
-    }
-    return (PyBytesObject *)PyBytes_FromStringAndSize(
-        PyBytes_AS_STRING(self),
-        PyBytes_GET_SIZE(self));
-}
-
-Py_LOCAL_INLINE(Py_ssize_t)
-countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount)
-{
-    Py_ssize_t count=0;
-    const char *start=target;
-    const char *end=target+target_len;
-
-    while ( (start=findchar(start, end-start, c)) != NULL ) {
-        count++;
-        if (count >= maxcount)
-            break;
-        start += 1;
-    }
-    return count;
-}
-
-
-/* Algorithms for different cases of string replacement */
-
-/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_interleave(PyBytesObject *self,
-                   const char *to_s, Py_ssize_t to_len,
-                   Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, i;
-    PyBytesObject *result;
-
-    self_len = PyBytes_GET_SIZE(self);
-
-    /* 1 at the end plus 1 after every character;
-       count = min(maxcount, self_len + 1) */
-    if (maxcount <= self_len)
-        count = maxcount;
-    else
-        /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */
-        count = self_len + 1;
-
-    /* Check for overflow */
-    /*   result_len = count * to_len + self_len; */
-    assert(count > 0);
-    if (to_len > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "replacement bytes are too long");
-        return NULL;
-    }
-    result_len = count * to_len + self_len;
-
-    if (! (result = (PyBytesObject *)
-                     PyBytes_FromStringAndSize(NULL, result_len)) )
-        return NULL;
-
-    self_s = PyBytes_AS_STRING(self);
-    result_s = PyBytes_AS_STRING(result);
-
-    /* TODO: special case single character, which doesn't need memcpy */
-
-    /* Lay the first one down (guaranteed this will occur) */
-    Py_MEMCPY(result_s, to_s, to_len);
-    result_s += to_len;
-    count -= 1;
-
-    for (i=0; i<count; i++) {
-        *result_s++ = *self_s++;
-        Py_MEMCPY(result_s, to_s, to_len);
-        result_s += to_len;
-    }
-
-    /* Copy the rest of the original string */
-    Py_MEMCPY(result_s, self_s, self_len-i);
-
-    return result;
-}
-
-/* Special case for deleting a single character */
-/* len(self)>=1, len(from)==1, to="", maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_delete_single_character(PyBytesObject *self,
-                                char from_c, Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count;
-    PyBytesObject *result;
-
-    self_len = PyBytes_GET_SIZE(self);
-    self_s = PyBytes_AS_STRING(self);
-
-    count = countchar(self_s, self_len, from_c, maxcount);
-    if (count == 0) {
-        return return_self(self);
-    }
-
-    result_len = self_len - count;  /* from_len == 1 */
-    assert(result_len>=0);
-
-    if ( (result = (PyBytesObject *)
-                    PyBytes_FromStringAndSize(NULL, result_len)) == NULL)
-        return NULL;
-    result_s = PyBytes_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-        Py_MEMCPY(result_s, start, next-start);
-        result_s += (next-start);
-        start = next+1;
-    }
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
-
-Py_LOCAL(PyBytesObject *)
-replace_delete_substring(PyBytesObject *self,
-                         const char *from_s, Py_ssize_t from_len,
-                         Py_ssize_t maxcount) {
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, offset;
-    PyBytesObject *result;
-
-    self_len = PyBytes_GET_SIZE(self);
-    self_s = PyBytes_AS_STRING(self);
-
-    count = stringlib_count(self_s, self_len,
-                            from_s, from_len,
-                            maxcount);
-
-    if (count == 0) {
-        /* no matches */
-        return return_self(self);
-    }
-
-    result_len = self_len - (count * from_len);
-    assert (result_len>=0);
-
-    if ( (result = (PyBytesObject *)
-          PyBytes_FromStringAndSize(NULL, result_len)) == NULL )
-        return NULL;
-
-    result_s = PyBytes_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset == -1)
-            break;
-        next = start + offset;
-
-        Py_MEMCPY(result_s, start, next-start);
-
-        result_s += (next-start);
-        start = next+from_len;
-    }
-    Py_MEMCPY(result_s, start, end-start);
-    return result;
-}
-
-/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_single_character_in_place(PyBytesObject *self,
-                                  char from_c, char to_c,
-                                  Py_ssize_t maxcount)
-{
-    char *self_s, *result_s, *start, *end, *next;
-    Py_ssize_t self_len;
-    PyBytesObject *result;
-
-    /* The result string will be the same size */
-    self_s = PyBytes_AS_STRING(self);
-    self_len = PyBytes_GET_SIZE(self);
-
-    next = findchar(self_s, self_len, from_c);
-
-    if (next == NULL) {
-        /* No matches; return the original string */
-        return return_self(self);
-    }
-
-    /* Need to make a new string */
-    result = (PyBytesObject *) PyBytes_FromStringAndSize(NULL, self_len);
-    if (result == NULL)
-        return NULL;
-    result_s = PyBytes_AS_STRING(result);
-    Py_MEMCPY(result_s, self_s, self_len);
-
-    /* change everything in-place, starting with this one */
-    start =  result_s + (next-self_s);
-    *start = to_c;
-    start++;
-    end = result_s + self_len;
-
-    while (--maxcount > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-        *next = to_c;
-        start = next+1;
-    }
-
-    return result;
-}
-
-/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_substring_in_place(PyBytesObject *self,
-                           const char *from_s, Py_ssize_t from_len,
-                           const char *to_s, Py_ssize_t to_len,
-                           Py_ssize_t maxcount)
-{
-    char *result_s, *start, *end;
-    char *self_s;
-    Py_ssize_t self_len, offset;
-    PyBytesObject *result;
-
-    /* The result string will be the same size */
-
-    self_s = PyBytes_AS_STRING(self);
-    self_len = PyBytes_GET_SIZE(self);
-
-    offset = stringlib_find(self_s, self_len,
-                            from_s, from_len,
-                            0);
-    if (offset == -1) {
-        /* No matches; return the original string */
-        return return_self(self);
-    }
-
-    /* Need to make a new string */
-    result = (PyBytesObject *) PyBytes_FromStringAndSize(NULL, self_len);
-    if (result == NULL)
-        return NULL;
-    result_s = PyBytes_AS_STRING(result);
-    Py_MEMCPY(result_s, self_s, self_len);
-
-    /* change everything in-place, starting with this one */
-    start =  result_s + offset;
-    Py_MEMCPY(start, to_s, from_len);
-    start += from_len;
-    end = result_s + self_len;
-
-    while ( --maxcount > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset==-1)
-            break;
-        Py_MEMCPY(start+offset, to_s, from_len);
-        start += offset+from_len;
-    }
-
-    return result;
-}
-
-/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_single_character(PyBytesObject *self,
-                         char from_c,
-                         const char *to_s, Py_ssize_t to_len,
-                         Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count;
-    PyBytesObject *result;
-
-    self_s = PyBytes_AS_STRING(self);
-    self_len = PyBytes_GET_SIZE(self);
-
-    count = countchar(self_s, self_len, from_c, maxcount);
-    if (count == 0) {
-        /* no matches, return unchanged */
-        return return_self(self);
-    }
-
-    /* use the difference between current and new, hence the "-1" */
-    /*   result_len = self_len + count * (to_len-1)  */
-    assert(count > 0);
-    if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "replacement bytes are too long");
-        return NULL;
-    }
-    result_len = self_len + count * (to_len - 1);
-
-    if ( (result = (PyBytesObject *)
-          PyBytes_FromStringAndSize(NULL, result_len)) == NULL)
-        return NULL;
-    result_s = PyBytes_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-
-        if (next == start) {
-            /* replace with the 'to' */
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start += 1;
-        } else {
-            /* copy the unchanged old then the 'to' */
-            Py_MEMCPY(result_s, start, next-start);
-            result_s += (next-start);
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start = next+1;
-        }
-    }
-    /* Copy the remainder of the remaining string */
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_substring(PyBytesObject *self,
-                  const char *from_s, Py_ssize_t from_len,
-                  const char *to_s, Py_ssize_t to_len,
-                  Py_ssize_t maxcount) {
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, offset;
-    PyBytesObject *result;
-
-    self_s = PyBytes_AS_STRING(self);
-    self_len = PyBytes_GET_SIZE(self);
-
-    count = stringlib_count(self_s, self_len,
-                            from_s, from_len,
-                            maxcount);
-
-    if (count == 0) {
-        /* no matches, return unchanged */
-        return return_self(self);
-    }
-
-    /* Check for overflow */
-    /*    result_len = self_len + count * (to_len-from_len) */
-    assert(count > 0);
-    if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "replacement bytes are too long");
-        return NULL;
-    }
-    result_len = self_len + count * (to_len-from_len);
-
-    if ( (result = (PyBytesObject *)
-          PyBytes_FromStringAndSize(NULL, result_len)) == NULL)
-        return NULL;
-    result_s = PyBytes_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset == -1)
-            break;
-        next = start+offset;
-        if (next == start) {
-            /* replace with the 'to' */
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start += from_len;
-        } else {
-            /* copy the unchanged old then the 'to' */
-            Py_MEMCPY(result_s, start, next-start);
-            result_s += (next-start);
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start = next+from_len;
-        }
-    }
-    /* Copy the remainder of the remaining string */
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-
-Py_LOCAL(PyBytesObject *)
-replace(PyBytesObject *self,
-    const char *from_s, Py_ssize_t from_len,
-    const char *to_s, Py_ssize_t to_len,
-    Py_ssize_t maxcount)
-{
-    if (maxcount < 0) {
-        maxcount = PY_SSIZE_T_MAX;
-    } else if (maxcount == 0 || PyBytes_GET_SIZE(self) == 0) {
-        /* nothing to do; return the original string */
-        return return_self(self);
-    }
-
-    if (maxcount == 0 ||
-        (from_len == 0 && to_len == 0)) {
-        /* nothing to do; return the original string */
-        return return_self(self);
-    }
-
-    /* Handle zero-length special cases */
-
-    if (from_len == 0) {
-        /* insert the 'to' string everywhere.   */
-        /*    >>> "Python".replace("", ".")     */
-        /*    '.P.y.t.h.o.n.'                   */
-        return replace_interleave(self, to_s, to_len, maxcount);
-    }
-
-    /* Except for "".replace("", "A") == "A" there is no way beyond this */
-    /* point for an empty self string to generate a non-empty string */
-    /* Special case so the remaining code always gets a non-empty string */
-    if (PyBytes_GET_SIZE(self) == 0) {
-        return return_self(self);
-    }
-
-    if (to_len == 0) {
-        /* delete all occurrences of 'from' string */
-        if (from_len == 1) {
-            return replace_delete_single_character(
-                self, from_s[0], maxcount);
-        } else {
-            return replace_delete_substring(self, from_s,
-                                            from_len, maxcount);
-        }
-    }
-
-    /* Handle special case where both strings have the same length */
-
-    if (from_len == to_len) {
-        if (from_len == 1) {
-            return replace_single_character_in_place(
-                self,
-                from_s[0],
-                to_s[0],
-                maxcount);
-        } else {
-            return replace_substring_in_place(
-                self, from_s, from_len, to_s, to_len,
-                maxcount);
-        }
-    }
-
-    /* Otherwise use the more generic algorithms */
-    if (from_len == 1) {
-        return replace_single_character(self, from_s[0],
-                                        to_s, to_len, maxcount);
-    } else {
-        /* len('from')>=2, len('to')>=1 */
-        return replace_substring(self, from_s, from_len, to_s, to_len,
-                                 maxcount);
-    }
-}
-
 
 /*[clinic input]
 bytes.replace
@@ -2775,156 +2214,28 @@ replaced.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_replace_impl(PyBytesObject*self, Py_buffer *old, Py_buffer *new,
+bytes_replace_impl(PyBytesObject *self, Py_buffer *old, Py_buffer *new,
                    Py_ssize_t count)
-/*[clinic end generated code: output=403dc9d7a83c5a1d input=b2fbbf0bf04de8e5]*/
+/*[clinic end generated code: output=994fa588b6b9c104 input=b2fbbf0bf04de8e5]*/
 {
-    return (PyObject *)replace((PyBytesObject *) self,
-                               (const char *)old->buf, old->len,
-                               (const char *)new->buf, new->len, count);
+    return stringlib_replace((PyObject *)self,
+                             (const char *)old->buf, old->len,
+                             (const char *)new->buf, new->len, count);
 }
 
 /** End DALKE **/
 
-/* Matches the end (direction >= 0) or start (direction < 0) of self
- * against substr, using the start and end arguments. Returns
- * -1 on error, 0 if not found and 1 if found.
- */
-Py_LOCAL(int)
-_bytes_tailmatch(PyBytesObject *self, PyObject *substr, Py_ssize_t start,
-                  Py_ssize_t end, int direction)
-{
-    Py_ssize_t len = PyBytes_GET_SIZE(self);
-    Py_ssize_t slen;
-    Py_buffer sub_view = {NULL, NULL};
-    const char* sub;
-    const char* str;
-
-    if (PyBytes_Check(substr)) {
-        sub = PyBytes_AS_STRING(substr);
-        slen = PyBytes_GET_SIZE(substr);
-    }
-    else {
-        if (PyObject_GetBuffer(substr, &sub_view, PyBUF_SIMPLE) != 0)
-            return -1;
-        sub = sub_view.buf;
-        slen = sub_view.len;
-    }
-    str = PyBytes_AS_STRING(self);
-
-    ADJUST_INDICES(start, end, len);
-
-    if (direction < 0) {
-        /* startswith */
-        if (start+slen > len)
-            goto notfound;
-    } else {
-        /* endswith */
-        if (end-start < slen || start > len)
-            goto notfound;
-
-        if (end-slen > start)
-            start = end - slen;
-    }
-    if (end-start < slen)
-        goto notfound;
-    if (memcmp(str+start, sub, slen) != 0)
-        goto notfound;
-
-    PyBuffer_Release(&sub_view);
-    return 1;
-
-notfound:
-    PyBuffer_Release(&sub_view);
-    return 0;
-}
-
-
-PyDoc_STRVAR(startswith__doc__,
-"B.startswith(prefix[, start[, end]]) -> bool\n\
-\n\
-Return True if B starts with the specified prefix, False otherwise.\n\
-With optional start, test B beginning at that position.\n\
-With optional end, stop comparing B at that position.\n\
-prefix can also be a tuple of bytes to try.");
 
 static PyObject *
 bytes_startswith(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t start = 0;
-    Py_ssize_t end = PY_SSIZE_T_MAX;
-    PyObject *subobj;
-    int result;
-
-    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
-        return NULL;
-    if (PyTuple_Check(subobj)) {
-        Py_ssize_t i;
-        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            result = _bytes_tailmatch(self,
-                            PyTuple_GET_ITEM(subobj, i),
-                            start, end, -1);
-            if (result == -1)
-                return NULL;
-            else if (result) {
-                Py_RETURN_TRUE;
-            }
-        }
-        Py_RETURN_FALSE;
-    }
-    result = _bytes_tailmatch(self, subobj, start, end, -1);
-    if (result == -1) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "startswith first arg must be bytes "
-                         "or a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name);
-        return NULL;
-    }
-    else
-        return PyBool_FromLong(result);
+    return _Py_bytes_startswith(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
-
-PyDoc_STRVAR(endswith__doc__,
-"B.endswith(suffix[, start[, end]]) -> bool\n\
-\n\
-Return True if B ends with the specified suffix, False otherwise.\n\
-With optional start, test B beginning at that position.\n\
-With optional end, stop comparing B at that position.\n\
-suffix can also be a tuple of bytes to try.");
-
 static PyObject *
 bytes_endswith(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t start = 0;
-    Py_ssize_t end = PY_SSIZE_T_MAX;
-    PyObject *subobj;
-    int result;
-
-    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
-        return NULL;
-    if (PyTuple_Check(subobj)) {
-        Py_ssize_t i;
-        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            result = _bytes_tailmatch(self,
-                            PyTuple_GET_ITEM(subobj, i),
-                            start, end, +1);
-            if (result == -1)
-                return NULL;
-            else if (result) {
-                Py_RETURN_TRUE;
-            }
-        }
-        Py_RETURN_FALSE;
-    }
-    result = _bytes_tailmatch(self, subobj, start, end, +1);
-    if (result == -1) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "endswith first arg must be bytes or "
-                         "a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name);
-        return NULL;
-    }
-    else
-        return PyBool_FromLong(result);
+    return _Py_bytes_endswith(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
 
@@ -2944,9 +2255,9 @@ Decode the bytes using the codec registered for encoding.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_decode_impl(PyBytesObject*self, const char *encoding,
+bytes_decode_impl(PyBytesObject *self, const char *encoding,
                   const char *errors)
-/*[clinic end generated code: output=2d2016ff8e0bb176 input=958174769d2a40ca]*/
+/*[clinic end generated code: output=5649a53dde27b314 input=958174769d2a40ca]*/
 {
     return PyUnicode_FromEncodedObject((PyObject*)self, encoding, errors);
 }
@@ -2964,8 +2275,8 @@ true.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_splitlines_impl(PyBytesObject*self, int keepends)
-/*[clinic end generated code: output=995c3598f7833cad input=7f4aac67144f9944]*/
+bytes_splitlines_impl(PyBytesObject *self, int keepends)
+/*[clinic end generated code: output=3484149a5d880ffb input=7f4aac67144f9944]*/
 {
     return stringlib_splitlines(
         (PyObject*) self, PyBytes_AS_STRING(self),
@@ -2973,22 +2284,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends)
         );
 }
 
-static int
-hex_digit_to_int(Py_UCS4 c)
-{
-    if (c >= 128)
-        return -1;
-    if (Py_ISDIGIT(c))
-        return c - '0';
-    else {
-        if (Py_ISUPPER(c))
-            c = Py_TOLOWER(c);
-        if (c >= 'a' && c <= 'f')
-            return c - 'a' + 10;
-    }
-    return -1;
-}
-
 /*[clinic input]
 @classmethod
 bytes.fromhex
@@ -3006,47 +2301,88 @@ static PyObject *
 bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
 /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
 {
-    PyObject *newstring;
+    PyObject *result = _PyBytes_FromHex(string, 0);
+    if (type != &PyBytes_Type && result != NULL) {
+        Py_SETREF(result, PyObject_CallFunctionObjArgs((PyObject *)type,
+                                                       result, NULL));
+    }
+    return result;
+}
+
+PyObject*
+_PyBytes_FromHex(PyObject *string, int use_bytearray)
+{
     char *buf;
-    Py_ssize_t hexlen, byteslen, i, j;
-    int top, bot;
-    void *data;
-    unsigned int kind;
+    Py_ssize_t hexlen, invalid_char;
+    unsigned int top, bot;
+    Py_UCS1 *str, *end;
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
+    writer.use_bytearray = use_bytearray;
 
     assert(PyUnicode_Check(string));
     if (PyUnicode_READY(string))
         return NULL;
-    kind = PyUnicode_KIND(string);
-    data = PyUnicode_DATA(string);
     hexlen = PyUnicode_GET_LENGTH(string);
 
-    byteslen = hexlen/2; /* This overestimates if there are spaces */
-    newstring = PyBytes_FromStringAndSize(NULL, byteslen);
-    if (!newstring)
+    if (!PyUnicode_IS_ASCII(string)) {
+        void *data = PyUnicode_DATA(string);
+        unsigned int kind = PyUnicode_KIND(string);
+        Py_ssize_t i;
+
+        /* search for the first non-ASCII character */
+        for (i = 0; i < hexlen; i++) {
+            if (PyUnicode_READ(kind, data, i) >= 128)
+                break;
+        }
+        invalid_char = i;
+        goto error;
+    }
+
+    assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND);
+    str = PyUnicode_1BYTE_DATA(string);
+
+    /* This overestimates if there are spaces */
+    buf = _PyBytesWriter_Alloc(&writer, hexlen / 2);
+    if (buf == NULL)
         return NULL;
-    buf = PyBytes_AS_STRING(newstring);
-    for (i = j = 0; i < hexlen; i += 2) {
+
+    end = str + hexlen;
+    while (str < end) {
         /* skip over spaces in the input */
-        while (PyUnicode_READ(kind, data, i) == ' ')
-            i++;
-        if (i >= hexlen)
-            break;
-        top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
-        bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
-        if (top == -1 || bot == -1) {
-            PyErr_Format(PyExc_ValueError,
-                         "non-hexadecimal number found in "
-                         "fromhex() arg at position %zd", i);
+        if (*str == ' ') {
+            do {
+                str++;
+            } while (*str == ' ');
+            if (str >= end)
+                break;
+        }
+
+        top = _PyLong_DigitValue[*str];
+        if (top >= 16) {
+            invalid_char = str - PyUnicode_1BYTE_DATA(string);
+            goto error;
+        }
+        str++;
+
+        bot = _PyLong_DigitValue[*str];
+        if (bot >= 16) {
+            invalid_char = str - PyUnicode_1BYTE_DATA(string);
             goto error;
         }
-        buf[j++] = (top << 4) + bot;
+        str++;
+
+        *buf++ = (unsigned char)((top << 4) + bot);
     }
-    if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0)
-        goto error;
-    return newstring;
+
+    return _PyBytesWriter_Finish(&writer, buf);
 
   error:
-    Py_XDECREF(newstring);
+    PyErr_Format(PyExc_ValueError,
+                 "non-hexadecimal number found in "
+                 "fromhex() arg at position %zd", invalid_char);
+    _PyBytesWriter_Dealloc(&writer);
     return NULL;
 }
 
@@ -3076,17 +2412,20 @@ bytes_methods[] = {
     {"__getnewargs__",          (PyCFunction)bytes_getnewargs,  METH_NOARGS},
     {"capitalize", (PyCFunction)stringlib_capitalize, METH_NOARGS,
      _Py_capitalize__doc__},
-    {"center", (PyCFunction)stringlib_center, METH_VARARGS, center__doc__},
-    {"count", (PyCFunction)bytes_count, METH_VARARGS, count__doc__},
+    {"center", (PyCFunction)stringlib_center, METH_VARARGS,
+     _Py_center__doc__},
+    {"count", (PyCFunction)bytes_count, METH_VARARGS,
+     _Py_count__doc__},
     BYTES_DECODE_METHODDEF
     {"endswith", (PyCFunction)bytes_endswith, METH_VARARGS,
-     endswith__doc__},
+     _Py_endswith__doc__},
     {"expandtabs", (PyCFunction)stringlib_expandtabs, METH_VARARGS | METH_KEYWORDS,
-     expandtabs__doc__},
-    {"find", (PyCFunction)bytes_find, METH_VARARGS, find__doc__},
+     _Py_expandtabs__doc__},
+    {"find", (PyCFunction)bytes_find, METH_VARARGS,
+     _Py_find__doc__},
     BYTES_FROMHEX_METHODDEF
     {"hex", (PyCFunction)bytes_hex, METH_NOARGS, hex__doc__},
-    {"index", (PyCFunction)bytes_index, METH_VARARGS, index__doc__},
+    {"index", (PyCFunction)bytes_index, METH_VARARGS, _Py_index__doc__},
     {"isalnum", (PyCFunction)stringlib_isalnum, METH_NOARGS,
      _Py_isalnum__doc__},
     {"isalpha", (PyCFunction)stringlib_isalpha, METH_NOARGS,
@@ -3102,38 +2441,40 @@ bytes_methods[] = {
     {"isupper", (PyCFunction)stringlib_isupper, METH_NOARGS,
      _Py_isupper__doc__},
     BYTES_JOIN_METHODDEF
-    {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, ljust__doc__},
+    {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, _Py_ljust__doc__},
     {"lower", (PyCFunction)stringlib_lower, METH_NOARGS, _Py_lower__doc__},
     BYTES_LSTRIP_METHODDEF
     BYTES_MAKETRANS_METHODDEF
     BYTES_PARTITION_METHODDEF
     BYTES_REPLACE_METHODDEF
-    {"rfind", (PyCFunction)bytes_rfind, METH_VARARGS, rfind__doc__},
-    {"rindex", (PyCFunction)bytes_rindex, METH_VARARGS, rindex__doc__},
-    {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, rjust__doc__},
+    {"rfind", (PyCFunction)bytes_rfind, METH_VARARGS, _Py_rfind__doc__},
+    {"rindex", (PyCFunction)bytes_rindex, METH_VARARGS, _Py_rindex__doc__},
+    {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, _Py_rjust__doc__},
     BYTES_RPARTITION_METHODDEF
     BYTES_RSPLIT_METHODDEF
     BYTES_RSTRIP_METHODDEF
     BYTES_SPLIT_METHODDEF
     BYTES_SPLITLINES_METHODDEF
     {"startswith", (PyCFunction)bytes_startswith, METH_VARARGS,
-     startswith__doc__},
+     _Py_startswith__doc__},
     BYTES_STRIP_METHODDEF
     {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS,
      _Py_swapcase__doc__},
     {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__},
     BYTES_TRANSLATE_METHODDEF
     {"upper", (PyCFunction)stringlib_upper, METH_NOARGS, _Py_upper__doc__},
-    {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, zfill__doc__},
+    {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, _Py_zfill__doc__},
     {NULL,     NULL}                         /* sentinel */
 };
 
 static PyObject *
-bytes_mod(PyObject *v, PyObject *w)
+bytes_mod(PyObject *self, PyObject *arg)
 {
-    if (!PyBytes_Check(v))
+    if (!PyBytes_Check(self)) {
         Py_RETURN_NOTIMPLEMENTED;
-    return _PyBytes_Format(v, w);
+    }
+    return _PyBytes_FormatEx(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self),
+                             arg, 0);
 }
 
 static PyNumberMethods bytes_as_number = {
@@ -3242,108 +2583,93 @@ bytes_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
     return PyBytes_FromObject(x);
 }
 
-PyObject *
-PyBytes_FromObject(PyObject *x)
+static PyObject*
+_PyBytes_FromBuffer(PyObject *x)
 {
-    PyObject *new, *it;
-    Py_ssize_t i, size;
+    PyObject *new;
+    Py_buffer view;
 
-    if (x == NULL) {
-        PyErr_BadInternalCall();
+    if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0)
         return NULL;
-    }
 
-    if (PyBytes_CheckExact(x)) {
-        Py_INCREF(x);
-        return x;
-    }
+    new = PyBytes_FromStringAndSize(NULL, view.len);
+    if (!new)
+        goto fail;
+    if (PyBuffer_ToContiguous(((PyBytesObject *)new)->ob_sval,
+                &view, view.len, 'C') < 0)
+        goto fail;
+    PyBuffer_Release(&view);
+    return new;
 
-    /* Use the modern buffer interface */
-    if (PyObject_CheckBuffer(x)) {
-        Py_buffer view;
-        if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0)
-            return NULL;
-        new = PyBytes_FromStringAndSize(NULL, view.len);
-        if (!new)
-            goto fail;
-        if (PyBuffer_ToContiguous(((PyBytesObject *)new)->ob_sval,
-                                  &view, view.len, 'C') < 0)
-            goto fail;
-        PyBuffer_Release(&view);
-        return new;
-      fail:
-        Py_XDECREF(new);
-        PyBuffer_Release(&view);
-        return NULL;
-    }
-    if (PyUnicode_Check(x)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "cannot convert unicode object to bytes");
-        return NULL;
-    }
+fail:
+    Py_XDECREF(new);
+    PyBuffer_Release(&view);
+    return NULL;
+}
 
-    if (PyList_CheckExact(x)) {
-        new = PyBytes_FromStringAndSize(NULL, Py_SIZE(x));
-        if (new == NULL)
-            return NULL;
-        for (i = 0; i < Py_SIZE(x); i++) {
-            Py_ssize_t value = PyNumber_AsSsize_t(
-                PyList_GET_ITEM(x, i), PyExc_ValueError);
-            if (value == -1 && PyErr_Occurred()) {
-                Py_DECREF(new);
-                return NULL;
-            }
-            if (value < 0 || value >= 256) {
-                PyErr_SetString(PyExc_ValueError,
-                                "bytes must be in range(0, 256)");
-                Py_DECREF(new);
-                return NULL;
-            }
-            ((PyBytesObject *)new)->ob_sval[i] = (char) value;
-        }
-        return new;
-    }
-    if (PyTuple_CheckExact(x)) {
-        new = PyBytes_FromStringAndSize(NULL, Py_SIZE(x));
-        if (new == NULL)
-            return NULL;
-        for (i = 0; i < Py_SIZE(x); i++) {
-            Py_ssize_t value = PyNumber_AsSsize_t(
-                PyTuple_GET_ITEM(x, i), PyExc_ValueError);
-            if (value == -1 && PyErr_Occurred()) {
-                Py_DECREF(new);
-                return NULL;
-            }
-            if (value < 0 || value >= 256) {
-                PyErr_SetString(PyExc_ValueError,
-                                "bytes must be in range(0, 256)");
-                Py_DECREF(new);
-                return NULL;
-            }
-            ((PyBytesObject *)new)->ob_sval[i] = (char) value;
-        }
-        return new;
-    }
+#define _PyBytes_FROM_LIST_BODY(x, GET_ITEM)                                \
+    do {                                                                    \
+        PyObject *bytes;                                                    \
+        Py_ssize_t i;                                                       \
+        Py_ssize_t value;                                                   \
+        char *str;                                                          \
+        PyObject *item;                                                     \
+                                                                            \
+        bytes = PyBytes_FromStringAndSize(NULL, Py_SIZE(x));                \
+        if (bytes == NULL)                                                  \
+            return NULL;                                                    \
+        str = ((PyBytesObject *)bytes)->ob_sval;                            \
+                                                                            \
+        for (i = 0; i < Py_SIZE(x); i++) {                                  \
+            item = GET_ITEM((x), i);                                        \
+            value = PyNumber_AsSsize_t(item, NULL);                         \
+            if (value == -1 && PyErr_Occurred())                            \
+                goto error;                                                 \
+                                                                            \
+            if (value < 0 || value >= 256) {                                \
+                PyErr_SetString(PyExc_ValueError,                           \
+                                "bytes must be in range(0, 256)");          \
+                goto error;                                                 \
+            }                                                               \
+            *str++ = (char) value;                                          \
+        }                                                                   \
+        return bytes;                                                       \
+                                                                            \
+    error:                                                                  \
+        Py_DECREF(bytes);                                                   \
+        return NULL;                                                        \
+    } while (0)
+
+static PyObject*
+_PyBytes_FromList(PyObject *x)
+{
+    _PyBytes_FROM_LIST_BODY(x, PyList_GET_ITEM);
+}
+
+static PyObject*
+_PyBytes_FromTuple(PyObject *x)
+{
+    _PyBytes_FROM_LIST_BODY(x, PyTuple_GET_ITEM);
+}
+
+static PyObject *
+_PyBytes_FromIterator(PyObject *it, PyObject *x)
+{
+    char *str;
+    Py_ssize_t i, size;
+    _PyBytesWriter writer;
 
     /* For iterator version, create a string object and resize as needed */
     size = PyObject_LengthHint(x, 64);
     if (size == -1 && PyErr_Occurred())
         return NULL;
-    /* Allocate an extra byte to prevent PyBytes_FromStringAndSize() from
-       returning a shared empty bytes string. This required because we
-       want to call _PyBytes_Resize() the returned object, which we can
-       only do on bytes objects with refcount == 1. */
-    if (size == 0)
-        size = 1;
-    new = PyBytes_FromStringAndSize(NULL, size);
-    if (new == NULL)
-        return NULL;
-    assert(Py_REFCNT(new) == 1);
 
-    /* Get the iterator */
-    it = PyObject_GetIter(x);
-    if (it == NULL)
-        goto error;
+    _PyBytesWriter_Init(&writer);
+    str = _PyBytesWriter_Alloc(&writer, size);
+    if (str == NULL)
+        return NULL;
+    writer.overallocate = 1;
+    size = writer.allocated;
 
     /* Run the iterator to exhaustion */
     for (i = 0; ; i++) {
@@ -3359,7 +2685,7 @@ PyBytes_FromObject(PyObject *x)
         }
 
         /* Interpret it as an int (__index__) */
-        value = PyNumber_AsSsize_t(item, PyExc_ValueError);
+        value = PyNumber_AsSsize_t(item, NULL);
         Py_DECREF(item);
         if (value == -1 && PyErr_Occurred())
             goto error;
@@ -3373,21 +2699,58 @@ PyBytes_FromObject(PyObject *x)
 
         /* Append the byte */
         if (i >= size) {
-            size = 2 * size + 1;
-            if (_PyBytes_Resize(&new, size) < 0)
-                goto error;
+            str = _PyBytesWriter_Resize(&writer, str, size+1);
+            if (str == NULL)
+                return NULL;
+            size = writer.allocated;
         }
-        ((PyBytesObject *)new)->ob_sval[i] = (char) value;
+        *str++ = (char) value;
     }
-    _PyBytes_Resize(&new, i);
 
-    /* Clean up and return success */
-    Py_DECREF(it);
-    return new;
+    return _PyBytesWriter_Finish(&writer, str);
 
   error:
-    Py_XDECREF(it);
-    Py_XDECREF(new);
+    _PyBytesWriter_Dealloc(&writer);
+    return NULL;
+}
+
+PyObject *
+PyBytes_FromObject(PyObject *x)
+{
+    PyObject *it, *result;
+
+    if (x == NULL) {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+
+    if (PyBytes_CheckExact(x)) {
+        Py_INCREF(x);
+        return x;
+    }
+
+    /* Use the modern buffer interface */
+    if (PyObject_CheckBuffer(x))
+        return _PyBytes_FromBuffer(x);
+
+    if (PyList_CheckExact(x))
+        return _PyBytes_FromList(x);
+
+    if (PyTuple_CheckExact(x))
+        return _PyBytes_FromTuple(x);
+
+    if (!PyUnicode_Check(x)) {
+        it = PyObject_GetIter(x);
+        if (it != NULL) {
+            result = _PyBytes_FromIterator(it, x);
+            Py_DECREF(it);
+            return result;
+        }
+    }
+
+    PyErr_Format(PyExc_TypeError,
+                 "cannot convert '%.200s' object to bytes",
+                 x->ob_type->tp_name);
     return NULL;
 }
 
@@ -3738,3 +3101,282 @@ bytes_iter(PyObject *seq)
     _PyObject_GC_TRACK(it);
     return (PyObject *)it;
 }
+
+
+/* _PyBytesWriter API */
+
+#ifdef MS_WINDOWS
+   /* On Windows, overallocate by 50% is the best factor */
+#  define OVERALLOCATE_FACTOR 2
+#else
+   /* On Linux, overallocate by 25% is the best factor */
+#  define OVERALLOCATE_FACTOR 4
+#endif
+
+void
+_PyBytesWriter_Init(_PyBytesWriter *writer)
+{
+    /* Set all attributes before small_buffer to 0 */
+    memset(writer, 0, offsetof(_PyBytesWriter, small_buffer));
+#ifdef Py_DEBUG
+    memset(writer->small_buffer, 0xCB, sizeof(writer->small_buffer));
+#endif
+}
+
+void
+_PyBytesWriter_Dealloc(_PyBytesWriter *writer)
+{
+    Py_CLEAR(writer->buffer);
+}
+
+Py_LOCAL_INLINE(char*)
+_PyBytesWriter_AsString(_PyBytesWriter *writer)
+{
+    if (writer->use_small_buffer) {
+        assert(writer->buffer == NULL);
+        return writer->small_buffer;
+    }
+    else if (writer->use_bytearray) {
+        assert(writer->buffer != NULL);
+        return PyByteArray_AS_STRING(writer->buffer);
+    }
+    else {
+        assert(writer->buffer != NULL);
+        return PyBytes_AS_STRING(writer->buffer);
+    }
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str)
+{
+    char *start = _PyBytesWriter_AsString(writer);
+    assert(str != NULL);
+    assert(str >= start);
+    assert(str - start <= writer->allocated);
+    return str - start;
+}
+
+Py_LOCAL_INLINE(void)
+_PyBytesWriter_CheckConsistency(_PyBytesWriter *writer, char *str)
+{
+#ifdef Py_DEBUG
+    char *start, *end;
+
+    if (writer->use_small_buffer) {
+        assert(writer->buffer == NULL);
+    }
+    else {
+        assert(writer->buffer != NULL);
+        if (writer->use_bytearray)
+            assert(PyByteArray_CheckExact(writer->buffer));
+        else
+            assert(PyBytes_CheckExact(writer->buffer));
+        assert(Py_REFCNT(writer->buffer) == 1);
+    }
+
+    if (writer->use_bytearray) {
+        /* bytearray has its own overallocation algorithm,
+           writer overallocation must be disabled */
+        assert(!writer->overallocate);
+    }
+
+    assert(0 <= writer->allocated);
+    assert(0 <= writer->min_size && writer->min_size <= writer->allocated);
+    /* the last byte must always be null */
+    start = _PyBytesWriter_AsString(writer);
+    assert(start[writer->allocated] == 0);
+
+    end = start + writer->allocated;
+    assert(str != NULL);
+    assert(start <= str && str <= end);
+#endif
+}
+
+void*
+_PyBytesWriter_Resize(_PyBytesWriter *writer, void *str, Py_ssize_t size)
+{
+    Py_ssize_t allocated, pos;
+
+    _PyBytesWriter_CheckConsistency(writer, str);
+    assert(writer->allocated < size);
+
+    allocated = size;
+    if (writer->overallocate
+        && allocated <= (PY_SSIZE_T_MAX - allocated / OVERALLOCATE_FACTOR)) {
+        /* overallocate to limit the number of realloc() */
+        allocated += allocated / OVERALLOCATE_FACTOR;
+    }
+
+    pos = _PyBytesWriter_GetSize(writer, str);
+    if (!writer->use_small_buffer) {
+        if (writer->use_bytearray) {
+            if (PyByteArray_Resize(writer->buffer, allocated))
+                goto error;
+            /* writer->allocated can be smaller than writer->buffer->ob_alloc,
+               but we cannot use ob_alloc because bytes may need to be moved
+               to use the whole buffer. bytearray uses an internal optimization
+               to avoid moving or copying bytes when bytes are removed at the
+               beginning (ex: del bytearray[:1]). */
+        }
+        else {
+            if (_PyBytes_Resize(&writer->buffer, allocated))
+                goto error;
+        }
+    }
+    else {
+        /* convert from stack buffer to bytes object buffer */
+        assert(writer->buffer == NULL);
+
+        if (writer->use_bytearray)
+            writer->buffer = PyByteArray_FromStringAndSize(NULL, allocated);
+        else
+            writer->buffer = PyBytes_FromStringAndSize(NULL, allocated);
+        if (writer->buffer == NULL)
+            goto error;
+
+        if (pos != 0) {
+            char *dest;
+            if (writer->use_bytearray)
+                dest = PyByteArray_AS_STRING(writer->buffer);
+            else
+                dest = PyBytes_AS_STRING(writer->buffer);
+            Py_MEMCPY(dest,
+                      writer->small_buffer,
+                      pos);
+        }
+
+        writer->use_small_buffer = 0;
+#ifdef Py_DEBUG
+        memset(writer->small_buffer, 0xDB, sizeof(writer->small_buffer));
+#endif
+    }
+    writer->allocated = allocated;
+
+    str = _PyBytesWriter_AsString(writer) + pos;
+    _PyBytesWriter_CheckConsistency(writer, str);
+    return str;
+
+error:
+    _PyBytesWriter_Dealloc(writer);
+    return NULL;
+}
+
+void*
+_PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size)
+{
+    Py_ssize_t new_min_size;
+
+    _PyBytesWriter_CheckConsistency(writer, str);
+    assert(size >= 0);
+
+    if (size == 0) {
+        /* nothing to do */
+        return str;
+    }
+
+    if (writer->min_size > PY_SSIZE_T_MAX - size) {
+        PyErr_NoMemory();
+        _PyBytesWriter_Dealloc(writer);
+        return NULL;
+    }
+    new_min_size = writer->min_size + size;
+
+    if (new_min_size > writer->allocated)
+        str = _PyBytesWriter_Resize(writer, str, new_min_size);
+
+    writer->min_size = new_min_size;
+    return str;
+}
+
+/* Allocate the buffer to write size bytes.
+   Return the pointer to the beginning of buffer data.
+   Raise an exception and return NULL on error. */
+void*
+_PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size)
+{
+    /* ensure that _PyBytesWriter_Alloc() is only called once */
+    assert(writer->min_size == 0 && writer->buffer == NULL);
+    assert(size >= 0);
+
+    writer->use_small_buffer = 1;
+#ifdef Py_DEBUG
+    writer->allocated = sizeof(writer->small_buffer) - 1;
+    /* In debug mode, don't use the full small buffer because it is less
+       efficient than bytes and bytearray objects to detect buffer underflow
+       and buffer overflow. Use 10 bytes of the small buffer to test also
+       code using the smaller buffer in debug mode.
+
+       Don't modify the _PyBytesWriter structure (use a shorter small buffer)
+       in debug mode to also be able to detect stack overflow when running
+       tests in debug mode. The _PyBytesWriter is large (more than 512 bytes),
+       if Py_EnterRecursiveCall() is not used in deep C callback, we may hit a
+       stack overflow. */
+    writer->allocated = Py_MIN(writer->allocated, 10);
+    /* _PyBytesWriter_CheckConsistency() requires the last byte to be 0,
+       to detect buffer overflow */
+    writer->small_buffer[writer->allocated] = 0;
+#else
+    writer->allocated = sizeof(writer->small_buffer);
+#endif
+    return _PyBytesWriter_Prepare(writer, writer->small_buffer, size);
+}
+
+PyObject *
+_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
+{
+    Py_ssize_t size;
+    PyObject *result;
+
+    _PyBytesWriter_CheckConsistency(writer, str);
+
+    size = _PyBytesWriter_GetSize(writer, str);
+    if (size == 0 && !writer->use_bytearray) {
+        Py_CLEAR(writer->buffer);
+        /* Get the empty byte string singleton */
+        result = PyBytes_FromStringAndSize(NULL, 0);
+    }
+    else if (writer->use_small_buffer) {
+        if (writer->use_bytearray) {
+            result = PyByteArray_FromStringAndSize(writer->small_buffer, size);
+        }
+        else {
+            result = PyBytes_FromStringAndSize(writer->small_buffer, size);
+        }
+    }
+    else {
+        result = writer->buffer;
+        writer->buffer = NULL;
+
+        if (size != writer->allocated) {
+            if (writer->use_bytearray) {
+                if (PyByteArray_Resize(result, size)) {
+                    Py_DECREF(result);
+                    return NULL;
+                }
+            }
+            else {
+                if (_PyBytes_Resize(&result, size)) {
+                    assert(result == NULL);
+                    return NULL;
+                }
+            }
+        }
+    }
+    return result;
+}
+
+void*
+_PyBytesWriter_WriteBytes(_PyBytesWriter *writer, void *ptr,
+                          const void *bytes, Py_ssize_t size)
+{
+    char *str = (char *)ptr;
+
+    str = _PyBytesWriter_Prepare(writer, str, size);
+    if (str == NULL)
+        return NULL;
+
+    Py_MEMCPY(str, bytes, size);
+    str += size;
+
+    return str;
+}
diff --git a/Objects/clinic/bytearrayobject.c.h b/Objects/clinic/bytearrayobject.c.h
index e87a221..e1cf03b 100644
--- a/Objects/clinic/bytearrayobject.c.h
+++ b/Objects/clinic/bytearrayobject.c.h
@@ -65,12 +65,14 @@ bytearray_translate(PyByteArrayObject *self, PyObject *args)
 
     switch (PyTuple_GET_SIZE(args)) {
         case 1:
-            if (!PyArg_ParseTuple(args, "O:translate", &table))
+            if (!PyArg_ParseTuple(args, "O:translate", &table)) {
                 goto exit;
+            }
             break;
         case 2:
-            if (!PyArg_ParseTuple(args, "OO:translate", &table, &deletechars))
+            if (!PyArg_ParseTuple(args, "OO:translate", &table, &deletechars)) {
                 goto exit;
+            }
             group_right_1 = 1;
             break;
         default:
@@ -108,17 +110,20 @@ bytearray_maketrans(void *null, PyObject *args)
     Py_buffer to = {NULL, NULL};
 
     if (!PyArg_ParseTuple(args, "y*y*:maketrans",
-        &frm, &to))
+        &frm, &to)) {
         goto exit;
+    }
     return_value = bytearray_maketrans_impl(&frm, &to);
 
 exit:
     /* Cleanup for frm */
-    if (frm.obj)
+    if (frm.obj) {
        PyBuffer_Release(&frm);
+    }
     /* Cleanup for to */
-    if (to.obj)
+    if (to.obj) {
        PyBuffer_Release(&to);
+    }
 
     return return_value;
 }
@@ -152,17 +157,20 @@ bytearray_replace(PyByteArrayObject *self, PyObject *args)
     Py_ssize_t count = -1;
 
     if (!PyArg_ParseTuple(args, "y*y*|n:replace",
-        &old, &new, &count))
+        &old, &new, &count)) {
         goto exit;
+    }
     return_value = bytearray_replace_impl(self, &old, &new, count);
 
 exit:
     /* Cleanup for old */
-    if (old.obj)
+    if (old.obj) {
        PyBuffer_Release(&old);
+    }
     /* Cleanup for new */
-    if (new.obj)
+    if (new.obj) {
        PyBuffer_Release(&new);
+    }
 
     return return_value;
 }
@@ -197,8 +205,9 @@ bytearray_split(PyByteArrayObject *self, PyObject *args, PyObject *kwargs)
     Py_ssize_t maxsplit = -1;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|On:split", _keywords,
-        &sep, &maxsplit))
+        &sep, &maxsplit)) {
         goto exit;
+    }
     return_value = bytearray_split_impl(self, sep, maxsplit);
 
 exit:
@@ -269,8 +278,9 @@ bytearray_rsplit(PyByteArrayObject *self, PyObject *args, PyObject *kwargs)
     Py_ssize_t maxsplit = -1;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|On:rsplit", _keywords,
-        &sep, &maxsplit))
+        &sep, &maxsplit)) {
         goto exit;
+    }
     return_value = bytearray_rsplit_impl(self, sep, maxsplit);
 
 exit:
@@ -320,8 +330,9 @@ bytearray_insert(PyByteArrayObject *self, PyObject *args)
     int item;
 
     if (!PyArg_ParseTuple(args, "nO&:insert",
-        &index, _getbytevalue, &item))
+        &index, _getbytevalue, &item)) {
         goto exit;
+    }
     return_value = bytearray_insert_impl(self, index, item);
 
 exit:
@@ -349,8 +360,9 @@ bytearray_append(PyByteArrayObject *self, PyObject *arg)
     PyObject *return_value = NULL;
     int item;
 
-    if (!PyArg_Parse(arg, "O&:append", _getbytevalue, &item))
+    if (!PyArg_Parse(arg, "O&:append", _getbytevalue, &item)) {
         goto exit;
+    }
     return_value = bytearray_append_impl(self, item);
 
 exit:
@@ -394,8 +406,9 @@ bytearray_pop(PyByteArrayObject *self, PyObject *args)
     Py_ssize_t index = -1;
 
     if (!PyArg_ParseTuple(args, "|n:pop",
-        &index))
+        &index)) {
         goto exit;
+    }
     return_value = bytearray_pop_impl(self, index);
 
 exit:
@@ -423,8 +436,9 @@ bytearray_remove(PyByteArrayObject *self, PyObject *arg)
     PyObject *return_value = NULL;
     int value;
 
-    if (!PyArg_Parse(arg, "O&:remove", _getbytevalue, &value))
+    if (!PyArg_Parse(arg, "O&:remove", _getbytevalue, &value)) {
         goto exit;
+    }
     return_value = bytearray_remove_impl(self, value);
 
 exit:
@@ -453,8 +467,9 @@ bytearray_strip(PyByteArrayObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "strip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytearray_strip_impl(self, bytes);
 
 exit:
@@ -483,8 +498,9 @@ bytearray_lstrip(PyByteArrayObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "lstrip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytearray_lstrip_impl(self, bytes);
 
 exit:
@@ -513,8 +529,9 @@ bytearray_rstrip(PyByteArrayObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "rstrip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytearray_rstrip_impl(self, bytes);
 
 exit:
@@ -552,8 +569,9 @@ bytearray_decode(PyByteArrayObject *self, PyObject *args, PyObject *kwargs)
     const char *errors = NULL;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode", _keywords,
-        &encoding, &errors))
+        &encoding, &errors)) {
         goto exit;
+    }
     return_value = bytearray_decode_impl(self, encoding, errors);
 
 exit:
@@ -596,8 +614,9 @@ bytearray_splitlines(PyByteArrayObject *self, PyObject *args, PyObject *kwargs)
     int keepends = 0;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i:splitlines", _keywords,
-        &keepends))
+        &keepends)) {
         goto exit;
+    }
     return_value = bytearray_splitlines_impl(self, keepends);
 
 exit:
@@ -617,17 +636,18 @@ PyDoc_STRVAR(bytearray_fromhex__doc__,
     {"fromhex", (PyCFunction)bytearray_fromhex, METH_O|METH_CLASS, bytearray_fromhex__doc__},
 
 static PyObject *
-bytearray_fromhex_impl(PyObject*cls, PyObject *string);
+bytearray_fromhex_impl(PyTypeObject *type, PyObject *string);
 
 static PyObject *
-bytearray_fromhex(PyTypeObject *cls, PyObject *arg)
+bytearray_fromhex(PyTypeObject *type, PyObject *arg)
 {
     PyObject *return_value = NULL;
     PyObject *string;
 
-    if (!PyArg_Parse(arg, "U:fromhex", &string))
+    if (!PyArg_Parse(arg, "U:fromhex", &string)) {
         goto exit;
-    return_value = bytearray_fromhex_impl((PyObject*)cls, string);
+    }
+    return_value = bytearray_fromhex_impl(type, string);
 
 exit:
     return return_value;
@@ -670,8 +690,9 @@ bytearray_reduce_ex(PyByteArrayObject *self, PyObject *args)
     int proto = 0;
 
     if (!PyArg_ParseTuple(args, "|i:__reduce_ex__",
-        &proto))
+        &proto)) {
         goto exit;
+    }
     return_value = bytearray_reduce_ex_impl(self, proto);
 
 exit:
@@ -695,4 +716,4 @@ bytearray_sizeof(PyByteArrayObject *self, PyObject *Py_UNUSED(ignored))
 {
     return bytearray_sizeof_impl(self);
 }
-/*[clinic end generated code: output=966c15ff22c5e243 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=a32f183ebef159cc input=a9049054013a1b77]*/
diff --git a/Objects/clinic/bytesobject.c.h b/Objects/clinic/bytesobject.c.h
index 5a1a5e9..0cced8e 100644
--- a/Objects/clinic/bytesobject.c.h
+++ b/Objects/clinic/bytesobject.c.h
@@ -20,10 +20,10 @@ PyDoc_STRVAR(bytes_split__doc__,
     {"split", (PyCFunction)bytes_split, METH_VARARGS|METH_KEYWORDS, bytes_split__doc__},
 
 static PyObject *
-bytes_split_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit);
+bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit);
 
 static PyObject *
-bytes_split(PyBytesObject*self, PyObject *args, PyObject *kwargs)
+bytes_split(PyBytesObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
     static char *_keywords[] = {"sep", "maxsplit", NULL};
@@ -31,8 +31,9 @@ bytes_split(PyBytesObject*self, PyObject *args, PyObject *kwargs)
     Py_ssize_t maxsplit = -1;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|On:split", _keywords,
-        &sep, &maxsplit))
+        &sep, &maxsplit)) {
         goto exit;
+    }
     return_value = bytes_split_impl(self, sep, maxsplit);
 
 exit:
@@ -64,14 +65,16 @@ bytes_partition(PyBytesObject *self, PyObject *arg)
     PyObject *return_value = NULL;
     Py_buffer sep = {NULL, NULL};
 
-    if (!PyArg_Parse(arg, "y*:partition", &sep))
+    if (!PyArg_Parse(arg, "y*:partition", &sep)) {
         goto exit;
+    }
     return_value = bytes_partition_impl(self, &sep);
 
 exit:
     /* Cleanup for sep */
-    if (sep.obj)
+    if (sep.obj) {
        PyBuffer_Release(&sep);
+    }
 
     return return_value;
 }
@@ -101,14 +104,16 @@ bytes_rpartition(PyBytesObject *self, PyObject *arg)
     PyObject *return_value = NULL;
     Py_buffer sep = {NULL, NULL};
 
-    if (!PyArg_Parse(arg, "y*:rpartition", &sep))
+    if (!PyArg_Parse(arg, "y*:rpartition", &sep)) {
         goto exit;
+    }
     return_value = bytes_rpartition_impl(self, &sep);
 
 exit:
     /* Cleanup for sep */
-    if (sep.obj)
+    if (sep.obj) {
        PyBuffer_Release(&sep);
+    }
 
     return return_value;
 }
@@ -133,10 +138,10 @@ PyDoc_STRVAR(bytes_rsplit__doc__,
     {"rsplit", (PyCFunction)bytes_rsplit, METH_VARARGS|METH_KEYWORDS, bytes_rsplit__doc__},
 
 static PyObject *
-bytes_rsplit_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit);
+bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit);
 
 static PyObject *
-bytes_rsplit(PyBytesObject*self, PyObject *args, PyObject *kwargs)
+bytes_rsplit(PyBytesObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
     static char *_keywords[] = {"sep", "maxsplit", NULL};
@@ -144,8 +149,9 @@ bytes_rsplit(PyBytesObject*self, PyObject *args, PyObject *kwargs)
     Py_ssize_t maxsplit = -1;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|On:rsplit", _keywords,
-        &sep, &maxsplit))
+        &sep, &maxsplit)) {
         goto exit;
+    }
     return_value = bytes_rsplit_impl(self, sep, maxsplit);
 
 exit:
@@ -189,8 +195,9 @@ bytes_strip(PyBytesObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "strip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytes_strip_impl(self, bytes);
 
 exit:
@@ -219,8 +226,9 @@ bytes_lstrip(PyBytesObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "lstrip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytes_lstrip_impl(self, bytes);
 
 exit:
@@ -249,8 +257,9 @@ bytes_rstrip(PyBytesObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "rstrip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytes_rstrip_impl(self, bytes);
 
 exit:
@@ -284,12 +293,14 @@ bytes_translate(PyBytesObject *self, PyObject *args)
 
     switch (PyTuple_GET_SIZE(args)) {
         case 1:
-            if (!PyArg_ParseTuple(args, "O:translate", &table))
+            if (!PyArg_ParseTuple(args, "O:translate", &table)) {
                 goto exit;
+            }
             break;
         case 2:
-            if (!PyArg_ParseTuple(args, "OO:translate", &table, &deletechars))
+            if (!PyArg_ParseTuple(args, "OO:translate", &table, &deletechars)) {
                 goto exit;
+            }
             group_right_1 = 1;
             break;
         default:
@@ -327,17 +338,20 @@ bytes_maketrans(void *null, PyObject *args)
     Py_buffer to = {NULL, NULL};
 
     if (!PyArg_ParseTuple(args, "y*y*:maketrans",
-        &frm, &to))
+        &frm, &to)) {
         goto exit;
+    }
     return_value = bytes_maketrans_impl(&frm, &to);
 
 exit:
     /* Cleanup for frm */
-    if (frm.obj)
+    if (frm.obj) {
        PyBuffer_Release(&frm);
+    }
     /* Cleanup for to */
-    if (to.obj)
+    if (to.obj) {
        PyBuffer_Release(&to);
+    }
 
     return return_value;
 }
@@ -359,11 +373,11 @@ PyDoc_STRVAR(bytes_replace__doc__,
     {"replace", (PyCFunction)bytes_replace, METH_VARARGS, bytes_replace__doc__},
 
 static PyObject *
-bytes_replace_impl(PyBytesObject*self, Py_buffer *old, Py_buffer *new,
+bytes_replace_impl(PyBytesObject *self, Py_buffer *old, Py_buffer *new,
                    Py_ssize_t count);
 
 static PyObject *
-bytes_replace(PyBytesObject*self, PyObject *args)
+bytes_replace(PyBytesObject *self, PyObject *args)
 {
     PyObject *return_value = NULL;
     Py_buffer old = {NULL, NULL};
@@ -371,17 +385,20 @@ bytes_replace(PyBytesObject*self, PyObject *args)
     Py_ssize_t count = -1;
 
     if (!PyArg_ParseTuple(args, "y*y*|n:replace",
-        &old, &new, &count))
+        &old, &new, &count)) {
         goto exit;
+    }
     return_value = bytes_replace_impl(self, &old, &new, count);
 
 exit:
     /* Cleanup for old */
-    if (old.obj)
+    if (old.obj) {
        PyBuffer_Release(&old);
+    }
     /* Cleanup for new */
-    if (new.obj)
+    if (new.obj) {
        PyBuffer_Release(&new);
+    }
 
     return return_value;
 }
@@ -405,11 +422,11 @@ PyDoc_STRVAR(bytes_decode__doc__,
     {"decode", (PyCFunction)bytes_decode, METH_VARARGS|METH_KEYWORDS, bytes_decode__doc__},
 
 static PyObject *
-bytes_decode_impl(PyBytesObject*self, const char *encoding,
+bytes_decode_impl(PyBytesObject *self, const char *encoding,
                   const char *errors);
 
 static PyObject *
-bytes_decode(PyBytesObject*self, PyObject *args, PyObject *kwargs)
+bytes_decode(PyBytesObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
     static char *_keywords[] = {"encoding", "errors", NULL};
@@ -417,8 +434,9 @@ bytes_decode(PyBytesObject*self, PyObject *args, PyObject *kwargs)
     const char *errors = NULL;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode", _keywords,
-        &encoding, &errors))
+        &encoding, &errors)) {
         goto exit;
+    }
     return_value = bytes_decode_impl(self, encoding, errors);
 
 exit:
@@ -438,18 +456,19 @@ PyDoc_STRVAR(bytes_splitlines__doc__,
     {"splitlines", (PyCFunction)bytes_splitlines, METH_VARARGS|METH_KEYWORDS, bytes_splitlines__doc__},
 
 static PyObject *
-bytes_splitlines_impl(PyBytesObject*self, int keepends);
+bytes_splitlines_impl(PyBytesObject *self, int keepends);
 
 static PyObject *
-bytes_splitlines(PyBytesObject*self, PyObject *args, PyObject *kwargs)
+bytes_splitlines(PyBytesObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
     static char *_keywords[] = {"keepends", NULL};
     int keepends = 0;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i:splitlines", _keywords,
-        &keepends))
+        &keepends)) {
         goto exit;
+    }
     return_value = bytes_splitlines_impl(self, keepends);
 
 exit:
@@ -477,11 +496,12 @@ bytes_fromhex(PyTypeObject *type, PyObject *arg)
     PyObject *return_value = NULL;
     PyObject *string;
 
-    if (!PyArg_Parse(arg, "U:fromhex", &string))
+    if (!PyArg_Parse(arg, "U:fromhex", &string)) {
         goto exit;
+    }
     return_value = bytes_fromhex_impl(type, string);
 
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=bd0ce8f25d7e18f4 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=6fe884a74e7d49cf input=a9049054013a1b77]*/
diff --git a/Objects/clinic/dictobject.c.h b/Objects/clinic/dictobject.c.h
index 5288b9a..d0cdfc3 100644
--- a/Objects/clinic/dictobject.c.h
+++ b/Objects/clinic/dictobject.c.h
@@ -23,8 +23,9 @@ dict_fromkeys(PyTypeObject *type, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "fromkeys",
         1, 2,
-        &iterable, &value))
+        &iterable, &value)) {
         goto exit;
+    }
     return_value = dict_fromkeys_impl(type, iterable, value);
 
 exit:
@@ -39,4 +40,4 @@ PyDoc_STRVAR(dict___contains____doc__,
 
 #define DICT___CONTAINS___METHODDEF    \
     {"__contains__", (PyCFunction)dict___contains__, METH_O|METH_COEXIST, dict___contains____doc__},
-/*[clinic end generated code: output=fe74d676332fdba6 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=926326109e3d9839 input=a9049054013a1b77]*/
diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h
index d42a700..891e90c 100644
--- a/Objects/clinic/unicodeobject.c.h
+++ b/Objects/clinic/unicodeobject.c.h
@@ -31,11 +31,12 @@ unicode_maketrans(void *null, PyObject *args)
     PyObject *z = NULL;
 
     if (!PyArg_ParseTuple(args, "O|UU:maketrans",
-        &x, &y, &z))
+        &x, &y, &z)) {
         goto exit;
+    }
     return_value = unicode_maketrans_impl(x, y, z);
 
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=94affdff5b2daff5 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=4a86dd108d92d104 input=a9049054013a1b77]*/
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
index 964ae62..f089f75 100644
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -694,7 +694,8 @@ PyCode_Addr2Line(PyCodeObject *co, int addrq)
         addr += *p++;
         if (addr > addrq)
             break;
-        line += *p++;
+        line += (signed char)*p;
+        p++;
     }
     return line;
 }
@@ -729,17 +730,19 @@ _PyCode_CheckLineNumber(PyCodeObject* co, int lasti, PyAddrPair *bounds)
         if (addr + *p > lasti)
             break;
         addr += *p++;
-        if (*p)
+        if ((signed char)*p)
             bounds->ap_lower = addr;
-        line += *p++;
+        line += (signed char)*p;
+        p++;
         --size;
     }
 
     if (size > 0) {
         while (--size >= 0) {
             addr += *p++;
-            if (*p++)
+            if ((signed char)*p)
                 break;
+            p++;
         }
         bounds->ap_upper = addr;
     }
diff --git a/Objects/descrobject.c b/Objects/descrobject.c
index da68e3b..4bc73b9 100644
--- a/Objects/descrobject.c
+++ b/Objects/descrobject.c
@@ -22,7 +22,7 @@ descr_name(PyDescrObject *descr)
 }
 
 static PyObject *
-descr_repr(PyDescrObject *descr, char *format)
+descr_repr(PyDescrObject *descr, const char *format)
 {
     PyObject *name = NULL;
     if (descr->d_name != NULL && PyUnicode_Check(descr->d_name))
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index e04ab2b..7a3ed42 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -321,7 +321,7 @@ static PyDictKeysObject *new_keys_object(Py_ssize_t size)
 
     assert(size >= PyDict_MINSIZE_SPLIT);
     assert(IS_POWER_OF_2(size));
-    dk = PyMem_MALLOC(sizeof(PyDictKeysObject) +
+    dk = PyObject_MALLOC(sizeof(PyDictKeysObject) +
                       sizeof(PyDictKeyEntry) * (size-1));
     if (dk == NULL) {
         PyErr_NoMemory();
@@ -350,7 +350,7 @@ free_keys_object(PyDictKeysObject *keys)
         Py_XDECREF(entries[i].me_key);
         Py_XDECREF(entries[i].me_value);
     }
-    PyMem_FREE(keys);
+    PyObject_FREE(keys);
 }
 
 #define new_values(size) PyMem_NEW(PyObject *, size)
@@ -961,7 +961,7 @@ dictresize(PyDictObject *mp, Py_ssize_t minused)
             }
         }
         assert(oldkeys->dk_refcnt == 1);
-        DK_DEBUG_DECREF PyMem_FREE(oldkeys);
+        DK_DEBUG_DECREF PyObject_FREE(oldkeys);
     }
     return 0;
 }
@@ -1160,39 +1160,42 @@ _PyDict_GetItemIdWithError(PyObject *dp, struct _Py_Identifier *key)
     return PyDict_GetItemWithError(dp, kv);
 }
 
-/* Fast version of global value lookup.
+/* Fast version of global value lookup (LOAD_GLOBAL).
  * Lookup in globals, then builtins.
+ *
+ * Raise an exception and return NULL if an error occurred (ex: computing the
+ * key hash failed, key comparison failed, ...). Return NULL if the key doesn't
+ * exist. Return the value if the key exists.
  */
 PyObject *
 _PyDict_LoadGlobal(PyDictObject *globals, PyDictObject *builtins, PyObject *key)
 {
-    PyObject *x;
-    if (PyUnicode_CheckExact(key)) {
-        PyObject **value_addr;
-        Py_hash_t hash = ((PyASCIIObject *)key)->hash;
-        if (hash != -1) {
-            PyDictKeyEntry *e;
-            e = globals->ma_keys->dk_lookup(globals, key, hash, &value_addr);
-            if (e == NULL) {
-                return NULL;
-            }
-            x = *value_addr;
-            if (x != NULL)
-                return x;
-            e = builtins->ma_keys->dk_lookup(builtins, key, hash, &value_addr);
-            if (e == NULL) {
-                return NULL;
-            }
-            x = *value_addr;
-            return x;
-        }
+    Py_hash_t hash;
+    PyDictKeyEntry *entry;
+    PyObject **value_addr;
+    PyObject *value;
+
+    if (!PyUnicode_CheckExact(key) ||
+        (hash = ((PyASCIIObject *) key)->hash) == -1)
+    {
+        hash = PyObject_Hash(key);
+        if (hash == -1)
+            return NULL;
     }
-    x = PyDict_GetItemWithError((PyObject *)globals, key);
-    if (x != NULL)
-        return x;
-    if (PyErr_Occurred())
+
+    /* namespace 1: globals */
+    entry = globals->ma_keys->dk_lookup(globals, key, hash, &value_addr);
+    if (entry == NULL)
         return NULL;
-    return PyDict_GetItemWithError((PyObject *)builtins, key);
+    value = *value_addr;
+    if (value != NULL)
+        return value;
+
+    /* namespace 2: builtins */
+    entry = builtins->ma_keys->dk_lookup(builtins, key, hash, &value_addr);
+    if (entry == NULL)
+        return NULL;
+    return *value_addr;
 }
 
 /* CAUTION: PyDict_SetItem() must guarantee that it won't resize the
@@ -1917,7 +1920,7 @@ dict_fromkeys_impl(PyTypeObject *type, PyObject *iterable, PyObject *value)
 }
 
 static int
-dict_update_common(PyObject *self, PyObject *args, PyObject *kwds, char *methname)
+dict_update_common(PyObject *self, PyObject *args, PyObject *kwds, const char *methname)
 {
     PyObject *arg = NULL;
     int result = 0;
diff --git a/Objects/exceptions.c b/Objects/exceptions.c
index c7cae2e..f829d32 100644
--- a/Objects/exceptions.c
+++ b/Objects/exceptions.c
@@ -59,15 +59,11 @@ BaseException_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 static int
 BaseException_init(PyBaseExceptionObject *self, PyObject *args, PyObject *kwds)
 {
-    PyObject *tmp;
-
     if (!_PyArg_NoKeywords(Py_TYPE(self)->tp_name, kwds))
         return -1;
 
-    tmp = self->args;
-    self->args = args;
-    Py_INCREF(self->args);
-    Py_XDECREF(tmp);
+    Py_INCREF(args);
+    Py_XSETREF(self->args, args);
 
     return 0;
 }
@@ -328,11 +324,10 @@ PyException_GetCause(PyObject *self) {
 
 /* Steals a reference to cause */
 void
-PyException_SetCause(PyObject *self, PyObject *cause) {
-    PyObject *old_cause = ((PyBaseExceptionObject *)self)->cause;
-    ((PyBaseExceptionObject *)self)->cause = cause;
+PyException_SetCause(PyObject *self, PyObject *cause)
+{
     ((PyBaseExceptionObject *)self)->suppress_context = 1;
-    Py_XDECREF(old_cause);
+    Py_XSETREF(((PyBaseExceptionObject *)self)->cause, cause);
 }
 
 PyObject *
@@ -344,10 +339,9 @@ PyException_GetContext(PyObject *self) {
 
 /* Steals a reference to context */
 void
-PyException_SetContext(PyObject *self, PyObject *context) {
-    PyObject *old_context = ((PyBaseExceptionObject *)self)->context;
-    ((PyBaseExceptionObject *)self)->context = context;
-    Py_XDECREF(old_context);
+PyException_SetContext(PyObject *self, PyObject *context)
+{
+    Py_XSETREF(((PyBaseExceptionObject *)self)->context, context);
 }
 
 
@@ -2610,7 +2604,9 @@ _PyExc_Init(PyObject *bltinmod)
     ADD_ERRNO(BlockingIOError, EWOULDBLOCK);
     POST_INIT(BrokenPipeError);
     ADD_ERRNO(BrokenPipeError, EPIPE);
+#ifdef ESHUTDOWN
     ADD_ERRNO(BrokenPipeError, ESHUTDOWN);
+#endif
     POST_INIT(ChildProcessError);
     ADD_ERRNO(ChildProcessError, ECHILD);
     POST_INIT(ConnectionAbortedError);
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
index d92bec3..da600f4 100644
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -215,35 +215,49 @@ double
 PyFloat_AsDouble(PyObject *op)
 {
     PyNumberMethods *nb;
-    PyFloatObject *fo;
+    PyObject *res;
     double val;
 
-    if (op && PyFloat_Check(op))
-        return PyFloat_AS_DOUBLE((PyFloatObject*) op);
-
     if (op == NULL) {
         PyErr_BadArgument();
         return -1;
     }
 
-    if ((nb = Py_TYPE(op)->tp_as_number) == NULL || nb->nb_float == NULL) {
-        PyErr_SetString(PyExc_TypeError, "a float is required");
-        return -1;
+    if (PyFloat_Check(op)) {
+        return PyFloat_AS_DOUBLE(op);
     }
 
-    fo = (PyFloatObject*) (*nb->nb_float) (op);
-    if (fo == NULL)
-        return -1;
-    if (!PyFloat_Check(fo)) {
-        Py_DECREF(fo);
-        PyErr_SetString(PyExc_TypeError,
-                        "nb_float should return float object");
+    nb = Py_TYPE(op)->tp_as_number;
+    if (nb == NULL || nb->nb_float == NULL) {
+        PyErr_Format(PyExc_TypeError, "must be real number, not %.50s",
+                     op->ob_type->tp_name);
         return -1;
     }
 
-    val = PyFloat_AS_DOUBLE(fo);
-    Py_DECREF(fo);
+    res = (*nb->nb_float) (op);
+    if (res == NULL) {
+        return -1;
+    }
+    if (!PyFloat_CheckExact(res)) {
+        if (!PyFloat_Check(res)) {
+            PyErr_Format(PyExc_TypeError,
+                         "%.50s.__float__ returned non-float (type %.50s)",
+                         op->ob_type->tp_name, res->ob_type->tp_name);
+            Py_DECREF(res);
+            return -1;
+        }
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "%.50s.__float__ returned non-float (type %.50s).  "
+                "The ability to return an instance of a strict subclass of float "
+                "is deprecated, and may be removed in a future version of Python.",
+                op->ob_type->tp_name, res->ob_type->tp_name)) {
+            Py_DECREF(res);
+            return -1;
+        }
+    }
 
+    val = PyFloat_AS_DOUBLE(res);
+    Py_DECREF(res);
     return val;
 }
 
@@ -1195,7 +1209,7 @@ Return a hexadecimal representation of a floating-point number.\n\
 static PyObject *
 float_fromhex(PyObject *cls, PyObject *arg)
 {
-    PyObject *result_as_float, *result;
+    PyObject *result;
     double x;
     long exp, top_exp, lsb, key_digit;
     char *s, *coeff_start, *s_store, *coeff_end, *exp_start, *s_end;
@@ -1410,11 +1424,10 @@ float_fromhex(PyObject *cls, PyObject *arg)
         s++;
     if (s != s_end)
         goto parse_error;
-    result_as_float = Py_BuildValue("(d)", negate ? -x : x);
-    if (result_as_float == NULL)
-        return NULL;
-    result = PyObject_CallObject(cls, result_as_float);
-    Py_DECREF(result_as_float);
+    result = PyFloat_FromDouble(negate ? -x : x);
+    if (cls != (PyObject *)&PyFloat_Type && result != NULL) {
+        Py_SETREF(result, PyObject_CallFunctionObjArgs(cls, result, NULL));
+    }
     return result;
 
   overflow_error:
@@ -1451,29 +1464,23 @@ float_as_integer_ratio(PyObject *v, PyObject *unused)
     int exponent;
     int i;
 
-    PyObject *prev;
     PyObject *py_exponent = NULL;
     PyObject *numerator = NULL;
     PyObject *denominator = NULL;
     PyObject *result_pair = NULL;
     PyNumberMethods *long_methods = PyLong_Type.tp_as_number;
 
-#define INPLACE_UPDATE(obj, call) \
-    prev = obj; \
-    obj = call; \
-    Py_DECREF(prev); \
-
     CONVERT_TO_DOUBLE(v, self);
 
     if (Py_IS_INFINITY(self)) {
-      PyErr_SetString(PyExc_OverflowError,
-                      "Cannot pass infinity to float.as_integer_ratio.");
-      return NULL;
+        PyErr_SetString(PyExc_OverflowError,
+                        "cannot convert Infinity to integer ratio");
+        return NULL;
     }
     if (Py_IS_NAN(self)) {
-      PyErr_SetString(PyExc_ValueError,
-                      "Cannot pass NaN to float.as_integer_ratio.");
-      return NULL;
+        PyErr_SetString(PyExc_ValueError,
+                        "cannot convert NaN to integer ratio");
+        return NULL;
     }
 
     PyFPE_START_PROTECT("as_integer_ratio", goto error);
@@ -1489,29 +1496,31 @@ float_as_integer_ratio(PyObject *v, PyObject *unused)
        to be truncated by PyLong_FromDouble(). */
 
     numerator = PyLong_FromDouble(float_part);
-    if (numerator == NULL) goto error;
+    if (numerator == NULL)
+        goto error;
+    denominator = PyLong_FromLong(1);
+    if (denominator == NULL)
+        goto error;
+    py_exponent = PyLong_FromLong(Py_ABS(exponent));
+    if (py_exponent == NULL)
+        goto error;
 
     /* fold in 2**exponent */
-    denominator = PyLong_FromLong(1);
-    py_exponent = PyLong_FromLong(labs((long)exponent));
-    if (py_exponent == NULL) goto error;
-    INPLACE_UPDATE(py_exponent,
-                   long_methods->nb_lshift(denominator, py_exponent));
-    if (py_exponent == NULL) goto error;
     if (exponent > 0) {
-        INPLACE_UPDATE(numerator,
-                       long_methods->nb_multiply(numerator, py_exponent));
-        if (numerator == NULL) goto error;
+        Py_SETREF(numerator,
+                  long_methods->nb_lshift(numerator, py_exponent));
+        if (numerator == NULL)
+            goto error;
     }
     else {
-        Py_DECREF(denominator);
-        denominator = py_exponent;
-        py_exponent = NULL;
+        Py_SETREF(denominator,
+                  long_methods->nb_lshift(denominator, py_exponent));
+        if (denominator == NULL)
+            goto error;
     }
 
     result_pair = PyTuple_Pack(2, numerator, denominator);
 
-#undef INPLACE_UPDATE
 error:
     Py_XDECREF(py_exponent);
     Py_XDECREF(denominator);
diff --git a/Objects/frameobject.c b/Objects/frameobject.c
index 9aadd61..b115614 100644
--- a/Objects/frameobject.c
+++ b/Objects/frameobject.c
@@ -137,7 +137,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno)
         new_lasti = -1;
         for (offset = 0; offset < lnotab_len; offset += 2) {
             addr += lnotab[offset];
-            line += lnotab[offset+1];
+            line += (signed char)lnotab[offset+1];
             if (line >= new_lineno) {
                 new_lasti = addr;
                 new_lineno = line;
@@ -189,7 +189,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno)
     memset(blockstack, '\0', sizeof(blockstack));
     memset(in_finally, '\0', sizeof(in_finally));
     blockstack_top = 0;
-    for (addr = 0; addr < code_len; addr++) {
+    for (addr = 0; addr < code_len; addr += 2) {
         unsigned char op = code[addr];
         switch (op) {
         case SETUP_LOOP:
@@ -251,10 +251,6 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno)
                 }
             }
         }
-
-        if (op >= HAVE_ARGUMENT) {
-            addr += 2;
-        }
     }
 
     /* Verify that the blockstack tracking code didn't get lost. */
@@ -277,7 +273,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno)
      * can tell whether the jump goes into any blocks without coming out
      * again - in that case we raise an exception below. */
     delta_iblock = 0;
-    for (addr = min_addr; addr < max_addr; addr++) {
+    for (addr = min_addr; addr < max_addr; addr += 2) {
         unsigned char op = code[addr];
         switch (op) {
         case SETUP_LOOP:
@@ -294,10 +290,6 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno)
         }
 
         min_delta_iblock = Py_MIN(min_delta_iblock, delta_iblock);
-
-        if (op >= HAVE_ARGUMENT) {
-            addr += 2;
-        }
     }
 
     /* Derive the absolute iblock values from the deltas. */
diff --git a/Objects/funcobject.c b/Objects/funcobject.c
index e6c327d..261c16d 100644
--- a/Objects/funcobject.c
+++ b/Objects/funcobject.c
@@ -249,7 +249,6 @@ func_get_code(PyFunctionObject *op)
 static int
 func_set_code(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
     Py_ssize_t nfree, nclosure;
 
     /* Not legal to del f.func_code or to set it to anything
@@ -270,10 +269,8 @@ func_set_code(PyFunctionObject *op, PyObject *value)
                      nclosure, nfree);
         return -1;
     }
-    tmp = op->func_code;
     Py_INCREF(value);
-    op->func_code = value;
-    Py_DECREF(tmp);
+    Py_XSETREF(op->func_code, value);
     return 0;
 }
 
@@ -287,8 +284,6 @@ func_get_name(PyFunctionObject *op)
 static int
 func_set_name(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     /* Not legal to del f.func_name or to set it to anything
      * other than a string object. */
     if (value == NULL || !PyUnicode_Check(value)) {
@@ -296,10 +291,8 @@ func_set_name(PyFunctionObject *op, PyObject *value)
                         "__name__ must be set to a string object");
         return -1;
     }
-    tmp = op->func_name;
     Py_INCREF(value);
-    op->func_name = value;
-    Py_DECREF(tmp);
+    Py_XSETREF(op->func_name, value);
     return 0;
 }
 
@@ -313,8 +306,6 @@ func_get_qualname(PyFunctionObject *op)
 static int
 func_set_qualname(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     /* Not legal to del f.__qualname__ or to set it to anything
      * other than a string object. */
     if (value == NULL || !PyUnicode_Check(value)) {
@@ -322,10 +313,8 @@ func_set_qualname(PyFunctionObject *op, PyObject *value)
                         "__qualname__ must be set to a string object");
         return -1;
     }
-    tmp = op->func_qualname;
     Py_INCREF(value);
-    op->func_qualname = value;
-    Py_DECREF(tmp);
+    Py_XSETREF(op->func_qualname, value);
     return 0;
 }
 
@@ -343,8 +332,6 @@ func_get_defaults(PyFunctionObject *op)
 static int
 func_set_defaults(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     /* Legal to del f.func_defaults.
      * Can only set func_defaults to NULL or a tuple. */
     if (value == Py_None)
@@ -354,10 +341,8 @@ func_set_defaults(PyFunctionObject *op, PyObject *value)
                         "__defaults__ must be set to a tuple object");
         return -1;
     }
-    tmp = op->func_defaults;
     Py_XINCREF(value);
-    op->func_defaults = value;
-    Py_XDECREF(tmp);
+    Py_XSETREF(op->func_defaults, value);
     return 0;
 }
 
@@ -375,8 +360,6 @@ func_get_kwdefaults(PyFunctionObject *op)
 static int
 func_set_kwdefaults(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     if (value == Py_None)
         value = NULL;
     /* Legal to del f.func_kwdefaults.
@@ -386,10 +369,8 @@ func_set_kwdefaults(PyFunctionObject *op, PyObject *value)
             "__kwdefaults__ must be set to a dict object");
         return -1;
     }
-    tmp = op->func_kwdefaults;
     Py_XINCREF(value);
-    op->func_kwdefaults = value;
-    Py_XDECREF(tmp);
+    Py_XSETREF(op->func_kwdefaults, value);
     return 0;
 }
 
@@ -408,8 +389,6 @@ func_get_annotations(PyFunctionObject *op)
 static int
 func_set_annotations(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     if (value == Py_None)
         value = NULL;
     /* Legal to del f.func_annotations.
@@ -420,10 +399,8 @@ func_set_annotations(PyFunctionObject *op, PyObject *value)
             "__annotations__ must be set to a dict object");
         return -1;
     }
-    tmp = op->func_annotations;
     Py_XINCREF(value);
-    op->func_annotations = value;
-    Py_XDECREF(tmp);
+    Py_XSETREF(op->func_annotations, value);
     return 0;
 }
 
diff --git a/Objects/genobject.c b/Objects/genobject.c
index b3e0a46..c028db5 100644
--- a/Objects/genobject.c
+++ b/Objects/genobject.c
@@ -187,7 +187,7 @@ gen_send_ex(PyGenObject *gen, PyObject *arg, int exc, int closing)
             /* Pop the exception before issuing a warning. */
             PyErr_Fetch(&exc, &val, &tb);
 
-            if (PyErr_WarnFormat(PyExc_PendingDeprecationWarning, 1,
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                                  "generator '%.50S' raised StopIteration",
                                  gen->gi_qualname)) {
                 /* Warning was converted to an error. */
@@ -277,7 +277,7 @@ _PyGen_yf(PyGenObject *gen)
         PyObject *bytecode = f->f_code->co_code;
         unsigned char *code = (unsigned char *)PyBytes_AS_STRING(bytecode);
 
-        if (code[f->f_lasti + 1] != YIELD_FROM)
+        if (code[f->f_lasti + 2] != YIELD_FROM)
             return NULL;
         yf = f->f_stacktop[-1];
         Py_INCREF(yf);
@@ -376,7 +376,7 @@ gen_throw(PyGenObject *gen, PyObject *args)
             assert(ret == yf);
             Py_DECREF(ret);
             /* Termination repetition of YIELD_FROM */
-            gen->gi_frame->f_lasti++;
+            gen->gi_frame->f_lasti += 2;
             if (_PyGen_FetchStopIterationValue(&val) == 0) {
                 ret = gen_send_ex(gen, val, 0, 0);
                 Py_DECREF(val);
@@ -519,8 +519,6 @@ gen_get_name(PyGenObject *op)
 static int
 gen_set_name(PyGenObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     /* Not legal to del gen.gi_name or to set it to anything
      * other than a string object. */
     if (value == NULL || !PyUnicode_Check(value)) {
@@ -528,10 +526,8 @@ gen_set_name(PyGenObject *op, PyObject *value)
                         "__name__ must be set to a string object");
         return -1;
     }
-    tmp = op->gi_name;
     Py_INCREF(value);
-    op->gi_name = value;
-    Py_DECREF(tmp);
+    Py_XSETREF(op->gi_name, value);
     return 0;
 }
 
@@ -545,8 +541,6 @@ gen_get_qualname(PyGenObject *op)
 static int
 gen_set_qualname(PyGenObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     /* Not legal to del gen.__qualname__ or to set it to anything
      * other than a string object. */
     if (value == NULL || !PyUnicode_Check(value)) {
@@ -554,10 +548,8 @@ gen_set_qualname(PyGenObject *op, PyObject *value)
                         "__qualname__ must be set to a string object");
         return -1;
     }
-    tmp = op->gi_qualname;
     Py_INCREF(value);
-    op->gi_qualname = value;
-    Py_DECREF(tmp);
+    Py_XSETREF(op->gi_qualname, value);
     return 0;
 }
 
diff --git a/Objects/listobject.c b/Objects/listobject.c
index d688179..0b2c8c1 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -82,6 +82,16 @@ static size_t count_reuse = 0;
 static void
 show_alloc(void)
 {
+    PyObject *xoptions, *value;
+    _Py_IDENTIFIER(showalloccount);
+
+    xoptions = PySys_GetXOptions();
+    if (xoptions == NULL)
+        return;
+    value = _PyDict_GetItemId(xoptions, &PyId_showalloccount);
+    if (value != Py_True)
+        return;
+
     fprintf(stderr, "List allocations: %" PY_FORMAT_SIZE_T "d\n",
         count_alloc);
     fprintf(stderr, "List reuse through freelist: %" PY_FORMAT_SIZE_T
@@ -216,7 +226,6 @@ int
 PyList_SetItem(PyObject *op, Py_ssize_t i,
                PyObject *newitem)
 {
-    PyObject *olditem;
     PyObject **p;
     if (!PyList_Check(op)) {
         Py_XDECREF(newitem);
@@ -230,9 +239,7 @@ PyList_SetItem(PyObject *op, Py_ssize_t i,
         return -1;
     }
     p = ((PyListObject *)op) -> ob_item + i;
-    olditem = *p;
-    *p = newitem;
-    Py_XDECREF(olditem);
+    Py_XSETREF(*p, newitem);
     return 0;
 }
 
@@ -251,7 +258,7 @@ ins1(PyListObject *self, Py_ssize_t where, PyObject *v)
         return -1;
     }
 
-    if (list_resize(self, n+1) == -1)
+    if (list_resize(self, n+1) < 0)
         return -1;
 
     if (where < 0) {
@@ -291,7 +298,7 @@ app1(PyListObject *self, PyObject *v)
         return -1;
     }
 
-    if (list_resize(self, n+1) == -1)
+    if (list_resize(self, n+1) < 0)
         return -1;
 
     Py_INCREF(v);
@@ -481,9 +488,9 @@ list_concat(PyListObject *a, PyObject *bb)
         return NULL;
     }
 #define b ((PyListObject *)bb)
-    size = Py_SIZE(a) + Py_SIZE(b);
-    if (size < 0)
+    if (Py_SIZE(a) > PY_SSIZE_T_MAX - Py_SIZE(b))
         return PyErr_NoMemory();
+    size = Py_SIZE(a) + Py_SIZE(b);
     np = (PyListObject *) PyList_New(size);
     if (np == NULL) {
         return NULL;
@@ -711,7 +718,7 @@ list_inplace_repeat(PyListObject *self, Py_ssize_t n)
         return PyErr_NoMemory();
     }
 
-    if (list_resize(self, size*n) == -1)
+    if (list_resize(self, size*n) < 0)
         return NULL;
 
     p = size;
@@ -730,7 +737,6 @@ list_inplace_repeat(PyListObject *self, Py_ssize_t n)
 static int
 list_ass_item(PyListObject *a, Py_ssize_t i, PyObject *v)
 {
-    PyObject *old_value;
     if (i < 0 || i >= Py_SIZE(a)) {
         PyErr_SetString(PyExc_IndexError,
                         "list assignment index out of range");
@@ -739,9 +745,7 @@ list_ass_item(PyListObject *a, Py_ssize_t i, PyObject *v)
     if (v == NULL)
         return list_ass_slice(a, i, i+1, v);
     Py_INCREF(v);
-    old_value = a->ob_item[i];
-    a->ob_item[i] = v;
-    Py_DECREF(old_value);
+    Py_SETREF(a->ob_item[i], v);
     return 0;
 }
 
@@ -804,7 +808,7 @@ listextend(PyListObject *self, PyObject *b)
             Py_RETURN_NONE;
         }
         m = Py_SIZE(self);
-        if (list_resize(self, m + n) == -1) {
+        if (list_resize(self, m + n) < 0) {
             Py_DECREF(b);
             return NULL;
         }
@@ -832,23 +836,25 @@ listextend(PyListObject *self, PyObject *b)
 
     /* Guess a result list size. */
     n = PyObject_LengthHint(b, 8);
-    if (n == -1) {
+    if (n < 0) {
         Py_DECREF(it);
         return NULL;
     }
     m = Py_SIZE(self);
-    mn = m + n;
-    if (mn >= m) {
+    if (m > PY_SSIZE_T_MAX - n) {
+        /* m + n overflowed; on the chance that n lied, and there really
+         * is enough room, ignore it.  If n was telling the truth, we'll
+         * eventually run out of memory during the loop.
+         */
+    }
+    else {
+        mn = m + n;
         /* Make room. */
-        if (list_resize(self, mn) == -1)
+        if (list_resize(self, mn) < 0)
             goto error;
         /* Make the list sane again. */
         Py_SIZE(self) = m;
     }
-    /* Else m + n overflowed; on the chance that n lied, and there really
-     * is enough room, ignore it.  If n was telling the truth, we'll
-     * eventually run out of memory during the loop.
-     */
 
     /* Run iterator to exhaustion. */
     for (;;) {
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 832e4f2..fef982f 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -486,7 +486,7 @@ sub-run, yet finding such very efficiently when they exist.
 I first learned about the galloping strategy in a related context; see:
 
     "Adaptive Set Intersections, Unions, and Differences" (2000)
-    Erik D. Demaine, Alejandro L�pez-Ortiz, J. Ian Munro
+    Erik D. Demaine, Alejandro López-Ortiz, J. Ian Munro
 
 and its followup(s).  An earlier paper called the same strategy
 "exponential search":
diff --git a/Objects/lnotab_notes.txt b/Objects/lnotab_notes.txt
index d247edd..5153757 100644
--- a/Objects/lnotab_notes.txt
+++ b/Objects/lnotab_notes.txt
@@ -12,42 +12,47 @@ pairs.  The details are important and delicate, best illustrated by example:
         0		    1
         6		    2
        50		    7
-      350                 307
-      361                 308
+      350                 207
+      361                 208
 
 Instead of storing these numbers literally, we compress the list by storing only
-the increments from one row to the next.  Conceptually, the stored list might
+the difference from one row to the next.  Conceptually, the stored list might
 look like:
 
-    0, 1,  6, 1,  44, 5,  300, 300,  11, 1
+    0, 1,  6, 1,  44, 5,  300, 200,  11, 1
 
-The above doesn't really work, but it's a start. Note that an unsigned byte
-can't hold negative values, or values larger than 255, and the above example
-contains two such values. So we make two tweaks:
+The above doesn't really work, but it's a start. An unsigned byte (byte code
+offset) can't hold negative values, or values larger than 255, a signed byte
+(line number) can't hold values larger than 127 or less than -128, and the
+above example contains two such values. So we make two tweaks:
 
- (a) there's a deep assumption that byte code offsets and their corresponding
- line #s both increase monotonically, and
- (b) if at least one column jumps by more than 255 from one row to the next,
- more than one pair is written to the table. In case #b, there's no way to know
- from looking at the table later how many were written.  That's the delicate
- part.  A user of co_lnotab desiring to find the source line number
- corresponding to a bytecode address A should do something like this
+ (a) there's a deep assumption that byte code offsets increase monotonically,
+ and
+ (b) if byte code offset jumps by more than 255 from one row to the next, or if
+ source code line number jumps by more than 127 or less than -128 from one row
+ to the next, more than one pair is written to the table. In case #b,
+ there's no way to know from looking at the table later how many were written.
+ That's the delicate part.  A user of co_lnotab desiring to find the source
+ line number corresponding to a bytecode address A should do something like
+ this:
 
     lineno = addr = 0
     for addr_incr, line_incr in co_lnotab:
         addr += addr_incr
         if addr > A:
             return lineno
+        if line_incr >= 0x80:
+            line_incr -= 0x100
         lineno += line_incr
 
 (In C, this is implemented by PyCode_Addr2Line().)  In order for this to work,
 when the addr field increments by more than 255, the line # increment in each
 pair generated must be 0 until the remaining addr increment is < 256.  So, in
 the example above, assemble_lnotab in compile.c should not (as was actually done
-until 2.2) expand 300, 300 to
+until 2.2) expand 300, 200 to
     255, 255, 45, 45,
 but to
-    255, 0, 45, 255, 0, 45.
+    255, 0, 45, 128, 0, 72.
 
 The above is sufficient to reconstruct line numbers for tracebacks, but not for
 line tracing.  Tracing is handled by PyCode_CheckLineNumber() in codeobject.c
@@ -90,16 +95,16 @@ which compiles to this:
               6 POP_JUMP_IF_FALSE       17
 
   3           9 LOAD_CONST               1 (1)
-             12 PRINT_ITEM          
+             12 PRINT_ITEM
 
-  4          13 BREAK_LOOP          
+  4          13 BREAK_LOOP
              14 JUMP_ABSOLUTE            3
-        >>   17 POP_BLOCK           
+        >>   17 POP_BLOCK
 
   6          18 LOAD_CONST               2 (2)
-             21 PRINT_ITEM          
+             21 PRINT_ITEM
         >>   22 LOAD_CONST               0 (None)
-             25 RETURN_VALUE        
+             25 RETURN_VALUE
 
 If 'a' is false, execution will jump to the POP_BLOCK instruction at offset 17
 and the co_lnotab will claim that execution has moved to line 4, which is wrong.
diff --git a/Objects/longobject.c b/Objects/longobject.c
index cd04e36..d49594d 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -1582,10 +1582,12 @@ divrem1(PyLongObject *a, digit n, digit *prem)
 static int
 long_to_decimal_string_internal(PyObject *aa,
                                 PyObject **p_output,
-                                _PyUnicodeWriter *writer)
+                                _PyUnicodeWriter *writer,
+                                _PyBytesWriter *bytes_writer,
+                                char **bytes_str)
 {
     PyLongObject *scratch, *a;
-    PyObject *str;
+    PyObject *str = NULL;
     Py_ssize_t size, strlen, size_a, i, j;
     digit *pout, *pin, rem, tenpow;
     int negative;
@@ -1662,7 +1664,13 @@ long_to_decimal_string_internal(PyObject *aa,
             return -1;
         }
         kind = writer->kind;
-        str = NULL;
+    }
+    else if (bytes_writer) {
+        *bytes_str = _PyBytesWriter_Prepare(bytes_writer, *bytes_str, strlen);
+        if (*bytes_str == NULL) {
+            Py_DECREF(scratch);
+            return -1;
+        }
     }
     else {
         str = PyUnicode_New(strlen, '9');
@@ -1673,13 +1681,8 @@ long_to_decimal_string_internal(PyObject *aa,
         kind = PyUnicode_KIND(str);
     }
 
-#define WRITE_DIGITS(TYPE)                                            \
+#define WRITE_DIGITS(p)                                               \
     do {                                                              \
-        if (writer)                                                   \
-            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + strlen; \
-        else                                                          \
-            p = (TYPE*)PyUnicode_DATA(str) + strlen;                  \
-                                                                      \
         /* pout[0] through pout[size-2] contribute exactly            \
            _PyLong_DECIMAL_SHIFT digits each */                       \
         for (i=0; i < size - 1; i++) {                                \
@@ -1699,6 +1702,16 @@ long_to_decimal_string_internal(PyObject *aa,
         /* and sign */                                                \
         if (negative)                                                 \
             *--p = '-';                                               \
+    } while (0)
+
+#define WRITE_UNICODE_DIGITS(TYPE)                                    \
+    do {                                                              \
+        if (writer)                                                   \
+            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + strlen; \
+        else                                                          \
+            p = (TYPE*)PyUnicode_DATA(str) + strlen;                  \
+                                                                      \
+        WRITE_DIGITS(p);                                              \
                                                                       \
         /* check we've counted correctly */                           \
         if (writer)                                                   \
@@ -1708,25 +1721,34 @@ long_to_decimal_string_internal(PyObject *aa,
     } while (0)
 
     /* fill the string right-to-left */
-    if (kind == PyUnicode_1BYTE_KIND) {
+    if (bytes_writer) {
+        char *p = *bytes_str + strlen;
+        WRITE_DIGITS(p);
+        assert(p == *bytes_str);
+    }
+    else if (kind == PyUnicode_1BYTE_KIND) {
         Py_UCS1 *p;
-        WRITE_DIGITS(Py_UCS1);
+        WRITE_UNICODE_DIGITS(Py_UCS1);
     }
     else if (kind == PyUnicode_2BYTE_KIND) {
         Py_UCS2 *p;
-        WRITE_DIGITS(Py_UCS2);
+        WRITE_UNICODE_DIGITS(Py_UCS2);
     }
     else {
         Py_UCS4 *p;
         assert (kind == PyUnicode_4BYTE_KIND);
-        WRITE_DIGITS(Py_UCS4);
+        WRITE_UNICODE_DIGITS(Py_UCS4);
     }
 #undef WRITE_DIGITS
+#undef WRITE_UNICODE_DIGITS
 
     Py_DECREF(scratch);
     if (writer) {
         writer->pos += strlen;
     }
+    else if (bytes_writer) {
+        (*bytes_str) += strlen;
+    }
     else {
         assert(_PyUnicode_CheckConsistency(str, 1));
         *p_output = (PyObject *)str;
@@ -1738,7 +1760,7 @@ static PyObject *
 long_to_decimal_string(PyObject *aa)
 {
     PyObject *v;
-    if (long_to_decimal_string_internal(aa, &v, NULL) == -1)
+    if (long_to_decimal_string_internal(aa, &v, NULL, NULL, NULL) == -1)
         return NULL;
     return v;
 }
@@ -1750,10 +1772,11 @@ long_to_decimal_string(PyObject *aa)
 
 static int
 long_format_binary(PyObject *aa, int base, int alternate,
-                   PyObject **p_output, _PyUnicodeWriter *writer)
+                   PyObject **p_output, _PyUnicodeWriter *writer,
+                   _PyBytesWriter *bytes_writer, char **bytes_str)
 {
     PyLongObject *a = (PyLongObject *)aa;
-    PyObject *v;
+    PyObject *v = NULL;
     Py_ssize_t sz;
     Py_ssize_t size_a;
     enum PyUnicode_Kind kind;
@@ -1810,7 +1833,11 @@ long_format_binary(PyObject *aa, int base, int alternate,
         if (_PyUnicodeWriter_Prepare(writer, sz, 'x') == -1)
             return -1;
         kind = writer->kind;
-        v = NULL;
+    }
+    else if (bytes_writer) {
+        *bytes_str = _PyBytesWriter_Prepare(bytes_writer, *bytes_str, sz);
+        if (*bytes_str == NULL)
+            return -1;
     }
     else {
         v = PyUnicode_New(sz, 'x');
@@ -1819,13 +1846,8 @@ long_format_binary(PyObject *aa, int base, int alternate,
         kind = PyUnicode_KIND(v);
     }
 
-#define WRITE_DIGITS(TYPE)                                              \
+#define WRITE_DIGITS(p)                                                 \
     do {                                                                \
-        if (writer)                                                     \
-            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + sz; \
-        else                                                            \
-            p = (TYPE*)PyUnicode_DATA(v) + sz;                          \
-                                                                        \
         if (size_a == 0) {                                              \
             *--p = '0';                                                 \
         }                                                               \
@@ -1860,30 +1882,50 @@ long_format_binary(PyObject *aa, int base, int alternate,
         }                                                               \
         if (negative)                                                   \
             *--p = '-';                                                 \
+    } while (0)
+
+#define WRITE_UNICODE_DIGITS(TYPE)                                      \
+    do {                                                                \
+        if (writer)                                                     \
+            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + sz; \
+        else                                                            \
+            p = (TYPE*)PyUnicode_DATA(v) + sz;                          \
+                                                                        \
+        WRITE_DIGITS(p);                                                \
+                                                                        \
         if (writer)                                                     \
             assert(p == ((TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos)); \
         else                                                            \
             assert(p == (TYPE*)PyUnicode_DATA(v));                      \
     } while (0)
 
-    if (kind == PyUnicode_1BYTE_KIND) {
+    if (bytes_writer) {
+        char *p = *bytes_str + sz;
+        WRITE_DIGITS(p);
+        assert(p == *bytes_str);
+    }
+    else if (kind == PyUnicode_1BYTE_KIND) {
         Py_UCS1 *p;
-        WRITE_DIGITS(Py_UCS1);
+        WRITE_UNICODE_DIGITS(Py_UCS1);
     }
     else if (kind == PyUnicode_2BYTE_KIND) {
         Py_UCS2 *p;
-        WRITE_DIGITS(Py_UCS2);
+        WRITE_UNICODE_DIGITS(Py_UCS2);
     }
     else {
         Py_UCS4 *p;
         assert (kind == PyUnicode_4BYTE_KIND);
-        WRITE_DIGITS(Py_UCS4);
+        WRITE_UNICODE_DIGITS(Py_UCS4);
     }
 #undef WRITE_DIGITS
+#undef WRITE_UNICODE_DIGITS
 
     if (writer) {
         writer->pos += sz;
     }
+    else if (bytes_writer) {
+        (*bytes_str) += sz;
+    }
     else {
         assert(_PyUnicode_CheckConsistency(v, 1));
         *p_output = v;
@@ -1897,9 +1939,9 @@ _PyLong_Format(PyObject *obj, int base)
     PyObject *str;
     int err;
     if (base == 10)
-        err = long_to_decimal_string_internal(obj, &str, NULL);
+        err = long_to_decimal_string_internal(obj, &str, NULL, NULL, NULL);
     else
-        err = long_format_binary(obj, base, 1, &str, NULL);
+        err = long_format_binary(obj, base, 1, &str, NULL, NULL, NULL);
     if (err == -1)
         return NULL;
     return str;
@@ -1911,9 +1953,31 @@ _PyLong_FormatWriter(_PyUnicodeWriter *writer,
                      int base, int alternate)
 {
     if (base == 10)
-        return long_to_decimal_string_internal(obj, NULL, writer);
+        return long_to_decimal_string_internal(obj, NULL, writer,
+                                               NULL, NULL);
+    else
+        return long_format_binary(obj, base, alternate, NULL, writer,
+                                  NULL, NULL);
+}
+
+char*
+_PyLong_FormatBytesWriter(_PyBytesWriter *writer, char *str,
+                          PyObject *obj,
+                          int base, int alternate)
+{
+    char *str2;
+    int res;
+    str2 = str;
+    if (base == 10)
+        res = long_to_decimal_string_internal(obj, NULL, NULL,
+                                              writer, &str2);
     else
-        return long_format_binary(obj, base, alternate, NULL, writer);
+        res = long_format_binary(obj, base, alternate, NULL, NULL,
+                                 writer, &str2);
+    if (res < 0)
+        return NULL;
+    assert(str2 != NULL);
+    return str2;
 }
 
 /* Table of digit values for 8-bit string -> integer conversion.
@@ -2705,6 +2769,13 @@ PyLong_AsDouble(PyObject *v)
         PyErr_SetString(PyExc_TypeError, "an integer is required");
         return -1.0;
     }
+    if (Py_ABS(Py_SIZE(v)) <= 1) {
+        /* Fast path; single digit long (31 bits) will cast safely
+	   to double.  This improves performance of FP/long operations
+	   by 20%.
+        */
+        return (double)MEDIUM_VALUE((PyLongObject *)v);
+    }
     x = _PyLong_Frexp((PyLongObject *)v, &exponent);
     if ((x == -1.0 && PyErr_Occurred()) || exponent > DBL_MAX_EXP) {
         PyErr_SetString(PyExc_OverflowError,
@@ -2951,8 +3022,14 @@ long_add(PyLongObject *a, PyLongObject *b)
     if (Py_SIZE(a) < 0) {
         if (Py_SIZE(b) < 0) {
             z = x_add(a, b);
-            if (z != NULL && Py_SIZE(z) != 0)
+            if (z != NULL) {
+                /* x_add received at least one multiple-digit int,
+                   and thus z must be a multiple-digit int.
+                   That also means z is not an element of
+                   small_ints, so negating it in-place is safe. */
+                assert(Py_REFCNT(z) == 1);
                 Py_SIZE(z) = -(Py_SIZE(z));
+            }
         }
         else
             z = x_sub(b, a);
@@ -2983,8 +3060,10 @@ long_sub(PyLongObject *a, PyLongObject *b)
             z = x_sub(a, b);
         else
             z = x_add(a, b);
-        if (z != NULL && Py_SIZE(z) != 0)
+        if (z != NULL) {
+            assert(Py_SIZE(z) == 0 || Py_REFCNT(z) == 1);
             Py_SIZE(z) = -(Py_SIZE(z));
+        }
     }
     else {
         if (Py_SIZE(b) < 0)
@@ -3431,6 +3510,52 @@ long_mul(PyLongObject *a, PyLongObject *b)
     return (PyObject *)z;
 }
 
+/* Fast modulo division for single-digit longs. */
+static PyObject *
+fast_mod(PyLongObject *a, PyLongObject *b)
+{
+    sdigit left = a->ob_digit[0];
+    sdigit right = b->ob_digit[0];
+    sdigit mod;
+
+    assert(Py_ABS(Py_SIZE(a)) == 1);
+    assert(Py_ABS(Py_SIZE(b)) == 1);
+
+    if (Py_SIZE(a) == Py_SIZE(b)) {
+        /* 'a' and 'b' have the same sign. */
+        mod = left % right;
+    }
+    else {
+        /* Either 'a' or 'b' is negative. */
+        mod = right - 1 - (left - 1) % right;
+    }
+
+    return PyLong_FromLong(mod * (sdigit)Py_SIZE(b));
+}
+
+/* Fast floor division for single-digit longs. */
+static PyObject *
+fast_floor_div(PyLongObject *a, PyLongObject *b)
+{
+    sdigit left = a->ob_digit[0];
+    sdigit right = b->ob_digit[0];
+    sdigit div;
+
+    assert(Py_ABS(Py_SIZE(a)) == 1);
+    assert(Py_ABS(Py_SIZE(b)) == 1);
+
+    if (Py_SIZE(a) == Py_SIZE(b)) {
+        /* 'a' and 'b' have the same sign. */
+        div = left / right;
+    }
+    else {
+        /* Either 'a' or 'b' is negative. */
+        div = -1 - (left - 1) / right;
+    }
+
+    return PyLong_FromLong(div);
+}
+
 /* The / and % operators are now defined in terms of divmod().
    The expression a mod b has the value a - b*floor(a/b).
    The long_divrem function gives the remainder after division of
@@ -3458,6 +3583,30 @@ l_divmod(PyLongObject *v, PyLongObject *w,
 {
     PyLongObject *div, *mod;
 
+    if (Py_ABS(Py_SIZE(v)) == 1 && Py_ABS(Py_SIZE(w)) == 1) {
+        /* Fast path for single-digit longs */
+        div = NULL;
+        if (pdiv != NULL) {
+            div = (PyLongObject *)fast_floor_div(v, w);
+            if (div == NULL) {
+                return -1;
+            }
+        }
+        if (pmod != NULL) {
+            mod = (PyLongObject *)fast_mod(v, w);
+            if (mod == NULL) {
+                Py_XDECREF(div);
+                return -1;
+            }
+            *pmod = mod;
+        }
+        if (pdiv != NULL) {
+            /* We only want to set `*pdiv` when `*pmod` is
+               set successfully. */
+            *pdiv = div;
+        }
+        return 0;
+    }
     if (long_divrem(v, w, &div, &mod) < 0)
         return -1;
     if ((Py_SIZE(mod) < 0 && Py_SIZE(w) > 0) ||
@@ -3502,6 +3651,11 @@ long_div(PyObject *a, PyObject *b)
     PyLongObject *div;
 
     CHECK_BINOP(a, b);
+
+    if (Py_ABS(Py_SIZE(a)) == 1 && Py_ABS(Py_SIZE(b)) == 1) {
+        return fast_floor_div((PyLongObject*)a, (PyLongObject*)b);
+    }
+
     if (l_divmod((PyLongObject*)a, (PyLongObject*)b, &div, NULL) < 0)
         div = NULL;
     return (PyObject *)div;
@@ -3777,6 +3931,10 @@ long_mod(PyObject *a, PyObject *b)
 
     CHECK_BINOP(a, b);
 
+    if (Py_ABS(Py_SIZE(a)) == 1 && Py_ABS(Py_SIZE(b)) == 1) {
+        return fast_mod((PyLongObject*)a, (PyLongObject*)b);
+    }
+
     if (l_divmod((PyLongObject*)a, (PyLongObject*)b, NULL, &mod) < 0)
         mod = NULL;
     return (PyObject *)mod;
diff --git a/Objects/memoryobject.c b/Objects/memoryobject.c
index 10162cb..e355a83 100644
--- a/Objects/memoryobject.c
+++ b/Objects/memoryobject.c
@@ -1133,7 +1133,7 @@ get_native_fmtchar(char *result, const char *fmt)
     return -1;
 }
 
-Py_LOCAL_INLINE(char *)
+Py_LOCAL_INLINE(const char *)
 get_native_fmtstr(const char *fmt)
 {
     int at = 0;
@@ -1221,7 +1221,7 @@ cast_to_1D(PyMemoryViewObject *mv, PyObject *format)
         goto out;
     }
 
-    view->format = get_native_fmtstr(PyBytes_AS_STRING(asciifmt));
+    view->format = (char *)get_native_fmtstr(PyBytes_AS_STRING(asciifmt));
     if (view->format == NULL) {
         /* NOT_REACHED: get_native_fmtchar() already validates the format. */
         PyErr_SetString(PyExc_RuntimeError,
diff --git a/Objects/object.c b/Objects/object.c
index 8024889..559794f 100644
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -109,6 +109,15 @@ void
 dump_counts(FILE* f)
 {
     PyTypeObject *tp;
+    PyObject *xoptions, *value;
+    _Py_IDENTIFIER(showalloccount);
+
+    xoptions = PySys_GetXOptions();
+    if (xoptions == NULL)
+        return;
+    value = _PyDict_GetItemId(xoptions, &PyId_showalloccount);
+    if (value != Py_True)
+        return;
 
     for (tp = type_list; tp; tp = tp->tp_next)
         fprintf(f, "%s alloc'd: %" PY_FORMAT_SIZE_T "d, "
@@ -644,7 +653,7 @@ PyObject_Bytes(PyObject *v)
 /* Map rich comparison operators to their swapped version, e.g. LT <--> GT */
 int _Py_SwappedOp[] = {Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE};
 
-static char *opstrings[] = {"<", "<=", "==", "!=", ">", ">="};
+static const char * const opstrings[] = {"<", "<=", "==", "!=", ">", ">="};
 
 /* Perform a rich comparison, raising TypeError when the requested comparison
    operator is not supported. */
@@ -686,11 +695,10 @@ do_richcompare(PyObject *v, PyObject *w, int op)
         res = (v != w) ? Py_True : Py_False;
         break;
     default:
-        /* XXX Special-case None so it doesn't show as NoneType() */
         PyErr_Format(PyExc_TypeError,
-                     "unorderable types: %.100s() %s %.100s()",
-                     v->ob_type->tp_name,
+                     "'%s' not supported between instances of '%.100s' and '%.100s'",
                      opstrings[op],
+                     v->ob_type->tp_name,
                      w->ob_type->tp_name);
         return NULL;
     }
@@ -1041,8 +1049,7 @@ _PyObject_GenericGetAttrWithDict(PyObject *obj, PyObject *name, PyObject *dict)
                      name->ob_type->tp_name);
         return NULL;
     }
-    else
-        Py_INCREF(name);
+    Py_INCREF(name);
 
     if (tp->tp_dict == NULL) {
         if (PyType_Ready(tp) < 0)
@@ -1050,10 +1057,10 @@ _PyObject_GenericGetAttrWithDict(PyObject *obj, PyObject *name, PyObject *dict)
     }
 
     descr = _PyType_Lookup(tp, name);
-    Py_XINCREF(descr);
 
     f = NULL;
     if (descr != NULL) {
+        Py_INCREF(descr);
         f = descr->ob_type->tp_descr_get;
         if (f != NULL && PyDescr_IsData(descr)) {
             res = f(descr, obj, (PyObject *)obj->ob_type);
@@ -1073,8 +1080,9 @@ _PyObject_GenericGetAttrWithDict(PyObject *obj, PyObject *name, PyObject *dict)
                 if (tsize < 0)
                     tsize = -tsize;
                 size = _PyObject_VAR_SIZE(tp, tsize);
+                assert(size <= PY_SSIZE_T_MAX);
 
-                dictoffset += (long)size;
+                dictoffset += (Py_ssize_t)size;
                 assert(dictoffset > 0);
                 assert(dictoffset % SIZEOF_VOID_P == 0);
             }
@@ -1142,12 +1150,11 @@ _PyObject_GenericSetAttrWithDict(PyObject *obj, PyObject *name,
     Py_INCREF(name);
 
     descr = _PyType_Lookup(tp, name);
-    Py_XINCREF(descr);
 
-    f = NULL;
     if (descr != NULL) {
+        Py_INCREF(descr);
         f = descr->ob_type->tp_descr_set;
-        if (f != NULL && PyDescr_IsData(descr)) {
+        if (f != NULL) {
             res = f(descr, obj, value);
             goto done;
         }
@@ -1155,40 +1162,32 @@ _PyObject_GenericSetAttrWithDict(PyObject *obj, PyObject *name,
 
     if (dict == NULL) {
         dictptr = _PyObject_GetDictPtr(obj);
-        if (dictptr != NULL) {
-            res = _PyObjectDict_SetItem(Py_TYPE(obj), dictptr, name, value);
-            if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError))
-                PyErr_SetObject(PyExc_AttributeError, name);
+        if (dictptr == NULL) {
+            if (descr == NULL) {
+                PyErr_Format(PyExc_AttributeError,
+                             "'%.100s' object has no attribute '%U'",
+                             tp->tp_name, name);
+            }
+            else {
+                PyErr_Format(PyExc_AttributeError,
+                             "'%.50s' object attribute '%U' is read-only",
+                             tp->tp_name, name);
+            }
             goto done;
         }
+        res = _PyObjectDict_SetItem(tp, dictptr, name, value);
     }
-    if (dict != NULL) {
+    else {
         Py_INCREF(dict);
         if (value == NULL)
             res = PyDict_DelItem(dict, name);
         else
             res = PyDict_SetItem(dict, name, value);
         Py_DECREF(dict);
-        if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError))
-            PyErr_SetObject(PyExc_AttributeError, name);
-        goto done;
     }
+    if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError))
+        PyErr_SetObject(PyExc_AttributeError, name);
 
-    if (f != NULL) {
-        res = f(descr, obj, value);
-        goto done;
-    }
-
-    if (descr == NULL) {
-        PyErr_Format(PyExc_AttributeError,
-                     "'%.100s' object has no attribute '%U'",
-                     tp->tp_name, name);
-        goto done;
-    }
-
-    PyErr_Format(PyExc_AttributeError,
-                 "'%.50s' object attribute '%U' is read-only",
-                 tp->tp_name, name);
   done:
     Py_XDECREF(descr);
     Py_DECREF(name);
@@ -1204,7 +1203,7 @@ PyObject_GenericSetAttr(PyObject *obj, PyObject *name, PyObject *value)
 int
 PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context)
 {
-    PyObject *dict, **dictptr = _PyObject_GetDictPtr(obj);
+    PyObject **dictptr = _PyObject_GetDictPtr(obj);
     if (dictptr == NULL) {
         PyErr_SetString(PyExc_AttributeError,
                         "This object has no __dict__");
@@ -1220,10 +1219,8 @@ PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context)
                      "not a '%.200s'", Py_TYPE(value)->tp_name);
         return -1;
     }
-    dict = *dictptr;
-    Py_XINCREF(value);
-    *dictptr = value;
-    Py_XDECREF(dict);
+    Py_INCREF(value);
+    Py_XSETREF(*dictptr, value);
     return 0;
 }
 
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index 7cc889f..3f95133 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -1,17 +1,38 @@
 #include "Python.h"
 
+
+/* Defined in tracemalloc.c */
+extern void _PyMem_DumpTraceback(int fd, const void *ptr);
+
+
 /* Python's malloc wrappers (see pymem.h) */
 
-#ifdef PYMALLOC_DEBUG   /* WITH_PYMALLOC && PYMALLOC_DEBUG */
+/*
+ * Basic types
+ * I don't care if these are defined in <sys/types.h> or elsewhere. Axiom.
+ */
+#undef  uchar
+#define uchar   unsigned char   /* assuming == 8 bits  */
+
+#undef  uint
+#define uint    unsigned int    /* assuming >= 16 bits */
+
+#undef uptr
+#define uptr    Py_uintptr_t
+
 /* Forward declaration */
+static void* _PyMem_DebugRawMalloc(void *ctx, size_t size);
+static void* _PyMem_DebugRawCalloc(void *ctx, size_t nelem, size_t elsize);
+static void* _PyMem_DebugRawRealloc(void *ctx, void *ptr, size_t size);
+static void _PyMem_DebugRawFree(void *ctx, void *p);
+
 static void* _PyMem_DebugMalloc(void *ctx, size_t size);
 static void* _PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize);
-static void _PyMem_DebugFree(void *ctx, void *p);
 static void* _PyMem_DebugRealloc(void *ctx, void *ptr, size_t size);
+static void _PyMem_DebugFree(void *ctx, void *p);
 
 static void _PyObject_DebugDumpAddress(const void *p);
 static void _PyMem_DebugCheckAddress(char api_id, const void *p);
-#endif
 
 #if defined(__has_feature)  /* Clang */
  #if __has_feature(address_sanitizer)  /* is ASAN enabled? */
@@ -145,9 +166,8 @@ _PyObject_ArenaFree(void *ctx, void *ptr, size_t size)
 #else
 #  define PYOBJ_FUNCS PYRAW_FUNCS
 #endif
-#define PYMEM_FUNCS PYRAW_FUNCS
+#define PYMEM_FUNCS PYOBJ_FUNCS
 
-#ifdef PYMALLOC_DEBUG
 typedef struct {
     /* We tag each block with an API ID in order to tag API violations */
     char api_id;
@@ -163,19 +183,21 @@ static struct {
     {'o', {NULL, PYOBJ_FUNCS}}
     };
 
-#define PYDBG_FUNCS _PyMem_DebugMalloc, _PyMem_DebugCalloc, _PyMem_DebugRealloc, _PyMem_DebugFree
-#endif
+#define PYRAWDBG_FUNCS \
+    _PyMem_DebugRawMalloc, _PyMem_DebugRawCalloc, _PyMem_DebugRawRealloc, _PyMem_DebugRawFree
+#define PYDBG_FUNCS \
+    _PyMem_DebugMalloc, _PyMem_DebugCalloc, _PyMem_DebugRealloc, _PyMem_DebugFree
 
 static PyMemAllocatorEx _PyMem_Raw = {
-#ifdef PYMALLOC_DEBUG
-    &_PyMem_Debug.raw, PYDBG_FUNCS
+#ifdef Py_DEBUG
+    &_PyMem_Debug.raw, PYRAWDBG_FUNCS
 #else
     NULL, PYRAW_FUNCS
 #endif
     };
 
 static PyMemAllocatorEx _PyMem = {
-#ifdef PYMALLOC_DEBUG
+#ifdef Py_DEBUG
     &_PyMem_Debug.mem, PYDBG_FUNCS
 #else
     NULL, PYMEM_FUNCS
@@ -183,16 +205,76 @@ static PyMemAllocatorEx _PyMem = {
     };
 
 static PyMemAllocatorEx _PyObject = {
-#ifdef PYMALLOC_DEBUG
+#ifdef Py_DEBUG
     &_PyMem_Debug.obj, PYDBG_FUNCS
 #else
     NULL, PYOBJ_FUNCS
 #endif
     };
 
+int
+_PyMem_SetupAllocators(const char *opt)
+{
+    if (opt == NULL || *opt == '\0') {
+        /* PYTHONMALLOC is empty or is not set or ignored (-E/-I command line
+           options): use default allocators */
+#ifdef Py_DEBUG
+#  ifdef WITH_PYMALLOC
+        opt = "pymalloc_debug";
+#  else
+        opt = "malloc_debug";
+#  endif
+#else
+   /* !Py_DEBUG */
+#  ifdef WITH_PYMALLOC
+        opt = "pymalloc";
+#  else
+        opt = "malloc";
+#  endif
+#endif
+    }
+
+    if (strcmp(opt, "debug") == 0) {
+        PyMem_SetupDebugHooks();
+    }
+    else if (strcmp(opt, "malloc") == 0 || strcmp(opt, "malloc_debug") == 0)
+    {
+        PyMemAllocatorEx alloc = {NULL, PYRAW_FUNCS};
+
+        PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &alloc);
+        PyMem_SetAllocator(PYMEM_DOMAIN_MEM, &alloc);
+        PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &alloc);
+
+        if (strcmp(opt, "malloc_debug") == 0)
+            PyMem_SetupDebugHooks();
+    }
+#ifdef WITH_PYMALLOC
+    else if (strcmp(opt, "pymalloc") == 0
+             || strcmp(opt, "pymalloc_debug") == 0)
+    {
+        PyMemAllocatorEx raw_alloc = {NULL, PYRAW_FUNCS};
+        PyMemAllocatorEx mem_alloc = {NULL, PYMEM_FUNCS};
+        PyMemAllocatorEx obj_alloc = {NULL, PYOBJ_FUNCS};
+
+        PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &raw_alloc);
+        PyMem_SetAllocator(PYMEM_DOMAIN_MEM, &mem_alloc);
+        PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &obj_alloc);
+
+        if (strcmp(opt, "pymalloc_debug") == 0)
+            PyMem_SetupDebugHooks();
+    }
+#endif
+    else {
+        /* unknown allocator */
+        return -1;
+    }
+    return 0;
+}
+
 #undef PYRAW_FUNCS
 #undef PYMEM_FUNCS
 #undef PYOBJ_FUNCS
+#undef PYRAWDBG_FUNCS
 #undef PYDBG_FUNCS
 
 static PyObjectArenaAllocator _PyObject_Arena = {NULL,
@@ -205,23 +287,46 @@ static PyObjectArenaAllocator _PyObject_Arena = {NULL,
 #endif
     };
 
+#ifdef WITH_PYMALLOC
+static int
+_PyMem_DebugEnabled(void)
+{
+    return (_PyObject.malloc == _PyMem_DebugMalloc);
+}
+
+int
+_PyMem_PymallocEnabled(void)
+{
+    if (_PyMem_DebugEnabled()) {
+        return (_PyMem_Debug.obj.alloc.malloc == _PyObject_Malloc);
+    }
+    else {
+        return (_PyObject.malloc == _PyObject_Malloc);
+    }
+}
+#endif
+
 void
 PyMem_SetupDebugHooks(void)
 {
-#ifdef PYMALLOC_DEBUG
     PyMemAllocatorEx alloc;
 
-    alloc.malloc = _PyMem_DebugMalloc;
-    alloc.calloc = _PyMem_DebugCalloc;
-    alloc.realloc = _PyMem_DebugRealloc;
-    alloc.free = _PyMem_DebugFree;
+    alloc.malloc = _PyMem_DebugRawMalloc;
+    alloc.calloc = _PyMem_DebugRawCalloc;
+    alloc.realloc = _PyMem_DebugRawRealloc;
+    alloc.free = _PyMem_DebugRawFree;
 
-    if (_PyMem_Raw.malloc != _PyMem_DebugMalloc) {
+    if (_PyMem_Raw.malloc != _PyMem_DebugRawMalloc) {
         alloc.ctx = &_PyMem_Debug.raw;
         PyMem_GetAllocator(PYMEM_DOMAIN_RAW, &_PyMem_Debug.raw.alloc);
         PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &alloc);
     }
 
+    alloc.malloc = _PyMem_DebugMalloc;
+    alloc.calloc = _PyMem_DebugCalloc;
+    alloc.realloc = _PyMem_DebugRealloc;
+    alloc.free = _PyMem_DebugFree;
+
     if (_PyMem.malloc != _PyMem_DebugMalloc) {
         alloc.ctx = &_PyMem_Debug.mem;
         PyMem_GetAllocator(PYMEM_DOMAIN_MEM, &_PyMem_Debug.mem.alloc);
@@ -233,7 +338,6 @@ PyMem_SetupDebugHooks(void)
         PyMem_GetAllocator(PYMEM_DOMAIN_OBJ, &_PyMem_Debug.obj.alloc);
         PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &alloc);
     }
-#endif
 }
 
 void
@@ -264,7 +368,6 @@ PyMem_SetAllocator(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
     case PYMEM_DOMAIN_OBJ: _PyObject = *allocator; break;
     /* ignore unknown domain */
     }
-
 }
 
 void
@@ -642,22 +745,6 @@ static int running_on_valgrind = -1;
 #define SIMPLELOCK_LOCK(lock)   /* acquire released lock */
 #define SIMPLELOCK_UNLOCK(lock) /* release acquired lock */
 
-/*
- * Basic types
- * I don't care if these are defined in <sys/types.h> or elsewhere. Axiom.
- */
-#undef  uchar
-#define uchar   unsigned char   /* assuming == 8 bits  */
-
-#undef  uint
-#define uint    unsigned int    /* assuming >= 16 bits */
-
-#undef  ulong
-#define ulong   unsigned long   /* assuming >= 32 bits */
-
-#undef uptr
-#define uptr    Py_uintptr_t
-
 /* When you say memory, my mind reasons in terms of (pointers to) blocks */
 typedef uchar block;
 
@@ -949,11 +1036,15 @@ new_arena(void)
     struct arena_object* arenaobj;
     uint excess;        /* number of bytes above pool alignment */
     void *address;
+    static int debug_stats = -1;
 
-#ifdef PYMALLOC_DEBUG
-    if (Py_GETENV("PYTHONMALLOCSTATS"))
+    if (debug_stats == -1) {
+        char *opt = Py_GETENV("PYTHONMALLOCSTATS");
+        debug_stats = (opt != NULL && *opt != '\0');
+    }
+    if (debug_stats)
         _PyObject_DebugMallocStats(stderr);
-#endif
+
     if (unused_arena_objects == NULL) {
         uint i;
         uint numarenas;
@@ -1709,7 +1800,7 @@ _Py_GetAllocatedBlocks(void)
 
 #endif /* WITH_PYMALLOC */
 
-#ifdef PYMALLOC_DEBUG
+
 /*==========================================================================*/
 /* A x-platform debugging allocator.  This doesn't manage memory directly,
  * it wraps a real allocator, adding extra debugging info to the memory blocks.
@@ -1767,31 +1858,6 @@ write_size_t(void *p, size_t n)
     }
 }
 
-#ifdef Py_DEBUG
-/* Is target in the list?  The list is traversed via the nextpool pointers.
- * The list may be NULL-terminated, or circular.  Return 1 if target is in
- * list, else 0.
- */
-static int
-pool_is_in_list(const poolp target, poolp list)
-{
-    poolp origlist = list;
-    assert(target != NULL);
-    if (list == NULL)
-        return 0;
-    do {
-        if (target == list)
-            return 1;
-        list = list->nextpool;
-    } while (list != NULL && list != origlist);
-    return 0;
-}
-
-#else
-#define pool_is_in_list(X, Y) 1
-
-#endif  /* Py_DEBUG */
-
 /* Let S = sizeof(size_t).  The debug malloc asks for 4*S extra bytes and
    fills them with useful stuff, here calling the underlying malloc's result p:
 
@@ -1819,7 +1885,7 @@ p[2*S+n+S: 2*S+n+2*S]
 */
 
 static void *
-_PyMem_DebugAlloc(int use_calloc, void *ctx, size_t nbytes)
+_PyMem_DebugRawAlloc(int use_calloc, void *ctx, size_t nbytes)
 {
     debug_alloc_api_t *api = (debug_alloc_api_t *)ctx;
     uchar *p;           /* base address of malloc'ed block */
@@ -1856,18 +1922,18 @@ _PyMem_DebugAlloc(int use_calloc, void *ctx, size_t nbytes)
 }
 
 static void *
-_PyMem_DebugMalloc(void *ctx, size_t nbytes)
+_PyMem_DebugRawMalloc(void *ctx, size_t nbytes)
 {
-    return _PyMem_DebugAlloc(0, ctx, nbytes);
+    return _PyMem_DebugRawAlloc(0, ctx, nbytes);
 }
 
 static void *
-_PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize)
+_PyMem_DebugRawCalloc(void *ctx, size_t nelem, size_t elsize)
 {
     size_t nbytes;
     assert(elsize == 0 || nelem <= PY_SSIZE_T_MAX / elsize);
     nbytes = nelem * elsize;
-    return _PyMem_DebugAlloc(1, ctx, nbytes);
+    return _PyMem_DebugRawAlloc(1, ctx, nbytes);
 }
 
 /* The debug free first checks the 2*SST bytes on each end for sanity (in
@@ -1876,7 +1942,7 @@ _PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize)
    Then calls the underlying free.
 */
 static void
-_PyMem_DebugFree(void *ctx, void *p)
+_PyMem_DebugRawFree(void *ctx, void *p)
 {
     debug_alloc_api_t *api = (debug_alloc_api_t *)ctx;
     uchar *q = (uchar *)p - 2*SST;  /* address returned from malloc */
@@ -1893,7 +1959,7 @@ _PyMem_DebugFree(void *ctx, void *p)
 }
 
 static void *
-_PyMem_DebugRealloc(void *ctx, void *p, size_t nbytes)
+_PyMem_DebugRawRealloc(void *ctx, void *p, size_t nbytes)
 {
     debug_alloc_api_t *api = (debug_alloc_api_t *)ctx;
     uchar *q = (uchar *)p, *oldq;
@@ -1903,7 +1969,7 @@ _PyMem_DebugRealloc(void *ctx, void *p, size_t nbytes)
     int i;
 
     if (p == NULL)
-        return _PyMem_DebugAlloc(0, ctx, nbytes);
+        return _PyMem_DebugRawAlloc(0, ctx, nbytes);
 
     _PyMem_DebugCheckAddress(api->api_id, p);
     bumpserialno();
@@ -1946,6 +2012,44 @@ _PyMem_DebugRealloc(void *ctx, void *p, size_t nbytes)
     return q;
 }
 
+static void
+_PyMem_DebugCheckGIL(void)
+{
+#ifdef WITH_THREAD
+    if (!PyGILState_Check())
+        Py_FatalError("Python memory allocator called "
+                      "without holding the GIL");
+#endif
+}
+
+static void *
+_PyMem_DebugMalloc(void *ctx, size_t nbytes)
+{
+    _PyMem_DebugCheckGIL();
+    return _PyMem_DebugRawMalloc(ctx, nbytes);
+}
+
+static void *
+_PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize)
+{
+    _PyMem_DebugCheckGIL();
+    return _PyMem_DebugRawCalloc(ctx, nelem, elsize);
+}
+
+static void
+_PyMem_DebugFree(void *ctx, void *ptr)
+{
+    _PyMem_DebugCheckGIL();
+    _PyMem_DebugRawFree(ctx, ptr);
+}
+
+static void *
+_PyMem_DebugRealloc(void *ctx, void *ptr, size_t nbytes)
+{
+    _PyMem_DebugCheckGIL();
+    return _PyMem_DebugRawRealloc(ctx, ptr, nbytes);
+}
+
 /* Check the forbidden bytes on both ends of the memory allocated for p.
  * If anything is wrong, print info to stderr via _PyObject_DebugDumpAddress,
  * and call Py_FatalError to kill the program.
@@ -2104,9 +2208,12 @@ _PyObject_DebugDumpAddress(const void *p)
         }
         fputc('\n', stderr);
     }
+    fputc('\n', stderr);
+
+    fflush(stderr);
+    _PyMem_DumpTraceback(fileno(stderr), p);
 }
 
-#endif  /* PYMALLOC_DEBUG */
 
 static size_t
 printone(FILE *out, const char* msg, size_t value)
@@ -2158,8 +2265,30 @@ _PyDebugAllocatorStats(FILE *out,
     (void)printone(out, buf2, num_blocks * sizeof_block);
 }
 
+
 #ifdef WITH_PYMALLOC
 
+#ifdef Py_DEBUG
+/* Is target in the list?  The list is traversed via the nextpool pointers.
+ * The list may be NULL-terminated, or circular.  Return 1 if target is in
+ * list, else 0.
+ */
+static int
+pool_is_in_list(const poolp target, poolp list)
+{
+    poolp origlist = list;
+    assert(target != NULL);
+    if (list == NULL)
+        return 0;
+    do {
+        if (target == list)
+            return 1;
+        list = list->nextpool;
+    } while (list != NULL && list != origlist);
+    return 0;
+}
+#endif
+
 /* Print summary info to "out" about the state of pymalloc's structures.
  * In Py_DEBUG mode, also perform some expensive internal consistency
  * checks.
@@ -2233,7 +2362,9 @@ _PyObject_DebugMallocStats(FILE *out)
 
             if (p->ref.count == 0) {
                 /* currently unused */
+#ifdef Py_DEBUG
                 assert(pool_is_in_list(p, arenas[i].freepools));
+#endif
                 continue;
             }
             ++numpools[sz];
@@ -2273,9 +2404,8 @@ _PyObject_DebugMallocStats(FILE *out)
         quantization += p * ((POOL_SIZE - POOL_OVERHEAD) % size);
     }
     fputc('\n', out);
-#ifdef PYMALLOC_DEBUG
-    (void)printone(out, "# times object malloc called", serialno);
-#endif
+    if (_PyMem_DebugEnabled())
+        (void)printone(out, "# times object malloc called", serialno);
     (void)printone(out, "# arenas allocated total", ntimes_arena_allocated);
     (void)printone(out, "# arenas reclaimed", ntimes_arena_allocated - narenas);
     (void)printone(out, "# arenas highwater mark", narenas_highwater);
@@ -2303,6 +2433,7 @@ _PyObject_DebugMallocStats(FILE *out)
 
 #endif /* #ifdef WITH_PYMALLOC */
 
+
 #ifdef Py_USING_MEMORY_DEBUGGER
 /* Make this function last so gcc won't inline it since the definition is
  * after the reference.
diff --git a/Objects/odictobject.c b/Objects/odictobject.c
index a6963d7..14be1cd 100644
--- a/Objects/odictobject.c
+++ b/Objects/odictobject.c
@@ -1424,14 +1424,13 @@ static PyMethodDef odict_methods[] = {
  * OrderedDict members
  */
 
-/* tp_members */
+/* tp_getset */
 
-static PyMemberDef odict_members[] = {
-    {"__dict__", T_OBJECT, offsetof(PyODictObject, od_inst_dict), READONLY},
-    {0}
+static PyGetSetDef odict_getset[] = {
+    {"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict},
+    {NULL}
 };
 
-
 /* ----------------------------------------------
  * OrderedDict type slot methods
  */
@@ -1463,7 +1462,7 @@ odict_dealloc(PyODictObject *self)
     ++tstate->trash_delete_nesting;
 
     Py_TRASHCAN_SAFE_END(self)
-};
+}
 
 /* tp_repr */
 
@@ -1540,7 +1539,7 @@ Done:
     Py_XDECREF(pieces);
     Py_ReprLeave((PyObject *)self);
     return result;
-};
+}
 
 /* tp_doc */
 
@@ -1612,7 +1611,7 @@ odict_richcompare(PyObject *v, PyObject *w, int op)
     } else {
         Py_RETURN_NOTIMPLEMENTED;
     }
-};
+}
 
 /* tp_iter */
 
@@ -1620,7 +1619,7 @@ static PyObject *
 odict_iter(PyODictObject *od)
 {
     return odictiter_new(od, _odict_ITER_KEYS);
-};
+}
 
 /* tp_init */
 
@@ -1646,27 +1645,19 @@ odict_init(PyObject *self, PyObject *args, PyObject *kwds)
         Py_DECREF(res);
         return 0;
     }
-};
+}
 
 /* tp_new */
 
 static PyObject *
 odict_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 {
-    PyObject *dict;
     PyODictObject *od;
 
-    dict = PyDict_New();
-    if (dict == NULL)
-        return NULL;
-
     od = (PyODictObject *)PyDict_Type.tp_new(type, args, kwds);
-    if (od == NULL) {
-        Py_DECREF(dict);
+    if (od == NULL)
         return NULL;
-    }
 
-    od->od_inst_dict = dict;
     /* type constructor fills the memory with zeros (see
        PyType_GenericAlloc()), there is no need to set them to zero again */
     if (_odict_resize(od) < 0) {
@@ -1708,8 +1699,8 @@ PyTypeObject PyODict_Type = {
     (getiterfunc)odict_iter,                    /* tp_iter */
     0,                                          /* tp_iternext */
     odict_methods,                              /* tp_methods */
-    odict_members,                              /* tp_members */
-    0,                                          /* tp_getset */
+    0,                                          /* tp_members */
+    odict_getset,                               /* tp_getset */
     &PyDict_Type,                               /* tp_base */
     0,                                          /* tp_dict */
     0,                                          /* tp_descr_get */
@@ -1729,7 +1720,7 @@ PyTypeObject PyODict_Type = {
 PyObject *
 PyODict_New(void) {
     return odict_new(&PyODict_Type, NULL, NULL);
-};
+}
 
 static int
 _PyODict_SetItem_KnownHash(PyObject *od, PyObject *key, PyObject *value,
@@ -1747,7 +1738,7 @@ _PyODict_SetItem_KnownHash(PyObject *od, PyObject *key, PyObject *value,
         }
     }
     return res;
-};
+}
 
 int
 PyODict_SetItem(PyObject *od, PyObject *key, PyObject *value)
@@ -1756,7 +1747,7 @@ PyODict_SetItem(PyObject *od, PyObject *key, PyObject *value)
     if (hash == -1)
         return -1;
     return _PyODict_SetItem_KnownHash(od, key, value, hash);
-};
+}
 
 int
 PyODict_DelItem(PyObject *od, PyObject *key)
@@ -1769,7 +1760,7 @@ PyODict_DelItem(PyObject *od, PyObject *key)
     if (res < 0)
         return -1;
     return _PyDict_DelItem_KnownHash(od, key, hash);
-};
+}
 
 
 /* -------------------------------------------
diff --git a/Objects/rangeobject.c b/Objects/rangeobject.c
index 0e9eb20..284a100 100644
--- a/Objects/rangeobject.c
+++ b/Objects/rangeobject.c
@@ -29,17 +29,10 @@ validate_step(PyObject *step)
         return PyLong_FromLong(1);
 
     step = PyNumber_Index(step);
-    if (step) {
-        Py_ssize_t istep = PyNumber_AsSsize_t(step, NULL);
-        if (istep == -1 && PyErr_Occurred()) {
-            /* Ignore OverflowError, we know the value isn't 0. */
-            PyErr_Clear();
-        }
-        else if (istep == 0) {
-            PyErr_SetString(PyExc_ValueError,
-                            "range() arg 3 must not be zero");
-            Py_CLEAR(step);
-        }
+    if (step && _PyLong_Sign(step) == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "range() arg 3 must not be zero");
+        Py_CLEAR(step);
     }
 
     return step;
@@ -129,9 +122,9 @@ range_new(PyTypeObject *type, PyObject *args, PyObject *kw)
         return (PyObject *) obj;
 
     /* Failed to create object, release attributes */
-    Py_XDECREF(start);
-    Py_XDECREF(stop);
-    Py_XDECREF(step);
+    Py_DECREF(start);
+    Py_DECREF(stop);
+    Py_DECREF(step);
     return NULL;
 }
 
@@ -196,7 +189,7 @@ compute_range_length(PyObject *start, PyObject *stop, PyObject *step)
     /* if (lo >= hi), return length of 0. */
     cmp_result = PyObject_RichCompareBool(lo, hi, Py_GE);
     if (cmp_result != 0) {
-        Py_XDECREF(step);
+        Py_DECREF(step);
         if (cmp_result < 0)
             return NULL;
         return PyLong_FromLong(0);
@@ -225,9 +218,9 @@ compute_range_length(PyObject *start, PyObject *stop, PyObject *step)
     return result;
 
   Fail:
+    Py_DECREF(step);
     Py_XDECREF(tmp2);
     Py_XDECREF(diff);
-    Py_XDECREF(step);
     Py_XDECREF(tmp1);
     Py_XDECREF(one);
     return NULL;
diff --git a/Objects/setobject.c b/Objects/setobject.c
index 4ef692d..6dd403f 100644
--- a/Objects/setobject.c
+++ b/Objects/setobject.c
@@ -26,7 +26,6 @@
 
 #include "Python.h"
 #include "structmember.h"
-#include "stringlib/eq.h"
 
 /* Object used as dummy key to fill deleted entries */
 static PyObject _dummy_struct;
@@ -48,19 +47,20 @@ static PyObject _dummy_struct;
 static setentry *
 set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
 {
-    setentry *table = so->table;
-    setentry *freeslot = NULL;
+    setentry *table;
     setentry *entry;
-    size_t perturb = hash;
+    size_t perturb;
     size_t mask = so->mask;
     size_t i = (size_t)hash & mask; /* Unsigned for defined overflow behavior */
     size_t j;
     int cmp;
 
-    entry = &table[i];
+    entry = &so->table[i];
     if (entry->key == NULL)
         return entry;
 
+    perturb = hash;
+
     while (1) {
         if (entry->hash == hash) {
             PyObject *startkey = entry->key;
@@ -70,8 +70,9 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                 return entry;
             if (PyUnicode_CheckExact(startkey)
                 && PyUnicode_CheckExact(key)
-                && unicode_eq(startkey, key))
+                && _PyUnicode_EQ(startkey, key))
                 return entry;
+            table = so->table;
             Py_INCREF(startkey);
             cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
             Py_DECREF(startkey);
@@ -83,14 +84,12 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                 return entry;
             mask = so->mask;                 /* help avoid a register spill */
         }
-        if (entry->hash == -1 && freeslot == NULL)
-            freeslot = entry;
 
         if (i + LINEAR_PROBES <= mask) {
             for (j = 0 ; j < LINEAR_PROBES ; j++) {
                 entry++;
-                if (entry->key == NULL)
-                    goto found_null;
+                if (entry->hash == 0 && entry->key == NULL)
+                    return entry;
                 if (entry->hash == hash) {
                     PyObject *startkey = entry->key;
                     assert(startkey != dummy);
@@ -98,8 +97,9 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                         return entry;
                     if (PyUnicode_CheckExact(startkey)
                         && PyUnicode_CheckExact(key)
-                        && unicode_eq(startkey, key))
+                        && _PyUnicode_EQ(startkey, key))
                         return entry;
+                    table = so->table;
                     Py_INCREF(startkey);
                     cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
                     Py_DECREF(startkey);
@@ -111,7 +111,104 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                         return entry;
                     mask = so->mask;
                 }
-                if (entry->hash == -1 && freeslot == NULL)
+            }
+        }
+
+        perturb >>= PERTURB_SHIFT;
+        i = (i * 5 + 1 + perturb) & mask;
+
+        entry = &so->table[i];
+        if (entry->key == NULL)
+            return entry;
+    }
+}
+
+static int set_table_resize(PySetObject *, Py_ssize_t);
+
+static int
+set_add_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
+{
+    setentry *table;
+    setentry *freeslot;
+    setentry *entry;
+    size_t perturb;
+    size_t mask;
+    size_t i;                       /* Unsigned for defined overflow behavior */
+    size_t j;
+    int cmp;
+
+    /* Pre-increment is necessary to prevent arbitrary code in the rich
+       comparison from deallocating the key just before the insertion. */
+    Py_INCREF(key);
+
+  restart:
+
+    mask = so->mask;
+    i = (size_t)hash & mask;
+
+    entry = &so->table[i];
+    if (entry->key == NULL)
+        goto found_unused;
+
+    freeslot = NULL;
+    perturb = hash;
+
+    while (1) {
+        if (entry->hash == hash) {
+            PyObject *startkey = entry->key;
+            /* startkey cannot be a dummy because the dummy hash field is -1 */
+            assert(startkey != dummy);
+            if (startkey == key)
+                goto found_active;
+            if (PyUnicode_CheckExact(startkey)
+                && PyUnicode_CheckExact(key)
+                && _PyUnicode_EQ(startkey, key))
+                goto found_active;
+            table = so->table;
+            Py_INCREF(startkey);
+            cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
+            Py_DECREF(startkey);
+            if (cmp > 0)                                          /* likely */
+                goto found_active;
+            if (cmp < 0)
+                goto comparison_error;
+            /* Continuing the search from the current entry only makes
+               sense if the table and entry are unchanged; otherwise,
+               we have to restart from the beginning */
+            if (table != so->table || entry->key != startkey)
+                goto restart;
+            mask = so->mask;                 /* help avoid a register spill */
+        }
+        else if (entry->hash == -1 && freeslot == NULL)
+            freeslot = entry;
+
+        if (i + LINEAR_PROBES <= mask) {
+            for (j = 0 ; j < LINEAR_PROBES ; j++) {
+                entry++;
+                if (entry->hash == 0 && entry->key == NULL)
+                    goto found_unused_or_dummy;
+                if (entry->hash == hash) {
+                    PyObject *startkey = entry->key;
+                    assert(startkey != dummy);
+                    if (startkey == key)
+                        goto found_active;
+                    if (PyUnicode_CheckExact(startkey)
+                        && PyUnicode_CheckExact(key)
+                        && _PyUnicode_EQ(startkey, key))
+                        goto found_active;
+                    table = so->table;
+                    Py_INCREF(startkey);
+                    cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
+                    Py_DECREF(startkey);
+                    if (cmp > 0)
+                        goto found_active;
+                    if (cmp < 0)
+                        goto comparison_error;
+                    if (table != so->table || entry->key != startkey)
+                        goto restart;
+                    mask = so->mask;
+                }
+                else if (entry->hash == -1 && freeslot == NULL)
                     freeslot = entry;
             }
         }
@@ -119,29 +216,51 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
         perturb >>= PERTURB_SHIFT;
         i = (i * 5 + 1 + perturb) & mask;
 
-        entry = &table[i];
+        entry = &so->table[i];
         if (entry->key == NULL)
-            goto found_null;
+            goto found_unused_or_dummy;
     }
-  found_null:
-    return freeslot == NULL ? entry : freeslot;
+
+  found_unused_or_dummy:
+    if (freeslot == NULL)
+        goto found_unused;
+    so->used++;
+    freeslot->key = key;
+    freeslot->hash = hash;
+    return 0;
+
+  found_unused:
+    so->fill++;
+    so->used++;
+    entry->key = key;
+    entry->hash = hash;
+    if ((size_t)so->fill*3 < mask*2)
+        return 0;
+    return set_table_resize(so, so->used);
+
+  found_active:
+    Py_DECREF(key);
+    return 0;
+
+  comparison_error:
+    Py_DECREF(key);
+    return -1;
 }
 
 /*
 Internal routine used by set_table_resize() to insert an item which is
 known to be absent from the set.  This routine also assumes that
 the set contains no deleted entries.  Besides the performance benefit,
-using set_insert_clean() in set_table_resize() is dangerous (SF bug #1456209).
-Note that no refcounts are changed by this routine; if needed, the caller
-is responsible for incref'ing `key`.
+there is also safety benefit since using set_add_entry() risks making
+a callback in the middle of a set_table_resize(), see issue 1456209.
+The caller is responsible for updating the key's reference count and
+the setobject's fill and used fields.
 */
 static void
-set_insert_clean(PySetObject *so, PyObject *key, Py_hash_t hash)
+set_insert_clean(setentry *table, size_t mask, PyObject *key, Py_hash_t hash)
 {
-    setentry *table = so->table;
     setentry *entry;
     size_t perturb = hash;
-    size_t mask = (size_t)so->mask;
     size_t i = (size_t)hash & mask;
     size_t j;
 
@@ -162,45 +281,11 @@ set_insert_clean(PySetObject *so, PyObject *key, Py_hash_t hash)
   found_null:
     entry->key = key;
     entry->hash = hash;
-    so->fill++;
-    so->used++;
 }
 
 /* ======== End logic for probing the hash table ========================== */
 /* ======================================================================== */
 
-
-/*
-Internal routine to insert a new key into the table.
-Used by the public insert routine.
-Eats a reference to key.
-*/
-static int
-set_insert_key(PySetObject *so, PyObject *key, Py_hash_t hash)
-{
-    setentry *entry;
-
-    entry = set_lookkey(so, key, hash);
-    if (entry == NULL)
-        return -1;
-    if (entry->key == NULL) {
-        /* UNUSED */
-        entry->key = key;
-        entry->hash = hash;
-        so->fill++;
-        so->used++;
-    } else if (entry->key == dummy) {
-        /* DUMMY */
-        entry->key = key;
-        entry->hash = hash;
-        so->used++;
-    } else {
-        /* ACTIVE */
-        Py_DECREF(key);
-    }
-    return 0;
-}
-
 /*
 Restructure the table by allocating a new table and reinserting all
 keys again.  When entries have been deleted, the new table may
@@ -213,10 +298,13 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
     setentry *oldtable, *newtable, *entry;
     Py_ssize_t oldfill = so->fill;
     Py_ssize_t oldused = so->used;
+    Py_ssize_t oldmask = so->mask;
+    size_t newmask;
     int is_oldtable_malloced;
     setentry small_copy[PySet_MINSIZE];
 
     assert(minused >= 0);
+    minused = (minused > 50000) ? minused * 2 : minused * 4;
 
     /* Find the smallest table size > minused. */
     /* XXX speed-up with intrinsics */
@@ -264,25 +352,24 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
     /* Make the set empty, using the new table. */
     assert(newtable != oldtable);
     memset(newtable, 0, sizeof(setentry) * newsize);
-    so->fill = 0;
-    so->used = 0;
+    so->fill = oldused;
+    so->used = oldused;
     so->mask = newsize - 1;
     so->table = newtable;
 
     /* Copy the data over; this is refcount-neutral for active entries;
        dummy entries aren't copied over, of course */
+    newmask = (size_t)so->mask;
     if (oldfill == oldused) {
-        for (entry = oldtable; oldused > 0; entry++) {
+        for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
             if (entry->key != NULL) {
-                oldused--;
-                set_insert_clean(so, entry->key, entry->hash);
+                set_insert_clean(newtable, newmask, entry->key, entry->hash);
             }
         }
     } else {
-        for (entry = oldtable; oldused > 0; entry++) {
+        for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
             if (entry->key != NULL && entry->key != dummy) {
-                oldused--;
-                set_insert_clean(so, entry->key, entry->hash);
+                set_insert_clean(newtable, newmask, entry->key, entry->hash);
             }
         }
     }
@@ -292,31 +379,42 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
     return 0;
 }
 
-/* CAUTION: set_add_key/entry() must guarantee it won't resize the table */
+static int
+set_contains_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
+{
+    setentry *entry;
+
+    entry = set_lookkey(so, key, hash);
+    if (entry != NULL)
+        return entry->key != NULL;
+    return -1;
+}
+
+#define DISCARD_NOTFOUND 0
+#define DISCARD_FOUND 1
 
 static int
-set_add_entry(PySetObject *so, setentry *entry)
+set_discard_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
 {
-    Py_ssize_t n_used;
-    PyObject *key = entry->key;
-    Py_hash_t hash = entry->hash;
+    setentry *entry;
+    PyObject *old_key;
 
-    assert(so->fill <= so->mask);  /* at least one empty slot */
-    n_used = so->used;
-    Py_INCREF(key);
-    if (set_insert_key(so, key, hash)) {
-        Py_DECREF(key);
+    entry = set_lookkey(so, key, hash);
+    if (entry == NULL)
         return -1;
-    }
-    if (!(so->used > n_used && so->fill*3 >= (so->mask+1)*2))
-        return 0;
-    return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
+    if (entry->key == NULL)
+        return DISCARD_NOTFOUND;
+    old_key = entry->key;
+    entry->key = dummy;
+    entry->hash = -1;
+    so->used--;
+    Py_DECREF(old_key);
+    return DISCARD_FOUND;
 }
 
 static int
 set_add_key(PySetObject *so, PyObject *key)
 {
-    setentry entry;
     Py_hash_t hash;
 
     if (!PyUnicode_CheckExact(key) ||
@@ -325,50 +423,35 @@ set_add_key(PySetObject *so, PyObject *key)
         if (hash == -1)
             return -1;
     }
-    entry.key = key;
-    entry.hash = hash;
-    return set_add_entry(so, &entry);
+    return set_add_entry(so, key, hash);
 }
 
-#define DISCARD_NOTFOUND 0
-#define DISCARD_FOUND 1
-
 static int
-set_discard_entry(PySetObject *so, setentry *oldentry)
+set_contains_key(PySetObject *so, PyObject *key)
 {
-    setentry *entry;
-    PyObject *old_key;
+    Py_hash_t hash;
 
-    entry = set_lookkey(so, oldentry->key, oldentry->hash);
-    if (entry == NULL)
-        return -1;
-    if (entry->key == NULL  ||  entry->key == dummy)
-        return DISCARD_NOTFOUND;
-    old_key = entry->key;
-    entry->key = dummy;
-    entry->hash = -1;
-    so->used--;
-    Py_DECREF(old_key);
-    return DISCARD_FOUND;
+    if (!PyUnicode_CheckExact(key) ||
+        (hash = ((PyASCIIObject *) key)->hash) == -1) {
+        hash = PyObject_Hash(key);
+        if (hash == -1)
+            return -1;
+    }
+    return set_contains_entry(so, key, hash);
 }
 
 static int
 set_discard_key(PySetObject *so, PyObject *key)
 {
-    setentry entry;
     Py_hash_t hash;
 
-    assert (PyAnySet_Check(so));
-
     if (!PyUnicode_CheckExact(key) ||
         (hash = ((PyASCIIObject *) key)->hash) == -1) {
         hash = PyObject_Hash(key);
         if (hash == -1)
             return -1;
     }
-    entry.key = key;
-    entry.hash = hash;
-    return set_discard_entry(so, &entry);
+    return set_discard_entry(so, key, hash);
 }
 
 static void
@@ -449,20 +532,22 @@ set_next(PySetObject *so, Py_ssize_t *pos_ptr, setentry **entry_ptr)
 {
     Py_ssize_t i;
     Py_ssize_t mask;
-    setentry *table;
+    setentry *entry;
 
     assert (PyAnySet_Check(so));
     i = *pos_ptr;
     assert(i >= 0);
-    table = so->table;
     mask = so->mask;
-    while (i <= mask && (table[i].key == NULL || table[i].key == dummy))
+    entry = &so->table[i];
+    while (i <= mask && (entry->key == NULL || entry->key == dummy)) {
         i++;
+        entry++;
+    }
     *pos_ptr = i+1;
     if (i > mask)
         return 0;
-    assert(table[i].key != NULL);
-    *entry_ptr = &table[i];
+    assert(entry != NULL);
+    *entry_ptr = entry;
     return 1;
 }
 
@@ -560,8 +645,8 @@ set_merge(PySetObject *so, PyObject *otherset)
      * incrementally resizing as we insert new keys.  Expect
      * that there will be no (or few) overlapping keys.
      */
-    if ((so->fill + other->used)*3 >= (so->mask+1)*2) {
-       if (set_table_resize(so, (so->used + other->used)*2) != 0)
+    if ((so->fill + other->used)*3 >= so->mask*2) {
+       if (set_table_resize(so, so->used + other->used) != 0)
            return -1;
     }
     so_entry = so->table;
@@ -586,11 +671,15 @@ set_merge(PySetObject *so, PyObject *otherset)
 
     /* If our table is empty, we can use set_insert_clean() */
     if (so->fill == 0) {
-        for (i = 0; i <= other->mask; i++, other_entry++) {
+        setentry *newtable = so->table;
+        size_t newmask = (size_t)so->mask;
+        so->fill = other->used;
+        so->used = other->used;
+        for (i = other->mask + 1; i > 0 ; i--, other_entry++) {
             key = other_entry->key;
             if (key != NULL && key != dummy) {
                 Py_INCREF(key);
-                set_insert_clean(so, key, other_entry->hash);
+                set_insert_clean(newtable, newmask, key, other_entry->hash);
             }
         }
         return 0;
@@ -601,46 +690,13 @@ set_merge(PySetObject *so, PyObject *otherset)
         other_entry = &other->table[i];
         key = other_entry->key;
         if (key != NULL && key != dummy) {
-            Py_INCREF(key);
-            if (set_insert_key(so, key, other_entry->hash)) {
-                Py_DECREF(key);
+            if (set_add_entry(so, key, other_entry->hash))
                 return -1;
-            }
         }
     }
     return 0;
 }
 
-static int
-set_contains_entry(PySetObject *so, setentry *entry)
-{
-    PyObject *key;
-    setentry *lu_entry;
-
-    lu_entry = set_lookkey(so, entry->key, entry->hash);
-    if (lu_entry == NULL)
-        return -1;
-    key = lu_entry->key;
-    return key != NULL && key != dummy;
-}
-
-static int
-set_contains_key(PySetObject *so, PyObject *key)
-{
-    setentry entry;
-    Py_hash_t hash;
-
-    if (!PyUnicode_CheckExact(key) ||
-        (hash = ((PyASCIIObject *) key)->hash) == -1) {
-        hash = PyObject_Hash(key);
-        if (hash == -1)
-            return -1;
-    }
-    entry.key = key;
-    entry.hash = hash;
-    return set_contains_entry(so, &entry);
-}
-
 static PyObject *
 set_pop(PySetObject *so)
 {
@@ -682,43 +738,64 @@ set_traverse(PySetObject *so, visitproc visit, void *arg)
     return 0;
 }
 
-static Py_hash_t
-frozenset_hash(PyObject *self)
+/* Work to increase the bit dispersion for closely spaced hash values.
+   This is important because some use cases have many combinations of a
+   small number of elements with nearby hashes so that many distinct
+   combinations collapse to only a handful of distinct hash values. */
+
+static Py_uhash_t
+_shuffle_bits(Py_uhash_t h)
 {
-    /* Most of the constants in this hash algorithm are randomly choosen
-       large primes with "interesting bit patterns" and that passed
-       tests for good collision statistics on a variety of problematic
-       datasets such as:
+    return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
+}
 
-          ps = []
-          for r in range(21):
-              ps += itertools.combinations(range(20), r)
-          num_distinct_hashes = len({hash(frozenset(s)) for s in ps})
+/* Most of the constants in this hash algorithm are randomly chosen
+   large primes with "interesting bit patterns" and that passed tests
+   for good collision statistics on a variety of problematic datasets
+   including powersets and graph structures (such as David Eppstein's
+   graph recipes in Lib/test/test_set.py) */
 
-    */
+static Py_hash_t
+frozenset_hash(PyObject *self)
+{
     PySetObject *so = (PySetObject *)self;
-    Py_uhash_t h, hash = 1927868237UL;
+    Py_uhash_t hash = 0;
     setentry *entry;
-    Py_ssize_t pos = 0;
 
     if (so->hash != -1)
         return so->hash;
 
-    hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1;
-    while (set_next(so, &pos, &entry)) {
-        /* Work to increase the bit dispersion for closely spaced hash
-           values.  This is important because some use cases have many
-           combinations of a small number of elements with nearby
-           hashes so that many distinct combinations collapse to only
-           a handful of distinct hash values. */
-        h = entry->hash;
-        hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
-    }
-    /* Make the final result spread-out in a different pattern
-       than the algorithm for tuples or other python objects. */
+    /* Xor-in shuffled bits from every entry's hash field because xor is
+       commutative and a frozenset hash should be independent of order.
+
+       For speed, include null entries and dummy entries and then
+       subtract out their effect afterwards so that the final hash
+       depends only on active entries.  This allows the code to be
+       vectorized by the compiler and it saves the unpredictable
+       branches that would arise when trying to exclude null and dummy
+       entries on every iteration. */
+
+    for (entry = so->table; entry <= &so->table[so->mask]; entry++)
+        hash ^= _shuffle_bits(entry->hash);
+
+    /* Remove the effect of an odd number of NULL entries */
+    if ((so->mask + 1 - so->fill) & 1)
+        hash ^= _shuffle_bits(0);
+
+    /* Remove the effect of an odd number of dummy entries */
+    if ((so->fill - so->used) & 1)
+        hash ^= _shuffle_bits(-1);
+
+    /* Factor in the number of active entries */
+    hash ^= ((Py_uhash_t)PySet_GET_SIZE(self) + 1) * 1927868237UL;
+
+    /* Disperse patterns arising in nested frozensets */
     hash = hash * 69069U + 907133923UL;
+
+    /* -1 is reserved as an error code */
     if (hash == (Py_uhash_t)-1)
         hash = 590923713UL;
+
     so->hash = hash;
     return hash;
 }
@@ -865,7 +942,7 @@ PyTypeObject PySetIter_Type = {
     PyObject_GenericGetAttr,                    /* tp_getattro */
     0,                                          /* tp_setattro */
     0,                                          /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,    /* tp_flags */
     0,                                          /* tp_doc */
     (traverseproc)setiter_traverse,             /* tp_traverse */
     0,                                          /* tp_clear */
@@ -910,18 +987,14 @@ set_update_internal(PySetObject *so, PyObject *other)
         * incrementally resizing as we insert new keys.  Expect
         * that there will be no (or few) overlapping keys.
         */
-        if (dictsize == -1)
+        if (dictsize < 0)
             return -1;
-        if ((so->fill + dictsize)*3 >= (so->mask+1)*2) {
-            if (set_table_resize(so, (so->used + dictsize)*2) != 0)
+        if ((so->fill + dictsize)*3 >= so->mask*2) {
+            if (set_table_resize(so, so->used + dictsize) != 0)
                 return -1;
         }
         while (_PyDict_Next(other, &pos, &key, &value, &hash)) {
-            setentry an_entry;
-
-            an_entry.hash = hash;
-            an_entry.key = key;
-            if (set_add_entry(so, &an_entry))
+            if (set_add_entry(so, key, hash))
                 return -1;
         }
         return 0;
@@ -970,9 +1043,8 @@ PyDoc_STRVAR(update_doc,
 static PyObject *
 make_new_set(PyTypeObject *type, PyObject *iterable)
 {
-    PySetObject *so = NULL;
+    PySetObject *so;
 
-    /* create PySetObject structure */
     so = (PySetObject *)type->tp_alloc(type, 0);
     if (so == NULL)
         return NULL;
@@ -1015,7 +1087,8 @@ frozenset_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 {
     PyObject *iterable = NULL, *result;
 
-    if (type == &PyFrozenSet_Type && !_PyArg_NoKeywords("frozenset()", kwds))
+    if (kwds != NULL && type == &PyFrozenSet_Type
+        && !_PyArg_NoKeywords("frozenset()", kwds))
         return NULL;
 
     if (!PyArg_UnpackTuple(args, type->tp_name, 0, 1, &iterable))
@@ -1042,24 +1115,9 @@ frozenset_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
     return emptyfrozenset;
 }
 
-int
-PySet_ClearFreeList(void)
-{
-    return 0;
-}
-
-void
-PySet_Fini(void)
-{
-    Py_CLEAR(emptyfrozenset);
-}
-
 static PyObject *
 set_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 {
-    if (type == &PySet_Type && !_PyArg_NoKeywords("set()", kwds))
-        return NULL;
-
     return make_new_set(type, NULL);
 }
 
@@ -1201,6 +1259,8 @@ set_intersection(PySetObject *so, PyObject *other)
 {
     PySetObject *result;
     PyObject *key, *it, *tmp;
+    Py_hash_t hash;
+    int rv;
 
     if ((PyObject *)so == other)
         return set_copy(so);
@@ -1220,13 +1280,15 @@ set_intersection(PySetObject *so, PyObject *other)
         }
 
         while (set_next((PySetObject *)other, &pos, &entry)) {
-            int rv = set_contains_entry(so, entry);
-            if (rv == -1) {
+            key = entry->key;
+            hash = entry->hash;
+            rv = set_contains_entry(so, key, hash);
+            if (rv < 0) {
                 Py_DECREF(result);
                 return NULL;
             }
             if (rv) {
-                if (set_add_entry(result, entry)) {
+                if (set_add_entry(result, key, hash)) {
                     Py_DECREF(result);
                     return NULL;
                 }
@@ -1242,32 +1304,15 @@ set_intersection(PySetObject *so, PyObject *other)
     }
 
     while ((key = PyIter_Next(it)) != NULL) {
-        int rv;
-        setentry entry;
-        Py_hash_t hash = PyObject_Hash(key);
-
-        if (hash == -1) {
-            Py_DECREF(it);
-            Py_DECREF(result);
-            Py_DECREF(key);
-            return NULL;
-        }
-        entry.hash = hash;
-        entry.key = key;
-        rv = set_contains_entry(so, &entry);
-        if (rv == -1) {
-            Py_DECREF(it);
-            Py_DECREF(result);
-            Py_DECREF(key);
-            return NULL;
-        }
+        hash = PyObject_Hash(key);
+        if (hash == -1)
+            goto error;
+        rv = set_contains_entry(so, key, hash);
+        if (rv < 0)
+            goto error;
         if (rv) {
-            if (set_add_entry(result, &entry)) {
-                Py_DECREF(it);
-                Py_DECREF(result);
-                Py_DECREF(key);
-                return NULL;
-            }
+            if (set_add_entry(result, key, hash))
+                goto error;
         }
         Py_DECREF(key);
     }
@@ -1277,6 +1322,11 @@ set_intersection(PySetObject *so, PyObject *other)
         return NULL;
     }
     return (PyObject *)result;
+  error:
+    Py_DECREF(it);
+    Py_DECREF(result);
+    Py_DECREF(key);
+    return NULL;
 }
 
 static PyObject *
@@ -1363,6 +1413,7 @@ static PyObject *
 set_isdisjoint(PySetObject *so, PyObject *other)
 {
     PyObject *key, *it, *tmp;
+    int rv;
 
     if ((PyObject *)so == other) {
         if (PySet_GET_SIZE(so) == 0)
@@ -1381,8 +1432,8 @@ set_isdisjoint(PySetObject *so, PyObject *other)
             other = tmp;
         }
         while (set_next((PySetObject *)other, &pos, &entry)) {
-            int rv = set_contains_entry(so, entry);
-            if (rv == -1)
+            rv = set_contains_entry(so, entry->key, entry->hash);
+            if (rv < 0)
                 return NULL;
             if (rv)
                 Py_RETURN_FALSE;
@@ -1395,8 +1446,6 @@ set_isdisjoint(PySetObject *so, PyObject *other)
         return NULL;
 
     while ((key = PyIter_Next(it)) != NULL) {
-        int rv;
-        setentry entry;
         Py_hash_t hash = PyObject_Hash(key);
 
         if (hash == -1) {
@@ -1404,11 +1453,9 @@ set_isdisjoint(PySetObject *so, PyObject *other)
             Py_DECREF(it);
             return NULL;
         }
-        entry.hash = hash;
-        entry.key = key;
-        rv = set_contains_entry(so, &entry);
+        rv = set_contains_entry(so, key, hash);
         Py_DECREF(key);
-        if (rv == -1) {
+        if (rv < 0) {
             Py_DECREF(it);
             return NULL;
         }
@@ -1437,7 +1484,7 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
         Py_ssize_t pos = 0;
 
         while (set_next((PySetObject *)other, &pos, &entry))
-            if (set_discard_entry(so, entry) == -1)
+            if (set_discard_entry(so, entry->key, entry->hash) < 0)
                 return -1;
     } else {
         PyObject *key, *it;
@@ -1446,7 +1493,7 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
             return -1;
 
         while ((key = PyIter_Next(it)) != NULL) {
-            if (set_discard_key(so, key) == -1) {
+            if (set_discard_key(so, key) < 0) {
                 Py_DECREF(it);
                 Py_DECREF(key);
                 return -1;
@@ -1457,10 +1504,10 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
         if (PyErr_Occurred())
             return -1;
     }
-    /* If more than 1/5 are dummies, then resize them away. */
-    if ((so->fill - so->used) * 5 < so->mask)
+    /* If more than 1/4th are dummies, then resize them away. */
+    if ((size_t)(so->fill - so->used) <= (size_t)so->mask / 4)
         return 0;
-    return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
+    return set_table_resize(so, so->used);
 }
 
 static PyObject *
@@ -1487,7 +1534,7 @@ set_copy_and_difference(PySetObject *so, PyObject *other)
     result = set_copy(so);
     if (result == NULL)
         return NULL;
-    if (set_difference_update_internal((PySetObject *) result, other) != -1)
+    if (set_difference_update_internal((PySetObject *) result, other) == 0)
         return result;
     Py_DECREF(result);
     return NULL;
@@ -1497,8 +1544,11 @@ static PyObject *
 set_difference(PySetObject *so, PyObject *other)
 {
     PyObject *result;
+    PyObject *key;
+    Py_hash_t hash;
     setentry *entry;
     Py_ssize_t pos = 0;
+    int rv;
 
     if (!PyAnySet_Check(other)  && !PyDict_CheckExact(other)) {
         return set_copy_and_difference(so, other);
@@ -1516,17 +1566,15 @@ set_difference(PySetObject *so, PyObject *other)
 
     if (PyDict_CheckExact(other)) {
         while (set_next(so, &pos, &entry)) {
-            setentry entrycopy;
-            int rv;
-            entrycopy.hash = entry->hash;
-            entrycopy.key = entry->key;
-            rv = _PyDict_Contains(other, entry->key, entry->hash);
+            key = entry->key;
+            hash = entry->hash;
+            rv = _PyDict_Contains(other, key, hash);
             if (rv < 0) {
                 Py_DECREF(result);
                 return NULL;
             }
             if (!rv) {
-                if (set_add_entry((PySetObject *)result, &entrycopy)) {
+                if (set_add_entry((PySetObject *)result, key, hash)) {
                     Py_DECREF(result);
                     return NULL;
                 }
@@ -1537,13 +1585,15 @@ set_difference(PySetObject *so, PyObject *other)
 
     /* Iterate over so, checking for common elements in other. */
     while (set_next(so, &pos, &entry)) {
-        int rv = set_contains_entry((PySetObject *)other, entry);
-        if (rv == -1) {
+        key = entry->key;
+        hash = entry->hash;
+        rv = set_contains_entry((PySetObject *)other, key, hash);
+        if (rv < 0) {
             Py_DECREF(result);
             return NULL;
         }
         if (!rv) {
-            if (set_add_entry((PySetObject *)result, entry)) {
+            if (set_add_entry((PySetObject *)result, key, hash)) {
                 Py_DECREF(result);
                 return NULL;
             }
@@ -1605,29 +1655,24 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other)
     PySetObject *otherset;
     PyObject *key;
     Py_ssize_t pos = 0;
+    Py_hash_t hash;
     setentry *entry;
+    int rv;
 
     if ((PyObject *)so == other)
         return set_clear(so);
 
     if (PyDict_CheckExact(other)) {
         PyObject *value;
-        int rv;
-        Py_hash_t hash;
         while (_PyDict_Next(other, &pos, &key, &value, &hash)) {
-            setentry an_entry;
-
             Py_INCREF(key);
-            an_entry.hash = hash;
-            an_entry.key = key;
-
-            rv = set_discard_entry(so, &an_entry);
-            if (rv == -1) {
+            rv = set_discard_entry(so, key, hash);
+            if (rv < 0) {
                 Py_DECREF(key);
                 return NULL;
             }
             if (rv == DISCARD_NOTFOUND) {
-                if (set_add_entry(so, &an_entry)) {
+                if (set_add_entry(so, key, hash)) {
                     Py_DECREF(key);
                     return NULL;
                 }
@@ -1647,13 +1692,15 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other)
     }
 
     while (set_next(otherset, &pos, &entry)) {
-        int rv = set_discard_entry(so, entry);
-        if (rv == -1) {
+        key = entry->key;
+        hash = entry->hash;
+        rv = set_discard_entry(so, key, hash);
+        if (rv < 0) {
             Py_DECREF(otherset);
             return NULL;
         }
         if (rv == DISCARD_NOTFOUND) {
-            if (set_add_entry(so, entry)) {
+            if (set_add_entry(so, key, hash)) {
                 Py_DECREF(otherset);
                 return NULL;
             }
@@ -1715,6 +1762,7 @@ set_issubset(PySetObject *so, PyObject *other)
 {
     setentry *entry;
     Py_ssize_t pos = 0;
+    int rv;
 
     if (!PyAnySet_Check(other)) {
         PyObject *tmp, *result;
@@ -1729,8 +1777,8 @@ set_issubset(PySetObject *so, PyObject *other)
         Py_RETURN_FALSE;
 
     while (set_next(so, &pos, &entry)) {
-        int rv = set_contains_entry((PySetObject *)other, entry);
-        if (rv == -1)
+        rv = set_contains_entry((PySetObject *)other, entry->key, entry->hash);
+        if (rv < 0)
             return NULL;
         if (!rv)
             Py_RETURN_FALSE;
@@ -1821,7 +1869,7 @@ set_contains(PySetObject *so, PyObject *key)
     int rv;
 
     rv = set_contains_key(so, key);
-    if (rv == -1) {
+    if (rv < 0) {
         if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
             return -1;
         PyErr_Clear();
@@ -1840,7 +1888,7 @@ set_direct_contains(PySetObject *so, PyObject *key)
     long result;
 
     result = set_contains(so, key);
-    if (result == -1)
+    if (result < 0)
         return NULL;
     return PyBool_FromLong(result);
 }
@@ -1854,7 +1902,7 @@ set_remove(PySetObject *so, PyObject *key)
     int rv;
 
     rv = set_discard_key(so, key);
-    if (rv == -1) {
+    if (rv < 0) {
         if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
             return NULL;
         PyErr_Clear();
@@ -1863,7 +1911,7 @@ set_remove(PySetObject *so, PyObject *key)
             return NULL;
         rv = set_discard_key(so, tmpkey);
         Py_DECREF(tmpkey);
-        if (rv == -1)
+        if (rv < 0)
             return NULL;
     }
 
@@ -1886,7 +1934,7 @@ set_discard(PySetObject *so, PyObject *key)
     int rv;
 
     rv = set_discard_key(so, key);
-    if (rv == -1) {
+    if (rv < 0) {
         if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
             return NULL;
         PyErr_Clear();
@@ -1895,7 +1943,7 @@ set_discard(PySetObject *so, PyObject *key)
             return NULL;
         rv = set_discard_key(so, tmpkey);
         Py_DECREF(tmpkey);
-        if (rv == -1)
+        if (rv < 0)
             return NULL;
     }
     Py_RETURN_NONE;
@@ -1949,13 +1997,12 @@ set_init(PySetObject *self, PyObject *args, PyObject *kwds)
 {
     PyObject *iterable = NULL;
 
-    if (!PyAnySet_Check(self))
-        return -1;
-    if (PySet_Check(self) && !_PyArg_NoKeywords("set()", kwds))
+    if (kwds != NULL && !_PyArg_NoKeywords("set()", kwds))
         return -1;
     if (!PyArg_UnpackTuple(args, Py_TYPE(self)->tp_name, 0, 1, &iterable))
         return -1;
-    set_clear_internal(self);
+    if (self->fill)
+        set_clear_internal(self);
     self->hash = -1;
     if (iterable == NULL)
         return 0;
@@ -2122,7 +2169,7 @@ static PyMethodDef frozenset_methods[] = {
      copy_doc},
     {"difference",      (PyCFunction)set_difference_multi,      METH_VARARGS,
      difference_doc},
-    {"intersection",(PyCFunction)set_intersection_multi,        METH_VARARGS,
+    {"intersection",    (PyCFunction)set_intersection_multi,    METH_VARARGS,
      intersection_doc},
     {"isdisjoint",      (PyCFunction)set_isdisjoint,    METH_O,
      isdisjoint_doc},
@@ -2193,7 +2240,7 @@ PyTypeObject PyFrozenSet_Type = {
     (traverseproc)set_traverse,         /* tp_traverse */
     (inquiry)set_clear_internal,        /* tp_clear */
     (richcmpfunc)set_richcompare,       /* tp_richcompare */
-    offsetof(PySetObject, weakreflist),         /* tp_weaklistoffset */
+    offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */
     (getiterfunc)set_iter,              /* tp_iter */
     0,                                  /* tp_iternext */
     frozenset_methods,                  /* tp_methods */
@@ -2277,6 +2324,18 @@ PySet_Add(PyObject *anyset, PyObject *key)
 }
 
 int
+PySet_ClearFreeList(void)
+{
+    return 0;
+}
+
+void
+PySet_Fini(void)
+{
+    Py_CLEAR(emptyfrozenset);
+}
+
+int
 _PySet_NextEntry(PyObject *set, Py_ssize_t *pos, PyObject **key, Py_hash_t *hash)
 {
     setentry *entry;
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index 0fc6b58..2846d7e 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -1,6 +1,8 @@
 /* stringlib: codec implementations */
 
-#if STRINGLIB_IS_UNICODE
+#if !STRINGLIB_IS_UNICODE
+# error "codecs.h is specific to Unicode"
+#endif
 
 /* Mask to quickly check whether a C 'long' contains a
    non-ASCII, UTF8-encoded char. */
@@ -263,50 +265,34 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
 
     Py_ssize_t i;                /* index into s of next input byte */
-    PyObject *result;            /* result string object */
     char *p;                     /* next free byte in output buffer */
-    Py_ssize_t nallocated;      /* number of result bytes allocated */
-    Py_ssize_t nneeded;            /* number of result bytes needed */
 #if STRINGLIB_SIZEOF_CHAR > 1
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
     PyObject *rep = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 #endif
 #if STRINGLIB_SIZEOF_CHAR == 1
     const Py_ssize_t max_char_size = 2;
-    char stackbuf[MAX_SHORT_UNICHARS * 2];
 #elif STRINGLIB_SIZEOF_CHAR == 2
     const Py_ssize_t max_char_size = 3;
-    char stackbuf[MAX_SHORT_UNICHARS * 3];
 #else /*  STRINGLIB_SIZEOF_CHAR == 4 */
     const Py_ssize_t max_char_size = 4;
-    char stackbuf[MAX_SHORT_UNICHARS * 4];
 #endif
+    _PyBytesWriter writer;
 
     assert(size >= 0);
+    _PyBytesWriter_Init(&writer);
 
-    if (size <= MAX_SHORT_UNICHARS) {
-        /* Write into the stack buffer; nallocated can't overflow.
-         * At the end, we'll allocate exactly as much heap space as it
-         * turns out we need.
-         */
-        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
-        result = NULL;   /* will allocate after we're done */
-        p = stackbuf;
-    }
-    else {
-        if (size > PY_SSIZE_T_MAX / max_char_size) {
-            /* integer overflow */
-            return PyErr_NoMemory();
-        }
-        /* Overallocate on the heap, and give the excess back at the end. */
-        nallocated = size * max_char_size;
-        result = PyBytes_FromStringAndSize(NULL, nallocated);
-        if (result == NULL)
-            return NULL;
-        p = PyBytes_AS_STRING(result);
+    if (size > PY_SSIZE_T_MAX / max_char_size) {
+        /* integer overflow */
+        return PyErr_NoMemory();
     }
 
+    p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
+    if (p == NULL)
+        return NULL;
+
     for (i = 0; i < size;) {
         Py_UCS4 ch = data[i++];
 
@@ -326,72 +312,118 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
         }
 #if STRINGLIB_SIZEOF_CHAR > 1
         else if (Py_UNICODE_IS_SURROGATE(ch)) {
-            Py_ssize_t newpos;
-            Py_ssize_t repsize, k, startpos;
+            Py_ssize_t startpos, endpos, newpos;
+            Py_ssize_t k;
+            if (error_handler == _Py_ERROR_UNKNOWN)
+                error_handler = get_error_handler(errors);
+
             startpos = i-1;
-            rep = unicode_encode_call_errorhandler(
-                  errors, &errorHandler, "utf-8", "surrogates not allowed",
-                  unicode, &exc, startpos, startpos+1, &newpos);
-            if (!rep)
-                goto error;
-
-            if (PyBytes_Check(rep))
-                repsize = PyBytes_GET_SIZE(rep);
-            else
-                repsize = PyUnicode_GET_LENGTH(rep);
+            endpos = startpos+1;
+
+            while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
+                endpos++;
+
+            /* Only overallocate the buffer if it's not the last write */
+            writer.overallocate = (endpos < size);
+
+            switch (error_handler)
+            {
+            case _Py_ERROR_REPLACE:
+                memset(p, '?', endpos - startpos);
+                p += (endpos - startpos);
+                /* fall through the ignore handler */
+            case _Py_ERROR_IGNORE:
+                i += (endpos - startpos - 1);
+                break;
 
-            if (repsize > max_char_size) {
-                Py_ssize_t offset;
+            case _Py_ERROR_SURROGATEPASS:
+                for (k=startpos; k<endpos; k++) {
+                    ch = data[k];
+                    *p++ = (char)(0xe0 | (ch >> 12));
+                    *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+                    *p++ = (char)(0x80 | (ch & 0x3f));
+                }
+                i += (endpos - startpos - 1);
+                break;
 
-                if (result == NULL)
-                    offset = p - stackbuf;
-                else
-                    offset = p - PyBytes_AS_STRING(result);
+            case _Py_ERROR_BACKSLASHREPLACE:
+                /* substract preallocated bytes */
+                writer.min_size -= max_char_size * (endpos - startpos);
+                p = backslashreplace(&writer, p,
+                                     unicode, startpos, endpos);
+                if (p == NULL)
+                    goto error;
+                i += (endpos - startpos - 1);
+                break;
 
-                if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
-                    /* integer overflow */
-                    PyErr_NoMemory();
+            case _Py_ERROR_XMLCHARREFREPLACE:
+                /* substract preallocated bytes */
+                writer.min_size -= max_char_size * (endpos - startpos);
+                p = xmlcharrefreplace(&writer, p,
+                                      unicode, startpos, endpos);
+                if (p == NULL)
                     goto error;
+                i += (endpos - startpos - 1);
+                break;
+
+            case _Py_ERROR_SURROGATEESCAPE:
+                for (k=startpos; k<endpos; k++) {
+                    ch = data[k];
+                    if (!(0xDC80 <= ch && ch <= 0xDCFF))
+                        break;
+                    *p++ = (char)(ch & 0xff);
                 }
-                nallocated += repsize - max_char_size;
-                if (result != NULL) {
-                    if (_PyBytes_Resize(&result, nallocated) < 0)
-                        goto error;
-                } else {
-                    result = PyBytes_FromStringAndSize(NULL, nallocated);
-                    if (result == NULL)
-                        goto error;
-                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
+                if (k >= endpos) {
+                    i += (endpos - startpos - 1);
+                    break;
                 }
-                p = PyBytes_AS_STRING(result) + offset;
-            }
+                startpos = k;
+                assert(startpos < endpos);
+                /* fall through the default handler */
+            default:
+                rep = unicode_encode_call_errorhandler(
+                      errors, &error_handler_obj, "utf-8", "surrogates not allowed",
+                      unicode, &exc, startpos, endpos, &newpos);
+                if (!rep)
+                    goto error;
 
-            if (PyBytes_Check(rep)) {
-                char *prep = PyBytes_AS_STRING(rep);
-                for(k = repsize; k > 0; k--)
-                    *p++ = *prep++;
-            } else /* rep is unicode */ {
-                enum PyUnicode_Kind repkind;
-                void *repdata;
+                /* substract preallocated bytes */
+                writer.min_size -= max_char_size;
 
-                if (PyUnicode_READY(rep) < 0)
-                    goto error;
-                repkind = PyUnicode_KIND(rep);
-                repdata = PyUnicode_DATA(rep);
+                if (PyBytes_Check(rep)) {
+                    p = _PyBytesWriter_WriteBytes(&writer, p,
+                                                  PyBytes_AS_STRING(rep),
+                                                  PyBytes_GET_SIZE(rep));
+                }
+                else {
+                    /* rep is unicode */
+                    if (PyUnicode_READY(rep) < 0)
+                        goto error;
 
-                for(k=0; k<repsize; k++) {
-                    Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
-                    if (0x80 <= c) {
+                    if (!PyUnicode_IS_ASCII(rep)) {
                         raise_encode_exception(&exc, "utf-8",
                                                unicode,
                                                i-1, i,
                                                "surrogates not allowed");
                         goto error;
                     }
-                    *p++ = (char)c;
+
+                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+                    p = _PyBytesWriter_WriteBytes(&writer, p,
+                                                  PyUnicode_DATA(rep),
+                                                  PyUnicode_GET_LENGTH(rep));
                 }
+
+                if (p == NULL)
+                    goto error;
+                Py_CLEAR(rep);
+
+                i = newpos;
             }
-            Py_CLEAR(rep);
+
+            /* If overallocation was disabled, ensure that it was the last
+               write. Otherwise, we missed an optimization */
+            assert(writer.overallocate || i == size);
         }
         else
 #if STRINGLIB_SIZEOF_CHAR > 2
@@ -416,31 +448,18 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
     }
 
-    if (result == NULL) {
-        /* This was stack allocated. */
-        nneeded = p - stackbuf;
-        assert(nneeded <= nallocated);
-        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
-    }
-    else {
-        /* Cut back to size actually needed. */
-        nneeded = p - PyBytes_AS_STRING(result);
-        assert(nneeded <= nallocated);
-        _PyBytes_Resize(&result, nneeded);
-    }
-
 #if STRINGLIB_SIZEOF_CHAR > 1
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
 #endif
-    return result;
+    return _PyBytesWriter_Finish(&writer, p);
 
 #if STRINGLIB_SIZEOF_CHAR > 1
  error:
     Py_XDECREF(rep);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
-    Py_XDECREF(result);
+    _PyBytesWriter_Dealloc(&writer);
     return NULL;
 #endif
 
@@ -806,5 +825,3 @@ STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
 #undef SWAB4
 
 #endif
-
-#endif /* STRINGLIB_IS_UNICODE */
diff --git a/Objects/stringlib/ctype.h b/Objects/stringlib/ctype.h
index 739cf3d..f054625 100644
--- a/Objects/stringlib/ctype.h
+++ b/Objects/stringlib/ctype.h
@@ -1,5 +1,6 @@
-/* NOTE: this API is -ONLY- for use with single byte character strings. */
-/* Do not use it with Unicode. */
+#if STRINGLIB_IS_UNICODE
+# error "ctype.h only compatible with byte-wise strings"
+#endif
 
 #include "bytes_methods.h"
 
diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
index cda68e7..98165ad 100644
--- a/Objects/stringlib/fastsearch.h
+++ b/Objects/stringlib/fastsearch.h
@@ -32,52 +32,98 @@
 #define STRINGLIB_BLOOM(mask, ch)     \
     ((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
 
-
 Py_LOCAL_INLINE(Py_ssize_t)
-STRINGLIB(fastsearch_memchr_1char)(const STRINGLIB_CHAR* s, Py_ssize_t n,
-                                   STRINGLIB_CHAR ch, unsigned char needle,
-                                   int mode)
+STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
 {
-    if (mode == FAST_SEARCH) {
-        const STRINGLIB_CHAR *ptr = s;
-        const STRINGLIB_CHAR *e = s + n;
-        while (ptr < e) {
-            void *candidate = memchr((const void *) ptr, needle, (e - ptr) * sizeof(STRINGLIB_CHAR));
-            if (candidate == NULL)
-                return -1;
-            ptr = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
-            if (sizeof(STRINGLIB_CHAR) == 1 || *ptr == ch)
-                return (ptr - s);
-            /* False positive */
-            ptr++;
-        }
+    const STRINGLIB_CHAR *p, *e;
+
+    p = s;
+    e = s + n;
+    if (n > 10) {
+#if STRINGLIB_SIZEOF_CHAR == 1
+        p = memchr(s, ch, n);
+        if (p != NULL)
+            return (p - s);
         return -1;
+#else
+        /* use memchr if we can choose a needle without two many likely
+           false positives */
+        unsigned char needle = ch & 0xff;
+        /* If looking for a multiple of 256, we'd have too
+           many false positives looking for the '\0' byte in UCS2
+           and UCS4 representations. */
+        if (needle != 0) {
+            while (p < e) {
+                void *candidate = memchr(p, needle,
+                                         (e - p) * sizeof(STRINGLIB_CHAR));
+                if (candidate == NULL)
+                    return -1;
+                p = (const STRINGLIB_CHAR *)
+                        _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
+                if (*p == ch)
+                    return (p - s);
+                /* False positive */
+                p++;
+            }
+            return -1;
+        }
+#endif
     }
+    while (p < e) {
+        if (*p == ch)
+            return (p - s);
+        p++;
+    }
+    return -1;
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
+{
+    const STRINGLIB_CHAR *p;
 #ifdef HAVE_MEMRCHR
     /* memrchr() is a GNU extension, available since glibc 2.1.91.
        it doesn't seem as optimized as memchr(), but is still quite
-       faster than our hand-written loop in FASTSEARCH below */
-    else if (mode == FAST_RSEARCH) {
-        while (n > 0) {
-            const STRINGLIB_CHAR *found;
-            void *candidate = memrchr((const void *) s, needle, n * sizeof(STRINGLIB_CHAR));
-            if (candidate == NULL)
-                return -1;
-            found = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
-            n = found - s;
-            if (sizeof(STRINGLIB_CHAR) == 1 || *found == ch)
-                return n;
-            /* False positive */
-        }
+       faster than our hand-written loop below */
+
+    if (n > 10) {
+#if STRINGLIB_SIZEOF_CHAR == 1
+        p = memrchr(s, ch, n);
+        if (p != NULL)
+            return (p - s);
         return -1;
-    }
+#else
+        /* use memrchr if we can choose a needle without two many likely
+           false positives */
+        unsigned char needle = ch & 0xff;
+        /* If looking for a multiple of 256, we'd have too
+           many false positives looking for the '\0' byte in UCS2
+           and UCS4 representations. */
+        if (needle != 0) {
+            while (n > 0) {
+                void *candidate = memrchr(s, needle,
+                                          n * sizeof(STRINGLIB_CHAR));
+                if (candidate == NULL)
+                    return -1;
+                p = (const STRINGLIB_CHAR *)
+                        _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
+                n = p - s;
+                if (*p == ch)
+                    return n;
+                /* False positive */
+            }
+            return -1;
+        }
 #endif
-    else {
-        assert(0); /* Should never get here */
-        return 0;
     }
-
-#undef DO_MEMCHR
+#endif  /* HAVE_MEMRCHR */
+    p = s + n;
+    while (p > s) {
+        p--;
+        if (*p == ch)
+            return (p - s);
+    }
+    return -1;
 }
 
 Py_LOCAL_INLINE(Py_ssize_t)
@@ -99,25 +145,11 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
         if (m <= 0)
             return -1;
         /* use special case for 1-character strings */
-        if (n > 10 && (mode == FAST_SEARCH
-#ifdef HAVE_MEMRCHR
-                    || mode == FAST_RSEARCH
-#endif
-                    )) {
-            /* use memchr if we can choose a needle without two many likely
-               false positives */
-            unsigned char needle;
-            needle = p[0] & 0xff;
-#if STRINGLIB_SIZEOF_CHAR > 1
-            /* If looking for a multiple of 256, we'd have too
-               many false positives looking for the '\0' byte in UCS2
-               and UCS4 representations. */
-            if (needle != 0)
-#endif
-                return STRINGLIB(fastsearch_memchr_1char)
-                       (s, n, p[0], needle, mode);
-        }
-        if (mode == FAST_COUNT) {
+        if (mode == FAST_SEARCH)
+            return STRINGLIB(find_char)(s, n, p[0]);
+        else if (mode == FAST_RSEARCH)
+            return STRINGLIB(rfind_char)(s, n, p[0]);
+        else {  /* FAST_COUNT */
             for (i = 0; i < n; i++)
                 if (s[i] == p[0]) {
                     count++;
@@ -125,14 +157,6 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
                         return maxcount;
                 }
             return count;
-        } else if (mode == FAST_SEARCH) {
-            for (i = 0; i < n; i++)
-                if (s[i] == p[0])
-                    return i;
-        } else {    /* FAST_RSEARCH */
-            for (i = n - 1; i > -1; i--)
-                if (s[i] == p[0])
-                    return i;
         }
         return -1;
     }
diff --git a/Objects/stringlib/find.h b/Objects/stringlib/find.h
index 14815f6..509b929 100644
--- a/Objects/stringlib/find.h
+++ b/Objects/stringlib/find.h
@@ -117,85 +117,3 @@ STRINGLIB(parse_args_finds)(const char * function_name, PyObject *args,
 }
 
 #undef FORMAT_BUFFER_SIZE
-
-#if STRINGLIB_IS_UNICODE
-
-/*
-Wraps stringlib_parse_args_finds() and additionally ensures that the
-first argument is a unicode object.
-
-Note that we receive a pointer to the pointer of the substring object,
-so when we create that object in this function we don't DECREF it,
-because it continues living in the caller functions (those functions,
-after finishing using the substring, must DECREF it).
-*/
-
-Py_LOCAL_INLINE(int)
-STRINGLIB(parse_args_finds_unicode)(const char * function_name, PyObject *args,
-                                   PyObject **substring,
-                                   Py_ssize_t *start, Py_ssize_t *end)
-{
-    PyObject *tmp_substring;
-
-    if(STRINGLIB(parse_args_finds)(function_name, args, &tmp_substring,
-                                  start, end)) {
-        tmp_substring = PyUnicode_FromObject(tmp_substring);
-        if (!tmp_substring)
-            return 0;
-        *substring = tmp_substring;
-        return 1;
-    }
-    return 0;
-}
-
-#else /* !STRINGLIB_IS_UNICODE */
-
-/*
-Wraps stringlib_parse_args_finds() and additionally checks whether the
-first argument is an integer in range(0, 256).
-
-If this is the case, writes the integer value to the byte parameter
-and sets subobj to NULL. Otherwise, sets the first argument to subobj
-and doesn't touch byte. The other parameters are similar to those of
-stringlib_parse_args_finds().
-*/
-
-Py_LOCAL_INLINE(int)
-STRINGLIB(parse_args_finds_byte)(const char *function_name, PyObject *args,
-                                 PyObject **subobj, char *byte,
-                                 Py_ssize_t *start, Py_ssize_t *end)
-{
-    PyObject *tmp_subobj;
-    Py_ssize_t ival;
-    PyObject *err;
-
-    if(!STRINGLIB(parse_args_finds)(function_name, args, &tmp_subobj,
-                                    start, end))
-        return 0;
-
-    if (!PyNumber_Check(tmp_subobj)) {
-        *subobj = tmp_subobj;
-        return 1;
-    }
-
-    ival = PyNumber_AsSsize_t(tmp_subobj, PyExc_OverflowError);
-    if (ival == -1) {
-        err = PyErr_Occurred();
-        if (err && !PyErr_GivenExceptionMatches(err, PyExc_OverflowError)) {
-            PyErr_Clear();
-            *subobj = tmp_subobj;
-            return 1;
-        }
-    }
-
-    if (ival < 0 || ival > 255) {
-        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)");
-        return 0;
-    }
-
-    *subobj = NULL;
-    *byte = (char)ival;
-    return 1;
-}
-
-#endif /* STRINGLIB_IS_UNICODE */
diff --git a/Objects/stringlib/find_max_char.h b/Objects/stringlib/find_max_char.h
index eb3fe88..8ccbc30 100644
--- a/Objects/stringlib/find_max_char.h
+++ b/Objects/stringlib/find_max_char.h
@@ -1,6 +1,8 @@
 /* Finding the optimal width of unicode characters in a buffer */
 
-#if STRINGLIB_IS_UNICODE
+#if !STRINGLIB_IS_UNICODE
+# error "find_max_char.h is specific to Unicode"
+#endif
 
 /* Mask to quickly check whether a C 'long' contains a
    non-ASCII, UTF8-encoded char. */
@@ -129,5 +131,4 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
 #undef MAX_CHAR_UCS4
 
 #endif /* STRINGLIB_SIZEOF_CHAR == 1 */
-#endif /* STRINGLIB_IS_UNICODE */
 
diff --git a/Objects/stringlib/join.h b/Objects/stringlib/join.h
index cbf81be..90f966d 100644
--- a/Objects/stringlib/join.h
+++ b/Objects/stringlib/join.h
@@ -1,6 +1,6 @@
 /* stringlib: bytes joining implementation */
 
-#if STRINGLIB_SIZEOF_CHAR != 1
+#if STRINGLIB_IS_UNICODE
 #error join.h only compatible with byte-wise strings
 #endif
 
diff --git a/Objects/stringlib/localeutil.h b/Objects/stringlib/localeutil.h
index 6e2f073..df501ed 100644
--- a/Objects/stringlib/localeutil.h
+++ b/Objects/stringlib/localeutil.h
@@ -2,8 +2,8 @@
 
 #include <locale.h>
 
-#ifndef STRINGLIB_IS_UNICODE
-#   error "localeutil is specific to Unicode"
+#if !STRINGLIB_IS_UNICODE
+#   error "localeutil.h is specific to Unicode"
 #endif
 
 typedef struct {
diff --git a/Objects/stringlib/transmogrify.h b/Objects/stringlib/transmogrify.h
index b559b53..625507d 100644
--- a/Objects/stringlib/transmogrify.h
+++ b/Objects/stringlib/transmogrify.h
@@ -1,14 +1,21 @@
-/* NOTE: this API is -ONLY- for use with single byte character strings. */
-/* Do not use it with Unicode. */
+#if STRINGLIB_IS_UNICODE
+# error "transmogrify.h only compatible with byte-wise strings"
+#endif
 
 /* the more complicated methods.  parts of these should be pulled out into the
    shared code in bytes_methods.c to cut down on duplicate code bloat.  */
 
-PyDoc_STRVAR(expandtabs__doc__,
-"B.expandtabs(tabsize=8) -> copy of B\n\
-\n\
-Return a copy of B where all tab characters are expanded using spaces.\n\
-If tabsize is not given, a tab size of 8 characters is assumed.");
+Py_LOCAL_INLINE(PyObject *)
+return_self(PyObject *self)
+{
+#if !STRINGLIB_MUTABLE
+    if (STRINGLIB_CHECK_EXACT(self)) {
+        Py_INCREF(self);
+        return self;
+    }
+#endif
+    return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
+}
 
 static PyObject*
 stringlib_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
@@ -93,39 +100,25 @@ pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
     if (right < 0)
         right = 0;
 
-    if (left == 0 && right == 0 && STRINGLIB_CHECK_EXACT(self)) {
-#if STRINGLIB_MUTABLE
-        /* We're defined as returning a copy;  If the object is mutable
-         * that means we must make an identical copy. */
-        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
-#else
-        Py_INCREF(self);
-        return (PyObject *)self;
-#endif /* STRINGLIB_MUTABLE */
+    if (left == 0 && right == 0) {
+        return return_self(self);
     }
 
-    u = STRINGLIB_NEW(NULL,
-				   left + STRINGLIB_LEN(self) + right);
+    u = STRINGLIB_NEW(NULL, left + STRINGLIB_LEN(self) + right);
     if (u) {
         if (left)
             memset(STRINGLIB_STR(u), fill, left);
         Py_MEMCPY(STRINGLIB_STR(u) + left,
-	       STRINGLIB_STR(self),
-	       STRINGLIB_LEN(self));
+               STRINGLIB_STR(self),
+               STRINGLIB_LEN(self));
         if (right)
             memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self),
-		   fill, right);
+                   fill, right);
     }
 
     return u;
 }
 
-PyDoc_STRVAR(ljust__doc__,
-"B.ljust(width[, fillchar]) -> copy of B\n"
-"\n"
-"Return B left justified in a string of length width. Padding is\n"
-"done using the specified fill character (default is a space).");
-
 static PyObject *
 stringlib_ljust(PyObject *self, PyObject *args)
 {
@@ -135,27 +128,14 @@ stringlib_ljust(PyObject *self, PyObject *args)
     if (!PyArg_ParseTuple(args, "n|c:ljust", &width, &fillchar))
         return NULL;
 
-    if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) {
-#if STRINGLIB_MUTABLE
-        /* We're defined as returning a copy;  If the object is mutable
-         * that means we must make an identical copy. */
-        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
-#else
-        Py_INCREF(self);
-        return (PyObject*) self;
-#endif
+    if (STRINGLIB_LEN(self) >= width) {
+        return return_self(self);
     }
 
     return pad(self, 0, width - STRINGLIB_LEN(self), fillchar);
 }
 
 
-PyDoc_STRVAR(rjust__doc__,
-"B.rjust(width[, fillchar]) -> copy of B\n"
-"\n"
-"Return B right justified in a string of length width. Padding is\n"
-"done using the specified fill character (default is a space)");
-
 static PyObject *
 stringlib_rjust(PyObject *self, PyObject *args)
 {
@@ -165,27 +145,14 @@ stringlib_rjust(PyObject *self, PyObject *args)
     if (!PyArg_ParseTuple(args, "n|c:rjust", &width, &fillchar))
         return NULL;
 
-    if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) {
-#if STRINGLIB_MUTABLE
-        /* We're defined as returning a copy;  If the object is mutable
-         * that means we must make an identical copy. */
-        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
-#else
-        Py_INCREF(self);
-        return (PyObject*) self;
-#endif
+    if (STRINGLIB_LEN(self) >= width) {
+        return return_self(self);
     }
 
     return pad(self, width - STRINGLIB_LEN(self), 0, fillchar);
 }
 
 
-PyDoc_STRVAR(center__doc__,
-"B.center(width[, fillchar]) -> copy of B\n"
-"\n"
-"Return B centered in a string of length width.  Padding is\n"
-"done using the specified fill character (default is a space).");
-
 static PyObject *
 stringlib_center(PyObject *self, PyObject *args)
 {
@@ -196,15 +163,8 @@ stringlib_center(PyObject *self, PyObject *args)
     if (!PyArg_ParseTuple(args, "n|c:center", &width, &fillchar))
         return NULL;
 
-    if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) {
-#if STRINGLIB_MUTABLE
-        /* We're defined as returning a copy;  If the object is mutable
-         * that means we must make an identical copy. */
-        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
-#else
-        Py_INCREF(self);
-        return (PyObject*) self;
-#endif
+    if (STRINGLIB_LEN(self) >= width) {
+        return return_self(self);
     }
 
     marg = width - STRINGLIB_LEN(self);
@@ -213,12 +173,6 @@ stringlib_center(PyObject *self, PyObject *args)
     return pad(self, left, marg - left, fillchar);
 }
 
-PyDoc_STRVAR(zfill__doc__,
-"B.zfill(width) -> copy of B\n"
-"\n"
-"Pad a numeric string B with zeros on the left, to fill a field\n"
-"of the specified width.  B is never truncated.");
-
 static PyObject *
 stringlib_zfill(PyObject *self, PyObject *args)
 {
@@ -231,21 +185,7 @@ stringlib_zfill(PyObject *self, PyObject *args)
         return NULL;
 
     if (STRINGLIB_LEN(self) >= width) {
-        if (STRINGLIB_CHECK_EXACT(self)) {
-#if STRINGLIB_MUTABLE
-            /* We're defined as returning a copy;  If the object is mutable
-             * that means we must make an identical copy. */
-            return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
-#else
-            Py_INCREF(self);
-            return (PyObject*) self;
-#endif
-        }
-        else
-            return STRINGLIB_NEW(
-                STRINGLIB_STR(self),
-                STRINGLIB_LEN(self)
-            );
+        return return_self(self);
     }
 
     fill = width - STRINGLIB_LEN(self);
@@ -262,5 +202,500 @@ stringlib_zfill(PyObject *self, PyObject *args)
         p[fill] = '0';
     }
 
-    return (PyObject*) s;
+    return s;
+}
+
+
+/* find and count characters and substrings */
+
+#define findchar(target, target_len, c)                         \
+  ((char *)memchr((const void *)(target), c, target_len))
+
+
+Py_LOCAL_INLINE(Py_ssize_t)
+countchar(const char *target, Py_ssize_t target_len, char c,
+          Py_ssize_t maxcount)
+{
+    Py_ssize_t count = 0;
+    const char *start = target;
+    const char *end = target + target_len;
+
+    while ((start = findchar(start, end - start, c)) != NULL) {
+        count++;
+        if (count >= maxcount)
+            break;
+        start += 1;
+    }
+    return count;
+}
+
+
+/* Algorithms for different cases of string replacement */
+
+/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_interleave(PyObject *self,
+                             const char *to_s, Py_ssize_t to_len,
+                             Py_ssize_t maxcount)
+{
+    const char *self_s;
+    char *result_s;
+    Py_ssize_t self_len, result_len;
+    Py_ssize_t count, i;
+    PyObject *result;
+
+    self_len = STRINGLIB_LEN(self);
+
+    /* 1 at the end plus 1 after every character;
+       count = min(maxcount, self_len + 1) */
+    if (maxcount <= self_len) {
+        count = maxcount;
+    }
+    else {
+        /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */
+        count = self_len + 1;
+    }
+
+    /* Check for overflow */
+    /*   result_len = count * to_len + self_len; */
+    assert(count > 0);
+    if (to_len > (PY_SSIZE_T_MAX - self_len) / count) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "replace bytes are too long");
+        return NULL;
+    }
+    result_len = count * to_len + self_len;
+    result = STRINGLIB_NEW(NULL, result_len);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    self_s = STRINGLIB_STR(self);
+    result_s = STRINGLIB_STR(result);
+
+    if (to_len > 1) {
+        /* Lay the first one down (guaranteed this will occur) */
+        Py_MEMCPY(result_s, to_s, to_len);
+        result_s += to_len;
+        count -= 1;
+
+        for (i = 0; i < count; i++) {
+            *result_s++ = *self_s++;
+            Py_MEMCPY(result_s, to_s, to_len);
+            result_s += to_len;
+        }
+    }
+    else {
+        result_s[0] = to_s[0];
+        result_s += to_len;
+        count -= 1;
+        for (i = 0; i < count; i++) {
+            *result_s++ = *self_s++;
+            result_s[0] = to_s[0];
+            result_s += to_len;
+        }
+    }
+
+    /* Copy the rest of the original string */
+    Py_MEMCPY(result_s, self_s, self_len - i);
+
+    return result;
 }
+
+/* Special case for deleting a single character */
+/* len(self)>=1, len(from)==1, to="", maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_delete_single_character(PyObject *self,
+                                          char from_c, Py_ssize_t maxcount)
+{
+    const char *self_s, *start, *next, *end;
+    char *result_s;
+    Py_ssize_t self_len, result_len;
+    Py_ssize_t count;
+    PyObject *result;
+
+    self_len = STRINGLIB_LEN(self);
+    self_s = STRINGLIB_STR(self);
+
+    count = countchar(self_s, self_len, from_c, maxcount);
+    if (count == 0) {
+        return return_self(self);
+    }
+
+    result_len = self_len - count;  /* from_len == 1 */
+    assert(result_len>=0);
+
+    result = STRINGLIB_NEW(NULL, result_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+
+    start = self_s;
+    end = self_s + self_len;
+    while (count-- > 0) {
+        next = findchar(start, end - start, from_c);
+        if (next == NULL)
+            break;
+        Py_MEMCPY(result_s, start, next - start);
+        result_s += (next - start);
+        start = next + 1;
+    }
+    Py_MEMCPY(result_s, start, end - start);
+
+    return result;
+}
+
+/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
+
+Py_LOCAL(PyObject *)
+stringlib_replace_delete_substring(PyObject *self,
+                                   const char *from_s, Py_ssize_t from_len,
+                                   Py_ssize_t maxcount)
+{
+    const char *self_s, *start, *next, *end;
+    char *result_s;
+    Py_ssize_t self_len, result_len;
+    Py_ssize_t count, offset;
+    PyObject *result;
+
+    self_len = STRINGLIB_LEN(self);
+    self_s = STRINGLIB_STR(self);
+
+    count = stringlib_count(self_s, self_len,
+                            from_s, from_len,
+                            maxcount);
+
+    if (count == 0) {
+        /* no matches */
+        return return_self(self);
+    }
+
+    result_len = self_len - (count * from_len);
+    assert (result_len>=0);
+
+    result = STRINGLIB_NEW(NULL, result_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+
+    start = self_s;
+    end = self_s + self_len;
+    while (count-- > 0) {
+        offset = stringlib_find(start, end - start,
+                                from_s, from_len,
+                                0);
+        if (offset == -1)
+            break;
+        next = start + offset;
+
+        Py_MEMCPY(result_s, start, next - start);
+
+        result_s += (next - start);
+        start = next + from_len;
+    }
+    Py_MEMCPY(result_s, start, end - start);
+    return result;
+}
+
+/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_single_character_in_place(PyObject *self,
+                                            char from_c, char to_c,
+                                            Py_ssize_t maxcount)
+{
+    const char *self_s, *end;
+    char *result_s, *start, *next;
+    Py_ssize_t self_len;
+    PyObject *result;
+
+    /* The result string will be the same size */
+    self_s = STRINGLIB_STR(self);
+    self_len = STRINGLIB_LEN(self);
+
+    next = findchar(self_s, self_len, from_c);
+
+    if (next == NULL) {
+        /* No matches; return the original bytes */
+        return return_self(self);
+    }
+
+    /* Need to make a new bytes */
+    result = STRINGLIB_NEW(NULL, self_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+    Py_MEMCPY(result_s, self_s, self_len);
+
+    /* change everything in-place, starting with this one */
+    start =  result_s + (next - self_s);
+    *start = to_c;
+    start++;
+    end = result_s + self_len;
+
+    while (--maxcount > 0) {
+        next = findchar(start, end - start, from_c);
+        if (next == NULL)
+            break;
+        *next = to_c;
+        start = next + 1;
+    }
+
+    return result;
+}
+
+/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_substring_in_place(PyObject *self,
+                                     const char *from_s, Py_ssize_t from_len,
+                                     const char *to_s, Py_ssize_t to_len,
+                                     Py_ssize_t maxcount)
+{
+    const char *self_s, *end;
+    char *result_s, *start;
+    Py_ssize_t self_len, offset;
+    PyObject *result;
+
+    /* The result bytes will be the same size */
+
+    self_s = STRINGLIB_STR(self);
+    self_len = STRINGLIB_LEN(self);
+
+    offset = stringlib_find(self_s, self_len,
+                            from_s, from_len,
+                            0);
+    if (offset == -1) {
+        /* No matches; return the original bytes */
+        return return_self(self);
+    }
+
+    /* Need to make a new bytes */
+    result = STRINGLIB_NEW(NULL, self_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+    Py_MEMCPY(result_s, self_s, self_len);
+
+    /* change everything in-place, starting with this one */
+    start =  result_s + offset;
+    Py_MEMCPY(start, to_s, from_len);
+    start += from_len;
+    end = result_s + self_len;
+
+    while ( --maxcount > 0) {
+        offset = stringlib_find(start, end - start,
+                                from_s, from_len,
+                                0);
+        if (offset == -1)
+            break;
+        Py_MEMCPY(start + offset, to_s, from_len);
+        start += offset + from_len;
+    }
+
+    return result;
+}
+
+/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_single_character(PyObject *self,
+                                   char from_c,
+                                   const char *to_s, Py_ssize_t to_len,
+                                   Py_ssize_t maxcount)
+{
+    const char *self_s, *start, *next, *end;
+    char *result_s;
+    Py_ssize_t self_len, result_len;
+    Py_ssize_t count;
+    PyObject *result;
+
+    self_s = STRINGLIB_STR(self);
+    self_len = STRINGLIB_LEN(self);
+
+    count = countchar(self_s, self_len, from_c, maxcount);
+    if (count == 0) {
+        /* no matches, return unchanged */
+        return return_self(self);
+    }
+
+    /* use the difference between current and new, hence the "-1" */
+    /*   result_len = self_len + count * (to_len-1)  */
+    assert(count > 0);
+    if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) {
+        PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
+        return NULL;
+    }
+    result_len = self_len + count * (to_len - 1);
+
+    result = STRINGLIB_NEW(NULL, result_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+
+    start = self_s;
+    end = self_s + self_len;
+    while (count-- > 0) {
+        next = findchar(start, end - start, from_c);
+        if (next == NULL)
+            break;
+
+        if (next == start) {
+            /* replace with the 'to' */
+            Py_MEMCPY(result_s, to_s, to_len);
+            result_s += to_len;
+            start += 1;
+        } else {
+            /* copy the unchanged old then the 'to' */
+            Py_MEMCPY(result_s, start, next - start);
+            result_s += (next - start);
+            Py_MEMCPY(result_s, to_s, to_len);
+            result_s += to_len;
+            start = next + 1;
+        }
+    }
+    /* Copy the remainder of the remaining bytes */
+    Py_MEMCPY(result_s, start, end - start);
+
+    return result;
+}
+
+/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_substring(PyObject *self,
+                            const char *from_s, Py_ssize_t from_len,
+                            const char *to_s, Py_ssize_t to_len,
+                            Py_ssize_t maxcount)
+{
+    const char *self_s, *start, *next, *end;
+    char *result_s;
+    Py_ssize_t self_len, result_len;
+    Py_ssize_t count, offset;
+    PyObject *result;
+
+    self_s = STRINGLIB_STR(self);
+    self_len = STRINGLIB_LEN(self);
+
+    count = stringlib_count(self_s, self_len,
+                            from_s, from_len,
+                            maxcount);
+
+    if (count == 0) {
+        /* no matches, return unchanged */
+        return return_self(self);
+    }
+
+    /* Check for overflow */
+    /*    result_len = self_len + count * (to_len-from_len) */
+    assert(count > 0);
+    if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) {
+        PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
+        return NULL;
+    }
+    result_len = self_len + count * (to_len - from_len);
+
+    result = STRINGLIB_NEW(NULL, result_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+
+    start = self_s;
+    end = self_s + self_len;
+    while (count-- > 0) {
+        offset = stringlib_find(start, end - start,
+                                from_s, from_len,
+                                0);
+        if (offset == -1)
+            break;
+        next = start + offset;
+        if (next == start) {
+            /* replace with the 'to' */
+            Py_MEMCPY(result_s, to_s, to_len);
+            result_s += to_len;
+            start += from_len;
+        } else {
+            /* copy the unchanged old then the 'to' */
+            Py_MEMCPY(result_s, start, next - start);
+            result_s += (next - start);
+            Py_MEMCPY(result_s, to_s, to_len);
+            result_s += to_len;
+            start = next + from_len;
+        }
+    }
+    /* Copy the remainder of the remaining bytes */
+    Py_MEMCPY(result_s, start, end - start);
+
+    return result;
+}
+
+
+Py_LOCAL(PyObject *)
+stringlib_replace(PyObject *self,
+                  const char *from_s, Py_ssize_t from_len,
+                  const char *to_s, Py_ssize_t to_len,
+                  Py_ssize_t maxcount)
+{
+    if (maxcount < 0) {
+        maxcount = PY_SSIZE_T_MAX;
+    } else if (maxcount == 0 || STRINGLIB_LEN(self) == 0) {
+        /* nothing to do; return the original bytes */
+        return return_self(self);
+    }
+
+    /* Handle zero-length special cases */
+    if (from_len == 0) {
+        if (to_len == 0) {
+            /* nothing to do; return the original bytes */
+            return return_self(self);
+        }
+        /* insert the 'to' bytes everywhere.    */
+        /*    >>> b"Python".replace(b"", b".")  */
+        /*    b'.P.y.t.h.o.n.'                  */
+        return stringlib_replace_interleave(self, to_s, to_len, maxcount);
+    }
+
+    /* Except for b"".replace(b"", b"A") == b"A" there is no way beyond this */
+    /* point for an empty self bytes to generate a non-empty bytes */
+    /* Special case so the remaining code always gets a non-empty bytes */
+    if (STRINGLIB_LEN(self) == 0) {
+        return return_self(self);
+    }
+
+    if (to_len == 0) {
+        /* delete all occurrences of 'from' bytes */
+        if (from_len == 1) {
+            return stringlib_replace_delete_single_character(
+                self, from_s[0], maxcount);
+        } else {
+            return stringlib_replace_delete_substring(
+                self, from_s, from_len, maxcount);
+        }
+    }
+
+    /* Handle special case where both bytes have the same length */
+
+    if (from_len == to_len) {
+        if (from_len == 1) {
+            return stringlib_replace_single_character_in_place(
+                self, from_s[0], to_s[0], maxcount);
+        } else {
+            return stringlib_replace_substring_in_place(
+                self, from_s, from_len, to_s, to_len, maxcount);
+        }
+    }
+
+    /* Otherwise use the more generic algorithms */
+    if (from_len == 1) {
+        return stringlib_replace_single_character(
+            self, from_s[0], to_s, to_len, maxcount);
+    } else {
+        /* len('from')>=2, len('to')>=1 */
+        return stringlib_replace_substring(
+            self, from_s, from_len, to_s, to_len, maxcount);
+    }
+}
+
+#undef findchar
diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h
index be09b5f..14fa28e 100644
--- a/Objects/stringlib/unicode_format.h
+++ b/Objects/stringlib/unicode_format.h
@@ -67,7 +67,7 @@ SubString_new_object(SubString *str)
     return PyUnicode_Substring(str->str, str->start, str->end);
 }
 
-/* return a new string.  if str->str is NULL, return None */
+/* return a new string.  if str->str is NULL, return a new empty string */
 Py_LOCAL_INLINE(PyObject *)
 SubString_new_object_or_empty(SubString *str)
 {
diff --git a/Objects/structseq.c b/Objects/structseq.c
index 664344b..e315cba 100644
--- a/Objects/structseq.c
+++ b/Objects/structseq.c
@@ -4,9 +4,9 @@
 #include "Python.h"
 #include "structmember.h"
 
-static char visible_length_key[] = "n_sequence_fields";
-static char real_length_key[] = "n_fields";
-static char unnamed_fields_key[] = "n_unnamed_fields";
+static const char visible_length_key[] = "n_sequence_fields";
+static const char real_length_key[] = "n_fields";
+static const char unnamed_fields_key[] = "n_unnamed_fields";
 
 /* Fields with this name have only a field index, not a field name.
    They are only allowed for indices < n_visible_fields. */
@@ -16,14 +16,14 @@ _Py_IDENTIFIER(n_fields);
 _Py_IDENTIFIER(n_unnamed_fields);
 
 #define VISIBLE_SIZE(op) Py_SIZE(op)
-#define VISIBLE_SIZE_TP(tp) PyLong_AsLong( \
+#define VISIBLE_SIZE_TP(tp) PyLong_AsSsize_t( \
                       _PyDict_GetItemId((tp)->tp_dict, &PyId_n_sequence_fields))
 
-#define REAL_SIZE_TP(tp) PyLong_AsLong( \
+#define REAL_SIZE_TP(tp) PyLong_AsSsize_t( \
                       _PyDict_GetItemId((tp)->tp_dict, &PyId_n_fields))
 #define REAL_SIZE(op) REAL_SIZE_TP(Py_TYPE(op))
 
-#define UNNAMED_FIELDS_TP(tp) PyLong_AsLong( \
+#define UNNAMED_FIELDS_TP(tp) PyLong_AsSsize_t( \
                       _PyDict_GetItemId((tp)->tp_dict, &PyId_n_unnamed_fields))
 #define UNNAMED_FIELDS(op) UNNAMED_FIELDS_TP(Py_TYPE(op))
 
@@ -164,7 +164,8 @@ structseq_repr(PyStructSequence *obj)
 #define TYPE_MAXSIZE 100
 
     PyTypeObject *typ = Py_TYPE(obj);
-    int i, removelast = 0;
+    Py_ssize_t i;
+    int removelast = 0;
     Py_ssize_t len;
     char buf[REPR_BUFFER_SIZE];
     char *endofbuf, *pbuf = buf;
@@ -236,8 +237,7 @@ structseq_reduce(PyStructSequence* self)
     PyObject* tup = NULL;
     PyObject* dict = NULL;
     PyObject* result;
-    Py_ssize_t n_fields, n_visible_fields, n_unnamed_fields;
-    int i;
+    Py_ssize_t n_fields, n_visible_fields, n_unnamed_fields, i;
 
     n_fields = REAL_SIZE(self);
     n_visible_fields = VISIBLE_SIZE(self);
@@ -325,7 +325,7 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
 {
     PyObject *dict;
     PyMemberDef* members;
-    int n_members, n_unnamed_members, i, k;
+    Py_ssize_t n_members, n_unnamed_members, i, k;
     PyObject *v;
 
 #ifdef Py_TRACE_REFS
@@ -373,9 +373,9 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
     Py_INCREF(type);
 
     dict = type->tp_dict;
-#define SET_DICT_FROM_INT(key, value)                           \
+#define SET_DICT_FROM_SIZE(key, value)                          \
     do {                                                        \
-        v = PyLong_FromLong((long) value);                      \
+        v = PyLong_FromSsize_t(value);                          \
         if (v == NULL)                                          \
             return -1;                                          \
         if (PyDict_SetItemString(dict, key, v) < 0) {           \
@@ -385,9 +385,9 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
         Py_DECREF(v);                                           \
     } while (0)
 
-    SET_DICT_FROM_INT(visible_length_key, desc->n_in_sequence);
-    SET_DICT_FROM_INT(real_length_key, n_members);
-    SET_DICT_FROM_INT(unnamed_fields_key, n_unnamed_members);
+    SET_DICT_FROM_SIZE(visible_length_key, desc->n_in_sequence);
+    SET_DICT_FROM_SIZE(real_length_key, n_members);
+    SET_DICT_FROM_SIZE(unnamed_fields_key, n_unnamed_members);
 
     return 0;
 }
diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c
index 7920fec..c0ff499 100644
--- a/Objects/tupleobject.c
+++ b/Objects/tupleobject.c
@@ -36,6 +36,16 @@ static Py_ssize_t count_tracked = 0;
 static void
 show_track(void)
 {
+    PyObject *xoptions, *value;
+    _Py_IDENTIFIER(showalloccount);
+
+    xoptions = PySys_GetXOptions();
+    if (xoptions == NULL)
+        return;
+    value = _PyDict_GetItemId(xoptions, &PyId_showalloccount);
+    if (value != Py_True)
+        return;
+
     fprintf(stderr, "Tuples created: %" PY_FORMAT_SIZE_T "d\n",
         count_tracked + count_untracked);
     fprintf(stderr, "Tuples tracked by the GC: %" PY_FORMAT_SIZE_T
@@ -149,7 +159,6 @@ PyTuple_GetItem(PyObject *op, Py_ssize_t i)
 int
 PyTuple_SetItem(PyObject *op, Py_ssize_t i, PyObject *newitem)
 {
-    PyObject *olditem;
     PyObject **p;
     if (!PyTuple_Check(op) || op->ob_refcnt != 1) {
         Py_XDECREF(newitem);
@@ -163,9 +172,7 @@ PyTuple_SetItem(PyObject *op, Py_ssize_t i, PyObject *newitem)
         return -1;
     }
     p = ((PyTupleObject *)op) -> ob_item + i;
-    olditem = *p;
-    *p = newitem;
-    Py_XDECREF(olditem);
+    Py_XSETREF(*p, newitem);
     return 0;
 }
 
@@ -446,9 +453,9 @@ tupleconcat(PyTupleObject *a, PyObject *bb)
         return NULL;
     }
 #define b ((PyTupleObject *)bb)
-    size = Py_SIZE(a) + Py_SIZE(b);
-    if (size < 0)
+    if (Py_SIZE(a) > PY_SSIZE_T_MAX - Py_SIZE(b))
         return PyErr_NoMemory();
+    size = Py_SIZE(a) + Py_SIZE(b);
     np = (PyTupleObject *) PyTuple_New(size);
     if (np == NULL) {
         return NULL;
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 9f57a7e..b1686d2 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -53,10 +53,12 @@ _Py_IDENTIFIER(__doc__);
 _Py_IDENTIFIER(__getattribute__);
 _Py_IDENTIFIER(__getitem__);
 _Py_IDENTIFIER(__hash__);
+_Py_IDENTIFIER(__init_subclass__);
 _Py_IDENTIFIER(__len__);
 _Py_IDENTIFIER(__module__);
 _Py_IDENTIFIER(__name__);
 _Py_IDENTIFIER(__new__);
+_Py_IDENTIFIER(__set_name__);
 _Py_IDENTIFIER(__setitem__);
 _Py_IDENTIFIER(builtins);
 
@@ -460,7 +462,7 @@ type_module(PyTypeObject *type, void *context)
             PyErr_Format(PyExc_AttributeError, "__module__");
             return 0;
         }
-        Py_XINCREF(mod);
+        Py_INCREF(mod);
         return mod;
     }
     else {
@@ -500,7 +502,7 @@ type_abstractmethods(PyTypeObject *type, void *context)
             PyErr_SetObject(PyExc_AttributeError, message);
         return NULL;
     }
-    Py_XINCREF(mod);
+    Py_INCREF(mod);
     return mod;
 }
 
@@ -548,7 +550,7 @@ type_get_bases(PyTypeObject *type, void *context)
 static PyTypeObject *best_base(PyObject *);
 static int mro_internal(PyTypeObject *, PyObject **);
 Py_LOCAL_INLINE(int) type_is_subtype_base_chain(PyTypeObject *, PyTypeObject *);
-static int compatible_for_assignment(PyTypeObject *, PyTypeObject *, char *);
+static int compatible_for_assignment(PyTypeObject *, PyTypeObject *, const char *);
 static int add_subclass(PyTypeObject*, PyTypeObject*);
 static int add_all_subclasses(PyTypeObject *type, PyObject *bases);
 static void remove_subclass(PyTypeObject *, PyTypeObject *);
@@ -888,25 +890,33 @@ type_call(PyTypeObject *type, PyObject *args, PyObject *kwds)
 #endif
 
     obj = type->tp_new(type, args, kwds);
-    if (obj != NULL) {
-        /* Ugly exception: when the call was type(something),
-           don't call tp_init on the result. */
-        if (type == &PyType_Type &&
-            PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 &&
-            (kwds == NULL ||
-             (PyDict_Check(kwds) && PyDict_Size(kwds) == 0)))
-            return obj;
-        /* If the returned object is not an instance of type,
-           it won't be initialized. */
-        if (!PyType_IsSubtype(Py_TYPE(obj), type))
-            return obj;
-        type = Py_TYPE(obj);
-        if (type->tp_init != NULL) {
-            int res = type->tp_init(obj, args, kwds);
-            if (res < 0) {
-                Py_DECREF(obj);
-                obj = NULL;
-            }
+    obj = _Py_CheckFunctionResult((PyObject*)type, obj, NULL);
+    if (obj == NULL)
+        return NULL;
+
+    /* Ugly exception: when the call was type(something),
+       don't call tp_init on the result. */
+    if (type == &PyType_Type &&
+        PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 &&
+        (kwds == NULL ||
+         (PyDict_Check(kwds) && PyDict_Size(kwds) == 0)))
+        return obj;
+
+    /* If the returned object is not an instance of type,
+       it won't be initialized. */
+    if (!PyType_IsSubtype(Py_TYPE(obj), type))
+        return obj;
+
+    type = Py_TYPE(obj);
+    if (type->tp_init != NULL) {
+        int res = type->tp_init(obj, args, kwds);
+        if (res < 0) {
+            assert(PyErr_Occurred());
+            Py_DECREF(obj);
+            obj = NULL;
+        }
+        else {
+            assert(!PyErr_Occurred());
         }
     }
     return obj;
@@ -1411,7 +1421,7 @@ _PyObject_LookupSpecial(PyObject *self, _Py_Identifier *attrid)
    as lookup_method to cache the interned name string object. */
 
 static PyObject *
-call_method(PyObject *o, _Py_Identifier *nameid, char *format, ...)
+call_method(PyObject *o, _Py_Identifier *nameid, const char *format, ...)
 {
     va_list va;
     PyObject *args, *func = 0, *retval;
@@ -1447,7 +1457,7 @@ call_method(PyObject *o, _Py_Identifier *nameid, char *format, ...)
 /* Clone of call_method() that returns NotImplemented when the lookup fails. */
 
 static PyObject *
-call_maybe(PyObject *o, _Py_Identifier *nameid, char *format, ...)
+call_maybe(PyObject *o, _Py_Identifier *nameid, const char *format, ...)
 {
     va_list va;
     PyObject *args, *func = 0, *retval;
@@ -1526,7 +1536,6 @@ class_name(PyObject *cls)
     PyObject *name = _PyObject_GetAttrId(cls, &PyId___name__);
     if (name == NULL) {
         PyErr_Clear();
-        Py_XDECREF(name);
         name = PyObject_Repr(cls);
     }
     if (name == NULL)
@@ -2020,6 +2029,8 @@ static void object_dealloc(PyObject *);
 static int object_init(PyObject *, PyObject *, PyObject *);
 static int update_slot(PyTypeObject *, PyObject *);
 static void fixup_slot_dispatchers(PyTypeObject *);
+static int set_names(PyTypeObject *);
+static int init_subclass(PyTypeObject *, PyObject *);
 
 /*
  * Helpers for  __dict__ descriptor.  We don't want to expose the dicts
@@ -2084,7 +2095,7 @@ subtype_dict(PyObject *obj, void *context)
 static int
 subtype_setdict(PyObject *obj, PyObject *value, void *context)
 {
-    PyObject *dict, **dictptr;
+    PyObject **dictptr;
     PyTypeObject *base;
 
     base = get_builtin_base_with_dict(Py_TYPE(obj));
@@ -2115,10 +2126,8 @@ subtype_setdict(PyObject *obj, PyObject *value, void *context)
                      "not a '%.200s'", Py_TYPE(value)->tp_name);
         return -1;
     }
-    dict = *dictptr;
     Py_XINCREF(value);
-    *dictptr = value;
-    Py_XDECREF(dict);
+    Py_XSETREF(*dictptr, value);
     return 0;
 }
 
@@ -2197,7 +2206,8 @@ type_init(PyObject *cls, PyObject *args, PyObject *kwds)
     assert(args != NULL && PyTuple_Check(args));
     assert(kwds == NULL || PyDict_Check(kwds));
 
-    if (kwds != NULL && PyDict_Check(kwds) && PyDict_Size(kwds) != 0) {
+    if (kwds != NULL && PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 &&
+        PyDict_Check(kwds) && PyDict_Size(kwds) != 0) {
         PyErr_SetString(PyExc_TypeError,
                         "type.__init__() takes no keyword arguments");
         return -1;
@@ -2264,7 +2274,6 @@ static PyObject *
 type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
 {
     PyObject *name, *bases = NULL, *orig_dict, *dict = NULL;
-    static char *kwlist[] = {"name", "bases", "dict", 0};
     PyObject *qualname, *slots = NULL, *tmp, *newslots;
     PyTypeObject *type = NULL, *base, *tmptype, *winner;
     PyHeapTypeObject *et;
@@ -2291,7 +2300,7 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
         /* SF bug 475327 -- if that didn't trigger, we need 3
            arguments. but PyArg_ParseTupleAndKeywords below may give
            a msg saying type() needs exactly 3. */
-        if (nargs + nkwds != 3) {
+        if (nargs != 3) {
             PyErr_SetString(PyExc_TypeError,
                             "type() takes 1 or 3 arguments");
             return NULL;
@@ -2299,10 +2308,8 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
     }
 
     /* Check arguments: (name, bases, dict) */
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "UO!O!:type", kwlist,
-                                     &name,
-                                     &PyTuple_Type, &bases,
-                                     &PyDict_Type, &orig_dict))
+    if (!PyArg_ParseTuple(args, "UO!O!:type", &name, &PyTuple_Type, &bases,
+                          &PyDict_Type, &orig_dict))
         return NULL;
 
     /* Determine the proper metatype to deal with this: */
@@ -2582,6 +2589,20 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
         Py_DECREF(tmp);
     }
 
+    /* Special-case __init_subclass__: if it's a plain function,
+       make it a classmethod */
+    tmp = _PyDict_GetItemId(dict, &PyId___init_subclass__);
+    if (tmp != NULL && PyFunction_Check(tmp)) {
+        tmp = PyClassMethod_New(tmp);
+        if (tmp == NULL)
+            goto error;
+        if (_PyDict_SetItemId(dict, &PyId___init_subclass__, tmp) < 0) {
+            Py_DECREF(tmp);
+            goto error;
+        }
+        Py_DECREF(tmp);
+    }
+
     /* Add descriptors for custom slots from __slots__, or for __dict__ */
     mp = PyHeapType_GET_MEMBERS(et);
     slotoffset = base->tp_basicsize;
@@ -2662,6 +2683,12 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
         et->ht_cached_keys = _PyDict_NewKeysForClass();
     }
 
+    if (set_names(type) < 0)
+        goto error;
+
+    if (init_subclass(type, kwds) < 0)
+        goto error;
+
     Py_DECREF(dict);
     return (PyObject *)type;
 
@@ -2673,7 +2700,7 @@ error:
     return NULL;
 }
 
-static short slotoffsets[] = {
+static const short slotoffsets[] = {
     -1, /* invalid slot */
 #include "typeslots.inc"
 };
@@ -3592,7 +3619,7 @@ same_slots_added(PyTypeObject *a, PyTypeObject *b)
 }
 
 static int
-compatible_for_assignment(PyTypeObject* oldto, PyTypeObject* newto, char* attr)
+compatible_for_assignment(PyTypeObject* oldto, PyTypeObject* newto, const char* attr)
 {
     PyTypeObject *newbase, *oldbase;
 
@@ -3868,6 +3895,24 @@ _PyObject_GetState(PyObject *obj, int required)
         }
 
         assert(slotnames == Py_None || PyList_Check(slotnames));
+        if (required) {
+            Py_ssize_t basicsize = PyBaseObject_Type.tp_basicsize;
+            if (obj->ob_type->tp_dictoffset)
+                basicsize += sizeof(PyObject *);
+            if (obj->ob_type->tp_weaklistoffset)
+                basicsize += sizeof(PyObject *);
+            if (slotnames != Py_None)
+                basicsize += sizeof(PyObject *) * Py_SIZE(slotnames);
+            if (obj->ob_type->tp_basicsize > basicsize) {
+                Py_DECREF(slotnames);
+                Py_DECREF(state);
+                PyErr_Format(PyExc_TypeError,
+                             "can't pickle %.200s objects",
+                             Py_TYPE(obj)->tp_name);
+                return NULL;
+            }
+        }
+
         if (slotnames != Py_None && Py_SIZE(slotnames) > 0) {
             PyObject *slots;
             Py_ssize_t slotnames_size, i;
@@ -3922,7 +3967,7 @@ _PyObject_GetState(PyObject *obj, int required)
             }
 
             /* If we found some slot attributes, pack them in a tuple along
-               the orginal attribute dictionary. */
+               the original attribute dictionary. */
             if (PyDict_Size(slots) > 0) {
                 PyObject *state2;
 
@@ -4091,7 +4136,7 @@ _PyObject_GetItemsIter(PyObject *obj, PyObject **listitems,
 }
 
 static PyObject *
-reduce_newobj(PyObject *obj, int proto)
+reduce_newobj(PyObject *obj)
 {
     PyObject *args = NULL, *kwargs = NULL;
     PyObject *copyreg;
@@ -4144,7 +4189,7 @@ reduce_newobj(PyObject *obj, int proto)
         }
         Py_XDECREF(args);
     }
-    else if (proto >= 4) {
+    else {
         _Py_IDENTIFIER(__newobj_ex__);
 
         newobj = _PyObject_GetAttrId(copyreg, &PyId___newobj_ex__);
@@ -4162,16 +4207,6 @@ reduce_newobj(PyObject *obj, int proto)
             return NULL;
         }
     }
-    else {
-        PyErr_SetString(PyExc_ValueError,
-                        "must use protocol 4 or greater to copy this "
-                        "object; since __getnewargs_ex__ returned "
-                        "keyword arguments.");
-        Py_DECREF(args);
-        Py_DECREF(kwargs);
-        Py_DECREF(copyreg);
-        return NULL;
-    }
 
     state = _PyObject_GetState(obj,
                 !hasargs && !PyList_Check(obj) && !PyDict_Check(obj));
@@ -4217,7 +4252,7 @@ _common_reduce(PyObject *self, int proto)
     PyObject *copyreg, *res;
 
     if (proto >= 2)
-        return reduce_newobj(self, proto);
+        return reduce_newobj(self);
 
     copyreg = import_copyreg();
     if (!copyreg)
@@ -4299,6 +4334,18 @@ PyDoc_STRVAR(object_subclasshook_doc,
 "NotImplemented, the normal algorithm is used.  Otherwise, it\n"
 "overrides the normal algorithm (and the outcome is cached).\n");
 
+static PyObject *
+object_init_subclass(PyObject *cls, PyObject *arg)
+{
+    Py_RETURN_NONE;
+}
+
+PyDoc_STRVAR(object_init_subclass_doc,
+"This method is called when a class is subclassed.\n"
+"\n"
+"The default implementation does nothing. It may be\n"
+"overridden to extend subclasses.\n");
+
 /*
    from PEP 3101, this code implements:
 
@@ -4403,6 +4450,8 @@ static PyMethodDef object_methods[] = {
      PyDoc_STR("helper for pickle")},
     {"__subclasshook__", object_subclasshook, METH_CLASS | METH_VARARGS,
      object_subclasshook_doc},
+    {"__init_subclass__", object_init_subclass, METH_CLASS | METH_NOARGS,
+     object_init_subclass_doc},
     {"__format__", object_format, METH_VARARGS,
      PyDoc_STR("default object formatter")},
     {"__sizeof__", object_sizeof, METH_NOARGS,
@@ -5348,7 +5397,7 @@ wrap_delitem(PyObject *self, PyObject *args, void *wrapped)
 /* Helper to check for object.__setattr__ or __delattr__ applied to a type.
    This is called the Carlo Verre hack after its discoverer. */
 static int
-hackcheck(PyObject *self, setattrofunc func, char *what)
+hackcheck(PyObject *self, setattrofunc func, const char *what)
 {
     PyTypeObject *type = Py_TYPE(self);
     while (type && type->tp_flags & Py_TPFLAGS_HEAPTYPE)
@@ -5768,8 +5817,8 @@ slot_sq_item(PyObject *self, Py_ssize_t i)
             if (args != NULL) {
                 PyTuple_SET_ITEM(args, 0, ival);
                 retval = PyObject_Call(func, args, NULL);
-                Py_XDECREF(args);
-                Py_XDECREF(func);
+                Py_DECREF(args);
+                Py_DECREF(func);
                 return retval;
             }
         }
@@ -6912,6 +6961,54 @@ update_all_slots(PyTypeObject* type)
     }
 }
 
+/* Call __set_name__ on all descriptors in a newly generated type */
+static int
+set_names(PyTypeObject *type)
+{
+    PyObject *key, *value, *tmp;
+    Py_ssize_t i = 0;
+
+    while (PyDict_Next(type->tp_dict, &i, &key, &value)) {
+        if (PyObject_HasAttr(value, _PyUnicode_FromId(&PyId___set_name__))) {
+            tmp = PyObject_CallMethodObjArgs(
+                value, _PyUnicode_FromId(&PyId___set_name__),
+                type, key, NULL);
+            if (tmp == NULL)
+                return -1;
+            else
+                Py_DECREF(tmp);
+        }
+    }
+
+    return 0;
+}
+
+/* Call __init_subclass__ on the parent of a newly generated type */
+static int
+init_subclass(PyTypeObject *type, PyObject *kwds)
+{
+    PyObject *super, *func, *tmp, *tuple;
+
+    super = PyObject_CallFunctionObjArgs((PyObject *) &PySuper_Type,
+                                         type, type, NULL);
+    func = _PyObject_GetAttrId(super, &PyId___init_subclass__);
+    Py_DECREF(super);
+
+    if (func == NULL)
+        return -1;
+
+    tuple = PyTuple_New(0);
+    tmp = PyObject_Call(func, tuple, kwds);
+    Py_DECREF(tuple);
+    Py_DECREF(func);
+
+    if (tmp == NULL)
+        return -1;
+
+    Py_DECREF(tmp);
+    return 0;
+}
+
 /* recurse_down_subclasses() and update_subclasses() are mutually
    recursive functions to call a callback for all subclasses,
    but refraining from recursing into subclasses that define 'name'. */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 1fcc83e..2d31c70 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #include "Python.h"
 #include "ucnhash.h"
 #include "bytes_methods.h"
+#include "stringlib/eq.h"
 
 #ifdef MS_WINDOWS
 #include <windows.h>
@@ -162,6 +163,14 @@ extern "C" {
             *_to++ = (to_type) *_iter++;                \
     } while (0)
 
+#ifdef MS_WINDOWS
+   /* On Windows, overallocate by 50% is the best factor */
+#  define OVERALLOCATE_FACTOR 2
+#else
+   /* On Linux, overallocate by 25% is the best factor */
+#  define OVERALLOCATE_FACTOR 4
+#endif
+
 /* This dictionary holds all interned unicode strings.  Note that references
    to strings in this dictionary are *not* counted in the string's ob_refcnt.
    When the interned string reaches a refcnt of 0 the string deallocation
@@ -263,7 +272,7 @@ raise_encode_exception(PyObject **exceptionObject,
                        const char *reason);
 
 /* Same for linebreaks */
-static unsigned char ascii_linebreak[] = {
+static const unsigned char ascii_linebreak[] = {
     0, 0, 0, 0, 0, 0, 0, 0,
 /*         0x000A, * LINE FEED */
 /*         0x000B, * LINE TABULATION */
@@ -292,6 +301,38 @@ static unsigned char ascii_linebreak[] = {
 
 #include "clinic/unicodeobject.c.h"
 
+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_STRICT,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_BACKSLASHREPLACE,
+    _Py_ERROR_SURROGATEPASS,
+    _Py_ERROR_XMLCHARREFREPLACE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+    if (errors == NULL || strcmp(errors, "strict") == 0)
+        return _Py_ERROR_STRICT;
+    if (strcmp(errors, "surrogateescape") == 0)
+        return _Py_ERROR_SURROGATEESCAPE;
+    if (strcmp(errors, "replace") == 0)
+        return _Py_ERROR_REPLACE;
+    if (strcmp(errors, "ignore") == 0)
+        return _Py_ERROR_IGNORE;
+    if (strcmp(errors, "backslashreplace") == 0)
+        return _Py_ERROR_BACKSLASHREPLACE;
+    if (strcmp(errors, "surrogatepass") == 0)
+        return _Py_ERROR_SURROGATEPASS;
+    if (strcmp(errors, "xmlcharrefreplace") == 0)
+        return _Py_ERROR_XMLCHARREFREPLACE;
+    return _Py_ERROR_OTHER;
+}
+
 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
    This function is kept for backward compatibility with the old API. */
 Py_UNICODE
@@ -521,6 +562,129 @@ unicode_result_unchanged(PyObject *unicode)
         return _PyUnicode_Copy(unicode);
 }
 
+/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
+   ASCII, Latin1, UTF-8, etc. */
+static char*
+backslashreplace(_PyBytesWriter *writer, char *str,
+                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+    Py_ssize_t size, i;
+    Py_UCS4 ch;
+    enum PyUnicode_Kind kind;
+    void *data;
+
+    assert(PyUnicode_IS_READY(unicode));
+    kind = PyUnicode_KIND(unicode);
+    data = PyUnicode_DATA(unicode);
+
+    size = 0;
+    /* determine replacement size */
+    for (i = collstart; i < collend; ++i) {
+        Py_ssize_t incr;
+
+        ch = PyUnicode_READ(kind, data, i);
+        if (ch < 0x100)
+            incr = 2+2;
+        else if (ch < 0x10000)
+            incr = 2+4;
+        else {
+            assert(ch <= MAX_UNICODE);
+            incr = 2+8;
+        }
+        if (size > PY_SSIZE_T_MAX - incr) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "encoded result is too long for a Python string");
+            return NULL;
+        }
+        size += incr;
+    }
+
+    str = _PyBytesWriter_Prepare(writer, str, size);
+    if (str == NULL)
+        return NULL;
+
+    /* generate replacement */
+    for (i = collstart; i < collend; ++i) {
+        ch = PyUnicode_READ(kind, data, i);
+        *str++ = '\\';
+        if (ch >= 0x00010000) {
+            *str++ = 'U';
+            *str++ = Py_hexdigits[(ch>>28)&0xf];
+            *str++ = Py_hexdigits[(ch>>24)&0xf];
+            *str++ = Py_hexdigits[(ch>>20)&0xf];
+            *str++ = Py_hexdigits[(ch>>16)&0xf];
+            *str++ = Py_hexdigits[(ch>>12)&0xf];
+            *str++ = Py_hexdigits[(ch>>8)&0xf];
+        }
+        else if (ch >= 0x100) {
+            *str++ = 'u';
+            *str++ = Py_hexdigits[(ch>>12)&0xf];
+            *str++ = Py_hexdigits[(ch>>8)&0xf];
+        }
+        else
+            *str++ = 'x';
+        *str++ = Py_hexdigits[(ch>>4)&0xf];
+        *str++ = Py_hexdigits[ch&0xf];
+    }
+    return str;
+}
+
+/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
+   ASCII, Latin1, UTF-8, etc. */
+static char*
+xmlcharrefreplace(_PyBytesWriter *writer, char *str,
+                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+    Py_ssize_t size, i;
+    Py_UCS4 ch;
+    enum PyUnicode_Kind kind;
+    void *data;
+
+    assert(PyUnicode_IS_READY(unicode));
+    kind = PyUnicode_KIND(unicode);
+    data = PyUnicode_DATA(unicode);
+
+    size = 0;
+    /* determine replacement size */
+    for (i = collstart; i < collend; ++i) {
+        Py_ssize_t incr;
+
+        ch = PyUnicode_READ(kind, data, i);
+        if (ch < 10)
+            incr = 2+1+1;
+        else if (ch < 100)
+            incr = 2+2+1;
+        else if (ch < 1000)
+            incr = 2+3+1;
+        else if (ch < 10000)
+            incr = 2+4+1;
+        else if (ch < 100000)
+            incr = 2+5+1;
+        else if (ch < 1000000)
+            incr = 2+6+1;
+        else {
+            assert(ch <= MAX_UNICODE);
+            incr = 2+7+1;
+        }
+        if (size > PY_SSIZE_T_MAX - incr) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "encoded result is too long for a Python string");
+            return NULL;
+        }
+        size += incr;
+    }
+
+    str = _PyBytesWriter_Prepare(writer, str, size);
+    if (str == NULL)
+        return NULL;
+
+    /* generate replacement */
+    for (i = collstart; i < collend; ++i) {
+        str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
+    }
+    return str;
+}
+
 /* --- Bloom Filters ----------------------------------------------------- */
 
 /* stuff to implement simple "bloom filters" for Unicode characters.
@@ -587,6 +751,18 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
 #undef BLOOM_UPDATE
 }
 
+static int
+ensure_unicode(PyObject *obj)
+{
+    if (!PyUnicode_Check(obj)) {
+        PyErr_Format(PyExc_TypeError,
+                     "must be str, not %.100s",
+                     Py_TYPE(obj)->tp_name);
+        return -1;
+    }
+    return PyUnicode_READY(obj);
+}
+
 /* Compilation of templated routines */
 
 #include "stringlib/asciilib.h"
@@ -647,27 +823,26 @@ Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
                                      Py_ssize_t size, Py_UCS4 ch,
                                      int direction)
 {
-    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
-
     switch (kind) {
     case PyUnicode_1BYTE_KIND:
-        {
-            Py_UCS1 ch1 = (Py_UCS1) ch;
-            if (ch1 == ch)
-                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
-            else
-                return -1;
-        }
+        if ((Py_UCS1) ch != ch)
+            return -1;
+        if (direction > 0)
+            return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
+        else
+            return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
     case PyUnicode_2BYTE_KIND:
-        {
-            Py_UCS2 ch2 = (Py_UCS2) ch;
-            if (ch2 == ch)
-                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
-            else
-                return -1;
-        }
+        if ((Py_UCS2) ch != ch)
+            return -1;
+        if (direction > 0)
+            return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
+        else
+            return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
     case PyUnicode_4BYTE_KIND:
-        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
+        if (direction > 0)
+            return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
+        else
+            return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
     default:
         assert(0);
         return -1;
@@ -2903,7 +3078,7 @@ PyUnicode_FromEncodedObject(PyObject *obj,
     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
         PyErr_Format(PyExc_TypeError,
-                     "coercing to str: need a bytes-like object, %.80s found",
+                     "decoding to str: need a bytes-like object, %.80s found",
                      Py_TYPE(obj)->tp_name);
         return NULL;
     }
@@ -3167,24 +3342,22 @@ wcstombs_errorpos(const wchar_t *wstr)
 static int
 locale_error_handler(const char *errors, int *surrogateescape)
 {
-    if (errors == NULL) {
-        *surrogateescape = 0;
-        return 0;
-    }
-
-    if (strcmp(errors, "strict") == 0) {
+    _Py_error_handler error_handler = get_error_handler(errors);
+    switch (error_handler)
+    {
+    case _Py_ERROR_STRICT:
         *surrogateescape = 0;
         return 0;
-    }
-    if (strcmp(errors, "surrogateescape") == 0) {
+    case _Py_ERROR_SURROGATEESCAPE:
         *surrogateescape = 1;
         return 0;
+    default:
+        PyErr_Format(PyExc_ValueError,
+                     "only 'strict' and 'surrogateescape' error handlers "
+                     "are supported, not '%s'",
+                     errors);
+        return -1;
     }
-    PyErr_Format(PyExc_ValueError,
-                 "only 'strict' and 'surrogateescape' error handlers "
-                 "are supported, not '%s'",
-                 errors);
-    return -1;
 }
 
 PyObject *
@@ -3626,19 +3799,17 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)
         output = arg;
         Py_INCREF(output);
     }
-    else {
-        arg = PyUnicode_FromObject(arg);
-        if (!arg)
-            return 0;
+    else if (PyUnicode_Check(arg)) {
         output = PyUnicode_EncodeFSDefault(arg);
-        Py_DECREF(arg);
         if (!output)
             return 0;
-        if (!PyBytes_Check(output)) {
-            Py_DECREF(output);
-            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
-            return 0;
-        }
+        assert(PyBytes_Check(output));
+    }
+    else {
+        PyErr_Format(PyExc_TypeError,
+                     "must be str or bytes, not %.100s",
+                     Py_TYPE(arg)->tp_name);
+        return 0;
     }
     size = PyBytes_GET_SIZE(output);
     data = PyBytes_AS_STRING(output);
@@ -3666,7 +3837,13 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
         output = arg;
         Py_INCREF(output);
     }
-    else if (PyObject_CheckBuffer(arg)) {
+    else if (PyBytes_Check(arg) || PyObject_CheckBuffer(arg)) {
+        if (!PyBytes_Check(arg) &&
+            PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+            "path should be string or bytes, not %.200s",
+            Py_TYPE(arg)->tp_name)) {
+            return 0;
+        }
         arg = PyBytes_FromObject(arg);
         if (!arg)
             return 0;
@@ -3675,11 +3852,6 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
         Py_DECREF(arg);
         if (!output)
             return 0;
-        if (!PyUnicode_Check(output)) {
-            Py_DECREF(output);
-            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
-            return 0;
-        }
     }
     else {
         PyErr_Format(PyExc_TypeError,
@@ -3716,7 +3888,7 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
 
     if (PyUnicode_UTF8(unicode) == NULL) {
         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
-        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
+        bytes = _PyUnicode_AsUTF8String(unicode, NULL);
         if (bytes == NULL)
             return NULL;
         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
@@ -3982,7 +4154,7 @@ unicode_decode_call_errorhandler_wchar(
     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
     PyObject **output, Py_ssize_t *outpos)
 {
-    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
+    static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
 
     PyObject *restuple = NULL;
     PyObject *repunicode = NULL;
@@ -4090,7 +4262,7 @@ unicode_decode_call_errorhandler_writer(
     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
 {
-    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
+    static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
 
     PyObject *restuple = NULL;
     PyObject *repunicode = NULL;
@@ -4696,8 +4868,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     Py_ssize_t startinpos;
     Py_ssize_t endinpos;
     const char *errmsg = "";
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0) {
         if (consumed)
@@ -4722,6 +4895,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     while (s < end) {
         Py_UCS4 ch;
         int kind = writer.kind;
+
         if (kind == PyUnicode_1BYTE_KIND) {
             if (PyUnicode_IS_ASCII(writer.buffer))
                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
@@ -4760,24 +4934,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
             continue;
         }
 
-        if (unicode_decode_call_errorhandler_writer(
-                errors, &errorHandler,
-                "utf-8", errmsg,
-                &starts, &end, &startinpos, &endinpos, &exc, &s,
-                &writer))
-            goto onError;
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler) {
+        case _Py_ERROR_IGNORE:
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_REPLACE:
+            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
+                goto onError;
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_SURROGATEESCAPE:
+        {
+            Py_ssize_t i;
+
+            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+                goto onError;
+            for (i=startinpos; i<endinpos; i++) {
+                ch = (Py_UCS4)(unsigned char)(starts[i]);
+                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
+                                ch + 0xdc00);
+                writer.pos++;
+            }
+            s += (endinpos - startinpos);
+            break;
+        }
+
+        default:
+            if (unicode_decode_call_errorhandler_writer(
+                    errors, &error_handler_obj,
+                    "utf-8", errmsg,
+                    &starts, &end, &startinpos, &endinpos, &exc, &s,
+                    &writer))
+                goto onError;
+        }
     }
 
 End:
     if (consumed)
         *consumed = s - starts;
 
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
 onError:
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     _PyUnicodeWriter_Dealloc(&writer);
     return NULL;
@@ -5868,11 +6074,10 @@ PyObject *
 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
 {
     Py_ssize_t i, len;
-    PyObject *repr;
     char *p;
     int kind;
     void *data;
-    Py_ssize_t expandsize = 0;
+    _PyBytesWriter writer;
 
     /* Initial allocation is based on the longest-possible character
        escape.
@@ -5888,35 +6093,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
     }
     if (PyUnicode_READY(unicode) == -1)
         return NULL;
+
+    _PyBytesWriter_Init(&writer);
+
     len = PyUnicode_GET_LENGTH(unicode);
     kind = PyUnicode_KIND(unicode);
     data = PyUnicode_DATA(unicode);
-    switch (kind) {
-    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
-    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
-    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
-    }
 
-    if (len == 0)
-        return PyBytes_FromStringAndSize(NULL, 0);
-
-    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
-        return PyErr_NoMemory();
-
-    repr = PyBytes_FromStringAndSize(NULL,
-                                     2
-                                     + expandsize*len
-                                     + 1);
-    if (repr == NULL)
-        return NULL;
-
-    p = PyBytes_AS_STRING(repr);
+    p = _PyBytesWriter_Alloc(&writer, len);
+    if (p == NULL)
+        goto error;
+    writer.overallocate = 1;
 
     for (i = 0; i < len; i++) {
         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
 
         /* Escape backslashes */
         if (ch == '\\') {
+            /* -1: substract 1 preallocated byte */
+            p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = (char) ch;
             continue;
@@ -5925,6 +6123,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
         /* Map 21-bit characters to '\U00xxxxxx' */
         else if (ch >= 0x10000) {
             assert(ch <= MAX_UNICODE);
+
+            p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'U';
             *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
@@ -5940,6 +6143,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
 
         /* Map 16-bit characters to '\uxxxx' */
         if (ch >= 256) {
+            p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'u';
             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
@@ -5950,20 +6157,37 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
 
         /* Map special whitespace to '\t', \n', '\r' */
         else if (ch == '\t') {
+            p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 't';
         }
         else if (ch == '\n') {
+            p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'n';
         }
         else if (ch == '\r') {
+            p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'r';
         }
 
         /* Map non-printable US ASCII to '\xhh' */
         else if (ch < ' ' || ch >= 0x7F) {
+            /* -1: substract 1 preallocated byte */
+            p = _PyBytesWriter_Prepare(&writer, p, 4-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'x';
             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
@@ -5975,10 +6199,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
             *p++ = (char) ch;
     }
 
-    assert(p - PyBytes_AS_STRING(repr) > 0);
-    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
-        return NULL;
-    return repr;
+    return _PyBytesWriter_Finish(&writer, p);
+
+error:
+    _PyBytesWriter_Dealloc(&writer);
+    return NULL;
 }
 
 PyObject *
@@ -6107,13 +6332,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
 PyObject *
 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
 {
-    PyObject *repr;
     char *p;
-    char *q;
-    Py_ssize_t expandsize, pos;
+    Py_ssize_t pos;
     int kind;
     void *data;
     Py_ssize_t len;
+    _PyBytesWriter writer;
 
     if (!PyUnicode_Check(unicode)) {
         PyErr_BadArgument();
@@ -6121,28 +6345,29 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
     }
     if (PyUnicode_READY(unicode) == -1)
         return NULL;
+
+    _PyBytesWriter_Init(&writer);
+
     kind = PyUnicode_KIND(unicode);
     data = PyUnicode_DATA(unicode);
     len = PyUnicode_GET_LENGTH(unicode);
-    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
-       bytes, and 1 byte characters 4. */
-    expandsize = kind * 2 + 2;
-
-    if (len > PY_SSIZE_T_MAX / expandsize)
-        return PyErr_NoMemory();
 
-    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
-    if (repr == NULL)
-        return NULL;
-    if (len == 0)
-        return repr;
+    p = _PyBytesWriter_Alloc(&writer, len);
+    if (p == NULL)
+        goto error;
+    writer.overallocate = 1;
 
-    p = q = PyBytes_AS_STRING(repr);
     for (pos = 0; pos < len; pos++) {
         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
         /* Map 32-bit characters to '\Uxxxxxxxx' */
         if (ch >= 0x10000) {
             assert(ch <= MAX_UNICODE);
+
+            /* -1: substract 1 preallocated byte */
+            p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'U';
             *p++ = Py_hexdigits[(ch >> 28) & 0xf];
@@ -6156,6 +6381,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
         }
         /* Map 16-bit characters to '\uxxxx' */
         else if (ch >= 256) {
+            /* -1: substract 1 preallocated byte */
+            p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'u';
             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
@@ -6168,10 +6398,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
             *p++ = (char) ch;
     }
 
-    assert(p > q);
-    if (_PyBytes_Resize(&repr, p - q) < 0)
-        return NULL;
-    return repr;
+    return _PyBytesWriter_Finish(&writer, p);
+
+error:
+    _PyBytesWriter_Dealloc(&writer);
+    return NULL;
 }
 
 PyObject *
@@ -6348,7 +6579,7 @@ unicode_encode_call_errorhandler(const char *errors,
                                  Py_ssize_t startpos, Py_ssize_t endpos,
                                  Py_ssize_t *newpos)
 {
-    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
+    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
     Py_ssize_t len;
     PyObject *restuple;
     PyObject *resunicode;
@@ -6402,25 +6633,22 @@ unicode_encode_call_errorhandler(const char *errors,
 static PyObject *
 unicode_encode_ucs1(PyObject *unicode,
                     const char *errors,
-                    unsigned int limit)
+                    const Py_UCS4 limit)
 {
     /* input state */
     Py_ssize_t pos=0, size;
     int kind;
     void *data;
-    /* output object */
-    PyObject *res;
     /* pointer into the output */
     char *str;
-    /* current output position */
-    Py_ssize_t ressize;
     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
-    /* the following variable is used for caching string comparisons
-     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
-    int known_errorHandler = -1;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
+    PyObject *rep = NULL;
+    /* output object */
+    _PyBytesWriter writer;
 
     if (PyUnicode_READY(unicode) == -1)
         return NULL;
@@ -6431,186 +6659,157 @@ unicode_encode_ucs1(PyObject *unicode,
        replacements, if we need more, we'll resize */
     if (size == 0)
         return PyBytes_FromStringAndSize(NULL, 0);
-    res = PyBytes_FromStringAndSize(NULL, size);
-    if (res == NULL)
+
+    _PyBytesWriter_Init(&writer);
+    str = _PyBytesWriter_Alloc(&writer, size);
+    if (str == NULL)
         return NULL;
-    str = PyBytes_AS_STRING(res);
-    ressize = size;
 
     while (pos < size) {
-        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
+        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
 
         /* can we encode this? */
-        if (c<limit) {
+        if (ch < limit) {
             /* no overflow check, because we know that the space is enough */
-            *str++ = (char)c;
+            *str++ = (char)ch;
             ++pos;
         }
         else {
-            Py_ssize_t requiredsize;
-            PyObject *repunicode;
-            Py_ssize_t repsize, newpos, respos, i;
+            Py_ssize_t newpos, i;
             /* startpos for collecting unencodable chars */
             Py_ssize_t collstart = pos;
-            Py_ssize_t collend = pos;
+            Py_ssize_t collend = collstart + 1;
             /* find all unecodable characters */
+
             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
                 ++collend;
+
+            /* Only overallocate the buffer if it's not the last write */
+            writer.overallocate = (collend < size);
+
             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
-            if (known_errorHandler==-1) {
-                if ((errors==NULL) || (!strcmp(errors, "strict")))
-                    known_errorHandler = 1;
-                else if (!strcmp(errors, "replace"))
-                    known_errorHandler = 2;
-                else if (!strcmp(errors, "ignore"))
-                    known_errorHandler = 3;
-                else if (!strcmp(errors, "xmlcharrefreplace"))
-                    known_errorHandler = 4;
-                else
-                    known_errorHandler = 0;
-            }
-            switch (known_errorHandler) {
-            case 1: /* strict */
+            if (error_handler == _Py_ERROR_UNKNOWN)
+                error_handler = get_error_handler(errors);
+
+            switch (error_handler) {
+            case _Py_ERROR_STRICT:
                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
                 goto onError;
-            case 2: /* replace */
-                while (collstart++ < collend)
-                    *str++ = '?'; /* fall through */
-            case 3: /* ignore */
+
+            case _Py_ERROR_REPLACE:
+                memset(str, '?', collend - collstart);
+                str += (collend - collstart);
+                /* fall through ignore error handler */
+            case _Py_ERROR_IGNORE:
+                pos = collend;
+                break;
+
+            case _Py_ERROR_BACKSLASHREPLACE:
+                /* substract preallocated bytes */
+                writer.min_size -= (collend - collstart);
+                str = backslashreplace(&writer, str,
+                                       unicode, collstart, collend);
+                if (str == NULL)
+                    goto onError;
+                pos = collend;
+                break;
+
+            case _Py_ERROR_XMLCHARREFREPLACE:
+                /* substract preallocated bytes */
+                writer.min_size -= (collend - collstart);
+                str = xmlcharrefreplace(&writer, str,
+                                        unicode, collstart, collend);
+                if (str == NULL)
+                    goto onError;
                 pos = collend;
                 break;
-            case 4: /* xmlcharrefreplace */
-                respos = str - PyBytes_AS_STRING(res);
-                requiredsize = respos;
-                /* determine replacement size */
+
+            case _Py_ERROR_SURROGATEESCAPE:
                 for (i = collstart; i < collend; ++i) {
-                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
-                    Py_ssize_t incr;
-                    if (ch < 10)
-                        incr = 2+1+1;
-                    else if (ch < 100)
-                        incr = 2+2+1;
-                    else if (ch < 1000)
-                        incr = 2+3+1;
-                    else if (ch < 10000)
-                        incr = 2+4+1;
-                    else if (ch < 100000)
-                        incr = 2+5+1;
-                    else if (ch < 1000000)
-                        incr = 2+6+1;
-                    else {
-                        assert(ch <= MAX_UNICODE);
-                        incr = 2+7+1;
+                    ch = PyUnicode_READ(kind, data, i);
+                    if (ch < 0xdc80 || 0xdcff < ch) {
+                        /* Not a UTF-8b surrogate */
+                        break;
                     }
-                    if (requiredsize > PY_SSIZE_T_MAX - incr)
-                        goto overflow;
-                    requiredsize += incr;
+                    *str++ = (char)(ch - 0xdc00);
+                    ++pos;
                 }
-                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
-                    goto overflow;
-                requiredsize += size - collend;
-                if (requiredsize > ressize) {
-                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
-                        requiredsize = 2*ressize;
-                    if (_PyBytes_Resize(&res, requiredsize))
-                        goto onError;
-                    str = PyBytes_AS_STRING(res) + respos;
-                    ressize = requiredsize;
-                }
-                /* generate replacement */
-                for (i = collstart; i < collend; ++i) {
-                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
-                }
-                pos = collend;
-                break;
+                if (i >= collend)
+                    break;
+                collstart = pos;
+                assert(collstart != collend);
+                /* fallback to general error handling */
+
             default:
-                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
-                                                              encoding, reason, unicode, &exc,
-                                                              collstart, collend, &newpos);
-                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
-                                           PyUnicode_READY(repunicode) == -1))
+                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
+                                                       encoding, reason, unicode, &exc,
+                                                       collstart, collend, &newpos);
+                if (rep == NULL)
                     goto onError;
-                if (PyBytes_Check(repunicode)) {
+
+                /* substract preallocated bytes */
+                writer.min_size -= 1;
+
+                if (PyBytes_Check(rep)) {
                     /* Directly copy bytes result to output. */
-                    repsize = PyBytes_Size(repunicode);
-                    if (repsize > 1) {
-                        /* Make room for all additional bytes. */
-                        respos = str - PyBytes_AS_STRING(res);
-                        if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
-                            Py_DECREF(repunicode);
-                            goto overflow;
-                        }
-                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
-                            Py_DECREF(repunicode);
-                            goto onError;
-                        }
-                        str = PyBytes_AS_STRING(res) + respos;
-                        ressize += repsize-1;
-                    }
-                    memcpy(str, PyBytes_AsString(repunicode), repsize);
-                    str += repsize;
-                    pos = newpos;
-                    Py_DECREF(repunicode);
-                    break;
-                }
-                /* need more space? (at least enough for what we
-                   have+the replacement+the rest of the string, so
-                   we won't have to check space for encodable characters) */
-                respos = str - PyBytes_AS_STRING(res);
-                repsize = PyUnicode_GET_LENGTH(repunicode);
-                requiredsize = respos;
-                if (requiredsize > PY_SSIZE_T_MAX - repsize)
-                    goto overflow;
-                requiredsize += repsize;
-                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
-                    goto overflow;
-                requiredsize += size - collend;
-                if (requiredsize > ressize) {
-                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
-                        requiredsize = 2*ressize;
-                    if (_PyBytes_Resize(&res, requiredsize)) {
-                        Py_DECREF(repunicode);
+                    str = _PyBytesWriter_WriteBytes(&writer, str,
+                                                    PyBytes_AS_STRING(rep),
+                                                    PyBytes_GET_SIZE(rep));
+                    if (str == NULL)
                         goto onError;
-                    }
-                    str = PyBytes_AS_STRING(res) + respos;
-                    ressize = requiredsize;
                 }
-                /* check if there is anything unencodable in the replacement
-                   and copy it to the output */
-                for (i = 0; repsize-->0; ++i, ++str) {
-                    c = PyUnicode_READ_CHAR(repunicode, i);
-                    if (c >= limit) {
-                        raise_encode_exception(&exc, encoding, unicode,
-                                               pos, pos+1, reason);
-                        Py_DECREF(repunicode);
+                else {
+                    assert(PyUnicode_Check(rep));
+
+                    if (PyUnicode_READY(rep) < 0)
                         goto onError;
+
+                    if (PyUnicode_IS_ASCII(rep)) {
+                        /* Fast path: all characters are smaller than limit */
+                        assert(limit >= 128);
+                        assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+                        str = _PyBytesWriter_WriteBytes(&writer, str,
+                                                        PyUnicode_DATA(rep),
+                                                        PyUnicode_GET_LENGTH(rep));
+                    }
+                    else {
+                        Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
+
+                        str = _PyBytesWriter_Prepare(&writer, str, repsize);
+                        if (str == NULL)
+                            goto onError;
+
+                        /* check if there is anything unencodable in the
+                           replacement and copy it to the output */
+                        for (i = 0; repsize-->0; ++i, ++str) {
+                            ch = PyUnicode_READ_CHAR(rep, i);
+                            if (ch >= limit) {
+                                raise_encode_exception(&exc, encoding, unicode,
+                                                       pos, pos+1, reason);
+                                goto onError;
+                            }
+                            *str = (char)ch;
+                        }
                     }
-                    *str = (char)c;
                 }
                 pos = newpos;
-                Py_DECREF(repunicode);
+                Py_CLEAR(rep);
             }
+
+            /* If overallocation was disabled, ensure that it was the last
+               write. Otherwise, we missed an optimization */
+            assert(writer.overallocate || pos == size);
         }
     }
-    /* Resize if we allocated to much */
-    size = str - PyBytes_AS_STRING(res);
-    if (size < ressize) { /* If this falls res will be NULL */
-        assert(size >= 0);
-        if (_PyBytes_Resize(&res, size) < 0)
-            goto onError;
-    }
 
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
-    return res;
-
-  overflow:
-    PyErr_SetString(PyExc_OverflowError,
-                    "encoded result is too long for a Python string");
+    return _PyBytesWriter_Finish(&writer, str);
 
   onError:
-    Py_XDECREF(res);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(rep);
+    _PyBytesWriter_Dealloc(&writer);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return NULL;
 }
@@ -6670,8 +6869,9 @@ PyUnicode_DecodeASCII(const char *s,
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
     const char *e;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0)
         _Py_RETURN_UNICODE_EMPTY();
@@ -6700,12 +6900,42 @@ PyUnicode_DecodeASCII(const char *s,
             PyUnicode_WRITE(kind, data, writer.pos, c);
             writer.pos++;
             ++s;
+            continue;
         }
-        else {
+
+        /* byte outsize range 0x00..0x7f: call the error handler */
+
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler)
+        {
+        case _Py_ERROR_REPLACE:
+        case _Py_ERROR_SURROGATEESCAPE:
+            /* Fast-path: the error handler only writes one character,
+               but we may switch to UCS2 at the first write */
+            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+                goto onError;
+            kind = writer.kind;
+            data = writer.data;
+
+            if (error_handler == _Py_ERROR_REPLACE)
+                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+            else
+                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+            writer.pos++;
+            ++s;
+            break;
+
+        case _Py_ERROR_IGNORE:
+            ++s;
+            break;
+
+        default:
             startinpos = s-starts;
             endinpos = startinpos + 1;
             if (unicode_decode_call_errorhandler_writer(
-                    errors, &errorHandler,
+                    errors, &error_handler_obj,
                     "ascii", "ordinal not in range(128)",
                     &starts, &e, &startinpos, &endinpos, &exc, &s,
                     &writer))
@@ -6714,13 +6944,13 @@ PyUnicode_DecodeASCII(const char *s,
             data = writer.data;
         }
     }
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
   onError:
     _PyUnicodeWriter_Dealloc(&writer);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return NULL;
 }
@@ -6775,7 +7005,7 @@ PyUnicode_AsASCIIString(PyObject *unicode)
 #  define WC_ERR_INVALID_CHARS 0x0080
 #endif
 
-static char*
+static const char*
 code_page_name(UINT code_page, PyObject **obj)
 {
     *obj = NULL;
@@ -6883,7 +7113,7 @@ decode_code_page_errors(UINT code_page,
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
     PyObject *encoding_obj = NULL;
-    char *encoding;
+    const char *encoding;
     DWORD err;
     int ret = -1;
 
@@ -7119,7 +7349,6 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
     BOOL usedDefaultChar = FALSE;
     BOOL *pusedDefaultChar = &usedDefaultChar;
     int outsize;
-    PyObject *exc = NULL;
     wchar_t *p;
     Py_ssize_t size;
     const DWORD flags = encode_code_page_flags(code_page, NULL);
@@ -7228,7 +7457,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
     PyObject *encoding_obj = NULL;
-    char *encoding;
+    const char *encoding;
     Py_ssize_t newpos, newoutsize;
     PyObject *rep;
     int ret = -1;
@@ -8086,7 +8315,7 @@ static int
 charmap_encoding_error(
     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
     PyObject **exceptionObject,
-    int *known_errorHandler, PyObject **errorHandler, const char *errors,
+    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
     PyObject **res, Py_ssize_t *respos)
 {
     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
@@ -8133,23 +8362,15 @@ charmap_encoding_error(
     }
     /* cache callback name lookup
      * (if not done yet, i.e. it's the first error) */
-    if (*known_errorHandler==-1) {
-        if ((errors==NULL) || (!strcmp(errors, "strict")))
-            *known_errorHandler = 1;
-        else if (!strcmp(errors, "replace"))
-            *known_errorHandler = 2;
-        else if (!strcmp(errors, "ignore"))
-            *known_errorHandler = 3;
-        else if (!strcmp(errors, "xmlcharrefreplace"))
-            *known_errorHandler = 4;
-        else
-            *known_errorHandler = 0;
-    }
-    switch (*known_errorHandler) {
-    case 1: /* strict */
+    if (*error_handler == _Py_ERROR_UNKNOWN)
+        *error_handler = get_error_handler(errors);
+
+    switch (*error_handler) {
+    case _Py_ERROR_STRICT:
         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
         return -1;
-    case 2: /* replace */
+
+    case _Py_ERROR_REPLACE:
         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
             x = charmapencode_output('?', mapping, res, respos);
             if (x==enc_EXCEPTION) {
@@ -8161,10 +8382,11 @@ charmap_encoding_error(
             }
         }
         /* fall through */
-    case 3: /* ignore */
+    case _Py_ERROR_IGNORE:
         *inpos = collendpos;
         break;
-    case 4: /* xmlcharrefreplace */
+
+    case _Py_ERROR_XMLCHARREFREPLACE:
         /* generate replacement (temporarily (mis)uses p) */
         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
             char buffer[2+29+1+1];
@@ -8182,8 +8404,9 @@ charmap_encoding_error(
         }
         *inpos = collendpos;
         break;
+
     default:
-        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
+        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
                                                       encoding, reason, unicode, exceptionObject,
                                                       collstartpos, collendpos, &newpos);
         if (repunicode == NULL)
@@ -8246,12 +8469,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
     Py_ssize_t size;
     /* current output position */
     Py_ssize_t respos = 0;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
-    /* the following variable is used for caching string comparisons
-     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
-     * 3=ignore, 4=xmlcharrefreplace */
-    int known_errorHandler = -1;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
     void *data;
     int kind;
 
@@ -8282,7 +8502,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
         if (x==enc_FAILED) { /* unencodable character */
             if (charmap_encoding_error(unicode, &inpos, mapping,
                                        &exc,
-                                       &known_errorHandler, &errorHandler, errors,
+                                       &error_handler, &error_handler_obj, errors,
                                        &res, &respos)) {
                 goto onError;
             }
@@ -8298,13 +8518,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
             goto onError;
 
     Py_XDECREF(exc);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     return res;
 
   onError:
     Py_XDECREF(res);
     Py_XDECREF(exc);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     return NULL;
 }
 
@@ -8371,7 +8591,7 @@ unicode_translate_call_errorhandler(const char *errors,
                                     Py_ssize_t startpos, Py_ssize_t endpos,
                                     Py_ssize_t *newpos)
 {
-    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
+    static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
 
     Py_ssize_t i_newpos;
     PyObject *restuple;
@@ -8628,7 +8848,7 @@ exit:
     return res;
 }
 
-PyObject *
+static PyObject *
 _PyUnicode_TranslateCharmap(PyObject *input,
                             PyObject *mapping,
                             const char *errors)
@@ -8657,10 +8877,8 @@ _PyUnicode_TranslateCharmap(PyObject *input,
     kind = PyUnicode_KIND(input);
     size = PyUnicode_GET_LENGTH(input);
 
-    if (size == 0) {
-        Py_INCREF(input);
-        return input;
-    }
+    if (size == 0)
+        return PyUnicode_FromObject(input);
 
     /* allocate enough for a simple 1:1 translation without
        replacements, if we need more, we'll resize */
@@ -8771,14 +8989,9 @@ PyUnicode_Translate(PyObject *str,
                     PyObject *mapping,
                     const char *errors)
 {
-    PyObject *result;
-
-    str = PyUnicode_FromObject(str);
-    if (str == NULL)
+    if (ensure_unicode(str) < 0)
         return NULL;
-    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
-    Py_DECREF(str);
-    return result;
+    return _PyUnicode_TranslateCharmap(str, mapping, errors);
 }
 
 static Py_UCS4
@@ -8960,9 +9173,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
     }
 
 static Py_ssize_t
-any_find_slice(int direction, PyObject* s1, PyObject* s2,
+any_find_slice(PyObject* s1, PyObject* s2,
                Py_ssize_t start,
-               Py_ssize_t end)
+               Py_ssize_t end,
+               int direction)
 {
     int kind1, kind2;
     void *buf1, *buf2;
@@ -9131,54 +9345,35 @@ PyUnicode_Count(PyObject *str,
                 Py_ssize_t end)
 {
     Py_ssize_t result;
-    PyObject* str_obj;
-    PyObject* sub_obj;
     int kind1, kind2;
     void *buf1 = NULL, *buf2 = NULL;
     Py_ssize_t len1, len2;
 
-    str_obj = PyUnicode_FromObject(str);
-    if (!str_obj)
-        return -1;
-    sub_obj = PyUnicode_FromObject(substr);
-    if (!sub_obj) {
-        Py_DECREF(str_obj);
-        return -1;
-    }
-    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
-        Py_DECREF(sub_obj);
-        Py_DECREF(str_obj);
+    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
         return -1;
-    }
 
-    kind1 = PyUnicode_KIND(str_obj);
-    kind2 = PyUnicode_KIND(sub_obj);
-    if (kind1 < kind2) {
-        Py_DECREF(sub_obj);
-        Py_DECREF(str_obj);
+    kind1 = PyUnicode_KIND(str);
+    kind2 = PyUnicode_KIND(substr);
+    if (kind1 < kind2)
         return 0;
-    }
 
-    len1 = PyUnicode_GET_LENGTH(str_obj);
-    len2 = PyUnicode_GET_LENGTH(sub_obj);
+    len1 = PyUnicode_GET_LENGTH(str);
+    len2 = PyUnicode_GET_LENGTH(substr);
     ADJUST_INDICES(start, end, len1);
-    if (end - start < len2) {
-        Py_DECREF(sub_obj);
-        Py_DECREF(str_obj);
+    if (end - start < len2)
         return 0;
-    }
 
-    buf1 = PyUnicode_DATA(str_obj);
-    buf2 = PyUnicode_DATA(sub_obj);
+    buf1 = PyUnicode_DATA(str);
+    buf2 = PyUnicode_DATA(substr);
     if (kind2 != kind1) {
-        buf2 = _PyUnicode_AsKind(sub_obj, kind1);
+        buf2 = _PyUnicode_AsKind(substr, kind1);
         if (!buf2)
             goto onError;
     }
 
     switch (kind1) {
     case PyUnicode_1BYTE_KIND:
-        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
+        if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
             result = asciilib_count(
                 ((Py_UCS1*)buf1) + start, end - start,
                 buf2, len2, PY_SSIZE_T_MAX
@@ -9205,16 +9400,11 @@ PyUnicode_Count(PyObject *str,
         assert(0); result = 0;
     }
 
-    Py_DECREF(sub_obj);
-    Py_DECREF(str_obj);
-
     if (kind2 != kind1)
         PyMem_Free(buf2);
 
     return result;
   onError:
-    Py_DECREF(sub_obj);
-    Py_DECREF(str_obj);
     if (kind2 != kind1 && buf2)
         PyMem_Free(buf2);
     return -1;
@@ -9222,35 +9412,15 @@ PyUnicode_Count(PyObject *str,
 
 Py_ssize_t
 PyUnicode_Find(PyObject *str,
-               PyObject *sub,
+               PyObject *substr,
                Py_ssize_t start,
                Py_ssize_t end,
                int direction)
 {
-    Py_ssize_t result;
-
-    str = PyUnicode_FromObject(str);
-    if (!str)
-        return -2;
-    sub = PyUnicode_FromObject(sub);
-    if (!sub) {
-        Py_DECREF(str);
+    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
         return -2;
-    }
-    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
-        Py_DECREF(sub);
-        Py_DECREF(str);
-        return -2;
-    }
-
-    result = any_find_slice(direction,
-        str, sub, start, end
-        );
-
-    Py_DECREF(str);
-    Py_DECREF(sub);
 
-    return result;
+    return any_find_slice(str, substr, start, end, direction);
 }
 
 Py_ssize_t
@@ -9353,22 +9523,10 @@ PyUnicode_Tailmatch(PyObject *str,
                     Py_ssize_t end,
                     int direction)
 {
-    Py_ssize_t result;
-
-    str = PyUnicode_FromObject(str);
-    if (str == NULL)
+    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
         return -1;
-    substr = PyUnicode_FromObject(substr);
-    if (substr == NULL) {
-        Py_DECREF(str);
-        return -1;
-    }
 
-    result = tailmatch(str, substr,
-                       start, end, direction);
-    Py_DECREF(str);
-    Py_DECREF(substr);
-    return result;
+    return tailmatch(str, substr, start, end, direction);
 }
 
 /* Apply fixfct filter to the Unicode object self and return a
@@ -9974,13 +10132,8 @@ PyUnicode_Splitlines(PyObject *string, int keepends)
 {
     PyObject *list;
 
-    string = PyUnicode_FromObject(string);
-    if (string == NULL)
-        return NULL;
-    if (PyUnicode_READY(string) == -1) {
-        Py_DECREF(string);
+    if (ensure_unicode(string) < 0)
         return NULL;
-    }
 
     switch (PyUnicode_KIND(string)) {
     case PyUnicode_1BYTE_KIND:
@@ -10007,7 +10160,6 @@ PyUnicode_Splitlines(PyObject *string, int keepends)
         assert(0);
         list = 0;
     }
-    Py_DECREF(string);
     return list;
 }
 
@@ -10568,28 +10720,27 @@ unicode_casefold(PyObject *self)
 }
 
 
-/* Argument converter.  Coerces to a single unicode character */
+/* Argument converter. Accepts a single Unicode character. */
 
 static int
 convert_uc(PyObject *obj, void *addr)
 {
     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
-    PyObject *uniobj;
 
-    uniobj = PyUnicode_FromObject(obj);
-    if (uniobj == NULL) {
-        PyErr_SetString(PyExc_TypeError,
-                        "The fill character cannot be converted to Unicode");
+    if (!PyUnicode_Check(obj)) {
+        PyErr_Format(PyExc_TypeError,
+                     "The fill character must be a unicode character, "
+                     "not %.100s", Py_TYPE(obj)->tp_name);
         return 0;
     }
-    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
+    if (PyUnicode_READY(obj) < 0)
+        return 0;
+    if (PyUnicode_GET_LENGTH(obj) != 1) {
         PyErr_SetString(PyExc_TypeError,
                         "The fill character must be exactly one character long");
-        Py_DECREF(uniobj);
         return 0;
     }
-    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
-    Py_DECREF(uniobj);
+    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
     return 1;
 }
 
@@ -10905,59 +11056,49 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
 }
 
 int
-PyUnicode_Contains(PyObject *container, PyObject *element)
+_PyUnicode_EQ(PyObject *aa, PyObject *bb)
+{
+    return unicode_eq(aa, bb);
+}
+
+int
+PyUnicode_Contains(PyObject *str, PyObject *substr)
 {
-    PyObject *str, *sub;
     int kind1, kind2;
     void *buf1, *buf2;
     Py_ssize_t len1, len2;
     int result;
 
-    /* Coerce the two arguments */
-    sub = PyUnicode_FromObject(element);
-    if (!sub) {
+    if (!PyUnicode_Check(substr)) {
         PyErr_Format(PyExc_TypeError,
-                     "'in <string>' requires string as left operand, not %s",
-                     element->ob_type->tp_name);
+                     "'in <string>' requires string as left operand, not %.100s",
+                     Py_TYPE(substr)->tp_name);
         return -1;
     }
-
-    str = PyUnicode_FromObject(container);
-    if (!str) {
-        Py_DECREF(sub);
+    if (PyUnicode_READY(substr) == -1)
+        return -1;
+    if (ensure_unicode(str) < 0)
         return -1;
-    }
 
     kind1 = PyUnicode_KIND(str);
-    kind2 = PyUnicode_KIND(sub);
-    if (kind1 < kind2) {
-        Py_DECREF(sub);
-        Py_DECREF(str);
+    kind2 = PyUnicode_KIND(substr);
+    if (kind1 < kind2)
         return 0;
-    }
     len1 = PyUnicode_GET_LENGTH(str);
-    len2 = PyUnicode_GET_LENGTH(sub);
-    if (len1 < len2) {
-        Py_DECREF(sub);
-        Py_DECREF(str);
+    len2 = PyUnicode_GET_LENGTH(substr);
+    if (len1 < len2)
         return 0;
-    }
     buf1 = PyUnicode_DATA(str);
-    buf2 = PyUnicode_DATA(sub);
+    buf2 = PyUnicode_DATA(substr);
     if (len2 == 1) {
         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
-        Py_DECREF(sub);
-        Py_DECREF(str);
         return result;
     }
     if (kind2 != kind1) {
-        buf2 = _PyUnicode_AsKind(sub, kind1);
-        if (!buf2) {
-            Py_DECREF(sub);
-            Py_DECREF(str);
+        buf2 = _PyUnicode_AsKind(substr, kind1);
+        if (!buf2)
             return -1;
-        }
     }
 
     switch (kind1) {
@@ -10975,9 +11116,6 @@ PyUnicode_Contains(PyObject *container, PyObject *element)
         assert(0);
     }
 
-    Py_DECREF(str);
-    Py_DECREF(sub);
-
     if (kind2 != kind1)
         PyMem_Free(buf2);
 
@@ -10989,56 +11127,40 @@ PyUnicode_Contains(PyObject *container, PyObject *element)
 PyObject *
 PyUnicode_Concat(PyObject *left, PyObject *right)
 {
-    PyObject *u = NULL, *v = NULL, *w;
+    PyObject *result;
     Py_UCS4 maxchar, maxchar2;
-    Py_ssize_t u_len, v_len, new_len;
+    Py_ssize_t left_len, right_len, new_len;
 
-    /* Coerce the two arguments */
-    u = PyUnicode_FromObject(left);
-    if (u == NULL)
-        goto onError;
-    v = PyUnicode_FromObject(right);
-    if (v == NULL)
-        goto onError;
+    if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
+        return NULL;
 
     /* Shortcuts */
-    if (v == unicode_empty) {
-        Py_DECREF(v);
-        return u;
-    }
-    if (u == unicode_empty) {
-        Py_DECREF(u);
-        return v;
-    }
+    if (left == unicode_empty)
+        return PyUnicode_FromObject(right);
+    if (right == unicode_empty)
+        return PyUnicode_FromObject(left);
 
-    u_len = PyUnicode_GET_LENGTH(u);
-    v_len = PyUnicode_GET_LENGTH(v);
-    if (u_len > PY_SSIZE_T_MAX - v_len) {
+    left_len = PyUnicode_GET_LENGTH(left);
+    right_len = PyUnicode_GET_LENGTH(right);
+    if (left_len > PY_SSIZE_T_MAX - right_len) {
         PyErr_SetString(PyExc_OverflowError,
                         "strings are too large to concat");
-        goto onError;
+        return NULL;
     }
-    new_len = u_len + v_len;
+    new_len = left_len + right_len;
 
-    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
-    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
+    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
+    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
     maxchar = Py_MAX(maxchar, maxchar2);
 
     /* Concat the two Unicode strings */
-    w = PyUnicode_New(new_len, maxchar);
-    if (w == NULL)
-        goto onError;
-    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
-    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
-    Py_DECREF(u);
-    Py_DECREF(v);
-    assert(_PyUnicode_CheckConsistency(w, 1));
-    return w;
-
-  onError:
-    Py_XDECREF(u);
-    Py_XDECREF(v);
-    return NULL;
+    result = PyUnicode_New(new_len, maxchar);
+    if (result == NULL)
+        return NULL;
+    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
+    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
+    assert(_PyUnicode_CheckConsistency(result, 1));
+    return result;
 }
 
 void
@@ -11129,6 +11251,25 @@ PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
     Py_XDECREF(right);
 }
 
+/*
+Wraps stringlib_parse_args_finds() and additionally ensures that the
+first argument is a unicode object.
+*/
+
+Py_LOCAL_INLINE(int)
+parse_args_finds_unicode(const char * function_name, PyObject *args,
+                         PyObject **substring,
+                         Py_ssize_t *start, Py_ssize_t *end)
+{
+    if(stringlib_parse_args_finds(function_name, args, substring,
+                                  start, end)) {
+        if (ensure_unicode(*substring) < 0)
+            return 0;
+        return 1;
+    }
+    return 0;
+}
+
 PyDoc_STRVAR(count__doc__,
              "S.count(sub[, start[, end]]) -> int\n\
 \n\
@@ -11147,31 +11288,26 @@ unicode_count(PyObject *self, PyObject *args)
     void *buf1, *buf2;
     Py_ssize_t len1, len2, iresult;
 
-    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
-                                            &start, &end))
+    if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
         return NULL;
 
     kind1 = PyUnicode_KIND(self);
     kind2 = PyUnicode_KIND(substring);
-    if (kind1 < kind2) {
-        Py_DECREF(substring);
+    if (kind1 < kind2)
         return PyLong_FromLong(0);
-    }
+
     len1 = PyUnicode_GET_LENGTH(self);
     len2 = PyUnicode_GET_LENGTH(substring);
     ADJUST_INDICES(start, end, len1);
-    if (end - start < len2) {
-        Py_DECREF(substring);
+    if (end - start < len2)
         return PyLong_FromLong(0);
-    }
+
     buf1 = PyUnicode_DATA(self);
     buf2 = PyUnicode_DATA(substring);
     if (kind2 != kind1) {
         buf2 = _PyUnicode_AsKind(substring, kind1);
-        if (!buf2) {
-            Py_DECREF(substring);
+        if (!buf2)
             return NULL;
-        }
     }
     switch (kind1) {
     case PyUnicode_1BYTE_KIND:
@@ -11201,8 +11337,6 @@ unicode_count(PyObject *self, PyObject *args)
     if (kind2 != kind1)
         PyMem_Free(buf2);
 
-    Py_DECREF(substring);
-
     return result;
 }
 
@@ -11336,22 +11470,13 @@ unicode_find(PyObject *self, PyObject *args)
     Py_ssize_t end = 0;
     Py_ssize_t result;
 
-    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
-                                            &start, &end))
+    if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
         return NULL;
 
-    if (PyUnicode_READY(self) == -1) {
-        Py_DECREF(substring);
-        return NULL;
-    }
-    if (PyUnicode_READY(substring) == -1) {
-        Py_DECREF(substring);
+    if (PyUnicode_READY(self) == -1)
         return NULL;
-    }
-
-    result = any_find_slice(1, self, substring, start, end);
 
-    Py_DECREF(substring);
+    result = any_find_slice(self, substring, start, end, 1);
 
     if (result == -2)
         return NULL;
@@ -11424,22 +11549,13 @@ unicode_index(PyObject *self, PyObject *args)
     Py_ssize_t start = 0;
     Py_ssize_t end = 0;
 
-    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
-                                            &start, &end))
+    if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
         return NULL;
 
-    if (PyUnicode_READY(self) == -1) {
-        Py_DECREF(substring);
-        return NULL;
-    }
-    if (PyUnicode_READY(substring) == -1) {
-        Py_DECREF(substring);
+    if (PyUnicode_READY(self) == -1)
         return NULL;
-    }
 
-    result = any_find_slice(1, self, substring, start, end);
-
-    Py_DECREF(substring);
+    result = any_find_slice(self, substring, start, end, 1);
 
     if (result == -2)
         return NULL;
@@ -11953,7 +12069,7 @@ unicode_lower(PyObject *self)
 #define BOTHSTRIP 2
 
 /* Arrays indexed by above */
-static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
+static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
 
 #define STRIPNAME(i) (stripformat[i]+3)
 
@@ -12248,40 +12364,15 @@ unicode_repeat(PyObject *str, Py_ssize_t len)
 }
 
 PyObject *
-PyUnicode_Replace(PyObject *obj,
-                  PyObject *subobj,
-                  PyObject *replobj,
+PyUnicode_Replace(PyObject *str,
+                  PyObject *substr,
+                  PyObject *replstr,
                   Py_ssize_t maxcount)
 {
-    PyObject *self;
-    PyObject *str1;
-    PyObject *str2;
-    PyObject *result;
-
-    self = PyUnicode_FromObject(obj);
-    if (self == NULL)
+    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
+            ensure_unicode(replstr) < 0)
         return NULL;
-    str1 = PyUnicode_FromObject(subobj);
-    if (str1 == NULL) {
-        Py_DECREF(self);
-        return NULL;
-    }
-    str2 = PyUnicode_FromObject(replobj);
-    if (str2 == NULL) {
-        Py_DECREF(self);
-        Py_DECREF(str1);
-        return NULL;
-    }
-    if (PyUnicode_READY(self) == -1 ||
-        PyUnicode_READY(str1) == -1 ||
-        PyUnicode_READY(str2) == -1)
-        result = NULL;
-    else
-        result = replace(self, str1, str2, maxcount);
-    Py_DECREF(self);
-    Py_DECREF(str1);
-    Py_DECREF(str2);
-    return result;
+    return replace(str, substr, replstr, maxcount);
 }
 
 PyDoc_STRVAR(replace__doc__,
@@ -12297,28 +12388,12 @@ unicode_replace(PyObject *self, PyObject *args)
     PyObject *str1;
     PyObject *str2;
     Py_ssize_t maxcount = -1;
-    PyObject *result;
 
-    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
+    if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
         return NULL;
     if (PyUnicode_READY(self) == -1)
         return NULL;
-    str1 = PyUnicode_FromObject(str1);
-    if (str1 == NULL)
-        return NULL;
-    str2 = PyUnicode_FromObject(str2);
-    if (str2 == NULL) {
-        Py_DECREF(str1);
-        return NULL;
-    }
-    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
-        result = NULL;
-    else
-        result = replace(self, str1, str2, maxcount);
-
-    Py_DECREF(str1);
-    Py_DECREF(str2);
-    return result;
+    return replace(self, str1, str2, maxcount);
 }
 
 static PyObject *
@@ -12503,22 +12578,13 @@ unicode_rfind(PyObject *self, PyObject *args)
     Py_ssize_t end = 0;
     Py_ssize_t result;
 
-    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
-                                            &start, &end))
+    if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
         return NULL;
 
-    if (PyUnicode_READY(self) == -1) {
-        Py_DECREF(substring);
-        return NULL;
-    }
-    if (PyUnicode_READY(substring) == -1) {
-        Py_DECREF(substring);
+    if (PyUnicode_READY(self) == -1)
         return NULL;
-    }
-
-    result = any_find_slice(-1, self, substring, start, end);
 
-    Py_DECREF(substring);
+    result = any_find_slice(self, substring, start, end, -1);
 
     if (result == -2)
         return NULL;
@@ -12540,22 +12606,13 @@ unicode_rindex(PyObject *self, PyObject *args)
     Py_ssize_t end = 0;
     Py_ssize_t result;
 
-    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
-                                            &start, &end))
+    if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
         return NULL;
 
-    if (PyUnicode_READY(self) == -1) {
-        Py_DECREF(substring);
-        return NULL;
-    }
-    if (PyUnicode_READY(substring) == -1) {
-        Py_DECREF(substring);
+    if (PyUnicode_READY(self) == -1)
         return NULL;
-    }
-
-    result = any_find_slice(-1, self, substring, start, end);
 
-    Py_DECREF(substring);
+    result = any_find_slice(self, substring, start, end, -1);
 
     if (result == -2)
         return NULL;
@@ -12595,24 +12652,10 @@ unicode_rjust(PyObject *self, PyObject *args)
 PyObject *
 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
 {
-    PyObject *result;
-
-    s = PyUnicode_FromObject(s);
-    if (s == NULL)
+    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
         return NULL;
-    if (sep != NULL) {
-        sep = PyUnicode_FromObject(sep);
-        if (sep == NULL) {
-            Py_DECREF(s);
-            return NULL;
-        }
-    }
 
-    result = split(s, sep, maxsplit);
-
-    Py_DECREF(s);
-    Py_XDECREF(sep);
-    return result;
+    return split(s, sep, maxsplit);
 }
 
 PyDoc_STRVAR(split__doc__,
@@ -12637,35 +12680,26 @@ unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
 
     if (substring == Py_None)
         return split(self, NULL, maxcount);
-    else if (PyUnicode_Check(substring))
+
+    if (PyUnicode_Check(substring))
         return split(self, substring, maxcount);
-    else
-        return PyUnicode_Split(self, substring, maxcount);
+
+    PyErr_Format(PyExc_TypeError,
+                 "must be str or None, not %.100s",
+                 Py_TYPE(substring)->tp_name);
+    return NULL;
 }
 
 PyObject *
-PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
+PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
 {
-    PyObject* str_obj;
-    PyObject* sep_obj;
     PyObject* out;
     int kind1, kind2;
     void *buf1, *buf2;
     Py_ssize_t len1, len2;
 
-    str_obj = PyUnicode_FromObject(str_in);
-    if (!str_obj)
-        return NULL;
-    sep_obj = PyUnicode_FromObject(sep_in);
-    if (!sep_obj) {
-        Py_DECREF(str_obj);
-        return NULL;
-    }
-    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
-        Py_DECREF(sep_obj);
-        Py_DECREF(str_obj);
+    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
         return NULL;
-    }
 
     kind1 = PyUnicode_KIND(str_obj);
     kind2 = PyUnicode_KIND(sep_obj);
@@ -12679,8 +12713,6 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
             Py_DECREF(unicode_empty);
         }
-        Py_DECREF(sep_obj);
-        Py_DECREF(str_obj);
         return out;
     }
     buf1 = PyUnicode_DATA(str_obj);
@@ -12688,7 +12720,7 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
     if (kind2 != kind1) {
         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
         if (!buf2)
-            goto onError;
+            return NULL;
     }
 
     switch (kind1) {
@@ -12709,39 +12741,23 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
         out = 0;
     }
 
-    Py_DECREF(sep_obj);
-    Py_DECREF(str_obj);
     if (kind2 != kind1)
         PyMem_Free(buf2);
 
     return out;
-  onError:
-    Py_DECREF(sep_obj);
-    Py_DECREF(str_obj);
-    if (kind2 != kind1 && buf2)
-        PyMem_Free(buf2);
-    return NULL;
 }
 
 
 PyObject *
-PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
+PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
 {
-    PyObject* str_obj;
-    PyObject* sep_obj;
     PyObject* out;
     int kind1, kind2;
     void *buf1, *buf2;
     Py_ssize_t len1, len2;
 
-    str_obj = PyUnicode_FromObject(str_in);
-    if (!str_obj)
+    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
         return NULL;
-    sep_obj = PyUnicode_FromObject(sep_in);
-    if (!sep_obj) {
-        Py_DECREF(str_obj);
-        return NULL;
-    }
 
     kind1 = PyUnicode_KIND(str_obj);
     kind2 = PyUnicode_KIND(sep_obj);
@@ -12755,8 +12771,6 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
             Py_DECREF(unicode_empty);
         }
-        Py_DECREF(sep_obj);
-        Py_DECREF(str_obj);
         return out;
     }
     buf1 = PyUnicode_DATA(str_obj);
@@ -12764,7 +12778,7 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
     if (kind2 != kind1) {
         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
         if (!buf2)
-            goto onError;
+            return NULL;
     }
 
     switch (kind1) {
@@ -12785,18 +12799,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
         out = 0;
     }
 
-    Py_DECREF(sep_obj);
-    Py_DECREF(str_obj);
     if (kind2 != kind1)
         PyMem_Free(buf2);
 
     return out;
-  onError:
-    Py_DECREF(sep_obj);
-    Py_DECREF(str_obj);
-    if (kind2 != kind1 && buf2)
-        PyMem_Free(buf2);
-    return NULL;
 }
 
 PyDoc_STRVAR(partition__doc__,
@@ -12828,24 +12834,10 @@ unicode_rpartition(PyObject *self, PyObject *separator)
 PyObject *
 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
 {
-    PyObject *result;
-
-    s = PyUnicode_FromObject(s);
-    if (s == NULL)
+    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
         return NULL;
-    if (sep != NULL) {
-        sep = PyUnicode_FromObject(sep);
-        if (sep == NULL) {
-            Py_DECREF(s);
-            return NULL;
-        }
-    }
 
-    result = rsplit(s, sep, maxsplit);
-
-    Py_DECREF(s);
-    Py_XDECREF(sep);
-    return result;
+    return rsplit(s, sep, maxsplit);
 }
 
 PyDoc_STRVAR(rsplit__doc__,
@@ -12870,10 +12862,14 @@ unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
 
     if (substring == Py_None)
         return rsplit(self, NULL, maxcount);
-    else if (PyUnicode_Check(substring))
+
+    if (PyUnicode_Check(substring))
         return rsplit(self, substring, maxcount);
-    else
-        return PyUnicode_RSplit(self, substring, maxcount);
+
+    PyErr_Format(PyExc_TypeError,
+                 "must be str or None, not %.100s",
+                 Py_TYPE(substring)->tp_name);
+    return NULL;
 }
 
 PyDoc_STRVAR(splitlines__doc__,
@@ -13154,11 +13150,15 @@ unicode_startswith(PyObject *self,
     if (PyTuple_Check(subobj)) {
         Py_ssize_t i;
         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
-            if (substring == NULL)
+            substring = PyTuple_GET_ITEM(subobj, i);
+            if (!PyUnicode_Check(substring)) {
+                PyErr_Format(PyExc_TypeError,
+                             "tuple for startswith must only contain str, "
+                             "not %.100s",
+                             Py_TYPE(substring)->tp_name);
                 return NULL;
+            }
             result = tailmatch(self, substring, start, end, -1);
-            Py_DECREF(substring);
             if (result == -1)
                 return NULL;
             if (result) {
@@ -13168,15 +13168,13 @@ unicode_startswith(PyObject *self,
         /* nothing matched */
         Py_RETURN_FALSE;
     }
-    substring = PyUnicode_FromObject(subobj);
-    if (substring == NULL) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
-                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
+    if (!PyUnicode_Check(subobj)) {
+        PyErr_Format(PyExc_TypeError,
+                     "startswith first arg must be str or "
+                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
         return NULL;
     }
-    result = tailmatch(self, substring, start, end, -1);
-    Py_DECREF(substring);
+    result = tailmatch(self, subobj, start, end, -1);
     if (result == -1)
         return NULL;
     return PyBool_FromLong(result);
@@ -13206,12 +13204,15 @@ unicode_endswith(PyObject *self,
     if (PyTuple_Check(subobj)) {
         Py_ssize_t i;
         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            substring = PyUnicode_FromObject(
-                PyTuple_GET_ITEM(subobj, i));
-            if (substring == NULL)
+            substring = PyTuple_GET_ITEM(subobj, i);
+            if (!PyUnicode_Check(substring)) {
+                PyErr_Format(PyExc_TypeError,
+                             "tuple for endswith must only contain str, "
+                             "not %.100s",
+                             Py_TYPE(substring)->tp_name);
                 return NULL;
+            }
             result = tailmatch(self, substring, start, end, +1);
-            Py_DECREF(substring);
             if (result == -1)
                 return NULL;
             if (result) {
@@ -13220,15 +13221,13 @@ unicode_endswith(PyObject *self,
         }
         Py_RETURN_FALSE;
     }
-    substring = PyUnicode_FromObject(subobj);
-    if (substring == NULL) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
-                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
+    if (!PyUnicode_Check(subobj)) {
+        PyErr_Format(PyExc_TypeError,
+                     "endswith first arg must be str or "
+                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
         return NULL;
     }
-    result = tailmatch(self, substring, start, end, +1);
-    Py_DECREF(substring);
+    result = tailmatch(self, subobj, start, end, +1);
     if (result == -1)
         return NULL;
     return PyBool_FromLong(result);
@@ -13237,44 +13236,50 @@ unicode_endswith(PyObject *self,
 Py_LOCAL_INLINE(void)
 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
 {
-    if (!writer->readonly)
+    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
+    writer->data = PyUnicode_DATA(writer->buffer);
+
+    if (!writer->readonly) {
+        writer->kind = PyUnicode_KIND(writer->buffer);
         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+    }
     else {
+        /* use a value smaller than PyUnicode_1BYTE_KIND() so
+           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+        writer->kind = PyUnicode_WCHAR_KIND;
+        assert(writer->kind <= PyUnicode_1BYTE_KIND);
+
         /* Copy-on-write mode: set buffer size to 0 so
          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
          * next write. */
         writer->size = 0;
     }
-    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
-    writer->data = PyUnicode_DATA(writer->buffer);
-    writer->kind = PyUnicode_KIND(writer->buffer);
 }
 
 void
 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
 {
     memset(writer, 0, sizeof(*writer));
-#ifdef Py_DEBUG
-    writer->kind = 5;    /* invalid kind */
-#endif
+
+    /* ASCII is the bare minimum */
     writer->min_char = 127;
+
+    /* use a value smaller than PyUnicode_1BYTE_KIND() so
+       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+    writer->kind = PyUnicode_WCHAR_KIND;
+    assert(writer->kind <= PyUnicode_1BYTE_KIND);
 }
 
 int
 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                                  Py_ssize_t length, Py_UCS4 maxchar)
 {
-#ifdef MS_WINDOWS
-   /* On Windows, overallocate by 50% is the best factor */
-#  define OVERALLOCATE_FACTOR 2
-#else
-   /* On Linux, overallocate by 25% is the best factor */
-#  define OVERALLOCATE_FACTOR 4
-#endif
     Py_ssize_t newlen;
     PyObject *newbuffer;
 
-    assert(length > 0);
+    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
+    assert((maxchar > writer->maxchar && length >= 0)
+           || length > 0);
 
     if (length > PY_SSIZE_T_MAX - writer->pos) {
         PyErr_NoMemory();
@@ -13340,6 +13345,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
 #undef OVERALLOCATE_FACTOR
 }
 
+int
+_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
+                                     enum PyUnicode_Kind kind)
+{
+    Py_UCS4 maxchar;
+
+    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
+    assert(writer->kind < kind);
+
+    switch (kind)
+    {
+    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
+    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
+    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
+    default:
+        assert(0 && "invalid kind");
+        return -1;
+    }
+
+    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
+}
+
 Py_LOCAL_INLINE(int)
 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
 {
@@ -13510,17 +13537,26 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
         return str;
     }
-    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
-        PyObject *newbuffer;
-        newbuffer = resize_compact(writer->buffer, writer->pos);
-        if (newbuffer == NULL) {
-            Py_CLEAR(writer->buffer);
-            return NULL;
+    if (writer->pos == 0) {
+        Py_CLEAR(writer->buffer);
+
+        /* Get the empty Unicode string singleton ('') */
+        _Py_INCREF_UNICODE_EMPTY();
+        str  = unicode_empty;
+    }
+    else {
+        str = writer->buffer;
+        writer->buffer = NULL;
+
+        if (PyUnicode_GET_LENGTH(str) != writer->pos) {
+            PyObject *str2;
+            str2 = resize_compact(str, writer->pos);
+            if (str2 == NULL)
+                return NULL;
+            str = str2;
         }
-        writer->buffer = newbuffer;
     }
-    str = writer->buffer;
-    writer->buffer = NULL;
+
     assert(_PyUnicode_CheckConsistency(str, 1));
     return unicode_result_ready(str);
 }
@@ -14661,13 +14697,10 @@ PyUnicode_Format(PyObject *format, PyObject *args)
         return NULL;
     }
 
-    ctx.fmtstr = PyUnicode_FromObject(format);
-    if (ctx.fmtstr == NULL)
+    if (ensure_unicode(format) < 0)
         return NULL;
-    if (PyUnicode_READY(ctx.fmtstr) == -1) {
-        Py_DECREF(ctx.fmtstr);
-        return NULL;
-    }
+
+    ctx.fmtstr = format;
     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
@@ -14727,11 +14760,9 @@ PyUnicode_Format(PyObject *format, PyObject *args)
     if (ctx.args_owned) {
         Py_DECREF(ctx.args);
     }
-    Py_DECREF(ctx.fmtstr);
     return _PyUnicodeWriter_Finish(&ctx.writer);
 
   onError:
-    Py_DECREF(ctx.fmtstr);
     _PyUnicodeWriter_Dealloc(&ctx.writer);
     if (ctx.args_owned) {
         Py_DECREF(ctx.args);
@@ -15009,26 +15040,18 @@ PyUnicode_InternInPlace(PyObject **p)
             return;
         }
     }
-    /* It might be that the GetItem call fails even
-       though the key is present in the dictionary,
-       namely when this happens during a stack overflow. */
     Py_ALLOW_RECURSION
-    t = PyDict_GetItem(interned, s);
+    t = PyDict_SetDefault(interned, s, s);
     Py_END_ALLOW_RECURSION
-
-    if (t) {
-        Py_INCREF(t);
-        Py_SETREF(*p, t);
+    if (t == NULL) {
+        PyErr_Clear();
         return;
     }
-
-    PyThreadState_GET()->recursion_critical = 1;
-    if (PyDict_SetItem(interned, s, s) < 0) {
-        PyErr_Clear();
-        PyThreadState_GET()->recursion_critical = 0;
+    if (t != s) {
+        Py_INCREF(t);
+        Py_SETREF(*p, t);
         return;
     }
-    PyThreadState_GET()->recursion_critical = 0;
     /* The two references in interned are not counted by refcnt.
        The deallocator will take care of this */
     Py_REFCNT(s) -= 2;
diff --git a/Objects/weakrefobject.c b/Objects/weakrefobject.c
index 7e6f364..f75b1e8 100644
--- a/Objects/weakrefobject.c
+++ b/Objects/weakrefobject.c
@@ -265,7 +265,7 @@ insert_head(PyWeakReference *newref, PyWeakReference **list)
 }
 
 static int
-parse_weakref_init_args(char *funcname, PyObject *args, PyObject *kwargs,
+parse_weakref_init_args(const char *funcname, PyObject *args, PyObject *kwargs,
                         PyObject **obp, PyObject **callbackp)
 {
     return PyArg_UnpackTuple(args, funcname, 1, 2, obp, callbackp);