47 files changed, 5832 insertions, 5109 deletions
diff --git a/Objects/abstract.c b/Objects/abstract.c
index 88205bd..d876dc5 100644
--- a/Objects/abstract.c
+++ b/Objects/abstract.c
@@ -32,8 +32,10 @@ PyObject_Type(PyObject *o)
 {
     PyObject *v;
 
-    if (o == NULL)
+    if (o == NULL) {
         return null_error();
+    }
+
     v = (PyObject *)o->ob_type;
     Py_INCREF(v);
     return v;
@@ -137,12 +139,16 @@ PyObject_GetItem(PyObject *o, PyObject *key)
 {
     PyMappingMethods *m;
 
-    if (o == NULL || key == NULL)
+    if (o == NULL || key == NULL) {
         return null_error();
+    }
 
     m = o->ob_type->tp_as_mapping;
-    if (m && m->mp_subscript)
-        return m->mp_subscript(o, key);
+    if (m && m->mp_subscript) {
+        PyObject *item = m->mp_subscript(o, key);
+        assert((item != NULL) ^ (PyErr_Occurred() != NULL));
+        return item;
+    }
 
     if (o->ob_type->tp_as_sequence) {
         if (PyIndex_Check(key)) {
@@ -669,13 +675,31 @@ PyObject_Format(PyObject *obj, PyObject *format_spec)
     PyObject *result = NULL;
     _Py_IDENTIFIER(__format__);
 
+    if (format_spec != NULL && !PyUnicode_Check(format_spec)) {
+        PyErr_Format(PyExc_SystemError,
+                     "Format specifier must be a string, not %.200s",
+                     Py_TYPE(format_spec)->tp_name);
+        return NULL;
+    }
+
+    /* Fast path for common types. */
+    if (format_spec == NULL || PyUnicode_GET_LENGTH(format_spec) == 0) {
+        if (PyUnicode_CheckExact(obj)) {
+            Py_INCREF(obj);
+            return obj;
+        }
+        if (PyLong_CheckExact(obj)) {
+            return PyObject_Str(obj);
+        }
+    }
+
     /* If no format_spec is provided, use an empty string */
     if (format_spec == NULL) {
         empty = PyUnicode_New(0, 0);
         format_spec = empty;
     }
 
-    /* Find the (unbound!) __format__ method (a borrowed reference) */
+    /* Find the (unbound!) __format__ method */
     meth = _PyObject_LookupSpecial(obj, &PyId___format__);
     if (meth == NULL) {
         if (!PyErr_Occurred())
@@ -1122,8 +1146,10 @@ PyNumber_Negative(PyObject *o)
 {
     PyNumberMethods *m;
 
-    if (o == NULL)
+    if (o == NULL) {
         return null_error();
+    }
+
     m = o->ob_type->tp_as_number;
     if (m && m->nb_negative)
         return (*m->nb_negative)(o);
@@ -1136,8 +1162,10 @@ PyNumber_Positive(PyObject *o)
 {
     PyNumberMethods *m;
 
-    if (o == NULL)
+    if (o == NULL) {
         return null_error();
+    }
+
     m = o->ob_type->tp_as_number;
     if (m && m->nb_positive)
         return (*m->nb_positive)(o);
@@ -1150,8 +1178,10 @@ PyNumber_Invert(PyObject *o)
 {
     PyNumberMethods *m;
 
-    if (o == NULL)
+    if (o == NULL) {
         return null_error();
+    }
+
     m = o->ob_type->tp_as_number;
     if (m && m->nb_invert)
         return (*m->nb_invert)(o);
@@ -1164,8 +1194,10 @@ PyNumber_Absolute(PyObject *o)
 {
     PyNumberMethods *m;
 
-    if (o == NULL)
+    if (o == NULL) {
         return null_error();
+    }
+
     m = o->ob_type->tp_as_number;
     if (m && m->nb_absolute)
         return m->nb_absolute(o);
@@ -1181,8 +1213,10 @@ PyObject *
 PyNumber_Index(PyObject *item)
 {
     PyObject *result = NULL;
-    if (item == NULL)
+    if (item == NULL) {
         return null_error();
+    }
+
     if (PyLong_Check(item)) {
         Py_INCREF(item);
         return item;
@@ -1265,42 +1299,55 @@ PyNumber_AsSsize_t(PyObject *item, PyObject *err)
 PyObject *
 PyNumber_Long(PyObject *o)
 {
+    PyObject *result;
     PyNumberMethods *m;
     PyObject *trunc_func;
     Py_buffer view;
     _Py_IDENTIFIER(__trunc__);
 
-    if (o == NULL)
+    if (o == NULL) {
         return null_error();
+    }
+
     if (PyLong_CheckExact(o)) {
         Py_INCREF(o);
         return o;
     }
     m = o->ob_type->tp_as_number;
     if (m && m->nb_int) { /* This should include subclasses of int */
-        return (PyObject *)_PyLong_FromNbInt(o);
+        result = (PyObject *)_PyLong_FromNbInt(o);
+        if (result != NULL && !PyLong_CheckExact(result)) {
+            Py_SETREF(result, _PyLong_Copy((PyLongObject *)result));
+        }
+        return result;
     }
     trunc_func = _PyObject_LookupSpecial(o, &PyId___trunc__);
     if (trunc_func) {
-        PyObject *truncated = PyEval_CallObject(trunc_func, NULL);
-        PyObject *int_instance;
+        result = PyEval_CallObject(trunc_func, NULL);
         Py_DECREF(trunc_func);
-        if (truncated == NULL || PyLong_Check(truncated))
-            return truncated;
+        if (result == NULL || PyLong_CheckExact(result)) {
+            return result;
+        }
+        if (PyLong_Check(result)) {
+            Py_SETREF(result, _PyLong_Copy((PyLongObject *)result));
+            return result;
+        }
         /* __trunc__ is specified to return an Integral type,
            but int() needs to return an int. */
-        m = truncated->ob_type->tp_as_number;
+        m = result->ob_type->tp_as_number;
         if (m == NULL || m->nb_int == NULL) {
             PyErr_Format(
                 PyExc_TypeError,
                 "__trunc__ returned non-Integral (type %.200s)",
-                truncated->ob_type->tp_name);
-            Py_DECREF(truncated);
+                result->ob_type->tp_name);
+            Py_DECREF(result);
             return NULL;
         }
-        int_instance = (PyObject *)_PyLong_FromNbInt(truncated);
-        Py_DECREF(truncated);
-        return int_instance;
+        Py_SETREF(result, (PyObject *)_PyLong_FromNbInt(result));
+        if (result != NULL && !PyLong_CheckExact(result)) {
+            Py_SETREF(result, _PyLong_Copy((PyLongObject *)result));
+        }
+        return result;
     }
     if (PyErr_Occurred())
         return NULL;
@@ -1322,7 +1369,7 @@ PyNumber_Long(PyObject *o)
                                  PyByteArray_GET_SIZE(o), 10);
 
     if (PyObject_GetBuffer(o, &view, PyBUF_SIMPLE) == 0) {
-        PyObject *result, *bytes;
+        PyObject *bytes;
 
         /* Copy to NUL-terminated buffer. */
         bytes = PyBytes_FromStringAndSize((const char *)view.buf, view.len);
@@ -1346,23 +1393,43 @@ PyNumber_Float(PyObject *o)
 {
     PyNumberMethods *m;
 
-    if (o == NULL)
+    if (o == NULL) {
         return null_error();
+    }
+
+    if (PyFloat_CheckExact(o)) {
+        Py_INCREF(o);
+        return o;
+    }
     m = o->ob_type->tp_as_number;
     if (m && m->nb_float) { /* This should include subclasses of float */
         PyObject *res = m->nb_float(o);
-        if (res && !PyFloat_Check(res)) {
+        double val;
+        if (!res || PyFloat_CheckExact(res)) {
+            return res;
+        }
+        if (!PyFloat_Check(res)) {
             PyErr_Format(PyExc_TypeError,
-              "__float__ returned non-float (type %.200s)",
-              res->ob_type->tp_name);
+                         "%.50s.__float__ returned non-float (type %.50s)",
+                         o->ob_type->tp_name, res->ob_type->tp_name);
             Py_DECREF(res);
             return NULL;
         }
-        return res;
+        /* Issue #26983: warn if 'res' not of exact type float. */
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "%.50s.__float__ returned non-float (type %.50s).  "
+                "The ability to return an instance of a strict subclass of float "
+                "is deprecated, and may be removed in a future version of Python.",
+                o->ob_type->tp_name, res->ob_type->tp_name)) {
+            Py_DECREF(res);
+            return NULL;
+        }
+        val = PyFloat_AS_DOUBLE(res);
+        Py_DECREF(res);
+        return PyFloat_FromDouble(val);
     }
     if (PyFloat_Check(o)) { /* A float subclass with nb_float == NULL */
-        PyFloatObject *po = (PyFloatObject *)o;
-        return PyFloat_FromDouble(po->ob_fval);
+        return PyFloat_FromDouble(PyFloat_AS_DOUBLE(o));
     }
     return PyFloat_FromString(o);
 }
@@ -1430,8 +1497,9 @@ PySequence_Concat(PyObject *s, PyObject *o)
 {
     PySequenceMethods *m;
 
-    if (s == NULL || o == NULL)
+    if (s == NULL || o == NULL) {
         return null_error();
+    }
 
     m = s->ob_type->tp_as_sequence;
     if (m && m->sq_concat)
@@ -1454,8 +1522,9 @@ PySequence_Repeat(PyObject *o, Py_ssize_t count)
 {
     PySequenceMethods *m;
 
-    if (o == NULL)
+    if (o == NULL) {
         return null_error();
+    }
 
     m = o->ob_type->tp_as_sequence;
     if (m && m->sq_repeat)
@@ -1483,8 +1552,9 @@ PySequence_InPlaceConcat(PyObject *s, PyObject *o)
 {
     PySequenceMethods *m;
 
-    if (s == NULL || o == NULL)
+    if (s == NULL || o == NULL) {
         return null_error();
+    }
 
     m = s->ob_type->tp_as_sequence;
     if (m && m->sq_inplace_concat)
@@ -1507,8 +1577,9 @@ PySequence_InPlaceRepeat(PyObject *o, Py_ssize_t count)
 {
     PySequenceMethods *m;
 
-    if (o == NULL)
+    if (o == NULL) {
         return null_error();
+    }
 
     m = o->ob_type->tp_as_sequence;
     if (m && m->sq_inplace_repeat)
@@ -1536,16 +1607,19 @@ PySequence_GetItem(PyObject *s, Py_ssize_t i)
 {
     PySequenceMethods *m;
 
-    if (s == NULL)
+    if (s == NULL) {
         return null_error();
+    }
 
     m = s->ob_type->tp_as_sequence;
     if (m && m->sq_item) {
         if (i < 0) {
             if (m->sq_length) {
                 Py_ssize_t l = (*m->sq_length)(s);
-                if (l < 0)
+                if (l < 0) {
+                    assert(PyErr_Occurred());
                     return NULL;
+                }
                 i += l;
             }
         }
@@ -1560,7 +1634,9 @@ PySequence_GetSlice(PyObject *s, Py_ssize_t i1, Py_ssize_t i2)
 {
     PyMappingMethods *mp;
 
-    if (!s) return null_error();
+    if (!s) {
+        return null_error();
+    }
 
     mp = s->ob_type->tp_as_mapping;
     if (mp && mp->mp_subscript) {
@@ -1687,8 +1763,9 @@ PySequence_Tuple(PyObject *v)
     PyObject *result = NULL;
     Py_ssize_t j;
 
-    if (v == NULL)
+    if (v == NULL) {
         return null_error();
+    }
 
     /* Special-case the common tuple and list cases, for efficiency. */
     if (PyTuple_CheckExact(v)) {
@@ -1768,8 +1845,9 @@ PySequence_List(PyObject *v)
     PyObject *result;  /* result list */
     PyObject *rv;          /* return value from PyList_Extend */
 
-    if (v == NULL)
+    if (v == NULL) {
         return null_error();
+    }
 
     result = PyList_New(0);
     if (result == NULL)
@@ -1789,8 +1867,9 @@ PySequence_Fast(PyObject *v, const char *m)
 {
     PyObject *it;
 
-    if (v == NULL)
+    if (v == NULL) {
         return null_error();
+    }
 
     if (PyList_CheckExact(v) || PyTuple_CheckExact(v)) {
         Py_INCREF(v);
@@ -1973,8 +2052,9 @@ PyMapping_GetItemString(PyObject *o, const char *key)
 {
     PyObject *okey, *r;
 
-    if (key == NULL)
+    if (key == NULL) {
         return null_error();
+    }
 
     okey = PyUnicode_FromString(key);
     if (okey == NULL)
@@ -2143,7 +2223,7 @@ _Py_CheckFunctionResult(PyObject *func, PyObject *result, const char *where)
 }
 
 PyObject *
-PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw)
+PyObject_Call(PyObject *func, PyObject *args, PyObject *kwargs)
 {
     ternaryfunc call;
     PyObject *result;
@@ -2152,6 +2232,8 @@ PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw)
        because it may clear it (directly or indirectly) and so the
        caller loses its exception */
     assert(!PyErr_Occurred());
+    assert(PyTuple_Check(args));
+    assert(kwargs == NULL || PyDict_Check(kwargs));
 
     call = func->ob_type->tp_call;
     if (call == NULL) {
@@ -2163,108 +2245,225 @@ PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw)
     if (Py_EnterRecursiveCall(" while calling a Python object"))
         return NULL;
 
-    result = (*call)(func, arg, kw);
+    result = (*call)(func, args, kwargs);
 
     Py_LeaveRecursiveCall();
 
     return _Py_CheckFunctionResult(func, result, NULL);
 }
 
-static PyObject*
-call_function_tail(PyObject *callable, PyObject *args)
+PyObject*
+_PyStack_AsTuple(PyObject **stack, Py_ssize_t nargs)
 {
-    PyObject *retval;
+    PyObject *args;
+    Py_ssize_t i;
 
-    if (args == NULL)
+    args = PyTuple_New(nargs);
+    if (args == NULL) {
         return NULL;
+    }
 
-    if (!PyTuple_Check(args)) {
-        PyObject *a;
+    for (i=0; i < nargs; i++) {
+        PyObject *item = stack[i];
+        Py_INCREF(item);
+        PyTuple_SET_ITEM(args, i, item);
+    }
+
+    return args;
+}
+
+PyObject *
+_PyObject_FastCallDict(PyObject *func, PyObject **args, Py_ssize_t nargs,
+                       PyObject *kwargs)
+{
+    ternaryfunc call;
+    PyObject *result = NULL;
+
+    /* _PyObject_FastCallDict() must not be called with an exception set,
+       because it may clear it (directly or indirectly) and so the
+       caller loses its exception */
+    assert(!PyErr_Occurred());
+
+    assert(func != NULL);
+    assert(nargs >= 0);
+    assert(nargs == 0 || args != NULL);
+    assert(kwargs == NULL || PyDict_Check(kwargs));
+
+    if (Py_EnterRecursiveCall(" while calling a Python object")) {
+        return NULL;
+    }
+
+    if (PyFunction_Check(func)) {
+        result = _PyFunction_FastCallDict(func, args, nargs, kwargs);
+    }
+    else if (PyCFunction_Check(func)) {
+        result = _PyCFunction_FastCallDict(func, args, nargs, kwargs);
+    }
+    else {
+        PyObject *tuple;
+
+        /* Slow-path: build a temporary tuple */
+        call = func->ob_type->tp_call;
+        if (call == NULL) {
+            PyErr_Format(PyExc_TypeError, "'%.200s' object is not callable",
+                         func->ob_type->tp_name);
+            goto exit;
+        }
+
+        tuple = _PyStack_AsTuple(args, nargs);
+        if (tuple == NULL) {
+            goto exit;
+        }
+
+        result = (*call)(func, tuple, kwargs);
+        Py_DECREF(tuple);
+
+        result = _Py_CheckFunctionResult(func, result, NULL);
+    }
+
+exit:
+    Py_LeaveRecursiveCall();
+
+    return result;
+}
 
-        a = PyTuple_New(1);
-        if (a == NULL) {
-            Py_DECREF(args);
+/* Positional arguments are obj followed args. */
+PyObject *
+_PyObject_Call_Prepend(PyObject *func,
+                       PyObject *obj, PyObject *args, PyObject *kwargs)
+{
+    PyObject *small_stack[8];
+    PyObject **stack;
+    Py_ssize_t argcount;
+    PyObject *result;
+
+    assert(PyTuple_Check(args));
+
+    argcount = PyTuple_GET_SIZE(args);
+    if (argcount + 1 <= (Py_ssize_t)Py_ARRAY_LENGTH(small_stack)) {
+        stack = small_stack;
+    }
+    else {
+        stack = PyMem_Malloc((argcount + 1) * sizeof(PyObject *));
+        if (stack == NULL) {
+            PyErr_NoMemory();
             return NULL;
         }
-        PyTuple_SET_ITEM(a, 0, args);
-        args = a;
     }
-    retval = PyObject_Call(callable, args, NULL);
 
-    Py_DECREF(args);
+    /* use borrowed references */
+    stack[0] = obj;
+    Py_MEMCPY(&stack[1],
+              &PyTuple_GET_ITEM(args, 0),
+              argcount * sizeof(PyObject *));
 
-    return retval;
+    result = _PyObject_FastCallDict(func,
+                                    stack, argcount + 1,
+                                    kwargs);
+    if (stack != small_stack) {
+        PyMem_Free(stack);
+    }
+    return result;
+}
+
+static PyObject*
+call_function_tail(PyObject *callable, PyObject *args)
+{
+    PyObject *result;
+
+    assert(args != NULL);
+
+    if (!PyTuple_Check(args)) {
+        result = _PyObject_CallArg1(callable, args);
+    }
+    else {
+        result = PyObject_Call(callable, args, NULL);
+    }
+
+    return result;
 }
 
 PyObject *
 PyObject_CallFunction(PyObject *callable, const char *format, ...)
 {
     va_list va;
-    PyObject *args;
+    PyObject *args, *result;
 
-    if (callable == NULL)
+    if (callable == NULL) {
         return null_error();
+    }
 
-    if (format && *format) {
-        va_start(va, format);
-        args = Py_VaBuildValue(format, va);
-        va_end(va);
+    if (!format || !*format) {
+        return _PyObject_CallNoArg(callable);
     }
-    else
-        args = PyTuple_New(0);
-    if (args == NULL)
+
+    va_start(va, format);
+    args = Py_VaBuildValue(format, va);
+    va_end(va);
+    if (args == NULL) {
         return NULL;
+    }
 
-    return call_function_tail(callable, args);
+    result = call_function_tail(callable, args);
+    Py_DECREF(args);
+    return result;
 }
 
 PyObject *
 _PyObject_CallFunction_SizeT(PyObject *callable, const char *format, ...)
 {
     va_list va;
-    PyObject *args;
+    PyObject *args, *result;
 
-    if (callable == NULL)
+    if (callable == NULL) {
         return null_error();
+    }
 
-    if (format && *format) {
-        va_start(va, format);
-        args = _Py_VaBuildValue_SizeT(format, va);
-        va_end(va);
+    if (!format || !*format) {
+        return _PyObject_CallNoArg(callable);
+    }
+
+    va_start(va, format);
+    args = _Py_VaBuildValue_SizeT(format, va);
+    va_end(va);
+    if (args == NULL) {
+        return NULL;
     }
-    else
-        args = PyTuple_New(0);
 
-    return call_function_tail(callable, args);
+    result = call_function_tail(callable, args);
+    Py_DECREF(args);
+    return result;
 }
 
 static PyObject*
 callmethod(PyObject* func, const char *format, va_list va, int is_size_t)
 {
-    PyObject *retval = NULL;
-    PyObject *args;
+    PyObject *args, *result;
+
+    assert(func != NULL);
 
     if (!PyCallable_Check(func)) {
         type_error("attribute of type '%.200s' is not callable", func);
-        goto exit;
+        return NULL;
     }
 
-    if (format && *format) {
-        if (is_size_t)
-            args = _Py_VaBuildValue_SizeT(format, va);
-        else
-            args = Py_VaBuildValue(format, va);
+    if (!format || !*format) {
+        return _PyObject_CallNoArg(func);
     }
-    else
-        args = PyTuple_New(0);
-
-    retval = call_function_tail(func, args);
 
-  exit:
-    /* args gets consumed in call_function_tail */
-    Py_XDECREF(func);
+    if (is_size_t) {
+        args = _Py_VaBuildValue_SizeT(format, va);
+    }
+    else {
+        args = Py_VaBuildValue(format, va);
+    }
+    if (args == NULL) {
+        return NULL;
+    }
 
-    return retval;
+    result = call_function_tail(func, args);
+    Py_DECREF(args);
+    return result;
 }
 
 PyObject *
@@ -2274,8 +2473,9 @@ PyObject_CallMethod(PyObject *o, const char *name, const char *format, ...)
     PyObject *func = NULL;
     PyObject *retval = NULL;
 
-    if (o == NULL || name == NULL)
+    if (o == NULL || name == NULL) {
         return null_error();
+    }
 
     func = PyObject_GetAttrString(o, name);
     if (func == NULL)
@@ -2284,6 +2484,7 @@ PyObject_CallMethod(PyObject *o, const char *name, const char *format, ...)
     va_start(va, format);
     retval = callmethod(func, format, va, 0);
     va_end(va);
+    Py_DECREF(func);
     return retval;
 }
 
@@ -2295,8 +2496,9 @@ _PyObject_CallMethodId(PyObject *o, _Py_Identifier *name,
     PyObject *func = NULL;
     PyObject *retval = NULL;
 
-    if (o == NULL || name == NULL)
+    if (o == NULL || name == NULL) {
         return null_error();
+    }
 
     func = _PyObject_GetAttrId(o, name);
     if (func == NULL)
@@ -2305,6 +2507,7 @@ _PyObject_CallMethodId(PyObject *o, _Py_Identifier *name,
     va_start(va, format);
     retval = callmethod(func, format, va, 0);
     va_end(va);
+    Py_DECREF(func);
     return retval;
 }
 
@@ -2316,8 +2519,9 @@ _PyObject_CallMethod_SizeT(PyObject *o, const char *name,
     PyObject *func = NULL;
     PyObject *retval;
 
-    if (o == NULL || name == NULL)
+    if (o == NULL || name == NULL) {
         return null_error();
+    }
 
     func = PyObject_GetAttrString(o, name);
     if (func == NULL)
@@ -2325,6 +2529,7 @@ _PyObject_CallMethod_SizeT(PyObject *o, const char *name,
     va_start(va, format);
     retval = callmethod(func, format, va, 1);
     va_end(va);
+    Py_DECREF(func);
     return retval;
 }
 
@@ -2336,8 +2541,9 @@ _PyObject_CallMethodId_SizeT(PyObject *o, _Py_Identifier *name,
     PyObject *func = NULL;
     PyObject *retval;
 
-    if (o == NULL || name == NULL)
+    if (o == NULL || name == NULL) {
         return null_error();
+    }
 
     func = _PyObject_GetAttrId(o, name);
     if (func == NULL) {
@@ -2346,39 +2552,61 @@ _PyObject_CallMethodId_SizeT(PyObject *o, _Py_Identifier *name,
     va_start(va, format);
     retval = callmethod(func, format, va, 1);
     va_end(va);
+    Py_DECREF(func);
     return retval;
 }
 
-static PyObject *
-objargs_mktuple(va_list va)
+static PyObject **
+objargs_mkstack(PyObject **small_stack, Py_ssize_t small_stack_size,
+                va_list va, Py_ssize_t *p_nargs)
 {
-    int i, n = 0;
+    Py_ssize_t i, n;
     va_list countva;
-    PyObject *result, *tmp;
+    PyObject **stack;
 
-        Py_VA_COPY(countva, va);
+    /* Count the number of arguments */
+    Py_VA_COPY(countva, va);
 
-    while (((PyObject *)va_arg(countva, PyObject *)) != NULL)
-        ++n;
-    result = PyTuple_New(n);
-    if (result != NULL && n > 0) {
-        for (i = 0; i < n; ++i) {
-            tmp = (PyObject *)va_arg(va, PyObject *);
-            PyTuple_SET_ITEM(result, i, tmp);
-            Py_INCREF(tmp);
+    n = 0;
+    while (1) {
+        PyObject *arg = (PyObject *)va_arg(countva, PyObject *);
+        if (arg == NULL) {
+            break;
         }
+        n++;
     }
-    return result;
+    *p_nargs = n;
+
+    /* Copy arguments */
+    if (n <= small_stack_size) {
+        stack = small_stack;
+    }
+    else {
+        stack = PyMem_Malloc(n * sizeof(stack[0]));
+        if (stack == NULL) {
+            PyErr_NoMemory();
+            return NULL;
+        }
+    }
+
+    for (i = 0; i < n; ++i) {
+        stack[i] = va_arg(va, PyObject *);
+    }
+    return stack;
 }
 
 PyObject *
 PyObject_CallMethodObjArgs(PyObject *callable, PyObject *name, ...)
 {
-    PyObject *args, *tmp;
+    PyObject *small_stack[5];
+    PyObject **stack;
+    Py_ssize_t nargs;
+    PyObject *result;
     va_list vargs;
 
-    if (callable == NULL || name == NULL)
+    if (callable == NULL || name == NULL) {
         return null_error();
+    }
 
     callable = PyObject_GetAttr(callable, name);
     if (callable == NULL)
@@ -2386,28 +2614,36 @@ PyObject_CallMethodObjArgs(PyObject *callable, PyObject *name, ...)
 
     /* count the args */
     va_start(vargs, name);
-    args = objargs_mktuple(vargs);
+    stack = objargs_mkstack(small_stack, Py_ARRAY_LENGTH(small_stack),
+                            vargs, &nargs);
     va_end(vargs);
-    if (args == NULL) {
+    if (stack == NULL) {
         Py_DECREF(callable);
         return NULL;
     }
-    tmp = PyObject_Call(callable, args, NULL);
-    Py_DECREF(args);
+
+    result = _PyObject_FastCall(callable, stack, nargs);
     Py_DECREF(callable);
+    if (stack != small_stack) {
+        PyMem_Free(stack);
+    }
 
-    return tmp;
+    return result;
 }
 
 PyObject *
 _PyObject_CallMethodIdObjArgs(PyObject *callable,
         struct _Py_Identifier *name, ...)
 {
-    PyObject *args, *tmp;
+    PyObject *small_stack[5];
+    PyObject **stack;
+    Py_ssize_t nargs;
+    PyObject *result;
     va_list vargs;
 
-    if (callable == NULL || name == NULL)
+    if (callable == NULL || name == NULL) {
         return null_error();
+    }
 
     callable = _PyObject_GetAttrId(callable, name);
     if (callable == NULL)
@@ -2415,38 +2651,51 @@ _PyObject_CallMethodIdObjArgs(PyObject *callable,
 
     /* count the args */
     va_start(vargs, name);
-    args = objargs_mktuple(vargs);
+    stack = objargs_mkstack(small_stack, Py_ARRAY_LENGTH(small_stack),
+                            vargs, &nargs);
     va_end(vargs);
-    if (args == NULL) {
+    if (stack == NULL) {
         Py_DECREF(callable);
         return NULL;
     }
-    tmp = PyObject_Call(callable, args, NULL);
-    Py_DECREF(args);
+
+    result = _PyObject_FastCall(callable, stack, nargs);
     Py_DECREF(callable);
+    if (stack != small_stack) {
+        PyMem_Free(stack);
+    }
 
-    return tmp;
+    return result;
 }
 
 PyObject *
 PyObject_CallFunctionObjArgs(PyObject *callable, ...)
 {
-    PyObject *args, *tmp;
+    PyObject *small_stack[5];
+    PyObject **stack;
+    Py_ssize_t nargs;
+    PyObject *result;
     va_list vargs;
 
-    if (callable == NULL)
+    if (callable == NULL) {
         return null_error();
+    }
 
     /* count the args */
     va_start(vargs, callable);
-    args = objargs_mktuple(vargs);
+    stack = objargs_mkstack(small_stack, Py_ARRAY_LENGTH(small_stack),
+                            vargs, &nargs);
     va_end(vargs);
-    if (args == NULL)
+    if (stack == NULL) {
         return NULL;
-    tmp = PyObject_Call(callable, args, NULL);
-    Py_DECREF(args);
+    }
 
-    return tmp;
+    result = _PyObject_FastCall(callable, stack, nargs);
+    if (stack != small_stack) {
+        PyMem_Free(stack);
+    }
+
+    return result;
 }
 
 
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c
index 6d4c6a1..c6d0707 100644
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -278,31 +278,6 @@ PyByteArray_Concat(PyObject *a, PyObject *b)
     return (PyObject *)result;
 }
 
-static PyObject *
-bytearray_format(PyByteArrayObject *self, PyObject *args)
-{
-    PyObject *bytes_in, *bytes_out, *res;
-    char *bytestring;
-
-    if (self == NULL || !PyByteArray_Check(self) || args == NULL) {
-        PyErr_BadInternalCall();
-        return NULL;
-    }
-    bytestring = PyByteArray_AS_STRING(self);
-    bytes_in = PyBytes_FromString(bytestring);
-    if (bytes_in == NULL)
-        return NULL;
-    bytes_out = _PyBytes_Format(bytes_in, args);
-    Py_DECREF(bytes_in);
-    if (bytes_out == NULL)
-        return NULL;
-    res = PyByteArray_FromObject(bytes_out);
-    Py_DECREF(bytes_out);
-    if (res == NULL)
-        return NULL;
-    return res;
-}
-
 /* Functions stuffed into the type object */
 
 static Py_ssize_t
@@ -820,17 +795,15 @@ bytearray_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds)
     }
 
     /* Is it an int? */
-    count = PyNumber_AsSsize_t(arg, PyExc_OverflowError);
-    if (count == -1 && PyErr_Occurred()) {
-        if (PyErr_ExceptionMatches(PyExc_OverflowError))
+    if (PyIndex_Check(arg)) {
+        count = PyNumber_AsSsize_t(arg, PyExc_OverflowError);
+        if (count == -1 && PyErr_Occurred()) {
             return -1;
-        PyErr_Clear();
-    }
-    else if (count < 0) {
-        PyErr_SetString(PyExc_ValueError, "negative count");
-        return -1;
-    }
-    else {
+        }
+        if (count < 0) {
+            PyErr_SetString(PyExc_ValueError, "negative count");
+            return -1;
+        }
         if (count > 0) {
             if (PyByteArray_Resize((PyObject *)self, count))
                 return -1;
@@ -1119,161 +1092,27 @@ bytearray_dealloc(PyByteArrayObject *self)
 #include "stringlib/transmogrify.h"
 
 
-/* The following Py_LOCAL_INLINE and Py_LOCAL functions
-were copied from the old char* style string object. */
-
-/* helper macro to fixup start/end slice values */
-#define ADJUST_INDICES(start, end, len)         \
-    if (end > len)                              \
-        end = len;                              \
-    else if (end < 0) {                         \
-        end += len;                             \
-        if (end < 0)                            \
-            end = 0;                            \
-    }                                           \
-    if (start < 0) {                            \
-        start += len;                           \
-        if (start < 0)                          \
-            start = 0;                          \
-    }
-
-Py_LOCAL_INLINE(Py_ssize_t)
-bytearray_find_internal(PyByteArrayObject *self, PyObject *args, int dir)
-{
-    PyObject *subobj;
-    char byte;
-    Py_buffer subbuf;
-    const char *sub;
-    Py_ssize_t len, sub_len;
-    Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
-    Py_ssize_t res;
-
-    if (!stringlib_parse_args_finds_byte("find/rfind/index/rindex",
-                                         args, &subobj, &byte, &start, &end))
-        return -2;
-
-    if (subobj) {
-        if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0)
-            return -2;
-
-        sub = subbuf.buf;
-        sub_len = subbuf.len;
-    }
-    else {
-        sub = &byte;
-        sub_len = 1;
-    }
-    len = PyByteArray_GET_SIZE(self);
-
-    ADJUST_INDICES(start, end, len);
-    if (end - start < sub_len)
-        res = -1;
-    else if (sub_len == 1
-#ifndef HAVE_MEMRCHR
-            && dir > 0
-#endif
-    ) {
-        unsigned char needle = *sub;
-        int mode = (dir > 0) ? FAST_SEARCH : FAST_RSEARCH;
-        res = stringlib_fastsearch_memchr_1char(
-            PyByteArray_AS_STRING(self) + start, end - start,
-            needle, needle, mode);
-        if (res >= 0)
-            res += start;
-    }
-    else {
-        if (dir > 0)
-            res = stringlib_find_slice(
-                PyByteArray_AS_STRING(self), len,
-                sub, sub_len, start, end);
-        else
-            res = stringlib_rfind_slice(
-                PyByteArray_AS_STRING(self), len,
-                sub, sub_len, start, end);
-    }
-
-    if (subobj)
-        PyBuffer_Release(&subbuf);
-
-    return res;
-}
-
-PyDoc_STRVAR(find__doc__,
-"B.find(sub[, start[, end]]) -> int\n\
-\n\
-Return the lowest index in B where subsection sub is found,\n\
-such that sub is contained within B[start,end].  Optional\n\
-arguments start and end are interpreted as in slice notation.\n\
-\n\
-Return -1 on failure.");
-
 static PyObject *
 bytearray_find(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytearray_find_internal(self, args, +1);
-    if (result == -2)
-        return NULL;
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_find(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
-PyDoc_STRVAR(count__doc__,
-"B.count(sub[, start[, end]]) -> int\n\
-\n\
-Return the number of non-overlapping occurrences of subsection sub in\n\
-bytes B[start:end].  Optional arguments start and end are interpreted\n\
-as in slice notation.");
-
 static PyObject *
 bytearray_count(PyByteArrayObject *self, PyObject *args)
 {
-    PyObject *sub_obj;
-    const char *str = PyByteArray_AS_STRING(self), *sub;
-    Py_ssize_t sub_len;
-    char byte;
-    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
-
-    Py_buffer vsub;
-    PyObject *count_obj;
-
-    if (!stringlib_parse_args_finds_byte("count", args, &sub_obj, &byte,
-                                         &start, &end))
-        return NULL;
-
-    if (sub_obj) {
-        if (PyObject_GetBuffer(sub_obj, &vsub, PyBUF_SIMPLE) != 0)
-            return NULL;
-
-        sub = vsub.buf;
-        sub_len = vsub.len;
-    }
-    else {
-        sub = &byte;
-        sub_len = 1;
-    }
-
-    ADJUST_INDICES(start, end, PyByteArray_GET_SIZE(self));
-
-    count_obj = PyLong_FromSsize_t(
-        stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX)
-        );
-
-    if (sub_obj)
-        PyBuffer_Release(&vsub);
-
-    return count_obj;
+    return _Py_bytes_count(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
 /*[clinic input]
 bytearray.clear
 
-    self: self(type="PyByteArrayObject *")
-
 Remove all items from the bytearray.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_clear_impl(PyByteArrayObject *self)
-/*[clinic end generated code: output=85c2fe6aede0956c input=e524fd330abcdc18]*/
+/*[clinic end generated code: output=85c2fe6aede0956c input=ed6edae9de447ac4]*/
 {
     if (PyByteArray_Resize((PyObject *)self, 0) < 0)
         return NULL;
@@ -1283,253 +1122,72 @@ bytearray_clear_impl(PyByteArrayObject *self)
 /*[clinic input]
 bytearray.copy
 
-    self: self(type="PyByteArrayObject *")
-
 Return a copy of B.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_copy_impl(PyByteArrayObject *self)
-/*[clinic end generated code: output=68cfbcfed484c132 input=6d5d2975aa0f33f3]*/
+/*[clinic end generated code: output=68cfbcfed484c132 input=6597b0c01bccaa9e]*/
 {
     return PyByteArray_FromStringAndSize(PyByteArray_AS_STRING((PyObject *)self),
                                          PyByteArray_GET_SIZE(self));
 }
 
-PyDoc_STRVAR(index__doc__,
-"B.index(sub[, start[, end]]) -> int\n\
-\n\
-Like B.find() but raise ValueError when the subsection is not found.");
-
 static PyObject *
 bytearray_index(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytearray_find_internal(self, args, +1);
-    if (result == -2)
-        return NULL;
-    if (result == -1) {
-        PyErr_SetString(PyExc_ValueError,
-                        "subsection not found");
-        return NULL;
-    }
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_index(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
-
-PyDoc_STRVAR(rfind__doc__,
-"B.rfind(sub[, start[, end]]) -> int\n\
-\n\
-Return the highest index in B where subsection sub is found,\n\
-such that sub is contained within B[start,end].  Optional\n\
-arguments start and end are interpreted as in slice notation.\n\
-\n\
-Return -1 on failure.");
-
 static PyObject *
 bytearray_rfind(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytearray_find_internal(self, args, -1);
-    if (result == -2)
-        return NULL;
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_rfind(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
-
-PyDoc_STRVAR(rindex__doc__,
-"B.rindex(sub[, start[, end]]) -> int\n\
-\n\
-Like B.rfind() but raise ValueError when the subsection is not found.");
-
 static PyObject *
 bytearray_rindex(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytearray_find_internal(self, args, -1);
-    if (result == -2)
-        return NULL;
-    if (result == -1) {
-        PyErr_SetString(PyExc_ValueError,
-                        "subsection not found");
-        return NULL;
-    }
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_rindex(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
-
 static int
 bytearray_contains(PyObject *self, PyObject *arg)
 {
-    Py_ssize_t ival = PyNumber_AsSsize_t(arg, PyExc_ValueError);
-    if (ival == -1 && PyErr_Occurred()) {
-        Py_buffer varg;
-        Py_ssize_t pos;
-        PyErr_Clear();
-        if (PyObject_GetBuffer(arg, &varg, PyBUF_SIMPLE) != 0)
-            return -1;
-        pos = stringlib_find(PyByteArray_AS_STRING(self), Py_SIZE(self),
-                             varg.buf, varg.len, 0);
-        PyBuffer_Release(&varg);
-        return pos >= 0;
-    }
-    if (ival < 0 || ival >= 256) {
-        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)");
-        return -1;
-    }
-
-    return memchr(PyByteArray_AS_STRING(self), (int) ival, Py_SIZE(self)) != NULL;
+    return _Py_bytes_contains(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), arg);
 }
 
-
-/* Matches the end (direction >= 0) or start (direction < 0) of self
- * against substr, using the start and end arguments. Returns
- * -1 on error, 0 if not found and 1 if found.
- */
-Py_LOCAL(int)
-_bytearray_tailmatch(PyByteArrayObject *self, PyObject *substr, Py_ssize_t start,
-                 Py_ssize_t end, int direction)
-{
-    Py_ssize_t len = PyByteArray_GET_SIZE(self);
-    const char* str;
-    Py_buffer vsubstr;
-    int rv = 0;
-
-    str = PyByteArray_AS_STRING(self);
-
-    if (PyObject_GetBuffer(substr, &vsubstr, PyBUF_SIMPLE) != 0)
-        return -1;
-
-    ADJUST_INDICES(start, end, len);
-
-    if (direction < 0) {
-        /* startswith */
-        if (start+vsubstr.len > len) {
-            goto done;
-        }
-    } else {
-        /* endswith */
-        if (end-start < vsubstr.len || start > len) {
-            goto done;
-        }
-
-        if (end-vsubstr.len > start)
-            start = end - vsubstr.len;
-    }
-    if (end-start >= vsubstr.len)
-        rv = ! memcmp(str+start, vsubstr.buf, vsubstr.len);
-
-done:
-    PyBuffer_Release(&vsubstr);
-    return rv;
-}
-
-
-PyDoc_STRVAR(startswith__doc__,
-"B.startswith(prefix[, start[, end]]) -> bool\n\
-\n\
-Return True if B starts with the specified prefix, False otherwise.\n\
-With optional start, test B beginning at that position.\n\
-With optional end, stop comparing B at that position.\n\
-prefix can also be a tuple of bytes to try.");
-
 static PyObject *
 bytearray_startswith(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t start = 0;
-    Py_ssize_t end = PY_SSIZE_T_MAX;
-    PyObject *subobj;
-    int result;
-
-    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
-        return NULL;
-    if (PyTuple_Check(subobj)) {
-        Py_ssize_t i;
-        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            result = _bytearray_tailmatch(self,
-                                      PyTuple_GET_ITEM(subobj, i),
-                                      start, end, -1);
-            if (result == -1)
-                return NULL;
-            else if (result) {
-                Py_RETURN_TRUE;
-            }
-        }
-        Py_RETURN_FALSE;
-    }
-    result = _bytearray_tailmatch(self, subobj, start, end, -1);
-    if (result == -1) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "startswith first arg must be bytes "
-                         "or a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name);
-        return NULL;
-    }
-    else
-        return PyBool_FromLong(result);
+    return _Py_bytes_startswith(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
-PyDoc_STRVAR(endswith__doc__,
-"B.endswith(suffix[, start[, end]]) -> bool\n\
-\n\
-Return True if B ends with the specified suffix, False otherwise.\n\
-With optional start, test B beginning at that position.\n\
-With optional end, stop comparing B at that position.\n\
-suffix can also be a tuple of bytes to try.");
-
 static PyObject *
 bytearray_endswith(PyByteArrayObject *self, PyObject *args)
 {
-    Py_ssize_t start = 0;
-    Py_ssize_t end = PY_SSIZE_T_MAX;
-    PyObject *subobj;
-    int result;
-
-    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
-        return NULL;
-    if (PyTuple_Check(subobj)) {
-        Py_ssize_t i;
-        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            result = _bytearray_tailmatch(self,
-                                      PyTuple_GET_ITEM(subobj, i),
-                                      start, end, +1);
-            if (result == -1)
-                return NULL;
-            else if (result) {
-                Py_RETURN_TRUE;
-            }
-        }
-        Py_RETURN_FALSE;
-    }
-    result = _bytearray_tailmatch(self, subobj, start, end, +1);
-    if (result == -1) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "endswith first arg must be bytes or "
-                         "a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name);
-        return NULL;
-    }
-    else
-        return PyBool_FromLong(result);
+    return _Py_bytes_endswith(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self), args);
 }
 
 
 /*[clinic input]
 bytearray.translate
 
-    self: self(type="PyByteArrayObject *")
     table: object
         Translation table, which must be a bytes object of length 256.
-    [
-    deletechars: object
-    ]
     /
+    delete as deletechars: object(c_default="NULL") = b''
 
 Return a copy with each character mapped by the given translation table.
 
-All characters occurring in the optional argument deletechars are removed.
+All characters occurring in the optional argument delete are removed.
 The remaining characters are mapped through the given translation table.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_translate_impl(PyByteArrayObject *self, PyObject *table,
-                         int group_right_1, PyObject *deletechars)
-/*[clinic end generated code: output=2bebc86a9a1ff083 input=b749ad85f4860824]*/
+                         PyObject *deletechars)
+/*[clinic end generated code: output=b6a8f01c2a74e446 input=cfff956d4d127a9b]*/
 {
     char *input, *output;
     const char *table_chars;
@@ -1572,7 +1230,7 @@ bytearray_translate_impl(PyByteArrayObject *self, PyObject *table,
     result = PyByteArray_FromStringAndSize((char *)NULL, inlen);
     if (result == NULL)
         goto done;
-    output_start = output = PyByteArray_AsString(result);
+    output_start = output = PyByteArray_AS_STRING(result);
     input = PyByteArray_AS_STRING(input_obj);
 
     if (vdel.len == 0 && table_chars != NULL) {
@@ -1598,8 +1256,7 @@ bytearray_translate_impl(PyByteArrayObject *self, PyObject *table,
     for (i = inlen; --i >= 0; ) {
         c = Py_CHARMASK(*input++);
         if (trans_table[c] != -1)
-            if (Py_CHARMASK(*output++ = (char)trans_table[c]) == c)
-                    continue;
+            *output++ = (char)trans_table[c];
     }
     /* Fix the size of the resulting string */
     if (inlen > 0)
@@ -1642,493 +1299,6 @@ bytearray_maketrans_impl(Py_buffer *frm, Py_buffer *to)
 }
 
 
-/* find and count characters and substrings */
-
-#define findchar(target, target_len, c)                         \
-  ((char *)memchr((const void *)(target), c, target_len))
-
-
-/* Bytes ops must return a string, create a copy */
-Py_LOCAL(PyByteArrayObject *)
-return_self(PyByteArrayObject *self)
-{
-    /* always return a new bytearray */
-    return (PyByteArrayObject *)PyByteArray_FromStringAndSize(
-            PyByteArray_AS_STRING(self),
-            PyByteArray_GET_SIZE(self));
-}
-
-Py_LOCAL_INLINE(Py_ssize_t)
-countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount)
-{
-    Py_ssize_t count=0;
-    const char *start=target;
-    const char *end=target+target_len;
-
-    while ( (start=findchar(start, end-start, c)) != NULL ) {
-        count++;
-        if (count >= maxcount)
-            break;
-        start += 1;
-    }
-    return count;
-}
-
-
-/* Algorithms for different cases of string replacement */
-
-/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_interleave(PyByteArrayObject *self,
-                   const char *to_s, Py_ssize_t to_len,
-                   Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, i;
-    PyByteArrayObject *result;
-
-    self_len = PyByteArray_GET_SIZE(self);
-
-    /* 1 at the end plus 1 after every character;
-       count = min(maxcount, self_len + 1) */
-    if (maxcount <= self_len)
-        count = maxcount;
-    else
-        /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */
-        count = self_len + 1;
-
-    /* Check for overflow */
-    /*   result_len = count * to_len + self_len; */
-    assert(count > 0);
-    if (to_len > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "replace string is too long");
-        return NULL;
-    }
-    result_len = count * to_len + self_len;
-
-    if (! (result = (PyByteArrayObject *)
-                     PyByteArray_FromStringAndSize(NULL, result_len)) )
-        return NULL;
-
-    self_s = PyByteArray_AS_STRING(self);
-    result_s = PyByteArray_AS_STRING(result);
-
-    /* TODO: special case single character, which doesn't need memcpy */
-
-    /* Lay the first one down (guaranteed this will occur) */
-    Py_MEMCPY(result_s, to_s, to_len);
-    result_s += to_len;
-    count -= 1;
-
-    for (i=0; i<count; i++) {
-        *result_s++ = *self_s++;
-        Py_MEMCPY(result_s, to_s, to_len);
-        result_s += to_len;
-    }
-
-    /* Copy the rest of the original string */
-    Py_MEMCPY(result_s, self_s, self_len-i);
-
-    return result;
-}
-
-/* Special case for deleting a single character */
-/* len(self)>=1, len(from)==1, to="", maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_delete_single_character(PyByteArrayObject *self,
-                                char from_c, Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count;
-    PyByteArrayObject *result;
-
-    self_len = PyByteArray_GET_SIZE(self);
-    self_s = PyByteArray_AS_STRING(self);
-
-    count = countchar(self_s, self_len, from_c, maxcount);
-    if (count == 0) {
-        return return_self(self);
-    }
-
-    result_len = self_len - count;  /* from_len == 1 */
-    assert(result_len>=0);
-
-    if ( (result = (PyByteArrayObject *)
-                    PyByteArray_FromStringAndSize(NULL, result_len)) == NULL)
-        return NULL;
-    result_s = PyByteArray_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-        Py_MEMCPY(result_s, start, next-start);
-        result_s += (next-start);
-        start = next+1;
-    }
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
-
-Py_LOCAL(PyByteArrayObject *)
-replace_delete_substring(PyByteArrayObject *self,
-                         const char *from_s, Py_ssize_t from_len,
-                         Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, offset;
-    PyByteArrayObject *result;
-
-    self_len = PyByteArray_GET_SIZE(self);
-    self_s = PyByteArray_AS_STRING(self);
-
-    count = stringlib_count(self_s, self_len,
-                            from_s, from_len,
-                            maxcount);
-
-    if (count == 0) {
-        /* no matches */
-        return return_self(self);
-    }
-
-    result_len = self_len - (count * from_len);
-    assert (result_len>=0);
-
-    if ( (result = (PyByteArrayObject *)
-        PyByteArray_FromStringAndSize(NULL, result_len)) == NULL )
-            return NULL;
-
-    result_s = PyByteArray_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset == -1)
-            break;
-        next = start + offset;
-
-        Py_MEMCPY(result_s, start, next-start);
-
-        result_s += (next-start);
-        start = next+from_len;
-    }
-    Py_MEMCPY(result_s, start, end-start);
-    return result;
-}
-
-/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_single_character_in_place(PyByteArrayObject *self,
-                                  char from_c, char to_c,
-                                  Py_ssize_t maxcount)
-{
-    char *self_s, *result_s, *start, *end, *next;
-    Py_ssize_t self_len;
-    PyByteArrayObject *result;
-
-    /* The result string will be the same size */
-    self_s = PyByteArray_AS_STRING(self);
-    self_len = PyByteArray_GET_SIZE(self);
-
-    next = findchar(self_s, self_len, from_c);
-
-    if (next == NULL) {
-        /* No matches; return the original bytes */
-        return return_self(self);
-    }
-
-    /* Need to make a new bytes */
-    result = (PyByteArrayObject *) PyByteArray_FromStringAndSize(NULL, self_len);
-    if (result == NULL)
-        return NULL;
-    result_s = PyByteArray_AS_STRING(result);
-    Py_MEMCPY(result_s, self_s, self_len);
-
-    /* change everything in-place, starting with this one */
-    start =  result_s + (next-self_s);
-    *start = to_c;
-    start++;
-    end = result_s + self_len;
-
-    while (--maxcount > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-        *next = to_c;
-        start = next+1;
-    }
-
-    return result;
-}
-
-/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_substring_in_place(PyByteArrayObject *self,
-                           const char *from_s, Py_ssize_t from_len,
-                           const char *to_s, Py_ssize_t to_len,
-                           Py_ssize_t maxcount)
-{
-    char *result_s, *start, *end;
-    char *self_s;
-    Py_ssize_t self_len, offset;
-    PyByteArrayObject *result;
-
-    /* The result bytes will be the same size */
-
-    self_s = PyByteArray_AS_STRING(self);
-    self_len = PyByteArray_GET_SIZE(self);
-
-    offset = stringlib_find(self_s, self_len,
-                            from_s, from_len,
-                            0);
-    if (offset == -1) {
-        /* No matches; return the original bytes */
-        return return_self(self);
-    }
-
-    /* Need to make a new bytes */
-    result = (PyByteArrayObject *) PyByteArray_FromStringAndSize(NULL, self_len);
-    if (result == NULL)
-        return NULL;
-    result_s = PyByteArray_AS_STRING(result);
-    Py_MEMCPY(result_s, self_s, self_len);
-
-    /* change everything in-place, starting with this one */
-    start =  result_s + offset;
-    Py_MEMCPY(start, to_s, from_len);
-    start += from_len;
-    end = result_s + self_len;
-
-    while ( --maxcount > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset==-1)
-            break;
-        Py_MEMCPY(start+offset, to_s, from_len);
-        start += offset+from_len;
-    }
-
-    return result;
-}
-
-/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_single_character(PyByteArrayObject *self,
-                         char from_c,
-                         const char *to_s, Py_ssize_t to_len,
-                         Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count;
-    PyByteArrayObject *result;
-
-    self_s = PyByteArray_AS_STRING(self);
-    self_len = PyByteArray_GET_SIZE(self);
-
-    count = countchar(self_s, self_len, from_c, maxcount);
-    if (count == 0) {
-        /* no matches, return unchanged */
-        return return_self(self);
-    }
-
-    /* use the difference between current and new, hence the "-1" */
-    /*   result_len = self_len + count * (to_len-1)  */
-    assert(count > 0);
-    if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
-        return NULL;
-    }
-    result_len = self_len + count * (to_len - 1);
-
-    if ( (result = (PyByteArrayObject *)
-          PyByteArray_FromStringAndSize(NULL, result_len)) == NULL)
-            return NULL;
-    result_s = PyByteArray_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-
-        if (next == start) {
-            /* replace with the 'to' */
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start += 1;
-        } else {
-            /* copy the unchanged old then the 'to' */
-            Py_MEMCPY(result_s, start, next-start);
-            result_s += (next-start);
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start = next+1;
-        }
-    }
-    /* Copy the remainder of the remaining bytes */
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyByteArrayObject *)
-replace_substring(PyByteArrayObject *self,
-                  const char *from_s, Py_ssize_t from_len,
-                  const char *to_s, Py_ssize_t to_len,
-                  Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, offset;
-    PyByteArrayObject *result;
-
-    self_s = PyByteArray_AS_STRING(self);
-    self_len = PyByteArray_GET_SIZE(self);
-
-    count = stringlib_count(self_s, self_len,
-                            from_s, from_len,
-                            maxcount);
-
-    if (count == 0) {
-        /* no matches, return unchanged */
-        return return_self(self);
-    }
-
-    /* Check for overflow */
-    /*    result_len = self_len + count * (to_len-from_len) */
-    assert(count > 0);
-    if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
-        return NULL;
-    }
-    result_len = self_len + count * (to_len - from_len);
-
-    if ( (result = (PyByteArrayObject *)
-          PyByteArray_FromStringAndSize(NULL, result_len)) == NULL)
-        return NULL;
-    result_s = PyByteArray_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset == -1)
-            break;
-        next = start+offset;
-        if (next == start) {
-            /* replace with the 'to' */
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start += from_len;
-        } else {
-            /* copy the unchanged old then the 'to' */
-            Py_MEMCPY(result_s, start, next-start);
-            result_s += (next-start);
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start = next+from_len;
-        }
-    }
-    /* Copy the remainder of the remaining bytes */
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-
-Py_LOCAL(PyByteArrayObject *)
-replace(PyByteArrayObject *self,
-        const char *from_s, Py_ssize_t from_len,
-        const char *to_s, Py_ssize_t to_len,
-        Py_ssize_t maxcount)
-{
-    if (maxcount < 0) {
-        maxcount = PY_SSIZE_T_MAX;
-    } else if (maxcount == 0 || PyByteArray_GET_SIZE(self) == 0) {
-        /* nothing to do; return the original bytes */
-        return return_self(self);
-    }
-
-    if (maxcount == 0 ||
-        (from_len == 0 && to_len == 0)) {
-        /* nothing to do; return the original bytes */
-        return return_self(self);
-    }
-
-    /* Handle zero-length special cases */
-
-    if (from_len == 0) {
-        /* insert the 'to' bytes everywhere.   */
-        /*    >>> "Python".replace("", ".")     */
-        /*    '.P.y.t.h.o.n.'                   */
-        return replace_interleave(self, to_s, to_len, maxcount);
-    }
-
-    /* Except for "".replace("", "A") == "A" there is no way beyond this */
-    /* point for an empty self bytes to generate a non-empty bytes */
-    /* Special case so the remaining code always gets a non-empty bytes */
-    if (PyByteArray_GET_SIZE(self) == 0) {
-        return return_self(self);
-    }
-
-    if (to_len == 0) {
-        /* delete all occurrences of 'from' bytes */
-        if (from_len == 1) {
-            return replace_delete_single_character(
-                    self, from_s[0], maxcount);
-        } else {
-            return replace_delete_substring(self, from_s, from_len, maxcount);
-        }
-    }
-
-    /* Handle special case where both bytes have the same length */
-
-    if (from_len == to_len) {
-        if (from_len == 1) {
-            return replace_single_character_in_place(
-                    self,
-                    from_s[0],
-                    to_s[0],
-                    maxcount);
-        } else {
-            return replace_substring_in_place(
-                self, from_s, from_len, to_s, to_len, maxcount);
-        }
-    }
-
-    /* Otherwise use the more generic algorithms */
-    if (from_len == 1) {
-        return replace_single_character(self, from_s[0],
-                                        to_s, to_len, maxcount);
-    } else {
-        /* len('from')>=2, len('to')>=1 */
-        return replace_substring(self, from_s, from_len, to_s, to_len, maxcount);
-    }
-}
-
-
 /*[clinic input]
 bytearray.replace
 
@@ -2150,9 +1320,9 @@ bytearray_replace_impl(PyByteArrayObject *self, Py_buffer *old,
                        Py_buffer *new, Py_ssize_t count)
 /*[clinic end generated code: output=d39884c4dc59412a input=aa379d988637c7fb]*/
 {
-    return (PyObject *)replace((PyByteArrayObject *) self,
-                               old->buf, old->len,
-                               new->buf, new->len, count);
+    return stringlib_replace((PyObject *)self,
+                             (const char *)old->buf, old->len,
+                             (const char *)new->buf, new->len, count);
 }
 
 /*[clinic input]
@@ -2200,7 +1370,6 @@ bytearray_split_impl(PyByteArrayObject *self, PyObject *sep,
 /*[clinic input]
 bytearray.partition
 
-    self: self(type="PyByteArrayObject *")
     sep: object
     /
 
@@ -2216,7 +1385,7 @@ bytearray object and two empty bytearray objects.
 
 static PyObject *
 bytearray_partition(PyByteArrayObject *self, PyObject *sep)
-/*[clinic end generated code: output=45d2525ddd35f957 input=7d7fe37b1696d506]*/
+/*[clinic end generated code: output=45d2525ddd35f957 input=86f89223892b70b5]*/
 {
     PyObject *bytesep, *result;
 
@@ -2238,7 +1407,6 @@ bytearray_partition(PyByteArrayObject *self, PyObject *sep)
 /*[clinic input]
 bytearray.rpartition
 
-    self: self(type="PyByteArrayObject *")
     sep: object
     /
 
@@ -2254,7 +1422,7 @@ objects and the original bytearray object.
 
 static PyObject *
 bytearray_rpartition(PyByteArrayObject *self, PyObject *sep)
-/*[clinic end generated code: output=440de3c9426115e8 input=9b8cd540c1b75853]*/
+/*[clinic end generated code: output=440de3c9426115e8 input=5f4094f2de87c8f3]*/
 {
     PyObject *bytesep, *result;
 
@@ -2312,14 +1480,12 @@ bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep,
 /*[clinic input]
 bytearray.reverse
 
-    self: self(type="PyByteArrayObject *")
-
 Reverse the order of the values in B in place.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_reverse_impl(PyByteArrayObject *self)
-/*[clinic end generated code: output=9f7616f29ab309d3 input=7933a499b8597bd1]*/
+/*[clinic end generated code: output=9f7616f29ab309d3 input=543356319fc78557]*/
 {
     char swap, *head, *tail;
     Py_ssize_t i, j, n = Py_SIZE(self);
@@ -2348,7 +1514,6 @@ class bytesvalue_converter(CConverter):
 /*[clinic input]
 bytearray.insert
 
-    self: self(type="PyByteArrayObject *")
     index: Py_ssize_t
         The index where the value is to be inserted.
     item: bytesvalue
@@ -2360,7 +1525,7 @@ Insert a single item into the bytearray before the given index.
 
 static PyObject *
 bytearray_insert_impl(PyByteArrayObject *self, Py_ssize_t index, int item)
-/*[clinic end generated code: output=76c775a70e7b07b7 input=833766836ba30e1e]*/
+/*[clinic end generated code: output=76c775a70e7b07b7 input=b2b5d07e9de6c070]*/
 {
     Py_ssize_t n = Py_SIZE(self);
     char *buf;
@@ -2390,7 +1555,6 @@ bytearray_insert_impl(PyByteArrayObject *self, Py_ssize_t index, int item)
 /*[clinic input]
 bytearray.append
 
-    self: self(type="PyByteArrayObject *")
     item: bytesvalue
         The item to be appended.
     /
@@ -2400,7 +1564,7 @@ Append a single item to the end of the bytearray.
 
 static PyObject *
 bytearray_append_impl(PyByteArrayObject *self, int item)
-/*[clinic end generated code: output=a154e19ed1886cb6 input=ae56ea87380407cc]*/
+/*[clinic end generated code: output=a154e19ed1886cb6 input=20d6bec3d1340593]*/
 {
     Py_ssize_t n = Py_SIZE(self);
 
@@ -2420,7 +1584,6 @@ bytearray_append_impl(PyByteArrayObject *self, int item)
 /*[clinic input]
 bytearray.extend
 
-    self: self(type="PyByteArrayObject *")
     iterable_of_ints: object
         The iterable of items to append.
     /
@@ -2430,7 +1593,7 @@ Append all the items from the iterator or sequence to the end of the bytearray.
 
 static PyObject *
 bytearray_extend(PyByteArrayObject *self, PyObject *iterable_of_ints)
-/*[clinic end generated code: output=98155dbe249170b1 input=ce83a5d75b70d850]*/
+/*[clinic end generated code: output=98155dbe249170b1 input=c617b3a93249ba28]*/
 {
     PyObject *it, *item, *bytearray_obj;
     Py_ssize_t buf_size = 0, len = 0;
@@ -2515,7 +1678,6 @@ bytearray_extend(PyByteArrayObject *self, PyObject *iterable_of_ints)
 /*[clinic input]
 bytearray.pop
 
-    self: self(type="PyByteArrayObject *")
     index: Py_ssize_t = -1
         The index from where to remove the item.
         -1 (the default value) means remove the last item.
@@ -2528,7 +1690,7 @@ If no index argument is given, will pop the last item.
 
 static PyObject *
 bytearray_pop_impl(PyByteArrayObject *self, Py_ssize_t index)
-/*[clinic end generated code: output=e0ccd401f8021da8 input=0797e6c0ca9d5a85]*/
+/*[clinic end generated code: output=e0ccd401f8021da8 input=3591df2d06c0d237]*/
 {
     int value;
     Py_ssize_t n = Py_SIZE(self);
@@ -2560,7 +1722,6 @@ bytearray_pop_impl(PyByteArrayObject *self, Py_ssize_t index)
 /*[clinic input]
 bytearray.remove
 
-    self: self(type="PyByteArrayObject *")
     value: bytesvalue
         The value to remove.
     /
@@ -2570,20 +1731,20 @@ Remove the first occurrence of a value in the bytearray.
 
 static PyObject *
 bytearray_remove_impl(PyByteArrayObject *self, int value)
-/*[clinic end generated code: output=d659e37866709c13 input=47560b11fd856c24]*/
+/*[clinic end generated code: output=d659e37866709c13 input=121831240cd51ddf]*/
 {
-    Py_ssize_t n = Py_SIZE(self);
+    Py_ssize_t where, n = Py_SIZE(self);
     char *buf = PyByteArray_AS_STRING(self);
-    char *where = memchr(buf, value, n);
 
-    if (!where) {
+    where = stringlib_find_char(buf, n, value);
+    if (where < 0) {
         PyErr_SetString(PyExc_ValueError, "value not found in bytearray");
         return NULL;
     }
     if (!_canresize(self))
         return NULL;
 
-    memmove(where, where + 1, buf + n - where);
+    memmove(buf + where, buf + where + 1, n - where);
     if (PyByteArray_Resize((PyObject *)self, n - 1) < 0)
         return NULL;
 
@@ -2593,8 +1754,8 @@ bytearray_remove_impl(PyByteArrayObject *self, int value)
 /* XXX These two helpers could be optimized if argsize == 1 */
 
 static Py_ssize_t
-lstrip_helper(char *myptr, Py_ssize_t mysize,
-              void *argptr, Py_ssize_t argsize)
+lstrip_helper(const char *myptr, Py_ssize_t mysize,
+              const void *argptr, Py_ssize_t argsize)
 {
     Py_ssize_t i = 0;
     while (i < mysize && memchr(argptr, (unsigned char) myptr[i], argsize))
@@ -2603,8 +1764,8 @@ lstrip_helper(char *myptr, Py_ssize_t mysize,
 }
 
 static Py_ssize_t
-rstrip_helper(char *myptr, Py_ssize_t mysize,
-              void *argptr, Py_ssize_t argsize)
+rstrip_helper(const char *myptr, Py_ssize_t mysize,
+              const void *argptr, Py_ssize_t argsize)
 {
     Py_ssize_t i = mysize - 1;
     while (i >= 0 && memchr(argptr, (unsigned char) myptr[i], argsize))
@@ -2805,27 +1966,10 @@ bytearray_splitlines_impl(PyByteArrayObject *self, int keepends)
         );
 }
 
-static int
-hex_digit_to_int(Py_UCS4 c)
-{
-    if (c >= 128)
-        return -1;
-    if (Py_ISDIGIT(c))
-        return c - '0';
-    else {
-        if (Py_ISUPPER(c))
-            c = Py_TOLOWER(c);
-        if (c >= 'a' && c <= 'f')
-            return c - 'a' + 10;
-    }
-    return -1;
-}
-
 /*[clinic input]
 @classmethod
 bytearray.fromhex
 
-    cls: self(type="PyObject*")
     string: unicode
     /
 
@@ -2836,51 +1980,15 @@ Example: bytearray.fromhex('B9 01EF') -> bytearray(b'\\xb9\\x01\\xef')
 [clinic start generated code]*/
 
 static PyObject *
-bytearray_fromhex_impl(PyObject*cls, PyObject *string)
-/*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
+bytearray_fromhex_impl(PyTypeObject *type, PyObject *string)
+/*[clinic end generated code: output=8f0f0b6d30fb3ba0 input=f033a16d1fb21f48]*/
 {
-    PyObject *newbytes;
-    char *buf;
-    Py_ssize_t hexlen, byteslen, i, j;
-    int top, bot;
-    void *data;
-    unsigned int kind;
-
-    assert(PyUnicode_Check(string));
-    if (PyUnicode_READY(string))
-        return NULL;
-    kind = PyUnicode_KIND(string);
-    data = PyUnicode_DATA(string);
-    hexlen = PyUnicode_GET_LENGTH(string);
-
-    byteslen = hexlen/2; /* This overestimates if there are spaces */
-    newbytes = PyByteArray_FromStringAndSize(NULL, byteslen);
-    if (!newbytes)
-        return NULL;
-    buf = PyByteArray_AS_STRING(newbytes);
-    for (i = j = 0; i < hexlen; i += 2) {
-        /* skip over spaces in the input */
-        while (PyUnicode_READ(kind, data, i) == ' ')
-            i++;
-        if (i >= hexlen)
-            break;
-        top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
-        bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
-        if (top == -1 || bot == -1) {
-            PyErr_Format(PyExc_ValueError,
-                         "non-hexadecimal number found in "
-                         "fromhex() arg at position %zd", i);
-            goto error;
-        }
-        buf[j++] = (top << 4) + bot;
+    PyObject *result = _PyBytes_FromHex(string, type == &PyByteArray_Type);
+    if (type != &PyByteArray_Type && result != NULL) {
+        Py_SETREF(result, PyObject_CallFunctionObjArgs((PyObject *)type,
+                                                       result, NULL));
     }
-    if (PyByteArray_Resize(newbytes, j) < 0)
-        goto error;
-    return newbytes;
-
-  error:
-    Py_DECREF(newbytes);
-    return NULL;
+    return result;
 }
 
 PyDoc_STRVAR(hex__doc__,
@@ -2935,14 +2043,12 @@ _common_reduce(PyByteArrayObject *self, int proto)
 /*[clinic input]
 bytearray.__reduce__ as bytearray_reduce
 
-    self: self(type="PyByteArrayObject *")
-
 Return state information for pickling.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_reduce_impl(PyByteArrayObject *self)
-/*[clinic end generated code: output=52bf304086464cab input=fbb07de4d102a03a]*/
+/*[clinic end generated code: output=52bf304086464cab input=44b5737ada62dd3f]*/
 {
     return _common_reduce(self, 2);
 }
@@ -2950,7 +2056,6 @@ bytearray_reduce_impl(PyByteArrayObject *self)
 /*[clinic input]
 bytearray.__reduce_ex__ as bytearray_reduce_ex
 
-    self: self(type="PyByteArrayObject *")
     proto: int = 0
     /
 
@@ -2959,7 +2064,7 @@ Return state information for pickling.
 
 static PyObject *
 bytearray_reduce_ex_impl(PyByteArrayObject *self, int proto)
-/*[clinic end generated code: output=52eac33377197520 input=0e091a42ca6dbd91]*/
+/*[clinic end generated code: output=52eac33377197520 input=f129bc1a1aa151ee]*/
 {
     return _common_reduce(self, proto);
 }
@@ -2967,14 +2072,12 @@ bytearray_reduce_ex_impl(PyByteArrayObject *self, int proto)
 /*[clinic input]
 bytearray.__sizeof__ as bytearray_sizeof
 
-    self: self(type="PyByteArrayObject *")
-
 Returns the size of the bytearray object in memory, in bytes.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_sizeof_impl(PyByteArrayObject *self)
-/*[clinic end generated code: output=738abdd17951c427 input=6b23d305362b462b]*/
+/*[clinic end generated code: output=738abdd17951c427 input=e27320fd98a4bc5a]*/
 {
     Py_ssize_t res;
 
@@ -3015,19 +2118,22 @@ bytearray_methods[] = {
     BYTEARRAY_APPEND_METHODDEF
     {"capitalize", (PyCFunction)stringlib_capitalize, METH_NOARGS,
      _Py_capitalize__doc__},
-    {"center", (PyCFunction)stringlib_center, METH_VARARGS, center__doc__},
+    {"center", (PyCFunction)stringlib_center, METH_VARARGS, _Py_center__doc__},
     BYTEARRAY_CLEAR_METHODDEF
     BYTEARRAY_COPY_METHODDEF
-    {"count", (PyCFunction)bytearray_count, METH_VARARGS, count__doc__},
+    {"count", (PyCFunction)bytearray_count, METH_VARARGS,
+     _Py_count__doc__},
     BYTEARRAY_DECODE_METHODDEF
-    {"endswith", (PyCFunction)bytearray_endswith, METH_VARARGS, endswith__doc__},
+    {"endswith", (PyCFunction)bytearray_endswith, METH_VARARGS,
+     _Py_endswith__doc__},
     {"expandtabs", (PyCFunction)stringlib_expandtabs, METH_VARARGS | METH_KEYWORDS,
-     expandtabs__doc__},
+     _Py_expandtabs__doc__},
     BYTEARRAY_EXTEND_METHODDEF
-    {"find", (PyCFunction)bytearray_find, METH_VARARGS, find__doc__},
+    {"find", (PyCFunction)bytearray_find, METH_VARARGS,
+     _Py_find__doc__},
     BYTEARRAY_FROMHEX_METHODDEF
     {"hex", (PyCFunction)bytearray_hex, METH_NOARGS, hex__doc__},
-    {"index", (PyCFunction)bytearray_index, METH_VARARGS, index__doc__},
+    {"index", (PyCFunction)bytearray_index, METH_VARARGS, _Py_index__doc__},
     BYTEARRAY_INSERT_METHODDEF
     {"isalnum", (PyCFunction)stringlib_isalnum, METH_NOARGS,
      _Py_isalnum__doc__},
@@ -3044,7 +2150,7 @@ bytearray_methods[] = {
     {"isupper", (PyCFunction)stringlib_isupper, METH_NOARGS,
      _Py_isupper__doc__},
     BYTEARRAY_JOIN_METHODDEF
-    {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, ljust__doc__},
+    {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, _Py_ljust__doc__},
     {"lower", (PyCFunction)stringlib_lower, METH_NOARGS, _Py_lower__doc__},
     BYTEARRAY_LSTRIP_METHODDEF
     BYTEARRAY_MAKETRANS_METHODDEF
@@ -3053,23 +2159,23 @@ bytearray_methods[] = {
     BYTEARRAY_REMOVE_METHODDEF
     BYTEARRAY_REPLACE_METHODDEF
     BYTEARRAY_REVERSE_METHODDEF
-    {"rfind", (PyCFunction)bytearray_rfind, METH_VARARGS, rfind__doc__},
-    {"rindex", (PyCFunction)bytearray_rindex, METH_VARARGS, rindex__doc__},
-    {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, rjust__doc__},
+    {"rfind", (PyCFunction)bytearray_rfind, METH_VARARGS, _Py_rfind__doc__},
+    {"rindex", (PyCFunction)bytearray_rindex, METH_VARARGS, _Py_rindex__doc__},
+    {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, _Py_rjust__doc__},
     BYTEARRAY_RPARTITION_METHODDEF
     BYTEARRAY_RSPLIT_METHODDEF
     BYTEARRAY_RSTRIP_METHODDEF
     BYTEARRAY_SPLIT_METHODDEF
     BYTEARRAY_SPLITLINES_METHODDEF
     {"startswith", (PyCFunction)bytearray_startswith, METH_VARARGS ,
-     startswith__doc__},
+     _Py_startswith__doc__},
     BYTEARRAY_STRIP_METHODDEF
     {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS,
      _Py_swapcase__doc__},
     {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__},
     BYTEARRAY_TRANSLATE_METHODDEF
     {"upper", (PyCFunction)stringlib_upper, METH_NOARGS, _Py_upper__doc__},
-    {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, zfill__doc__},
+    {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, _Py_zfill__doc__},
     {NULL}
 };
 
@@ -3078,7 +2184,7 @@ bytearray_mod(PyObject *v, PyObject *w)
 {
     if (!PyByteArray_Check(v))
         Py_RETURN_NOTIMPLEMENTED;
-    return bytearray_format((PyByteArrayObject *)v, w);
+    return _PyBytes_FormatEx(PyByteArray_AS_STRING(v), PyByteArray_GET_SIZE(v), w, 1);
 }
 
 static PyNumberMethods bytearray_as_number = {
diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c
index a299915..d7f061b 100644
--- a/Objects/bytes_methods.c
+++ b/Objects/bytes_methods.c
@@ -1,3 +1,4 @@
+#define PY_SSIZE_T_CLEAN
 #include "Python.h"
 #include "bytes_methods.h"
 
@@ -277,7 +278,7 @@ Return a titlecased version of B, i.e. ASCII words start with uppercase\n\
 characters, all remaining cased characters have lowercase.");
 
 void
-_Py_bytes_title(char *result, char *s, Py_ssize_t len)
+_Py_bytes_title(char *result, const char *s, Py_ssize_t len)
 {
     Py_ssize_t i;
     int previous_is_cased = 0;
@@ -306,7 +307,7 @@ Return a copy of B with only its first character capitalized (ASCII)\n\
 and the rest lower-cased.");
 
 void
-_Py_bytes_capitalize(char *result, char *s, Py_ssize_t len)
+_Py_bytes_capitalize(char *result, const char *s, Py_ssize_t len)
 {
     Py_ssize_t i;
 
@@ -336,7 +337,7 @@ Return a copy of B with uppercase ASCII characters converted\n\
 to lowercase ASCII and vice versa.");
 
 void
-_Py_bytes_swapcase(char *result, char *s, Py_ssize_t len)
+_Py_bytes_swapcase(char *result, const char *s, Py_ssize_t len)
 {
     Py_ssize_t i;
 
@@ -387,3 +388,427 @@ _Py_bytes_maketrans(Py_buffer *frm, Py_buffer *to)
 
     return res;
 }
+
+#define FASTSEARCH fastsearch
+#define STRINGLIB(F) stringlib_##F
+#define STRINGLIB_CHAR char
+#define STRINGLIB_SIZEOF_CHAR 1
+
+#include "stringlib/fastsearch.h"
+#include "stringlib/count.h"
+#include "stringlib/find.h"
+
+/*
+Wraps stringlib_parse_args_finds() and additionally checks whether the
+first argument is an integer in range(0, 256).
+
+If this is the case, writes the integer value to the byte parameter
+and sets subobj to NULL. Otherwise, sets the first argument to subobj
+and doesn't touch byte. The other parameters are similar to those of
+stringlib_parse_args_finds().
+*/
+
+Py_LOCAL_INLINE(int)
+parse_args_finds_byte(const char *function_name, PyObject *args,
+                      PyObject **subobj, char *byte,
+                      Py_ssize_t *start, Py_ssize_t *end)
+{
+    PyObject *tmp_subobj;
+    Py_ssize_t ival;
+    PyObject *err;
+
+    if(!stringlib_parse_args_finds(function_name, args, &tmp_subobj,
+                                   start, end))
+        return 0;
+
+    if (!PyNumber_Check(tmp_subobj)) {
+        *subobj = tmp_subobj;
+        return 1;
+    }
+
+    ival = PyNumber_AsSsize_t(tmp_subobj, PyExc_OverflowError);
+    if (ival == -1) {
+        err = PyErr_Occurred();
+        if (err && !PyErr_GivenExceptionMatches(err, PyExc_OverflowError)) {
+            PyErr_Clear();
+            *subobj = tmp_subobj;
+            return 1;
+        }
+    }
+
+    if (ival < 0 || ival > 255) {
+        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)");
+        return 0;
+    }
+
+    *subobj = NULL;
+    *byte = (char)ival;
+    return 1;
+}
+
+/* helper macro to fixup start/end slice values */
+#define ADJUST_INDICES(start, end, len)         \
+    if (end > len)                          \
+        end = len;                          \
+    else if (end < 0) {                     \
+        end += len;                         \
+        if (end < 0)                        \
+        end = 0;                        \
+    }                                       \
+    if (start < 0) {                        \
+        start += len;                       \
+        if (start < 0)                      \
+        start = 0;                      \
+    }
+
+Py_LOCAL_INLINE(Py_ssize_t)
+find_internal(const char *str, Py_ssize_t len,
+              const char *function_name, PyObject *args, int dir)
+{
+    PyObject *subobj;
+    char byte;
+    Py_buffer subbuf;
+    const char *sub;
+    Py_ssize_t sub_len;
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+    Py_ssize_t res;
+
+    if (!parse_args_finds_byte(function_name, args,
+                               &subobj, &byte, &start, &end))
+        return -2;
+
+    if (subobj) {
+        if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0)
+            return -2;
+
+        sub = subbuf.buf;
+        sub_len = subbuf.len;
+    }
+    else {
+        sub = &byte;
+        sub_len = 1;
+    }
+
+    ADJUST_INDICES(start, end, len);
+    if (end - start < sub_len)
+        res = -1;
+    else if (sub_len == 1) {
+        if (dir > 0)
+            res = stringlib_find_char(
+                str + start, end - start,
+                *sub);
+        else
+            res = stringlib_rfind_char(
+                str + start, end - start,
+                *sub);
+        if (res >= 0)
+            res += start;
+    }
+    else {
+        if (dir > 0)
+            res = stringlib_find_slice(
+                str, len,
+                sub, sub_len, start, end);
+        else
+            res = stringlib_rfind_slice(
+                str, len,
+                sub, sub_len, start, end);
+    }
+
+    if (subobj)
+        PyBuffer_Release(&subbuf);
+
+    return res;
+}
+
+PyDoc_STRVAR_shared(_Py_find__doc__,
+"B.find(sub[, start[, end]]) -> int\n\
+\n\
+Return the lowest index in B where subsection sub is found,\n\
+such that sub is contained within B[start,end].  Optional\n\
+arguments start and end are interpreted as in slice notation.\n\
+\n\
+Return -1 on failure.");
+
+PyObject *
+_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *args)
+{
+    Py_ssize_t result = find_internal(str, len, "find", args, +1);
+    if (result == -2)
+        return NULL;
+    return PyLong_FromSsize_t(result);
+}
+
+PyDoc_STRVAR_shared(_Py_index__doc__,
+"B.index(sub[, start[, end]]) -> int\n\
+\n\
+Like B.find() but raise ValueError when the subsection is not found.");
+
+PyObject *
+_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *args)
+{
+    Py_ssize_t result = find_internal(str, len, "index", args, +1);
+    if (result == -2)
+        return NULL;
+    if (result == -1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "subsection not found");
+        return NULL;
+    }
+    return PyLong_FromSsize_t(result);
+}
+
+PyDoc_STRVAR_shared(_Py_rfind__doc__,
+"B.rfind(sub[, start[, end]]) -> int\n\
+\n\
+Return the highest index in B where subsection sub is found,\n\
+such that sub is contained within B[start,end].  Optional\n\
+arguments start and end are interpreted as in slice notation.\n\
+\n\
+Return -1 on failure.");
+
+PyObject *
+_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *args)
+{
+    Py_ssize_t result = find_internal(str, len, "rfind", args, -1);
+    if (result == -2)
+        return NULL;
+    return PyLong_FromSsize_t(result);
+}
+
+PyDoc_STRVAR_shared(_Py_rindex__doc__,
+"B.rindex(sub[, start[, end]]) -> int\n\
+\n\
+Like B.rfind() but raise ValueError when the subsection is not found.");
+
+PyObject *
+_Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *args)
+{
+    Py_ssize_t result = find_internal(str, len, "rindex", args, -1);
+    if (result == -2)
+        return NULL;
+    if (result == -1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "subsection not found");
+        return NULL;
+    }
+    return PyLong_FromSsize_t(result);
+}
+
+PyDoc_STRVAR_shared(_Py_count__doc__,
+"B.count(sub[, start[, end]]) -> int\n\
+\n\
+Return the number of non-overlapping occurrences of subsection sub in\n\
+bytes B[start:end].  Optional arguments start and end are interpreted\n\
+as in slice notation.");
+
+PyObject *
+_Py_bytes_count(const char *str, Py_ssize_t len, PyObject *args)
+{
+    PyObject *sub_obj;
+    const char *sub;
+    Py_ssize_t sub_len;
+    char byte;
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+
+    Py_buffer vsub;
+    PyObject *count_obj;
+
+    if (!parse_args_finds_byte("count", args,
+                               &sub_obj, &byte, &start, &end))
+        return NULL;
+
+    if (sub_obj) {
+        if (PyObject_GetBuffer(sub_obj, &vsub, PyBUF_SIMPLE) != 0)
+            return NULL;
+
+        sub = vsub.buf;
+        sub_len = vsub.len;
+    }
+    else {
+        sub = &byte;
+        sub_len = 1;
+    }
+
+    ADJUST_INDICES(start, end, len);
+
+    count_obj = PyLong_FromSsize_t(
+        stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX)
+        );
+
+    if (sub_obj)
+        PyBuffer_Release(&vsub);
+
+    return count_obj;
+}
+
+int
+_Py_bytes_contains(const char *str, Py_ssize_t len, PyObject *arg)
+{
+    Py_ssize_t ival = PyNumber_AsSsize_t(arg, NULL);
+    if (ival == -1 && PyErr_Occurred()) {
+        Py_buffer varg;
+        Py_ssize_t pos;
+        PyErr_Clear();
+        if (PyObject_GetBuffer(arg, &varg, PyBUF_SIMPLE) != 0)
+            return -1;
+        pos = stringlib_find(str, len,
+                             varg.buf, varg.len, 0);
+        PyBuffer_Release(&varg);
+        return pos >= 0;
+    }
+    if (ival < 0 || ival >= 256) {
+        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)");
+        return -1;
+    }
+
+    return memchr(str, (int) ival, len) != NULL;
+}
+
+
+/* Matches the end (direction >= 0) or start (direction < 0) of the buffer
+ * against substr, using the start and end arguments. Returns
+ * -1 on error, 0 if not found and 1 if found.
+ */
+Py_LOCAL(int)
+tailmatch(const char *str, Py_ssize_t len, PyObject *substr,
+          Py_ssize_t start, Py_ssize_t end, int direction)
+{
+    Py_buffer sub_view = {NULL, NULL};
+    const char *sub;
+    Py_ssize_t slen;
+
+    if (PyBytes_Check(substr)) {
+        sub = PyBytes_AS_STRING(substr);
+        slen = PyBytes_GET_SIZE(substr);
+    }
+    else {
+        if (PyObject_GetBuffer(substr, &sub_view, PyBUF_SIMPLE) != 0)
+            return -1;
+        sub = sub_view.buf;
+        slen = sub_view.len;
+    }
+
+    ADJUST_INDICES(start, end, len);
+
+    if (direction < 0) {
+        /* startswith */
+        if (start + slen > len)
+            goto notfound;
+    } else {
+        /* endswith */
+        if (end - start < slen || start > len)
+            goto notfound;
+
+        if (end - slen > start)
+            start = end - slen;
+    }
+    if (end - start < slen)
+        goto notfound;
+    if (memcmp(str + start, sub, slen) != 0)
+        goto notfound;
+
+    PyBuffer_Release(&sub_view);
+    return 1;
+
+notfound:
+    PyBuffer_Release(&sub_view);
+    return 0;
+}
+
+Py_LOCAL(PyObject *)
+_Py_bytes_tailmatch(const char *str, Py_ssize_t len,
+                    const char *function_name, PyObject *args,
+                    int direction)
+{
+    Py_ssize_t start = 0;
+    Py_ssize_t end = PY_SSIZE_T_MAX;
+    PyObject *subobj;
+    int result;
+
+    if (!stringlib_parse_args_finds(function_name, args, &subobj, &start, &end))
+        return NULL;
+    if (PyTuple_Check(subobj)) {
+        Py_ssize_t i;
+        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
+            result = tailmatch(str, len, PyTuple_GET_ITEM(subobj, i),
+                               start, end, direction);
+            if (result == -1)
+                return NULL;
+            else if (result) {
+                Py_RETURN_TRUE;
+            }
+        }
+        Py_RETURN_FALSE;
+    }
+    result = tailmatch(str, len, subobj, start, end, direction);
+    if (result == -1) {
+        if (PyErr_ExceptionMatches(PyExc_TypeError))
+            PyErr_Format(PyExc_TypeError,
+                         "%s first arg must be bytes or a tuple of bytes, "
+                         "not %s",
+                         function_name, Py_TYPE(subobj)->tp_name);
+        return NULL;
+    }
+    else
+        return PyBool_FromLong(result);
+}
+
+PyDoc_STRVAR_shared(_Py_startswith__doc__,
+"B.startswith(prefix[, start[, end]]) -> bool\n\
+\n\
+Return True if B starts with the specified prefix, False otherwise.\n\
+With optional start, test B beginning at that position.\n\
+With optional end, stop comparing B at that position.\n\
+prefix can also be a tuple of bytes to try.");
+
+PyObject *
+_Py_bytes_startswith(const char *str, Py_ssize_t len, PyObject *args)
+{
+    return _Py_bytes_tailmatch(str, len, "startswith", args, -1);
+}
+
+PyDoc_STRVAR_shared(_Py_endswith__doc__,
+"B.endswith(suffix[, start[, end]]) -> bool\n\
+\n\
+Return True if B ends with the specified suffix, False otherwise.\n\
+With optional start, test B beginning at that position.\n\
+With optional end, stop comparing B at that position.\n\
+suffix can also be a tuple of bytes to try.");
+
+PyObject *
+_Py_bytes_endswith(const char *str, Py_ssize_t len, PyObject *args)
+{
+    return _Py_bytes_tailmatch(str, len, "endswith", args, +1);
+}
+
+PyDoc_STRVAR_shared(_Py_expandtabs__doc__,
+"B.expandtabs(tabsize=8) -> copy of B\n\
+\n\
+Return a copy of B where all tab characters are expanded using spaces.\n\
+If tabsize is not given, a tab size of 8 characters is assumed.");
+
+PyDoc_STRVAR_shared(_Py_ljust__doc__,
+"B.ljust(width[, fillchar]) -> copy of B\n"
+"\n"
+"Return B left justified in a string of length width. Padding is\n"
+"done using the specified fill character (default is a space).");
+
+PyDoc_STRVAR_shared(_Py_rjust__doc__,
+"B.rjust(width[, fillchar]) -> copy of B\n"
+"\n"
+"Return B right justified in a string of length width. Padding is\n"
+"done using the specified fill character (default is a space)");
+
+PyDoc_STRVAR_shared(_Py_center__doc__,
+"B.center(width[, fillchar]) -> copy of B\n"
+"\n"
+"Return B centered in a string of length width.  Padding is\n"
+"done using the specified fill character (default is a space).");
+
+PyDoc_STRVAR_shared(_Py_zfill__doc__,
+"B.zfill(width) -> copy of B\n"
+"\n"
+"Pad a numeric string B with zeros on the left, to fill a field\n"
+"of the specified width.  B is never truncated.");
+
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index 5934336..b0d9b39 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -9,9 +9,9 @@
 #include <stddef.h>
 
 /*[clinic input]
-class bytes "PyBytesObject*" "&PyBytes_Type"
+class bytes "PyBytesObject *" "&PyBytes_Type"
 [clinic start generated code]*/
-/*[clinic end generated code: output=da39a3ee5e6b4b0d input=1a1d9102afc1b00c]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=7a238f965d64892b]*/
 
 #include "clinic/bytesobject.c.h"
 
@@ -30,6 +30,10 @@ static PyBytesObject *nullstring;
 */
 #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
 
+/* Forward declaration */
+Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer,
+                                                   char *str);
+
 /*
    For PyBytes_FromString(), the parameter `str' points to a null-terminated
    string containing exactly `size' bytes.
@@ -174,190 +178,184 @@ PyBytes_FromString(const char *str)
 PyObject *
 PyBytes_FromFormatV(const char *format, va_list vargs)
 {
-    va_list count;
-    Py_ssize_t n = 0;
-    const char* f;
     char *s;
-    PyObject* string;
+    const char *f;
+    const char *p;
+    Py_ssize_t prec;
+    int longflag;
+    int size_tflag;
+    /* Longest 64-bit formatted numbers:
+       - "18446744073709551615\0" (21 bytes)
+       - "-9223372036854775808\0" (21 bytes)
+       Decimal takes the most space (it isn't enough for octal.)
+
+       Longest 64-bit pointer representation:
+       "0xffffffffffffffff\0" (19 bytes). */
+    char buffer[21];
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
+
+    s = _PyBytesWriter_Alloc(&writer, strlen(format));
+    if (s == NULL)
+        return NULL;
+    writer.overallocate = 1;
+
+#define WRITE_BYTES(str) \
+    do { \
+        s = _PyBytesWriter_WriteBytes(&writer, s, (str), strlen(str)); \
+        if (s == NULL) \
+            goto error; \
+    } while (0)
 
-    Py_VA_COPY(count, vargs);
-    /* step 1: figure out how large a buffer we need */
     for (f = format; *f; f++) {
-        if (*f == '%') {
-            const char* p = f;
-            while (*++f && *f != '%' && !Py_ISALPHA(*f))
-                ;
-
-            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
-             * they don't affect the amount of space we reserve.
-             */
-            if ((*f == 'l' || *f == 'z') &&
-                            (f[1] == 'd' || f[1] == 'u'))
-                ++f;
-
-            switch (*f) {
-            case 'c':
-            {
-                int c = va_arg(count, int);
-                if (c < 0 || c > 255) {
-                    PyErr_SetString(PyExc_OverflowError,
-                                    "PyBytes_FromFormatV(): %c format "
-                                    "expects an integer in range [0; 255]");
-                    return NULL;
-                }
-                n++;
-                break;
+        if (*f != '%') {
+            *s++ = *f;
+            continue;
+        }
+
+        p = f++;
+
+        /* ignore the width (ex: 10 in "%10s") */
+        while (Py_ISDIGIT(*f))
+            f++;
+
+        /* parse the precision (ex: 10 in "%.10s") */
+        prec = 0;
+        if (*f == '.') {
+            f++;
+            for (; Py_ISDIGIT(*f); f++) {
+                prec = (prec * 10) + (*f - '0');
             }
-            case '%':
-                n++;
-                break;
-            case 'd': case 'u': case 'i': case 'x':
-                (void) va_arg(count, int);
-                /* 20 bytes is enough to hold a 64-bit
-                   integer.  Decimal takes the most space.
-                   This isn't enough for octal. */
-                n += 20;
-                break;
-            case 's':
-                s = va_arg(count, char*);
-                n += strlen(s);
-                break;
-            case 'p':
-                (void) va_arg(count, int);
-                /* maximum 64-bit pointer representation:
-                 * 0xffffffffffffffff
-                 * so 19 characters is enough.
-                 * XXX I count 18 -- what's the extra for?
-                 */
-                n += 19;
-                break;
-            default:
-                /* if we stumble upon an unknown
-                   formatting code, copy the rest of
-                   the format string to the output
-                   string. (we cannot just skip the
-                   code, since there's no way to know
-                   what's in the argument list) */
-                n += strlen(p);
-                goto expand;
+        }
+
+        while (*f && *f != '%' && !Py_ISALPHA(*f))
+            f++;
+
+        /* handle the long flag ('l'), but only for %ld and %lu.
+           others can be added when necessary. */
+        longflag = 0;
+        if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
+            longflag = 1;
+            ++f;
+        }
+
+        /* handle the size_t flag ('z'). */
+        size_tflag = 0;
+        if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
+            size_tflag = 1;
+            ++f;
+        }
+
+        /* subtract bytes preallocated for the format string
+           (ex: 2 for "%s") */
+        writer.min_size -= (f - p + 1);
+
+        switch (*f) {
+        case 'c':
+        {
+            int c = va_arg(vargs, int);
+            if (c < 0 || c > 255) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "PyBytes_FromFormatV(): %c format "
+                                "expects an integer in range [0; 255]");
+                goto error;
             }
-        } else
-            n++;
-    }
- expand:
-    /* step 2: fill the buffer */
-    /* Since we've analyzed how much space we need for the worst case,
-       use sprintf directly instead of the slower PyOS_snprintf. */
-    string = PyBytes_FromStringAndSize(NULL, n);
-    if (!string)
-        return NULL;
+            writer.min_size++;
+            *s++ = (unsigned char)c;
+            break;
+        }
 
-    s = PyBytes_AsString(string);
+        case 'd':
+            if (longflag)
+                sprintf(buffer, "%ld", va_arg(vargs, long));
+            else if (size_tflag)
+                sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
+                    va_arg(vargs, Py_ssize_t));
+            else
+                sprintf(buffer, "%d", va_arg(vargs, int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
 
-    for (f = format; *f; f++) {
-        if (*f == '%') {
-            const char* p = f++;
+        case 'u':
+            if (longflag)
+                sprintf(buffer, "%lu",
+                    va_arg(vargs, unsigned long));
+            else if (size_tflag)
+                sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
+                    va_arg(vargs, size_t));
+            else
+                sprintf(buffer, "%u",
+                    va_arg(vargs, unsigned int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
+
+        case 'i':
+            sprintf(buffer, "%i", va_arg(vargs, int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
+
+        case 'x':
+            sprintf(buffer, "%x", va_arg(vargs, int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
+
+        case 's':
+        {
             Py_ssize_t i;
-            int longflag = 0;
-            int size_tflag = 0;
-            /* parse the width.precision part (we're only
-               interested in the precision value, if any) */
-            n = 0;
-            while (Py_ISDIGIT(*f))
-                n = (n*10) + *f++ - '0';
-            if (*f == '.') {
-                f++;
-                n = 0;
-                while (Py_ISDIGIT(*f))
-                    n = (n*10) + *f++ - '0';
-            }
-            while (*f && *f != '%' && !Py_ISALPHA(*f))
-                f++;
-            /* handle the long flag, but only for %ld and %lu.
-               others can be added when necessary. */
-            if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
-                longflag = 1;
-                ++f;
-            }
-            /* handle the size_t flag. */
-            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
-                size_tflag = 1;
-                ++f;
-            }
 
-            switch (*f) {
-            case 'c':
-            {
-                int c = va_arg(vargs, int);
-                /* c has been checked for overflow in the first step */
-                *s++ = (unsigned char)c;
-                break;
+            p = va_arg(vargs, const char*);
+            i = strlen(p);
+            if (prec > 0 && i > prec)
+                i = prec;
+            s = _PyBytesWriter_WriteBytes(&writer, s, p, i);
+            if (s == NULL)
+                goto error;
+            break;
+        }
+
+        case 'p':
+            sprintf(buffer, "%p", va_arg(vargs, void*));
+            assert(strlen(buffer) < sizeof(buffer));
+            /* %p is ill-defined:  ensure leading 0x. */
+            if (buffer[1] == 'X')
+                buffer[1] = 'x';
+            else if (buffer[1] != 'x') {
+                memmove(buffer+2, buffer, strlen(buffer)+1);
+                buffer[0] = '0';
+                buffer[1] = 'x';
             }
-            case 'd':
-                if (longflag)
-                    sprintf(s, "%ld", va_arg(vargs, long));
-                else if (size_tflag)
-                    sprintf(s, "%" PY_FORMAT_SIZE_T "d",
-                        va_arg(vargs, Py_ssize_t));
-                else
-                    sprintf(s, "%d", va_arg(vargs, int));
-                s += strlen(s);
-                break;
-            case 'u':
-                if (longflag)
-                    sprintf(s, "%lu",
-                        va_arg(vargs, unsigned long));
-                else if (size_tflag)
-                    sprintf(s, "%" PY_FORMAT_SIZE_T "u",
-                        va_arg(vargs, size_t));
-                else
-                    sprintf(s, "%u",
-                        va_arg(vargs, unsigned int));
-                s += strlen(s);
-                break;
-            case 'i':
-                sprintf(s, "%i", va_arg(vargs, int));
-                s += strlen(s);
-                break;
-            case 'x':
-                sprintf(s, "%x", va_arg(vargs, int));
-                s += strlen(s);
-                break;
-            case 's':
-                p = va_arg(vargs, char*);
-                i = strlen(p);
-                if (n > 0 && i > n)
-                    i = n;
-                Py_MEMCPY(s, p, i);
-                s += i;
-                break;
-            case 'p':
-                sprintf(s, "%p", va_arg(vargs, void*));
-                /* %p is ill-defined:  ensure leading 0x. */
-                if (s[1] == 'X')
-                    s[1] = 'x';
-                else if (s[1] != 'x') {
-                    memmove(s+2, s, strlen(s)+1);
-                    s[0] = '0';
-                    s[1] = 'x';
-                }
-                s += strlen(s);
-                break;
-            case '%':
-                *s++ = '%';
-                break;
-            default:
-                strcpy(s, p);
-                s += strlen(s);
-                goto end;
+            WRITE_BYTES(buffer);
+            break;
+
+        case '%':
+            writer.min_size++;
+            *s++ = '%';
+            break;
+
+        default:
+            if (*f == 0) {
+                /* fix min_size if we reached the end of the format string */
+                writer.min_size++;
             }
-        } else
-            *s++ = *f;
+
+            /* invalid format string: copy unformatted string and exit */
+            WRITE_BYTES(p);
+            return _PyBytesWriter_Finish(&writer, s);
+        }
     }
 
- end:
-    _PyBytes_Resize(&string, s - PyBytes_AS_STRING(string));
-    return string;
+#undef WRITE_BYTES
+
+    return _PyBytesWriter_Finish(&writer, s);
+
+ error:
+    _PyBytesWriter_Dealloc(&writer);
+    return NULL;
 }
 
 PyObject *
@@ -409,12 +407,14 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
 
 /* Returns a new reference to a PyBytes object, or NULL on failure. */
 
-static PyObject *
-formatfloat(PyObject *v, int flags, int prec, int type)
+static char*
+formatfloat(PyObject *v, int flags, int prec, int type,
+            PyObject **p_result, _PyBytesWriter *writer, char *str)
 {
     char *p;
     PyObject *result;
     double x;
+    size_t len;
 
     x = PyFloat_AsDouble(v);
     if (x == -1.0 && PyErr_Occurred()) {
@@ -431,9 +431,22 @@ formatfloat(PyObject *v, int flags, int prec, int type)
 
     if (p == NULL)
         return NULL;
-    result = PyBytes_FromStringAndSize(p, strlen(p));
+
+    len = strlen(p);
+    if (writer != NULL) {
+        str = _PyBytesWriter_Prepare(writer, str, len);
+        if (str == NULL)
+            return NULL;
+        Py_MEMCPY(str, p, len);
+        PyMem_Free(p);
+        str += len;
+        return str;
+    }
+
+    result = PyBytes_FromStringAndSize(p, len);
     PyMem_Free(p);
-    return result;
+    *p_result = result;
+    return str;
 }
 
 static PyObject *
@@ -473,11 +486,11 @@ formatlong(PyObject *v, int flags, int prec, int type)
 static int
 byte_converter(PyObject *arg, char *p)
 {
-    if (PyBytes_Check(arg) && PyBytes_Size(arg) == 1) {
+    if (PyBytes_Check(arg) && PyBytes_GET_SIZE(arg) == 1) {
         *p = PyBytes_AS_STRING(arg)[0];
         return 1;
     }
-    else if (PyByteArray_Check(arg) && PyByteArray_Size(arg) == 1) {
+    else if (PyByteArray_Check(arg) && PyByteArray_GET_SIZE(arg) == 1) {
         *p = PyByteArray_AS_STRING(arg)[0];
         return 1;
     }
@@ -557,36 +570,36 @@ format_obj(PyObject *v, const char **pbuf, Py_ssize_t *plen)
     return NULL;
 }
 
-/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
-
-   FORMATBUFLEN is the length of the buffer in which the ints &
-   chars are formatted. XXX This is a magic number. Each formatting
-   routine does bounds checking to ensure no overflow, but a better
-   solution may be to malloc a buffer of appropriate size for each
-   format. For now, the current solution is sufficient.
-*/
-#define FORMATBUFLEN (size_t)120
+/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) */
 
 PyObject *
-_PyBytes_Format(PyObject *format, PyObject *args)
+_PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
+                  PyObject *args, int use_bytearray)
 {
-    char *fmt, *res;
+    const char *fmt;
+    char *res;
     Py_ssize_t arglen, argidx;
-    Py_ssize_t reslen, rescnt, fmtcnt;
+    Py_ssize_t fmtcnt;
     int args_owned = 0;
-    PyObject *result;
     PyObject *dict = NULL;
-    if (format == NULL || !PyBytes_Check(format) || args == NULL) {
+    _PyBytesWriter writer;
+
+    if (args == NULL) {
         PyErr_BadInternalCall();
         return NULL;
     }
-    fmt = PyBytes_AS_STRING(format);
-    fmtcnt = PyBytes_GET_SIZE(format);
-    reslen = rescnt = fmtcnt + 100;
-    result = PyBytes_FromStringAndSize((char *)NULL, reslen);
-    if (result == NULL)
+    fmt = format;
+    fmtcnt = format_len;
+
+    _PyBytesWriter_Init(&writer);
+    writer.use_bytearray = use_bytearray;
+
+    res = _PyBytesWriter_Alloc(&writer, fmtcnt);
+    if (res == NULL)
         return NULL;
-    res = PyBytes_AsString(result);
+    if (!use_bytearray)
+        writer.overallocate = 1;
+
     if (PyTuple_Check(args)) {
         arglen = PyTuple_GET_SIZE(args);
         argidx = 0;
@@ -600,18 +613,23 @@ _PyBytes_Format(PyObject *format, PyObject *args)
         !PyByteArray_Check(args)) {
             dict = args;
     }
+
     while (--fmtcnt >= 0) {
         if (*fmt != '%') {
-            if (--rescnt < 0) {
-                rescnt = fmtcnt + 100;
-                reslen += rescnt;
-                if (_PyBytes_Resize(&result, reslen))
-                    return NULL;
-                res = PyBytes_AS_STRING(result)
-                    + reslen - rescnt;
-                --rescnt;
-            }
-            *res++ = *fmt++;
+            Py_ssize_t len;
+            char *pos;
+
+            pos = strchr(fmt + 1, '%');
+            if (pos != NULL)
+                len = pos - fmt;
+            else
+                len = format_len - (fmt - format);
+            assert(len != 0);
+
+            Py_MEMCPY(res, fmt, len);
+            res += len;
+            fmt += len;
+            fmtcnt -= (len - 1);
         }
         else {
             /* Got a format specifier */
@@ -626,10 +644,14 @@ _PyBytes_Format(PyObject *format, PyObject *args)
             int sign;
             Py_ssize_t len = 0;
             char onechar; /* For byte_converter() */
+            Py_ssize_t alloc;
+#ifdef Py_DEBUG
+            char *before;
+#endif
 
             fmt++;
             if (*fmt == '(') {
-                char *keystart;
+                const char *keystart;
                 Py_ssize_t keylen;
                 PyObject *key;
                 int pcount = 1;
@@ -673,6 +695,8 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 arglen = -1;
                 argidx = -2;
             }
+
+            /* Parse flags. Example: "%+i" => flags=F_SIGN. */
             while (--fmtcnt >= 0) {
                 switch (c = *fmt++) {
                 case '-': flags |= F_LJUST; continue;
@@ -683,6 +707,8 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 }
                 break;
             }
+
+            /* Parse width. Example: "%10s" => width=10 */
             if (c == '*') {
                 v = getnextarg(args, arglen, &argidx);
                 if (v == NULL)
@@ -717,6 +743,8 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                     width = width*10 + (c - '0');
                 }
             }
+
+            /* Parse precision. Example: "%.3f" => prec=3 */
             if (c == '.') {
                 prec = 0;
                 if (--fmtcnt >= 0)
@@ -771,13 +799,19 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (v == NULL)
                     goto error;
             }
+
+            if (fmtcnt < 0) {
+                /* last writer: disable writer overallocation */
+                writer.overallocate = 0;
+            }
+
             sign = 0;
             fill = ' ';
             switch (c) {
             case '%':
-                pbuf = "%";
-                len = 1;
-                break;
+                *res++ = '%';
+                continue;
+
             case 'r':
                 // %r is only for 2/3 code; 3 only code should use %a
             case 'a':
@@ -790,6 +824,7 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (prec >= 0 && len > prec)
                     len = prec;
                 break;
+
             case 's':
                 // %s is only for 2/3 code; 3 only code should use %b
             case 'b':
@@ -799,12 +834,49 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (prec >= 0 && len > prec)
                     len = prec;
                 break;
+
             case 'i':
             case 'd':
             case 'u':
             case 'o':
             case 'x':
             case 'X':
+                if (PyLong_CheckExact(v)
+                    && width == -1 && prec == -1
+                    && !(flags & (F_SIGN | F_BLANK))
+                    && c != 'X')
+                {
+                    /* Fast path */
+                    int alternate = flags & F_ALT;
+                    int base;
+
+                    switch(c)
+                    {
+                        default:
+                            assert(0 && "'type' not in [diuoxX]");
+                        case 'd':
+                        case 'i':
+                        case 'u':
+                            base = 10;
+                            break;
+                        case 'o':
+                            base = 8;
+                            break;
+                        case 'x':
+                        case 'X':
+                            base = 16;
+                            break;
+                    }
+
+                    /* Fast path */
+                    writer.min_size -= 2; /* size preallocated for "%d" */
+                    res = _PyLong_FormatBytesWriter(&writer, res,
+                                                    v, base, alternate);
+                    if (res == NULL)
+                        goto error;
+                    continue;
+                }
+
                 temp = formatlong(v, flags, prec, c);
                 if (!temp)
                     goto error;
@@ -815,14 +887,25 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (flags & F_ZERO)
                     fill = '0';
                 break;
+
             case 'e':
             case 'E':
             case 'f':
             case 'F':
             case 'g':
             case 'G':
-                temp = formatfloat(v, flags, prec, c);
-                if (temp == NULL)
+                if (width == -1 && prec == -1
+                    && !(flags & (F_SIGN | F_BLANK)))
+                {
+                    /* Fast path */
+                    writer.min_size -= 2; /* size preallocated for "%f" */
+                    res = formatfloat(v, flags, prec, c, NULL, &writer, res);
+                    if (res == NULL)
+                        goto error;
+                    continue;
+                }
+
+                if (!formatfloat(v, flags, prec, c, &temp, NULL, res))
                     goto error;
                 pbuf = PyBytes_AS_STRING(temp);
                 len = PyBytes_GET_SIZE(temp);
@@ -830,21 +913,28 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (flags & F_ZERO)
                     fill = '0';
                 break;
+
             case 'c':
                 pbuf = &onechar;
                 len = byte_converter(v, &onechar);
                 if (!len)
                     goto error;
+                if (width == -1) {
+                    /* Fast path */
+                    *res++ = onechar;
+                    continue;
+                }
                 break;
+
             default:
                 PyErr_Format(PyExc_ValueError,
                   "unsupported format character '%c' (0x%x) "
                   "at index %zd",
                   c, c,
-                  (Py_ssize_t)(fmt - 1 -
-                               PyBytes_AsString(format)));
+                  (Py_ssize_t)(fmt - 1 - format));
                 goto error;
             }
+
             if (sign) {
                 if (*pbuf == '-' || *pbuf == '+') {
                     sign = *pbuf++;
@@ -859,29 +949,31 @@ _PyBytes_Format(PyObject *format, PyObject *args)
             }
             if (width < len)
                 width = len;
-            if (rescnt - (sign != 0) < width) {
-                reslen -= rescnt;
-                rescnt = width + fmtcnt + 100;
-                reslen += rescnt;
-                if (reslen < 0) {
-                    Py_DECREF(result);
-                    Py_XDECREF(temp);
-                    return PyErr_NoMemory();
-                }
-                if (_PyBytes_Resize(&result, reslen)) {
-                    Py_XDECREF(temp);
-                    return NULL;
-                }
-                res = PyBytes_AS_STRING(result)
-                    + reslen - rescnt;
+
+            alloc = width;
+            if (sign != 0 && len == width)
+                alloc++;
+            /* 2: size preallocated for %s */
+            if (alloc > 2) {
+                res = _PyBytesWriter_Prepare(&writer, res, alloc - 2);
+                if (res == NULL)
+                    goto error;
             }
+#ifdef Py_DEBUG
+            before = res;
+#endif
+
+            /* Write the sign if needed */
             if (sign) {
                 if (fill != ' ')
                     *res++ = sign;
-                rescnt--;
                 if (width > len)
                     width--;
             }
+
+            /* Write the numeric prefix for "x", "X" and "o" formats
+               if the alternate form is used.
+               For example, write "0x" for the "%#x" format. */
             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
                 assert(pbuf[0] == '0');
                 assert(pbuf[1] == c);
@@ -889,18 +981,21 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                     *res++ = *pbuf++;
                     *res++ = *pbuf++;
                 }
-                rescnt -= 2;
                 width -= 2;
                 if (width < 0)
                     width = 0;
                 len -= 2;
             }
+
+            /* Pad left with the fill character if needed */
             if (width > len && !(flags & F_LJUST)) {
-                do {
-                    --rescnt;
-                    *res++ = fill;
-                } while (--width > len);
+                memset(res, fill, width - len);
+                res += (width - len);
+                width = len;
             }
+
+            /* If padding with spaces: write sign if needed and/or numeric
+               prefix if the alternate form is used */
             if (fill == ' ') {
                 if (sign)
                     *res++ = sign;
@@ -912,13 +1007,17 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                     *res++ = *pbuf++;
                 }
             }
+
+            /* Copy bytes */
             Py_MEMCPY(res, pbuf, len);
             res += len;
-            rescnt -= len;
-            while (--width >= len) {
-                --rescnt;
-                *res++ = ' ';
+
+            /* Pad right with the fill character if needed */
+            if (width > len) {
+                memset(res, ' ', width - len);
+                res += (width - len);
             }
+
             if (dict && (argidx < arglen) && c != '%') {
                 PyErr_SetString(PyExc_TypeError,
                            "not all arguments converted during bytes formatting");
@@ -926,22 +1025,31 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 goto error;
             }
             Py_XDECREF(temp);
+
+#ifdef Py_DEBUG
+            /* check that we computed the exact size for this write */
+            assert((res - before) == alloc);
+#endif
         } /* '%' */
+
+        /* If overallocation was disabled, ensure that it was the last
+           write. Otherwise, we missed an optimization */
+        assert(writer.overallocate || fmtcnt < 0 || use_bytearray);
     } /* until end */
+
     if (argidx < arglen && !dict) {
         PyErr_SetString(PyExc_TypeError,
                         "not all arguments converted during bytes formatting");
         goto error;
     }
+
     if (args_owned) {
         Py_DECREF(args);
     }
-    if (_PyBytes_Resize(&result, reslen - rescnt))
-        return NULL;
-    return result;
+    return _PyBytesWriter_Finish(&writer, res);
 
  error:
-    Py_DECREF(result);
+    _PyBytesWriter_Dealloc(&writer);
     if (args_owned) {
         Py_DECREF(args);
     }
@@ -961,6 +1069,42 @@ bytes_dealloc(PyObject *op)
    the string is UTF-8 encoded and should be re-encoded in the
    specified encoding.  */
 
+static char *
+_PyBytes_DecodeEscapeRecode(const char **s, const char *end,
+                            const char *errors, const char *recode_encoding,
+                            _PyBytesWriter *writer, char *p)
+{
+    PyObject *u, *w;
+    const char* t;
+
+    t = *s;
+    /* Decode non-ASCII bytes as UTF-8. */
+    while (t < end && (*t & 0x80))
+        t++;
+    u = PyUnicode_DecodeUTF8(*s, t - *s, errors);
+    if (u == NULL)
+        return NULL;
+
+    /* Recode them in target encoding. */
+    w = PyUnicode_AsEncodedString(u, recode_encoding, errors);
+    Py_DECREF(u);
+    if  (w == NULL)
+        return NULL;
+    assert(PyBytes_Check(w));
+
+    /* Append bytes to output buffer. */
+    writer->min_size--;   /* subtract 1 preallocated byte */
+    p = _PyBytesWriter_WriteBytes(writer, p,
+                                  PyBytes_AS_STRING(w),
+                                  PyBytes_GET_SIZE(w));
+    Py_DECREF(w);
+    if (p == NULL)
+        return NULL;
+
+    *s = t;
+    return p;
+}
+
 PyObject *PyBytes_DecodeEscape(const char *s,
                                 Py_ssize_t len,
                                 const char *errors,
@@ -968,54 +1112,42 @@ PyObject *PyBytes_DecodeEscape(const char *s,
                                 const char *recode_encoding)
 {
     int c;
-    char *p, *buf;
+    char *p;
     const char *end;
-    PyObject *v;
-    Py_ssize_t newlen = recode_encoding ? 4*len:len;
-    v = PyBytes_FromStringAndSize((char *)NULL, newlen);
-    if (v == NULL)
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
+
+    p = _PyBytesWriter_Alloc(&writer, len);
+    if (p == NULL)
         return NULL;
-    p = buf = PyBytes_AsString(v);
+    writer.overallocate = 1;
+
     end = s + len;
     while (s < end) {
         if (*s != '\\') {
           non_esc:
-            if (recode_encoding && (*s & 0x80)) {
-                PyObject *u, *w;
-                char *r;
-                const char* t;
-                Py_ssize_t rn;
-                t = s;
-                /* Decode non-ASCII bytes as UTF-8. */
-                while (t < end && (*t & 0x80)) t++;
-                u = PyUnicode_DecodeUTF8(s, t - s, errors);
-                if(!u) goto failed;
-
-                /* Recode them in target encoding. */
-                w = PyUnicode_AsEncodedString(
-                    u, recode_encoding, errors);
-                Py_DECREF(u);
-                if (!w)                 goto failed;
-
-                /* Append bytes to output buffer. */
-                assert(PyBytes_Check(w));
-                r = PyBytes_AS_STRING(w);
-                rn = PyBytes_GET_SIZE(w);
-                Py_MEMCPY(p, r, rn);
-                p += rn;
-                Py_DECREF(w);
-                s = t;
-            } else {
+            if (!(recode_encoding && (*s & 0x80))) {
                 *p++ = *s++;
             }
+            else {
+                /* non-ASCII character and need to recode */
+                p = _PyBytes_DecodeEscapeRecode(&s, end,
+                                                errors, recode_encoding,
+                                                &writer, p);
+                if (p == NULL)
+                    goto failed;
+            }
             continue;
         }
+
         s++;
-        if (s==end) {
+        if (s == end) {
             PyErr_SetString(PyExc_ValueError,
                             "Trailing \\ in string");
             goto failed;
         }
+
         switch (*s++) {
         /* XXX This assumes ASCII! */
         case '\n': break;
@@ -1040,28 +1172,18 @@ PyObject *PyBytes_DecodeEscape(const char *s,
             *p++ = c;
             break;
         case 'x':
-            if (s+1 < end && Py_ISXDIGIT(s[0]) && Py_ISXDIGIT(s[1])) {
-                unsigned int x = 0;
-                c = Py_CHARMASK(*s);
-                s++;
-                if (Py_ISDIGIT(c))
-                    x = c - '0';
-                else if (Py_ISLOWER(c))
-                    x = 10 + c - 'a';
-                else
-                    x = 10 + c - 'A';
-                x = x << 4;
-                c = Py_CHARMASK(*s);
-                s++;
-                if (Py_ISDIGIT(c))
-                    x += c - '0';
-                else if (Py_ISLOWER(c))
-                    x += 10 + c - 'a';
-                else
-                    x += 10 + c - 'A';
-                *p++ = x;
-                break;
+            if (s+1 < end) {
+                int digit1, digit2;
+                digit1 = _PyLong_DigitValue[Py_CHARMASK(s[0])];
+                digit2 = _PyLong_DigitValue[Py_CHARMASK(s[1])];
+                if (digit1 < 16 && digit2 < 16) {
+                    *p++ = (unsigned char)((digit1 << 4) + digit2);
+                    s += 2;
+                    break;
+                }
             }
+            /* invalid hexadecimal digits */
+
             if (!errors || strcmp(errors, "strict") == 0) {
                 PyErr_Format(PyExc_ValueError,
                              "invalid \\x escape at position %d",
@@ -1083,6 +1205,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
             if (s < end && Py_ISXDIGIT(s[0]))
                 s++; /* and a hexdigit */
             break;
+
         default:
             *p++ = '\\';
             s--;
@@ -1090,11 +1213,11 @@ PyObject *PyBytes_DecodeEscape(const char *s,
                              UTF-8 bytes may follow. */
         }
     }
-    if (p-buf < newlen)
-        _PyBytes_Resize(&v, p - buf);
-    return v;
+
+    return _PyBytesWriter_Finish(&writer, p);
+
   failed:
-    Py_DECREF(v);
+    _PyBytesWriter_Dealloc(&writer);
     return NULL;
 }
 
@@ -1363,24 +1486,7 @@ bytes_repeat(PyBytesObject *a, Py_ssize_t n)
 static int
 bytes_contains(PyObject *self, PyObject *arg)
 {
-    Py_ssize_t ival = PyNumber_AsSsize_t(arg, PyExc_ValueError);
-    if (ival == -1 && PyErr_Occurred()) {
-        Py_buffer varg;
-        Py_ssize_t pos;
-        PyErr_Clear();
-        if (PyObject_GetBuffer(arg, &varg, PyBUF_SIMPLE) != 0)
-            return -1;
-        pos = stringlib_find(PyBytes_AS_STRING(self), Py_SIZE(self),
-                             varg.buf, varg.len, 0);
-        PyBuffer_Release(&varg);
-        return pos >= 0;
-    }
-    if (ival < 0 || ival >= 256) {
-        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)");
-        return -1;
-    }
-
-    return memchr(PyBytes_AS_STRING(self), (int) ival, Py_SIZE(self)) != NULL;
+    return _Py_bytes_contains(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), arg);
 }
 
 static PyObject *
@@ -1627,8 +1733,8 @@ Return a list of the sections in the bytes, using sep as the delimiter.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_split_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit)
-/*[clinic end generated code: output=8bde44dacb36ef2e input=8b809b39074abbfa]*/
+bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit)
+/*[clinic end generated code: output=52126b5844c1d8ef input=8b809b39074abbfa]*/
 {
     Py_ssize_t len = PyBytes_GET_SIZE(self), n;
     const char *s = PyBytes_AS_STRING(self), *sub;
@@ -1652,7 +1758,6 @@ bytes_split_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit)
 /*[clinic input]
 bytes.partition
 
-    self: self(type="PyBytesObject *")
     sep: Py_buffer
     /
 
@@ -1668,7 +1773,7 @@ object and two empty bytes objects.
 
 static PyObject *
 bytes_partition_impl(PyBytesObject *self, Py_buffer *sep)
-/*[clinic end generated code: output=f532b392a17ff695 input=bc855dc63ca949de]*/
+/*[clinic end generated code: output=f532b392a17ff695 input=61cca95519406099]*/
 {
     return stringlib_partition(
         (PyObject*) self,
@@ -1680,7 +1785,6 @@ bytes_partition_impl(PyBytesObject *self, Py_buffer *sep)
 /*[clinic input]
 bytes.rpartition
 
-    self: self(type="PyBytesObject *")
     sep: Py_buffer
     /
 
@@ -1696,7 +1800,7 @@ objects and the original bytes object.
 
 static PyObject *
 bytes_rpartition_impl(PyBytesObject *self, Py_buffer *sep)
-/*[clinic end generated code: output=191b114cbb028e50 input=6588fff262a9170e]*/
+/*[clinic end generated code: output=191b114cbb028e50 input=67f689e63a62d478]*/
 {
     return stringlib_rpartition(
         (PyObject*) self,
@@ -1714,8 +1818,8 @@ Splitting is done starting at the end of the bytes and working to the front.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_rsplit_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit)
-/*[clinic end generated code: output=0b6570b977911d88 input=0f86c9f28f7d7b7b]*/
+bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit)
+/*[clinic end generated code: output=ba698d9ea01e1c8f input=0f86c9f28f7d7b7b]*/
 {
     Py_ssize_t len = PyBytes_GET_SIZE(self), n;
     const char *s = PyBytes_AS_STRING(self), *sub;
@@ -1753,8 +1857,8 @@ Example: b'.'.join([b'ab', b'pq', b'rs']) -> b'ab.pq.rs'.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_join(PyBytesObject*self, PyObject *iterable_of_bytes)
-/*[clinic end generated code: output=634aff14764ff997 input=7fe377b95bd549d2]*/
+bytes_join(PyBytesObject *self, PyObject *iterable_of_bytes)
+/*[clinic end generated code: output=a046f379f626f6f8 input=7fe377b95bd549d2]*/
 {
     return stringlib_bytes_join((PyObject*)self, iterable_of_bytes);
 }
@@ -1767,158 +1871,30 @@ _PyBytes_Join(PyObject *sep, PyObject *x)
     return bytes_join((PyBytesObject*)sep, x);
 }
 
-/* helper macro to fixup start/end slice values */
-#define ADJUST_INDICES(start, end, len)         \
-    if (end > len)                          \
-        end = len;                          \
-    else if (end < 0) {                     \
-        end += len;                         \
-        if (end < 0)                        \
-        end = 0;                        \
-    }                                       \
-    if (start < 0) {                        \
-        start += len;                       \
-        if (start < 0)                      \
-        start = 0;                      \
-    }
-
-Py_LOCAL_INLINE(Py_ssize_t)
-bytes_find_internal(PyBytesObject *self, PyObject *args, int dir)
-{
-    PyObject *subobj;
-    char byte;
-    Py_buffer subbuf;
-    const char *sub;
-    Py_ssize_t len, sub_len;
-    Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
-    Py_ssize_t res;
-
-    if (!stringlib_parse_args_finds_byte("find/rfind/index/rindex",
-                                         args, &subobj, &byte, &start, &end))
-        return -2;
-
-    if (subobj) {
-        if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0)
-            return -2;
-
-        sub = subbuf.buf;
-        sub_len = subbuf.len;
-    }
-    else {
-        sub = &byte;
-        sub_len = 1;
-    }
-    len = PyBytes_GET_SIZE(self);
-
-    ADJUST_INDICES(start, end, len);
-    if (end - start < sub_len)
-        res = -1;
-    else if (sub_len == 1
-#ifndef HAVE_MEMRCHR
-            && dir > 0
-#endif
-    ) {
-        unsigned char needle = *sub;
-        int mode = (dir > 0) ? FAST_SEARCH : FAST_RSEARCH;
-        res = stringlib_fastsearch_memchr_1char(
-            PyBytes_AS_STRING(self) + start, end - start,
-            needle, needle, mode);
-        if (res >= 0)
-            res += start;
-    }
-    else {
-        if (dir > 0)
-            res = stringlib_find_slice(
-                PyBytes_AS_STRING(self), len,
-                sub, sub_len, start, end);
-        else
-            res = stringlib_rfind_slice(
-                PyBytes_AS_STRING(self), len,
-                sub, sub_len, start, end);
-    }
-
-    if (subobj)
-        PyBuffer_Release(&subbuf);
-
-    return res;
-}
-
-
-PyDoc_STRVAR(find__doc__,
-"B.find(sub[, start[, end]]) -> int\n\
-\n\
-Return the lowest index in B where substring sub is found,\n\
-such that sub is contained within B[start:end].  Optional\n\
-arguments start and end are interpreted as in slice notation.\n\
-\n\
-Return -1 on failure.");
-
 static PyObject *
 bytes_find(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytes_find_internal(self, args, +1);
-    if (result == -2)
-        return NULL;
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_find(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
-
-PyDoc_STRVAR(index__doc__,
-"B.index(sub[, start[, end]]) -> int\n\
-\n\
-Like B.find() but raise ValueError when the substring is not found.");
-
 static PyObject *
 bytes_index(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytes_find_internal(self, args, +1);
-    if (result == -2)
-        return NULL;
-    if (result == -1) {
-        PyErr_SetString(PyExc_ValueError,
-                        "substring not found");
-        return NULL;
-    }
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_index(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
 
-PyDoc_STRVAR(rfind__doc__,
-"B.rfind(sub[, start[, end]]) -> int\n\
-\n\
-Return the highest index in B where substring sub is found,\n\
-such that sub is contained within B[start:end].  Optional\n\
-arguments start and end are interpreted as in slice notation.\n\
-\n\
-Return -1 on failure.");
-
 static PyObject *
 bytes_rfind(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytes_find_internal(self, args, -1);
-    if (result == -2)
-        return NULL;
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_rfind(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
 
-PyDoc_STRVAR(rindex__doc__,
-"B.rindex(sub[, start[, end]]) -> int\n\
-\n\
-Like B.rfind() but raise ValueError when the substring is not found.");
-
 static PyObject *
 bytes_rindex(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t result = bytes_find_internal(self, args, -1);
-    if (result == -2)
-        return NULL;
-    if (result == -1) {
-        PyErr_SetString(PyExc_ValueError,
-                        "substring not found");
-        return NULL;
-    }
-    return PyLong_FromSsize_t(result);
+    return _Py_bytes_rindex(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
 
@@ -2005,7 +1981,6 @@ do_argstrip(PyBytesObject *self, int striptype, PyObject *bytes)
 /*[clinic input]
 bytes.strip
 
-    self: self(type="PyBytesObject *")
     bytes: object = None
     /
 
@@ -2016,7 +1991,7 @@ If the argument is omitted or None, strip leading and trailing ASCII whitespace.
 
 static PyObject *
 bytes_strip_impl(PyBytesObject *self, PyObject *bytes)
-/*[clinic end generated code: output=c7c228d3bd104a1b input=37daa5fad1395d95]*/
+/*[clinic end generated code: output=c7c228d3bd104a1b input=8a354640e4e0b3ef]*/
 {
     return do_argstrip(self, BOTHSTRIP, bytes);
 }
@@ -2024,7 +1999,6 @@ bytes_strip_impl(PyBytesObject *self, PyObject *bytes)
 /*[clinic input]
 bytes.lstrip
 
-    self: self(type="PyBytesObject *")
     bytes: object = None
     /
 
@@ -2035,7 +2009,7 @@ If the argument is omitted or None, strip leading  ASCII whitespace.
 
 static PyObject *
 bytes_lstrip_impl(PyBytesObject *self, PyObject *bytes)
-/*[clinic end generated code: output=28602e586f524e82 input=88811b09dfbc2988]*/
+/*[clinic end generated code: output=28602e586f524e82 input=9baff4398c3f6857]*/
 {
     return do_argstrip(self, LEFTSTRIP, bytes);
 }
@@ -2043,7 +2017,6 @@ bytes_lstrip_impl(PyBytesObject *self, PyObject *bytes)
 /*[clinic input]
 bytes.rstrip
 
-    self: self(type="PyBytesObject *")
     bytes: object = None
     /
 
@@ -2054,81 +2027,37 @@ If the argument is omitted or None, strip trailing ASCII whitespace.
 
 static PyObject *
 bytes_rstrip_impl(PyBytesObject *self, PyObject *bytes)
-/*[clinic end generated code: output=547e3815c95447da input=8f93c9cd361f0140]*/
+/*[clinic end generated code: output=547e3815c95447da input=b78af445c727e32b]*/
 {
     return do_argstrip(self, RIGHTSTRIP, bytes);
 }
 
 
-PyDoc_STRVAR(count__doc__,
-"B.count(sub[, start[, end]]) -> int\n\
-\n\
-Return the number of non-overlapping occurrences of substring sub in\n\
-string B[start:end].  Optional arguments start and end are interpreted\n\
-as in slice notation.");
-
 static PyObject *
 bytes_count(PyBytesObject *self, PyObject *args)
 {
-    PyObject *sub_obj;
-    const char *str = PyBytes_AS_STRING(self), *sub;
-    Py_ssize_t sub_len;
-    char byte;
-    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
-
-    Py_buffer vsub;
-    PyObject *count_obj;
-
-    if (!stringlib_parse_args_finds_byte("count", args, &sub_obj, &byte,
-                                         &start, &end))
-        return NULL;
-
-    if (sub_obj) {
-        if (PyObject_GetBuffer(sub_obj, &vsub, PyBUF_SIMPLE) != 0)
-            return NULL;
-
-        sub = vsub.buf;
-        sub_len = vsub.len;
-    }
-    else {
-        sub = &byte;
-        sub_len = 1;
-    }
-
-    ADJUST_INDICES(start, end, PyBytes_GET_SIZE(self));
-
-    count_obj = PyLong_FromSsize_t(
-        stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX)
-        );
-
-    if (sub_obj)
-        PyBuffer_Release(&vsub);
-
-    return count_obj;
+    return _Py_bytes_count(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
 
 /*[clinic input]
 bytes.translate
 
-    self: self(type="PyBytesObject *")
     table: object
         Translation table, which must be a bytes object of length 256.
-    [
-    deletechars: object
-    ]
     /
+    delete as deletechars: object(c_default="NULL") = b''
 
 Return a copy with each character mapped by the given translation table.
 
-All characters occurring in the optional argument deletechars are removed.
+All characters occurring in the optional argument delete are removed.
 The remaining characters are mapped through the given translation table.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_translate_impl(PyBytesObject *self, PyObject *table, int group_right_1,
+bytes_translate_impl(PyBytesObject *self, PyObject *table,
                      PyObject *deletechars)
-/*[clinic end generated code: output=233df850eb50bf8d input=d8fa5519d7cc4be7]*/
+/*[clinic end generated code: output=43be3437f1956211 input=0ecdf159f654233c]*/
 {
     char *input, *output;
     Py_buffer table_view = {NULL, NULL};
@@ -2189,7 +2118,7 @@ bytes_translate_impl(PyBytesObject *self, PyObject *table, int group_right_1,
         PyBuffer_Release(&table_view);
         return NULL;
     }
-    output_start = output = PyBytes_AsString(result);
+    output_start = output = PyBytes_AS_STRING(result);
     input = PyBytes_AS_STRING(input_obj);
 
     if (dellen == 0 && table_chars != NULL) {
@@ -2265,498 +2194,6 @@ bytes_maketrans_impl(Py_buffer *frm, Py_buffer *to)
     return _Py_bytes_maketrans(frm, to);
 }
 
-/* find and count characters and substrings */
-
-#define findchar(target, target_len, c)                         \
-  ((char *)memchr((const void *)(target), c, target_len))
-
-/* String ops must return a string.  */
-/* If the object is subclass of string, create a copy */
-Py_LOCAL(PyBytesObject *)
-return_self(PyBytesObject *self)
-{
-    if (PyBytes_CheckExact(self)) {
-        Py_INCREF(self);
-        return self;
-    }
-    return (PyBytesObject *)PyBytes_FromStringAndSize(
-        PyBytes_AS_STRING(self),
-        PyBytes_GET_SIZE(self));
-}
-
-Py_LOCAL_INLINE(Py_ssize_t)
-countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount)
-{
-    Py_ssize_t count=0;
-    const char *start=target;
-    const char *end=target+target_len;
-
-    while ( (start=findchar(start, end-start, c)) != NULL ) {
-        count++;
-        if (count >= maxcount)
-            break;
-        start += 1;
-    }
-    return count;
-}
-
-
-/* Algorithms for different cases of string replacement */
-
-/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_interleave(PyBytesObject *self,
-                   const char *to_s, Py_ssize_t to_len,
-                   Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, i;
-    PyBytesObject *result;
-
-    self_len = PyBytes_GET_SIZE(self);
-
-    /* 1 at the end plus 1 after every character;
-       count = min(maxcount, self_len + 1) */
-    if (maxcount <= self_len)
-        count = maxcount;
-    else
-        /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */
-        count = self_len + 1;
-
-    /* Check for overflow */
-    /*   result_len = count * to_len + self_len; */
-    assert(count > 0);
-    if (to_len > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "replacement bytes are too long");
-        return NULL;
-    }
-    result_len = count * to_len + self_len;
-
-    if (! (result = (PyBytesObject *)
-                     PyBytes_FromStringAndSize(NULL, result_len)) )
-        return NULL;
-
-    self_s = PyBytes_AS_STRING(self);
-    result_s = PyBytes_AS_STRING(result);
-
-    /* TODO: special case single character, which doesn't need memcpy */
-
-    /* Lay the first one down (guaranteed this will occur) */
-    Py_MEMCPY(result_s, to_s, to_len);
-    result_s += to_len;
-    count -= 1;
-
-    for (i=0; i<count; i++) {
-        *result_s++ = *self_s++;
-        Py_MEMCPY(result_s, to_s, to_len);
-        result_s += to_len;
-    }
-
-    /* Copy the rest of the original string */
-    Py_MEMCPY(result_s, self_s, self_len-i);
-
-    return result;
-}
-
-/* Special case for deleting a single character */
-/* len(self)>=1, len(from)==1, to="", maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_delete_single_character(PyBytesObject *self,
-                                char from_c, Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count;
-    PyBytesObject *result;
-
-    self_len = PyBytes_GET_SIZE(self);
-    self_s = PyBytes_AS_STRING(self);
-
-    count = countchar(self_s, self_len, from_c, maxcount);
-    if (count == 0) {
-        return return_self(self);
-    }
-
-    result_len = self_len - count;  /* from_len == 1 */
-    assert(result_len>=0);
-
-    if ( (result = (PyBytesObject *)
-                    PyBytes_FromStringAndSize(NULL, result_len)) == NULL)
-        return NULL;
-    result_s = PyBytes_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-        Py_MEMCPY(result_s, start, next-start);
-        result_s += (next-start);
-        start = next+1;
-    }
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
-
-Py_LOCAL(PyBytesObject *)
-replace_delete_substring(PyBytesObject *self,
-                         const char *from_s, Py_ssize_t from_len,
-                         Py_ssize_t maxcount) {
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, offset;
-    PyBytesObject *result;
-
-    self_len = PyBytes_GET_SIZE(self);
-    self_s = PyBytes_AS_STRING(self);
-
-    count = stringlib_count(self_s, self_len,
-                            from_s, from_len,
-                            maxcount);
-
-    if (count == 0) {
-        /* no matches */
-        return return_self(self);
-    }
-
-    result_len = self_len - (count * from_len);
-    assert (result_len>=0);
-
-    if ( (result = (PyBytesObject *)
-          PyBytes_FromStringAndSize(NULL, result_len)) == NULL )
-        return NULL;
-
-    result_s = PyBytes_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset == -1)
-            break;
-        next = start + offset;
-
-        Py_MEMCPY(result_s, start, next-start);
-
-        result_s += (next-start);
-        start = next+from_len;
-    }
-    Py_MEMCPY(result_s, start, end-start);
-    return result;
-}
-
-/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_single_character_in_place(PyBytesObject *self,
-                                  char from_c, char to_c,
-                                  Py_ssize_t maxcount)
-{
-    char *self_s, *result_s, *start, *end, *next;
-    Py_ssize_t self_len;
-    PyBytesObject *result;
-
-    /* The result string will be the same size */
-    self_s = PyBytes_AS_STRING(self);
-    self_len = PyBytes_GET_SIZE(self);
-
-    next = findchar(self_s, self_len, from_c);
-
-    if (next == NULL) {
-        /* No matches; return the original string */
-        return return_self(self);
-    }
-
-    /* Need to make a new string */
-    result = (PyBytesObject *) PyBytes_FromStringAndSize(NULL, self_len);
-    if (result == NULL)
-        return NULL;
-    result_s = PyBytes_AS_STRING(result);
-    Py_MEMCPY(result_s, self_s, self_len);
-
-    /* change everything in-place, starting with this one */
-    start =  result_s + (next-self_s);
-    *start = to_c;
-    start++;
-    end = result_s + self_len;
-
-    while (--maxcount > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-        *next = to_c;
-        start = next+1;
-    }
-
-    return result;
-}
-
-/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_substring_in_place(PyBytesObject *self,
-                           const char *from_s, Py_ssize_t from_len,
-                           const char *to_s, Py_ssize_t to_len,
-                           Py_ssize_t maxcount)
-{
-    char *result_s, *start, *end;
-    char *self_s;
-    Py_ssize_t self_len, offset;
-    PyBytesObject *result;
-
-    /* The result string will be the same size */
-
-    self_s = PyBytes_AS_STRING(self);
-    self_len = PyBytes_GET_SIZE(self);
-
-    offset = stringlib_find(self_s, self_len,
-                            from_s, from_len,
-                            0);
-    if (offset == -1) {
-        /* No matches; return the original string */
-        return return_self(self);
-    }
-
-    /* Need to make a new string */
-    result = (PyBytesObject *) PyBytes_FromStringAndSize(NULL, self_len);
-    if (result == NULL)
-        return NULL;
-    result_s = PyBytes_AS_STRING(result);
-    Py_MEMCPY(result_s, self_s, self_len);
-
-    /* change everything in-place, starting with this one */
-    start =  result_s + offset;
-    Py_MEMCPY(start, to_s, from_len);
-    start += from_len;
-    end = result_s + self_len;
-
-    while ( --maxcount > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset==-1)
-            break;
-        Py_MEMCPY(start+offset, to_s, from_len);
-        start += offset+from_len;
-    }
-
-    return result;
-}
-
-/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_single_character(PyBytesObject *self,
-                         char from_c,
-                         const char *to_s, Py_ssize_t to_len,
-                         Py_ssize_t maxcount)
-{
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count;
-    PyBytesObject *result;
-
-    self_s = PyBytes_AS_STRING(self);
-    self_len = PyBytes_GET_SIZE(self);
-
-    count = countchar(self_s, self_len, from_c, maxcount);
-    if (count == 0) {
-        /* no matches, return unchanged */
-        return return_self(self);
-    }
-
-    /* use the difference between current and new, hence the "-1" */
-    /*   result_len = self_len + count * (to_len-1)  */
-    assert(count > 0);
-    if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "replacement bytes are too long");
-        return NULL;
-    }
-    result_len = self_len + count * (to_len - 1);
-
-    if ( (result = (PyBytesObject *)
-          PyBytes_FromStringAndSize(NULL, result_len)) == NULL)
-        return NULL;
-    result_s = PyBytes_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        next = findchar(start, end-start, from_c);
-        if (next == NULL)
-            break;
-
-        if (next == start) {
-            /* replace with the 'to' */
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start += 1;
-        } else {
-            /* copy the unchanged old then the 'to' */
-            Py_MEMCPY(result_s, start, next-start);
-            result_s += (next-start);
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start = next+1;
-        }
-    }
-    /* Copy the remainder of the remaining string */
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
-Py_LOCAL(PyBytesObject *)
-replace_substring(PyBytesObject *self,
-                  const char *from_s, Py_ssize_t from_len,
-                  const char *to_s, Py_ssize_t to_len,
-                  Py_ssize_t maxcount) {
-    char *self_s, *result_s;
-    char *start, *next, *end;
-    Py_ssize_t self_len, result_len;
-    Py_ssize_t count, offset;
-    PyBytesObject *result;
-
-    self_s = PyBytes_AS_STRING(self);
-    self_len = PyBytes_GET_SIZE(self);
-
-    count = stringlib_count(self_s, self_len,
-                            from_s, from_len,
-                            maxcount);
-
-    if (count == 0) {
-        /* no matches, return unchanged */
-        return return_self(self);
-    }
-
-    /* Check for overflow */
-    /*    result_len = self_len + count * (to_len-from_len) */
-    assert(count > 0);
-    if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "replacement bytes are too long");
-        return NULL;
-    }
-    result_len = self_len + count * (to_len-from_len);
-
-    if ( (result = (PyBytesObject *)
-          PyBytes_FromStringAndSize(NULL, result_len)) == NULL)
-        return NULL;
-    result_s = PyBytes_AS_STRING(result);
-
-    start = self_s;
-    end = self_s + self_len;
-    while (count-- > 0) {
-        offset = stringlib_find(start, end-start,
-                                from_s, from_len,
-                                0);
-        if (offset == -1)
-            break;
-        next = start+offset;
-        if (next == start) {
-            /* replace with the 'to' */
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start += from_len;
-        } else {
-            /* copy the unchanged old then the 'to' */
-            Py_MEMCPY(result_s, start, next-start);
-            result_s += (next-start);
-            Py_MEMCPY(result_s, to_s, to_len);
-            result_s += to_len;
-            start = next+from_len;
-        }
-    }
-    /* Copy the remainder of the remaining string */
-    Py_MEMCPY(result_s, start, end-start);
-
-    return result;
-}
-
-
-Py_LOCAL(PyBytesObject *)
-replace(PyBytesObject *self,
-    const char *from_s, Py_ssize_t from_len,
-    const char *to_s, Py_ssize_t to_len,
-    Py_ssize_t maxcount)
-{
-    if (maxcount < 0) {
-        maxcount = PY_SSIZE_T_MAX;
-    } else if (maxcount == 0 || PyBytes_GET_SIZE(self) == 0) {
-        /* nothing to do; return the original string */
-        return return_self(self);
-    }
-
-    if (maxcount == 0 ||
-        (from_len == 0 && to_len == 0)) {
-        /* nothing to do; return the original string */
-        return return_self(self);
-    }
-
-    /* Handle zero-length special cases */
-
-    if (from_len == 0) {
-        /* insert the 'to' string everywhere.   */
-        /*    >>> "Python".replace("", ".")     */
-        /*    '.P.y.t.h.o.n.'                   */
-        return replace_interleave(self, to_s, to_len, maxcount);
-    }
-
-    /* Except for "".replace("", "A") == "A" there is no way beyond this */
-    /* point for an empty self string to generate a non-empty string */
-    /* Special case so the remaining code always gets a non-empty string */
-    if (PyBytes_GET_SIZE(self) == 0) {
-        return return_self(self);
-    }
-
-    if (to_len == 0) {
-        /* delete all occurrences of 'from' string */
-        if (from_len == 1) {
-            return replace_delete_single_character(
-                self, from_s[0], maxcount);
-        } else {
-            return replace_delete_substring(self, from_s,
-                                            from_len, maxcount);
-        }
-    }
-
-    /* Handle special case where both strings have the same length */
-
-    if (from_len == to_len) {
-        if (from_len == 1) {
-            return replace_single_character_in_place(
-                self,
-                from_s[0],
-                to_s[0],
-                maxcount);
-        } else {
-            return replace_substring_in_place(
-                self, from_s, from_len, to_s, to_len,
-                maxcount);
-        }
-    }
-
-    /* Otherwise use the more generic algorithms */
-    if (from_len == 1) {
-        return replace_single_character(self, from_s[0],
-                                        to_s, to_len, maxcount);
-    } else {
-        /* len('from')>=2, len('to')>=1 */
-        return replace_substring(self, from_s, from_len, to_s, to_len,
-                                 maxcount);
-    }
-}
-
 
 /*[clinic input]
 bytes.replace
@@ -2775,156 +2212,28 @@ replaced.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_replace_impl(PyBytesObject*self, Py_buffer *old, Py_buffer *new,
+bytes_replace_impl(PyBytesObject *self, Py_buffer *old, Py_buffer *new,
                    Py_ssize_t count)
-/*[clinic end generated code: output=403dc9d7a83c5a1d input=b2fbbf0bf04de8e5]*/
+/*[clinic end generated code: output=994fa588b6b9c104 input=b2fbbf0bf04de8e5]*/
 {
-    return (PyObject *)replace((PyBytesObject *) self,
-                               (const char *)old->buf, old->len,
-                               (const char *)new->buf, new->len, count);
+    return stringlib_replace((PyObject *)self,
+                             (const char *)old->buf, old->len,
+                             (const char *)new->buf, new->len, count);
 }
 
 /** End DALKE **/
 
-/* Matches the end (direction >= 0) or start (direction < 0) of self
- * against substr, using the start and end arguments. Returns
- * -1 on error, 0 if not found and 1 if found.
- */
-Py_LOCAL(int)
-_bytes_tailmatch(PyBytesObject *self, PyObject *substr, Py_ssize_t start,
-                  Py_ssize_t end, int direction)
-{
-    Py_ssize_t len = PyBytes_GET_SIZE(self);
-    Py_ssize_t slen;
-    Py_buffer sub_view = {NULL, NULL};
-    const char* sub;
-    const char* str;
-
-    if (PyBytes_Check(substr)) {
-        sub = PyBytes_AS_STRING(substr);
-        slen = PyBytes_GET_SIZE(substr);
-    }
-    else {
-        if (PyObject_GetBuffer(substr, &sub_view, PyBUF_SIMPLE) != 0)
-            return -1;
-        sub = sub_view.buf;
-        slen = sub_view.len;
-    }
-    str = PyBytes_AS_STRING(self);
-
-    ADJUST_INDICES(start, end, len);
-
-    if (direction < 0) {
-        /* startswith */
-        if (start+slen > len)
-            goto notfound;
-    } else {
-        /* endswith */
-        if (end-start < slen || start > len)
-            goto notfound;
-
-        if (end-slen > start)
-            start = end - slen;
-    }
-    if (end-start < slen)
-        goto notfound;
-    if (memcmp(str+start, sub, slen) != 0)
-        goto notfound;
-
-    PyBuffer_Release(&sub_view);
-    return 1;
-
-notfound:
-    PyBuffer_Release(&sub_view);
-    return 0;
-}
-
-
-PyDoc_STRVAR(startswith__doc__,
-"B.startswith(prefix[, start[, end]]) -> bool\n\
-\n\
-Return True if B starts with the specified prefix, False otherwise.\n\
-With optional start, test B beginning at that position.\n\
-With optional end, stop comparing B at that position.\n\
-prefix can also be a tuple of bytes to try.");
 
 static PyObject *
 bytes_startswith(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t start = 0;
-    Py_ssize_t end = PY_SSIZE_T_MAX;
-    PyObject *subobj;
-    int result;
-
-    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
-        return NULL;
-    if (PyTuple_Check(subobj)) {
-        Py_ssize_t i;
-        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            result = _bytes_tailmatch(self,
-                            PyTuple_GET_ITEM(subobj, i),
-                            start, end, -1);
-            if (result == -1)
-                return NULL;
-            else if (result) {
-                Py_RETURN_TRUE;
-            }
-        }
-        Py_RETURN_FALSE;
-    }
-    result = _bytes_tailmatch(self, subobj, start, end, -1);
-    if (result == -1) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "startswith first arg must be bytes "
-                         "or a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name);
-        return NULL;
-    }
-    else
-        return PyBool_FromLong(result);
+    return _Py_bytes_startswith(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
-
-PyDoc_STRVAR(endswith__doc__,
-"B.endswith(suffix[, start[, end]]) -> bool\n\
-\n\
-Return True if B ends with the specified suffix, False otherwise.\n\
-With optional start, test B beginning at that position.\n\
-With optional end, stop comparing B at that position.\n\
-suffix can also be a tuple of bytes to try.");
-
 static PyObject *
 bytes_endswith(PyBytesObject *self, PyObject *args)
 {
-    Py_ssize_t start = 0;
-    Py_ssize_t end = PY_SSIZE_T_MAX;
-    PyObject *subobj;
-    int result;
-
-    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
-        return NULL;
-    if (PyTuple_Check(subobj)) {
-        Py_ssize_t i;
-        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            result = _bytes_tailmatch(self,
-                            PyTuple_GET_ITEM(subobj, i),
-                            start, end, +1);
-            if (result == -1)
-                return NULL;
-            else if (result) {
-                Py_RETURN_TRUE;
-            }
-        }
-        Py_RETURN_FALSE;
-    }
-    result = _bytes_tailmatch(self, subobj, start, end, +1);
-    if (result == -1) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "endswith first arg must be bytes or "
-                         "a tuple of bytes, not %s", Py_TYPE(subobj)->tp_name);
-        return NULL;
-    }
-    else
-        return PyBool_FromLong(result);
+    return _Py_bytes_endswith(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), args);
 }
 
 
@@ -2944,9 +2253,9 @@ Decode the bytes using the codec registered for encoding.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_decode_impl(PyBytesObject*self, const char *encoding,
+bytes_decode_impl(PyBytesObject *self, const char *encoding,
                   const char *errors)
-/*[clinic end generated code: output=2d2016ff8e0bb176 input=958174769d2a40ca]*/
+/*[clinic end generated code: output=5649a53dde27b314 input=958174769d2a40ca]*/
 {
     return PyUnicode_FromEncodedObject((PyObject*)self, encoding, errors);
 }
@@ -2964,8 +2273,8 @@ true.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_splitlines_impl(PyBytesObject*self, int keepends)
-/*[clinic end generated code: output=995c3598f7833cad input=7f4aac67144f9944]*/
+bytes_splitlines_impl(PyBytesObject *self, int keepends)
+/*[clinic end generated code: output=3484149a5d880ffb input=7f4aac67144f9944]*/
 {
     return stringlib_splitlines(
         (PyObject*) self, PyBytes_AS_STRING(self),
@@ -2973,22 +2282,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends)
         );
 }
 
-static int
-hex_digit_to_int(Py_UCS4 c)
-{
-    if (c >= 128)
-        return -1;
-    if (Py_ISDIGIT(c))
-        return c - '0';
-    else {
-        if (Py_ISUPPER(c))
-            c = Py_TOLOWER(c);
-        if (c >= 'a' && c <= 'f')
-            return c - 'a' + 10;
-    }
-    return -1;
-}
-
 /*[clinic input]
 @classmethod
 bytes.fromhex
@@ -3006,47 +2299,88 @@ static PyObject *
 bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
 /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
 {
-    PyObject *newstring;
+    PyObject *result = _PyBytes_FromHex(string, 0);
+    if (type != &PyBytes_Type && result != NULL) {
+        Py_SETREF(result, PyObject_CallFunctionObjArgs((PyObject *)type,
+                                                       result, NULL));
+    }
+    return result;
+}
+
+PyObject*
+_PyBytes_FromHex(PyObject *string, int use_bytearray)
+{
     char *buf;
-    Py_ssize_t hexlen, byteslen, i, j;
-    int top, bot;
-    void *data;
-    unsigned int kind;
+    Py_ssize_t hexlen, invalid_char;
+    unsigned int top, bot;
+    Py_UCS1 *str, *end;
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
+    writer.use_bytearray = use_bytearray;
 
     assert(PyUnicode_Check(string));
     if (PyUnicode_READY(string))
         return NULL;
-    kind = PyUnicode_KIND(string);
-    data = PyUnicode_DATA(string);
     hexlen = PyUnicode_GET_LENGTH(string);
 
-    byteslen = hexlen/2; /* This overestimates if there are spaces */
-    newstring = PyBytes_FromStringAndSize(NULL, byteslen);
-    if (!newstring)
+    if (!PyUnicode_IS_ASCII(string)) {
+        void *data = PyUnicode_DATA(string);
+        unsigned int kind = PyUnicode_KIND(string);
+        Py_ssize_t i;
+
+        /* search for the first non-ASCII character */
+        for (i = 0; i < hexlen; i++) {
+            if (PyUnicode_READ(kind, data, i) >= 128)
+                break;
+        }
+        invalid_char = i;
+        goto error;
+    }
+
+    assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND);
+    str = PyUnicode_1BYTE_DATA(string);
+
+    /* This overestimates if there are spaces */
+    buf = _PyBytesWriter_Alloc(&writer, hexlen / 2);
+    if (buf == NULL)
         return NULL;
-    buf = PyBytes_AS_STRING(newstring);
-    for (i = j = 0; i < hexlen; i += 2) {
+
+    end = str + hexlen;
+    while (str < end) {
         /* skip over spaces in the input */
-        while (PyUnicode_READ(kind, data, i) == ' ')
-            i++;
-        if (i >= hexlen)
-            break;
-        top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
-        bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
-        if (top == -1 || bot == -1) {
-            PyErr_Format(PyExc_ValueError,
-                         "non-hexadecimal number found in "
-                         "fromhex() arg at position %zd", i);
+        if (*str == ' ') {
+            do {
+                str++;
+            } while (*str == ' ');
+            if (str >= end)
+                break;
+        }
+
+        top = _PyLong_DigitValue[*str];
+        if (top >= 16) {
+            invalid_char = str - PyUnicode_1BYTE_DATA(string);
             goto error;
         }
-        buf[j++] = (top << 4) + bot;
+        str++;
+
+        bot = _PyLong_DigitValue[*str];
+        if (bot >= 16) {
+            invalid_char = str - PyUnicode_1BYTE_DATA(string);
+            goto error;
+        }
+        str++;
+
+        *buf++ = (unsigned char)((top << 4) + bot);
     }
-    if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0)
-        goto error;
-    return newstring;
+
+    return _PyBytesWriter_Finish(&writer, buf);
 
   error:
-    Py_XDECREF(newstring);
+    PyErr_Format(PyExc_ValueError,
+                 "non-hexadecimal number found in "
+                 "fromhex() arg at position %zd", invalid_char);
+    _PyBytesWriter_Dealloc(&writer);
     return NULL;
 }
 
@@ -3076,17 +2410,20 @@ bytes_methods[] = {
     {"__getnewargs__",          (PyCFunction)bytes_getnewargs,  METH_NOARGS},
     {"capitalize", (PyCFunction)stringlib_capitalize, METH_NOARGS,
      _Py_capitalize__doc__},
-    {"center", (PyCFunction)stringlib_center, METH_VARARGS, center__doc__},
-    {"count", (PyCFunction)bytes_count, METH_VARARGS, count__doc__},
+    {"center", (PyCFunction)stringlib_center, METH_VARARGS,
+     _Py_center__doc__},
+    {"count", (PyCFunction)bytes_count, METH_VARARGS,
+     _Py_count__doc__},
     BYTES_DECODE_METHODDEF
     {"endswith", (PyCFunction)bytes_endswith, METH_VARARGS,
-     endswith__doc__},
+     _Py_endswith__doc__},
     {"expandtabs", (PyCFunction)stringlib_expandtabs, METH_VARARGS | METH_KEYWORDS,
-     expandtabs__doc__},
-    {"find", (PyCFunction)bytes_find, METH_VARARGS, find__doc__},
+     _Py_expandtabs__doc__},
+    {"find", (PyCFunction)bytes_find, METH_VARARGS,
+     _Py_find__doc__},
     BYTES_FROMHEX_METHODDEF
     {"hex", (PyCFunction)bytes_hex, METH_NOARGS, hex__doc__},
-    {"index", (PyCFunction)bytes_index, METH_VARARGS, index__doc__},
+    {"index", (PyCFunction)bytes_index, METH_VARARGS, _Py_index__doc__},
     {"isalnum", (PyCFunction)stringlib_isalnum, METH_NOARGS,
      _Py_isalnum__doc__},
     {"isalpha", (PyCFunction)stringlib_isalpha, METH_NOARGS,
@@ -3102,38 +2439,40 @@ bytes_methods[] = {
     {"isupper", (PyCFunction)stringlib_isupper, METH_NOARGS,
      _Py_isupper__doc__},
     BYTES_JOIN_METHODDEF
-    {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, ljust__doc__},
+    {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, _Py_ljust__doc__},
     {"lower", (PyCFunction)stringlib_lower, METH_NOARGS, _Py_lower__doc__},
     BYTES_LSTRIP_METHODDEF
     BYTES_MAKETRANS_METHODDEF
     BYTES_PARTITION_METHODDEF
     BYTES_REPLACE_METHODDEF
-    {"rfind", (PyCFunction)bytes_rfind, METH_VARARGS, rfind__doc__},
-    {"rindex", (PyCFunction)bytes_rindex, METH_VARARGS, rindex__doc__},
-    {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, rjust__doc__},
+    {"rfind", (PyCFunction)bytes_rfind, METH_VARARGS, _Py_rfind__doc__},
+    {"rindex", (PyCFunction)bytes_rindex, METH_VARARGS, _Py_rindex__doc__},
+    {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, _Py_rjust__doc__},
     BYTES_RPARTITION_METHODDEF
     BYTES_RSPLIT_METHODDEF
     BYTES_RSTRIP_METHODDEF
     BYTES_SPLIT_METHODDEF
     BYTES_SPLITLINES_METHODDEF
     {"startswith", (PyCFunction)bytes_startswith, METH_VARARGS,
-     startswith__doc__},
+     _Py_startswith__doc__},
     BYTES_STRIP_METHODDEF
     {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS,
      _Py_swapcase__doc__},
     {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__},
     BYTES_TRANSLATE_METHODDEF
     {"upper", (PyCFunction)stringlib_upper, METH_NOARGS, _Py_upper__doc__},
-    {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, zfill__doc__},
+    {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, _Py_zfill__doc__},
     {NULL,     NULL}                         /* sentinel */
 };
 
 static PyObject *
-bytes_mod(PyObject *v, PyObject *w)
+bytes_mod(PyObject *self, PyObject *arg)
 {
-    if (!PyBytes_Check(v))
+    if (!PyBytes_Check(self)) {
         Py_RETURN_NOTIMPLEMENTED;
-    return _PyBytes_Format(v, w);
+    }
+    return _PyBytes_FormatEx(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self),
+                             arg, 0);
 }
 
 static PyNumberMethods bytes_as_number = {
@@ -3222,17 +2561,15 @@ bytes_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
         return NULL;
     }
     /* Is it an integer? */
-    size = PyNumber_AsSsize_t(x, PyExc_OverflowError);
-    if (size == -1 && PyErr_Occurred()) {
-        if (PyErr_ExceptionMatches(PyExc_OverflowError))
+    if (PyIndex_Check(x)) {
+        size = PyNumber_AsSsize_t(x, PyExc_OverflowError);
+        if (size == -1 && PyErr_Occurred()) {
             return NULL;
-        PyErr_Clear();
-    }
-    else if (size < 0) {
-        PyErr_SetString(PyExc_ValueError, "negative count");
-        return NULL;
-    }
-    else {
+        }
+        if (size < 0) {
+            PyErr_SetString(PyExc_ValueError, "negative count");
+            return NULL;
+        }
         new = _PyBytes_FromSize(size, 1);
         if (new == NULL)
             return NULL;
@@ -3242,108 +2579,93 @@ bytes_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
     return PyBytes_FromObject(x);
 }
 
-PyObject *
-PyBytes_FromObject(PyObject *x)
+static PyObject*
+_PyBytes_FromBuffer(PyObject *x)
 {
-    PyObject *new, *it;
-    Py_ssize_t i, size;
+    PyObject *new;
+    Py_buffer view;
 
-    if (x == NULL) {
-        PyErr_BadInternalCall();
+    if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0)
         return NULL;
-    }
 
-    if (PyBytes_CheckExact(x)) {
-        Py_INCREF(x);
-        return x;
-    }
+    new = PyBytes_FromStringAndSize(NULL, view.len);
+    if (!new)
+        goto fail;
+    if (PyBuffer_ToContiguous(((PyBytesObject *)new)->ob_sval,
+                &view, view.len, 'C') < 0)
+        goto fail;
+    PyBuffer_Release(&view);
+    return new;
 
-    /* Use the modern buffer interface */
-    if (PyObject_CheckBuffer(x)) {
-        Py_buffer view;
-        if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0)
-            return NULL;
-        new = PyBytes_FromStringAndSize(NULL, view.len);
-        if (!new)
-            goto fail;
-        if (PyBuffer_ToContiguous(((PyBytesObject *)new)->ob_sval,
-                                  &view, view.len, 'C') < 0)
-            goto fail;
-        PyBuffer_Release(&view);
-        return new;
-      fail:
-        Py_XDECREF(new);
-        PyBuffer_Release(&view);
-        return NULL;
-    }
-    if (PyUnicode_Check(x)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "cannot convert unicode object to bytes");
-        return NULL;
-    }
+fail:
+    Py_XDECREF(new);
+    PyBuffer_Release(&view);
+    return NULL;
+}
 
-    if (PyList_CheckExact(x)) {
-        new = PyBytes_FromStringAndSize(NULL, Py_SIZE(x));
-        if (new == NULL)
-            return NULL;
-        for (i = 0; i < Py_SIZE(x); i++) {
-            Py_ssize_t value = PyNumber_AsSsize_t(
-                PyList_GET_ITEM(x, i), PyExc_ValueError);
-            if (value == -1 && PyErr_Occurred()) {
-                Py_DECREF(new);
-                return NULL;
-            }
-            if (value < 0 || value >= 256) {
-                PyErr_SetString(PyExc_ValueError,
-                                "bytes must be in range(0, 256)");
-                Py_DECREF(new);
-                return NULL;
-            }
-            ((PyBytesObject *)new)->ob_sval[i] = (char) value;
-        }
-        return new;
-    }
-    if (PyTuple_CheckExact(x)) {
-        new = PyBytes_FromStringAndSize(NULL, Py_SIZE(x));
-        if (new == NULL)
-            return NULL;
-        for (i = 0; i < Py_SIZE(x); i++) {
-            Py_ssize_t value = PyNumber_AsSsize_t(
-                PyTuple_GET_ITEM(x, i), PyExc_ValueError);
-            if (value == -1 && PyErr_Occurred()) {
-                Py_DECREF(new);
-                return NULL;
-            }
-            if (value < 0 || value >= 256) {
-                PyErr_SetString(PyExc_ValueError,
-                                "bytes must be in range(0, 256)");
-                Py_DECREF(new);
-                return NULL;
-            }
-            ((PyBytesObject *)new)->ob_sval[i] = (char) value;
-        }
-        return new;
-    }
+#define _PyBytes_FROM_LIST_BODY(x, GET_ITEM)                                \
+    do {                                                                    \
+        PyObject *bytes;                                                    \
+        Py_ssize_t i;                                                       \
+        Py_ssize_t value;                                                   \
+        char *str;                                                          \
+        PyObject *item;                                                     \
+                                                                            \
+        bytes = PyBytes_FromStringAndSize(NULL, Py_SIZE(x));                \
+        if (bytes == NULL)                                                  \
+            return NULL;                                                    \
+        str = ((PyBytesObject *)bytes)->ob_sval;                            \
+                                                                            \
+        for (i = 0; i < Py_SIZE(x); i++) {                                  \
+            item = GET_ITEM((x), i);                                        \
+            value = PyNumber_AsSsize_t(item, NULL);                         \
+            if (value == -1 && PyErr_Occurred())                            \
+                goto error;                                                 \
+                                                                            \
+            if (value < 0 || value >= 256) {                                \
+                PyErr_SetString(PyExc_ValueError,                           \
+                                "bytes must be in range(0, 256)");          \
+                goto error;                                                 \
+            }                                                               \
+            *str++ = (char) value;                                          \
+        }                                                                   \
+        return bytes;                                                       \
+                                                                            \
+    error:                                                                  \
+        Py_DECREF(bytes);                                                   \
+        return NULL;                                                        \
+    } while (0)
+
+static PyObject*
+_PyBytes_FromList(PyObject *x)
+{
+    _PyBytes_FROM_LIST_BODY(x, PyList_GET_ITEM);
+}
+
+static PyObject*
+_PyBytes_FromTuple(PyObject *x)
+{
+    _PyBytes_FROM_LIST_BODY(x, PyTuple_GET_ITEM);
+}
+
+static PyObject *
+_PyBytes_FromIterator(PyObject *it, PyObject *x)
+{
+    char *str;
+    Py_ssize_t i, size;
+    _PyBytesWriter writer;
 
     /* For iterator version, create a string object and resize as needed */
     size = PyObject_LengthHint(x, 64);
     if (size == -1 && PyErr_Occurred())
         return NULL;
-    /* Allocate an extra byte to prevent PyBytes_FromStringAndSize() from
-       returning a shared empty bytes string. This required because we
-       want to call _PyBytes_Resize() the returned object, which we can
-       only do on bytes objects with refcount == 1. */
-    if (size == 0)
-        size = 1;
-    new = PyBytes_FromStringAndSize(NULL, size);
-    if (new == NULL)
-        return NULL;
-    assert(Py_REFCNT(new) == 1);
 
-    /* Get the iterator */
-    it = PyObject_GetIter(x);
-    if (it == NULL)
-        goto error;
+    _PyBytesWriter_Init(&writer);
+    str = _PyBytesWriter_Alloc(&writer, size);
+    if (str == NULL)
+        return NULL;
+    writer.overallocate = 1;
+    size = writer.allocated;
 
     /* Run the iterator to exhaustion */
     for (i = 0; ; i++) {
@@ -3359,7 +2681,7 @@ PyBytes_FromObject(PyObject *x)
         }
 
         /* Interpret it as an int (__index__) */
-        value = PyNumber_AsSsize_t(item, PyExc_ValueError);
+        value = PyNumber_AsSsize_t(item, NULL);
         Py_DECREF(item);
         if (value == -1 && PyErr_Occurred())
             goto error;
@@ -3373,21 +2695,58 @@ PyBytes_FromObject(PyObject *x)
 
         /* Append the byte */
         if (i >= size) {
-            size = 2 * size + 1;
-            if (_PyBytes_Resize(&new, size) < 0)
-                goto error;
+            str = _PyBytesWriter_Resize(&writer, str, size+1);
+            if (str == NULL)
+                return NULL;
+            size = writer.allocated;
         }
-        ((PyBytesObject *)new)->ob_sval[i] = (char) value;
+        *str++ = (char) value;
     }
-    _PyBytes_Resize(&new, i);
 
-    /* Clean up and return success */
-    Py_DECREF(it);
-    return new;
+    return _PyBytesWriter_Finish(&writer, str);
 
   error:
-    Py_XDECREF(it);
-    Py_XDECREF(new);
+    _PyBytesWriter_Dealloc(&writer);
+    return NULL;
+}
+
+PyObject *
+PyBytes_FromObject(PyObject *x)
+{
+    PyObject *it, *result;
+
+    if (x == NULL) {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+
+    if (PyBytes_CheckExact(x)) {
+        Py_INCREF(x);
+        return x;
+    }
+
+    /* Use the modern buffer interface */
+    if (PyObject_CheckBuffer(x))
+        return _PyBytes_FromBuffer(x);
+
+    if (PyList_CheckExact(x))
+        return _PyBytes_FromList(x);
+
+    if (PyTuple_CheckExact(x))
+        return _PyBytes_FromTuple(x);
+
+    if (!PyUnicode_Check(x)) {
+        it = PyObject_GetIter(x);
+        if (it != NULL) {
+            result = _PyBytes_FromIterator(it, x);
+            Py_DECREF(it);
+            return result;
+        }
+    }
+
+    PyErr_Format(PyExc_TypeError,
+                 "cannot convert '%.200s' object to bytes",
+                 x->ob_type->tp_name);
     return NULL;
 }
 
@@ -3738,3 +3097,282 @@ bytes_iter(PyObject *seq)
     _PyObject_GC_TRACK(it);
     return (PyObject *)it;
 }
+
+
+/* _PyBytesWriter API */
+
+#ifdef MS_WINDOWS
+   /* On Windows, overallocate by 50% is the best factor */
+#  define OVERALLOCATE_FACTOR 2
+#else
+   /* On Linux, overallocate by 25% is the best factor */
+#  define OVERALLOCATE_FACTOR 4
+#endif
+
+void
+_PyBytesWriter_Init(_PyBytesWriter *writer)
+{
+    /* Set all attributes before small_buffer to 0 */
+    memset(writer, 0, offsetof(_PyBytesWriter, small_buffer));
+#ifdef Py_DEBUG
+    memset(writer->small_buffer, 0xCB, sizeof(writer->small_buffer));
+#endif
+}
+
+void
+_PyBytesWriter_Dealloc(_PyBytesWriter *writer)
+{
+    Py_CLEAR(writer->buffer);
+}
+
+Py_LOCAL_INLINE(char*)
+_PyBytesWriter_AsString(_PyBytesWriter *writer)
+{
+    if (writer->use_small_buffer) {
+        assert(writer->buffer == NULL);
+        return writer->small_buffer;
+    }
+    else if (writer->use_bytearray) {
+        assert(writer->buffer != NULL);
+        return PyByteArray_AS_STRING(writer->buffer);
+    }
+    else {
+        assert(writer->buffer != NULL);
+        return PyBytes_AS_STRING(writer->buffer);
+    }
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str)
+{
+    char *start = _PyBytesWriter_AsString(writer);
+    assert(str != NULL);
+    assert(str >= start);
+    assert(str - start <= writer->allocated);
+    return str - start;
+}
+
+Py_LOCAL_INLINE(void)
+_PyBytesWriter_CheckConsistency(_PyBytesWriter *writer, char *str)
+{
+#ifdef Py_DEBUG
+    char *start, *end;
+
+    if (writer->use_small_buffer) {
+        assert(writer->buffer == NULL);
+    }
+    else {
+        assert(writer->buffer != NULL);
+        if (writer->use_bytearray)
+            assert(PyByteArray_CheckExact(writer->buffer));
+        else
+            assert(PyBytes_CheckExact(writer->buffer));
+        assert(Py_REFCNT(writer->buffer) == 1);
+    }
+
+    if (writer->use_bytearray) {
+        /* bytearray has its own overallocation algorithm,
+           writer overallocation must be disabled */
+        assert(!writer->overallocate);
+    }
+
+    assert(0 <= writer->allocated);
+    assert(0 <= writer->min_size && writer->min_size <= writer->allocated);
+    /* the last byte must always be null */
+    start = _PyBytesWriter_AsString(writer);
+    assert(start[writer->allocated] == 0);
+
+    end = start + writer->allocated;
+    assert(str != NULL);
+    assert(start <= str && str <= end);
+#endif
+}
+
+void*
+_PyBytesWriter_Resize(_PyBytesWriter *writer, void *str, Py_ssize_t size)
+{
+    Py_ssize_t allocated, pos;
+
+    _PyBytesWriter_CheckConsistency(writer, str);
+    assert(writer->allocated < size);
+
+    allocated = size;
+    if (writer->overallocate
+        && allocated <= (PY_SSIZE_T_MAX - allocated / OVERALLOCATE_FACTOR)) {
+        /* overallocate to limit the number of realloc() */
+        allocated += allocated / OVERALLOCATE_FACTOR;
+    }
+
+    pos = _PyBytesWriter_GetSize(writer, str);
+    if (!writer->use_small_buffer) {
+        if (writer->use_bytearray) {
+            if (PyByteArray_Resize(writer->buffer, allocated))
+                goto error;
+            /* writer->allocated can be smaller than writer->buffer->ob_alloc,
+               but we cannot use ob_alloc because bytes may need to be moved
+               to use the whole buffer. bytearray uses an internal optimization
+               to avoid moving or copying bytes when bytes are removed at the
+               beginning (ex: del bytearray[:1]). */
+        }
+        else {
+            if (_PyBytes_Resize(&writer->buffer, allocated))
+                goto error;
+        }
+    }
+    else {
+        /* convert from stack buffer to bytes object buffer */
+        assert(writer->buffer == NULL);
+
+        if (writer->use_bytearray)
+            writer->buffer = PyByteArray_FromStringAndSize(NULL, allocated);
+        else
+            writer->buffer = PyBytes_FromStringAndSize(NULL, allocated);
+        if (writer->buffer == NULL)
+            goto error;
+
+        if (pos != 0) {
+            char *dest;
+            if (writer->use_bytearray)
+                dest = PyByteArray_AS_STRING(writer->buffer);
+            else
+                dest = PyBytes_AS_STRING(writer->buffer);
+            Py_MEMCPY(dest,
+                      writer->small_buffer,
+                      pos);
+        }
+
+        writer->use_small_buffer = 0;
+#ifdef Py_DEBUG
+        memset(writer->small_buffer, 0xDB, sizeof(writer->small_buffer));
+#endif
+    }
+    writer->allocated = allocated;
+
+    str = _PyBytesWriter_AsString(writer) + pos;
+    _PyBytesWriter_CheckConsistency(writer, str);
+    return str;
+
+error:
+    _PyBytesWriter_Dealloc(writer);
+    return NULL;
+}
+
+void*
+_PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size)
+{
+    Py_ssize_t new_min_size;
+
+    _PyBytesWriter_CheckConsistency(writer, str);
+    assert(size >= 0);
+
+    if (size == 0) {
+        /* nothing to do */
+        return str;
+    }
+
+    if (writer->min_size > PY_SSIZE_T_MAX - size) {
+        PyErr_NoMemory();
+        _PyBytesWriter_Dealloc(writer);
+        return NULL;
+    }
+    new_min_size = writer->min_size + size;
+
+    if (new_min_size > writer->allocated)
+        str = _PyBytesWriter_Resize(writer, str, new_min_size);
+
+    writer->min_size = new_min_size;
+    return str;
+}
+
+/* Allocate the buffer to write size bytes.
+   Return the pointer to the beginning of buffer data.
+   Raise an exception and return NULL on error. */
+void*
+_PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size)
+{
+    /* ensure that _PyBytesWriter_Alloc() is only called once */
+    assert(writer->min_size == 0 && writer->buffer == NULL);
+    assert(size >= 0);
+
+    writer->use_small_buffer = 1;
+#ifdef Py_DEBUG
+    writer->allocated = sizeof(writer->small_buffer) - 1;
+    /* In debug mode, don't use the full small buffer because it is less
+       efficient than bytes and bytearray objects to detect buffer underflow
+       and buffer overflow. Use 10 bytes of the small buffer to test also
+       code using the smaller buffer in debug mode.
+
+       Don't modify the _PyBytesWriter structure (use a shorter small buffer)
+       in debug mode to also be able to detect stack overflow when running
+       tests in debug mode. The _PyBytesWriter is large (more than 512 bytes),
+       if Py_EnterRecursiveCall() is not used in deep C callback, we may hit a
+       stack overflow. */
+    writer->allocated = Py_MIN(writer->allocated, 10);
+    /* _PyBytesWriter_CheckConsistency() requires the last byte to be 0,
+       to detect buffer overflow */
+    writer->small_buffer[writer->allocated] = 0;
+#else
+    writer->allocated = sizeof(writer->small_buffer);
+#endif
+    return _PyBytesWriter_Prepare(writer, writer->small_buffer, size);
+}
+
+PyObject *
+_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
+{
+    Py_ssize_t size;
+    PyObject *result;
+
+    _PyBytesWriter_CheckConsistency(writer, str);
+
+    size = _PyBytesWriter_GetSize(writer, str);
+    if (size == 0 && !writer->use_bytearray) {
+        Py_CLEAR(writer->buffer);
+        /* Get the empty byte string singleton */
+        result = PyBytes_FromStringAndSize(NULL, 0);
+    }
+    else if (writer->use_small_buffer) {
+        if (writer->use_bytearray) {
+            result = PyByteArray_FromStringAndSize(writer->small_buffer, size);
+        }
+        else {
+            result = PyBytes_FromStringAndSize(writer->small_buffer, size);
+        }
+    }
+    else {
+        result = writer->buffer;
+        writer->buffer = NULL;
+
+        if (size != writer->allocated) {
+            if (writer->use_bytearray) {
+                if (PyByteArray_Resize(result, size)) {
+                    Py_DECREF(result);
+                    return NULL;
+                }
+            }
+            else {
+                if (_PyBytes_Resize(&result, size)) {
+                    assert(result == NULL);
+                    return NULL;
+                }
+            }
+        }
+    }
+    return result;
+}
+
+void*
+_PyBytesWriter_WriteBytes(_PyBytesWriter *writer, void *ptr,
+                          const void *bytes, Py_ssize_t size)
+{
+    char *str = (char *)ptr;
+
+    str = _PyBytesWriter_Prepare(writer, str, size);
+    if (str == NULL)
+        return NULL;
+
+    Py_MEMCPY(str, bytes, size);
+    str += size;
+
+    return str;
+}
diff --git a/Objects/classobject.c b/Objects/classobject.c
index 5e8ac59..b0ed023 100644
--- a/Objects/classobject.c
+++ b/Objects/classobject.c
@@ -302,34 +302,19 @@ method_traverse(PyMethodObject *im, visitproc visit, void *arg)
 }
 
 static PyObject *
-method_call(PyObject *func, PyObject *arg, PyObject *kw)
+method_call(PyObject *method, PyObject *args, PyObject *kwargs)
 {
-    PyObject *self = PyMethod_GET_SELF(func);
-    PyObject *result;
+    PyObject *self, *func;
 
-    func = PyMethod_GET_FUNCTION(func);
+    self = PyMethod_GET_SELF(method);
     if (self == NULL) {
         PyErr_BadInternalCall();
         return NULL;
     }
-    else {
-        Py_ssize_t argcount = PyTuple_Size(arg);
-        PyObject *newarg = PyTuple_New(argcount + 1);
-        int i;
-        if (newarg == NULL)
-            return NULL;
-        Py_INCREF(self);
-        PyTuple_SET_ITEM(newarg, 0, self);
-        for (i = 0; i < argcount; i++) {
-            PyObject *v = PyTuple_GET_ITEM(arg, i);
-            Py_XINCREF(v);
-            PyTuple_SET_ITEM(newarg, i+1, v);
-        }
-        arg = newarg;
-    }
-    result = PyObject_Call((PyObject *)func, arg, kw);
-    Py_DECREF(arg);
-    return result;
+
+    func = PyMethod_GET_FUNCTION(method);
+
+    return _PyObject_Call_Prepend(func, self, args, kwargs);
 }
 
 static PyObject *
diff --git a/Objects/clinic/bytearrayobject.c.h b/Objects/clinic/bytearrayobject.c.h
index e87a221..a60be76 100644
--- a/Objects/clinic/bytearrayobject.c.h
+++ b/Objects/clinic/bytearrayobject.c.h
@@ -39,45 +39,38 @@ bytearray_copy(PyByteArrayObject *self, PyObject *Py_UNUSED(ignored))
 }
 
 PyDoc_STRVAR(bytearray_translate__doc__,
-"translate(table, [deletechars])\n"
+"translate($self, table, /, delete=b\'\')\n"
+"--\n"
+"\n"
 "Return a copy with each character mapped by the given translation table.\n"
 "\n"
 "  table\n"
 "    Translation table, which must be a bytes object of length 256.\n"
 "\n"
-"All characters occurring in the optional argument deletechars are removed.\n"
+"All characters occurring in the optional argument delete are removed.\n"
 "The remaining characters are mapped through the given translation table.");
 
 #define BYTEARRAY_TRANSLATE_METHODDEF    \
-    {"translate", (PyCFunction)bytearray_translate, METH_VARARGS, bytearray_translate__doc__},
+    {"translate", (PyCFunction)bytearray_translate, METH_VARARGS|METH_KEYWORDS, bytearray_translate__doc__},
 
 static PyObject *
 bytearray_translate_impl(PyByteArrayObject *self, PyObject *table,
-                         int group_right_1, PyObject *deletechars);
+                         PyObject *deletechars);
 
 static PyObject *
-bytearray_translate(PyByteArrayObject *self, PyObject *args)
+bytearray_translate(PyByteArrayObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
+    static const char * const _keywords[] = {"", "delete", NULL};
+    static _PyArg_Parser _parser = {"O|O:translate", _keywords, 0};
     PyObject *table;
-    int group_right_1 = 0;
     PyObject *deletechars = NULL;
 
-    switch (PyTuple_GET_SIZE(args)) {
-        case 1:
-            if (!PyArg_ParseTuple(args, "O:translate", &table))
-                goto exit;
-            break;
-        case 2:
-            if (!PyArg_ParseTuple(args, "OO:translate", &table, &deletechars))
-                goto exit;
-            group_right_1 = 1;
-            break;
-        default:
-            PyErr_SetString(PyExc_TypeError, "bytearray.translate requires 1 to 2 arguments");
-            goto exit;
+    if (!_PyArg_ParseTupleAndKeywordsFast(args, kwargs, &_parser,
+        &table, &deletechars)) {
+        goto exit;
     }
-    return_value = bytearray_translate_impl(self, table, group_right_1, deletechars);
+    return_value = bytearray_translate_impl(self, table, deletechars);
 
 exit:
     return return_value;
@@ -108,17 +101,20 @@ bytearray_maketrans(void *null, PyObject *args)
     Py_buffer to = {NULL, NULL};
 
     if (!PyArg_ParseTuple(args, "y*y*:maketrans",
-        &frm, &to))
+        &frm, &to)) {
         goto exit;
+    }
     return_value = bytearray_maketrans_impl(&frm, &to);
 
 exit:
     /* Cleanup for frm */
-    if (frm.obj)
+    if (frm.obj) {
        PyBuffer_Release(&frm);
+    }
     /* Cleanup for to */
-    if (to.obj)
+    if (to.obj) {
        PyBuffer_Release(&to);
+    }
 
     return return_value;
 }
@@ -152,17 +148,20 @@ bytearray_replace(PyByteArrayObject *self, PyObject *args)
     Py_ssize_t count = -1;
 
     if (!PyArg_ParseTuple(args, "y*y*|n:replace",
-        &old, &new, &count))
+        &old, &new, &count)) {
         goto exit;
+    }
     return_value = bytearray_replace_impl(self, &old, &new, count);
 
 exit:
     /* Cleanup for old */
-    if (old.obj)
+    if (old.obj) {
        PyBuffer_Release(&old);
+    }
     /* Cleanup for new */
-    if (new.obj)
+    if (new.obj) {
        PyBuffer_Release(&new);
+    }
 
     return return_value;
 }
@@ -192,13 +191,15 @@ static PyObject *
 bytearray_split(PyByteArrayObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
-    static char *_keywords[] = {"sep", "maxsplit", NULL};
+    static const char * const _keywords[] = {"sep", "maxsplit", NULL};
+    static _PyArg_Parser _parser = {"|On:split", _keywords, 0};
     PyObject *sep = Py_None;
     Py_ssize_t maxsplit = -1;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|On:split", _keywords,
-        &sep, &maxsplit))
+    if (!_PyArg_ParseTupleAndKeywordsFast(args, kwargs, &_parser,
+        &sep, &maxsplit)) {
         goto exit;
+    }
     return_value = bytearray_split_impl(self, sep, maxsplit);
 
 exit:
@@ -264,13 +265,15 @@ static PyObject *
 bytearray_rsplit(PyByteArrayObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
-    static char *_keywords[] = {"sep", "maxsplit", NULL};
+    static const char * const _keywords[] = {"sep", "maxsplit", NULL};
+    static _PyArg_Parser _parser = {"|On:rsplit", _keywords, 0};
     PyObject *sep = Py_None;
     Py_ssize_t maxsplit = -1;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|On:rsplit", _keywords,
-        &sep, &maxsplit))
+    if (!_PyArg_ParseTupleAndKeywordsFast(args, kwargs, &_parser,
+        &sep, &maxsplit)) {
         goto exit;
+    }
     return_value = bytearray_rsplit_impl(self, sep, maxsplit);
 
 exit:
@@ -320,8 +323,9 @@ bytearray_insert(PyByteArrayObject *self, PyObject *args)
     int item;
 
     if (!PyArg_ParseTuple(args, "nO&:insert",
-        &index, _getbytevalue, &item))
+        &index, _getbytevalue, &item)) {
         goto exit;
+    }
     return_value = bytearray_insert_impl(self, index, item);
 
 exit:
@@ -349,8 +353,9 @@ bytearray_append(PyByteArrayObject *self, PyObject *arg)
     PyObject *return_value = NULL;
     int item;
 
-    if (!PyArg_Parse(arg, "O&:append", _getbytevalue, &item))
+    if (!PyArg_Parse(arg, "O&:append", _getbytevalue, &item)) {
         goto exit;
+    }
     return_value = bytearray_append_impl(self, item);
 
 exit:
@@ -394,8 +399,9 @@ bytearray_pop(PyByteArrayObject *self, PyObject *args)
     Py_ssize_t index = -1;
 
     if (!PyArg_ParseTuple(args, "|n:pop",
-        &index))
+        &index)) {
         goto exit;
+    }
     return_value = bytearray_pop_impl(self, index);
 
 exit:
@@ -423,8 +429,9 @@ bytearray_remove(PyByteArrayObject *self, PyObject *arg)
     PyObject *return_value = NULL;
     int value;
 
-    if (!PyArg_Parse(arg, "O&:remove", _getbytevalue, &value))
+    if (!PyArg_Parse(arg, "O&:remove", _getbytevalue, &value)) {
         goto exit;
+    }
     return_value = bytearray_remove_impl(self, value);
 
 exit:
@@ -453,8 +460,9 @@ bytearray_strip(PyByteArrayObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "strip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytearray_strip_impl(self, bytes);
 
 exit:
@@ -483,8 +491,9 @@ bytearray_lstrip(PyByteArrayObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "lstrip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytearray_lstrip_impl(self, bytes);
 
 exit:
@@ -513,8 +522,9 @@ bytearray_rstrip(PyByteArrayObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "rstrip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytearray_rstrip_impl(self, bytes);
 
 exit:
@@ -547,13 +557,15 @@ static PyObject *
 bytearray_decode(PyByteArrayObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
-    static char *_keywords[] = {"encoding", "errors", NULL};
+    static const char * const _keywords[] = {"encoding", "errors", NULL};
+    static _PyArg_Parser _parser = {"|ss:decode", _keywords, 0};
     const char *encoding = NULL;
     const char *errors = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode", _keywords,
-        &encoding, &errors))
+    if (!_PyArg_ParseTupleAndKeywordsFast(args, kwargs, &_parser,
+        &encoding, &errors)) {
         goto exit;
+    }
     return_value = bytearray_decode_impl(self, encoding, errors);
 
 exit:
@@ -592,12 +604,14 @@ static PyObject *
 bytearray_splitlines(PyByteArrayObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
-    static char *_keywords[] = {"keepends", NULL};
+    static const char * const _keywords[] = {"keepends", NULL};
+    static _PyArg_Parser _parser = {"|i:splitlines", _keywords, 0};
     int keepends = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i:splitlines", _keywords,
-        &keepends))
+    if (!_PyArg_ParseTupleAndKeywordsFast(args, kwargs, &_parser,
+        &keepends)) {
         goto exit;
+    }
     return_value = bytearray_splitlines_impl(self, keepends);
 
 exit:
@@ -617,17 +631,18 @@ PyDoc_STRVAR(bytearray_fromhex__doc__,
     {"fromhex", (PyCFunction)bytearray_fromhex, METH_O|METH_CLASS, bytearray_fromhex__doc__},
 
 static PyObject *
-bytearray_fromhex_impl(PyObject*cls, PyObject *string);
+bytearray_fromhex_impl(PyTypeObject *type, PyObject *string);
 
 static PyObject *
-bytearray_fromhex(PyTypeObject *cls, PyObject *arg)
+bytearray_fromhex(PyTypeObject *type, PyObject *arg)
 {
     PyObject *return_value = NULL;
     PyObject *string;
 
-    if (!PyArg_Parse(arg, "U:fromhex", &string))
+    if (!PyArg_Parse(arg, "U:fromhex", &string)) {
         goto exit;
-    return_value = bytearray_fromhex_impl((PyObject*)cls, string);
+    }
+    return_value = bytearray_fromhex_impl(type, string);
 
 exit:
     return return_value;
@@ -670,8 +685,9 @@ bytearray_reduce_ex(PyByteArrayObject *self, PyObject *args)
     int proto = 0;
 
     if (!PyArg_ParseTuple(args, "|i:__reduce_ex__",
-        &proto))
+        &proto)) {
         goto exit;
+    }
     return_value = bytearray_reduce_ex_impl(self, proto);
 
 exit:
@@ -695,4 +711,4 @@ bytearray_sizeof(PyByteArrayObject *self, PyObject *Py_UNUSED(ignored))
 {
     return bytearray_sizeof_impl(self);
 }
-/*[clinic end generated code: output=966c15ff22c5e243 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=59a0c86b29ff06d1 input=a9049054013a1b77]*/
diff --git a/Objects/clinic/bytesobject.c.h b/Objects/clinic/bytesobject.c.h
index 5a1a5e9..f179ce6 100644
--- a/Objects/clinic/bytesobject.c.h
+++ b/Objects/clinic/bytesobject.c.h
@@ -20,19 +20,21 @@ PyDoc_STRVAR(bytes_split__doc__,
     {"split", (PyCFunction)bytes_split, METH_VARARGS|METH_KEYWORDS, bytes_split__doc__},
 
 static PyObject *
-bytes_split_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit);
+bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit);
 
 static PyObject *
-bytes_split(PyBytesObject*self, PyObject *args, PyObject *kwargs)
+bytes_split(PyBytesObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
-    static char *_keywords[] = {"sep", "maxsplit", NULL};
+    static const char * const _keywords[] = {"sep", "maxsplit", NULL};
+    static _PyArg_Parser _parser = {"|On:split", _keywords, 0};
     PyObject *sep = Py_None;
     Py_ssize_t maxsplit = -1;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|On:split", _keywords,
-        &sep, &maxsplit))
+    if (!_PyArg_ParseTupleAndKeywordsFast(args, kwargs, &_parser,
+        &sep, &maxsplit)) {
         goto exit;
+    }
     return_value = bytes_split_impl(self, sep, maxsplit);
 
 exit:
@@ -64,14 +66,16 @@ bytes_partition(PyBytesObject *self, PyObject *arg)
     PyObject *return_value = NULL;
     Py_buffer sep = {NULL, NULL};
 
-    if (!PyArg_Parse(arg, "y*:partition", &sep))
+    if (!PyArg_Parse(arg, "y*:partition", &sep)) {
         goto exit;
+    }
     return_value = bytes_partition_impl(self, &sep);
 
 exit:
     /* Cleanup for sep */
-    if (sep.obj)
+    if (sep.obj) {
        PyBuffer_Release(&sep);
+    }
 
     return return_value;
 }
@@ -101,14 +105,16 @@ bytes_rpartition(PyBytesObject *self, PyObject *arg)
     PyObject *return_value = NULL;
     Py_buffer sep = {NULL, NULL};
 
-    if (!PyArg_Parse(arg, "y*:rpartition", &sep))
+    if (!PyArg_Parse(arg, "y*:rpartition", &sep)) {
         goto exit;
+    }
     return_value = bytes_rpartition_impl(self, &sep);
 
 exit:
     /* Cleanup for sep */
-    if (sep.obj)
+    if (sep.obj) {
        PyBuffer_Release(&sep);
+    }
 
     return return_value;
 }
@@ -133,19 +139,21 @@ PyDoc_STRVAR(bytes_rsplit__doc__,
     {"rsplit", (PyCFunction)bytes_rsplit, METH_VARARGS|METH_KEYWORDS, bytes_rsplit__doc__},
 
 static PyObject *
-bytes_rsplit_impl(PyBytesObject*self, PyObject *sep, Py_ssize_t maxsplit);
+bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit);
 
 static PyObject *
-bytes_rsplit(PyBytesObject*self, PyObject *args, PyObject *kwargs)
+bytes_rsplit(PyBytesObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
-    static char *_keywords[] = {"sep", "maxsplit", NULL};
+    static const char * const _keywords[] = {"sep", "maxsplit", NULL};
+    static _PyArg_Parser _parser = {"|On:rsplit", _keywords, 0};
     PyObject *sep = Py_None;
     Py_ssize_t maxsplit = -1;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|On:rsplit", _keywords,
-        &sep, &maxsplit))
+    if (!_PyArg_ParseTupleAndKeywordsFast(args, kwargs, &_parser,
+        &sep, &maxsplit)) {
         goto exit;
+    }
     return_value = bytes_rsplit_impl(self, sep, maxsplit);
 
 exit:
@@ -189,8 +197,9 @@ bytes_strip(PyBytesObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "strip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytes_strip_impl(self, bytes);
 
 exit:
@@ -219,8 +228,9 @@ bytes_lstrip(PyBytesObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "lstrip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytes_lstrip_impl(self, bytes);
 
 exit:
@@ -249,8 +259,9 @@ bytes_rstrip(PyBytesObject *self, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "rstrip",
         0, 1,
-        &bytes))
+        &bytes)) {
         goto exit;
+    }
     return_value = bytes_rstrip_impl(self, bytes);
 
 exit:
@@ -258,45 +269,38 @@ exit:
 }
 
 PyDoc_STRVAR(bytes_translate__doc__,
-"translate(table, [deletechars])\n"
+"translate($self, table, /, delete=b\'\')\n"
+"--\n"
+"\n"
 "Return a copy with each character mapped by the given translation table.\n"
 "\n"
 "  table\n"
 "    Translation table, which must be a bytes object of length 256.\n"
 "\n"
-"All characters occurring in the optional argument deletechars are removed.\n"
+"All characters occurring in the optional argument delete are removed.\n"
 "The remaining characters are mapped through the given translation table.");
 
 #define BYTES_TRANSLATE_METHODDEF    \
-    {"translate", (PyCFunction)bytes_translate, METH_VARARGS, bytes_translate__doc__},
+    {"translate", (PyCFunction)bytes_translate, METH_VARARGS|METH_KEYWORDS, bytes_translate__doc__},
 
 static PyObject *
-bytes_translate_impl(PyBytesObject *self, PyObject *table, int group_right_1,
+bytes_translate_impl(PyBytesObject *self, PyObject *table,
                      PyObject *deletechars);
 
 static PyObject *
-bytes_translate(PyBytesObject *self, PyObject *args)
+bytes_translate(PyBytesObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
+    static const char * const _keywords[] = {"", "delete", NULL};
+    static _PyArg_Parser _parser = {"O|O:translate", _keywords, 0};
     PyObject *table;
-    int group_right_1 = 0;
     PyObject *deletechars = NULL;
 
-    switch (PyTuple_GET_SIZE(args)) {
-        case 1:
-            if (!PyArg_ParseTuple(args, "O:translate", &table))
-                goto exit;
-            break;
-        case 2:
-            if (!PyArg_ParseTuple(args, "OO:translate", &table, &deletechars))
-                goto exit;
-            group_right_1 = 1;
-            break;
-        default:
-            PyErr_SetString(PyExc_TypeError, "bytes.translate requires 1 to 2 arguments");
-            goto exit;
+    if (!_PyArg_ParseTupleAndKeywordsFast(args, kwargs, &_parser,
+        &table, &deletechars)) {
+        goto exit;
     }
-    return_value = bytes_translate_impl(self, table, group_right_1, deletechars);
+    return_value = bytes_translate_impl(self, table, deletechars);
 
 exit:
     return return_value;
@@ -327,17 +331,20 @@ bytes_maketrans(void *null, PyObject *args)
     Py_buffer to = {NULL, NULL};
 
     if (!PyArg_ParseTuple(args, "y*y*:maketrans",
-        &frm, &to))
+        &frm, &to)) {
         goto exit;
+    }
     return_value = bytes_maketrans_impl(&frm, &to);
 
 exit:
     /* Cleanup for frm */
-    if (frm.obj)
+    if (frm.obj) {
        PyBuffer_Release(&frm);
+    }
     /* Cleanup for to */
-    if (to.obj)
+    if (to.obj) {
        PyBuffer_Release(&to);
+    }
 
     return return_value;
 }
@@ -359,11 +366,11 @@ PyDoc_STRVAR(bytes_replace__doc__,
     {"replace", (PyCFunction)bytes_replace, METH_VARARGS, bytes_replace__doc__},
 
 static PyObject *
-bytes_replace_impl(PyBytesObject*self, Py_buffer *old, Py_buffer *new,
+bytes_replace_impl(PyBytesObject *self, Py_buffer *old, Py_buffer *new,
                    Py_ssize_t count);
 
 static PyObject *
-bytes_replace(PyBytesObject*self, PyObject *args)
+bytes_replace(PyBytesObject *self, PyObject *args)
 {
     PyObject *return_value = NULL;
     Py_buffer old = {NULL, NULL};
@@ -371,17 +378,20 @@ bytes_replace(PyBytesObject*self, PyObject *args)
     Py_ssize_t count = -1;
 
     if (!PyArg_ParseTuple(args, "y*y*|n:replace",
-        &old, &new, &count))
+        &old, &new, &count)) {
         goto exit;
+    }
     return_value = bytes_replace_impl(self, &old, &new, count);
 
 exit:
     /* Cleanup for old */
-    if (old.obj)
+    if (old.obj) {
        PyBuffer_Release(&old);
+    }
     /* Cleanup for new */
-    if (new.obj)
+    if (new.obj) {
        PyBuffer_Release(&new);
+    }
 
     return return_value;
 }
@@ -405,20 +415,22 @@ PyDoc_STRVAR(bytes_decode__doc__,
     {"decode", (PyCFunction)bytes_decode, METH_VARARGS|METH_KEYWORDS, bytes_decode__doc__},
 
 static PyObject *
-bytes_decode_impl(PyBytesObject*self, const char *encoding,
+bytes_decode_impl(PyBytesObject *self, const char *encoding,
                   const char *errors);
 
 static PyObject *
-bytes_decode(PyBytesObject*self, PyObject *args, PyObject *kwargs)
+bytes_decode(PyBytesObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
-    static char *_keywords[] = {"encoding", "errors", NULL};
+    static const char * const _keywords[] = {"encoding", "errors", NULL};
+    static _PyArg_Parser _parser = {"|ss:decode", _keywords, 0};
     const char *encoding = NULL;
     const char *errors = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode", _keywords,
-        &encoding, &errors))
+    if (!_PyArg_ParseTupleAndKeywordsFast(args, kwargs, &_parser,
+        &encoding, &errors)) {
         goto exit;
+    }
     return_value = bytes_decode_impl(self, encoding, errors);
 
 exit:
@@ -438,18 +450,20 @@ PyDoc_STRVAR(bytes_splitlines__doc__,
     {"splitlines", (PyCFunction)bytes_splitlines, METH_VARARGS|METH_KEYWORDS, bytes_splitlines__doc__},
 
 static PyObject *
-bytes_splitlines_impl(PyBytesObject*self, int keepends);
+bytes_splitlines_impl(PyBytesObject *self, int keepends);
 
 static PyObject *
-bytes_splitlines(PyBytesObject*self, PyObject *args, PyObject *kwargs)
+bytes_splitlines(PyBytesObject *self, PyObject *args, PyObject *kwargs)
 {
     PyObject *return_value = NULL;
-    static char *_keywords[] = {"keepends", NULL};
+    static const char * const _keywords[] = {"keepends", NULL};
+    static _PyArg_Parser _parser = {"|i:splitlines", _keywords, 0};
     int keepends = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i:splitlines", _keywords,
-        &keepends))
+    if (!_PyArg_ParseTupleAndKeywordsFast(args, kwargs, &_parser,
+        &keepends)) {
         goto exit;
+    }
     return_value = bytes_splitlines_impl(self, keepends);
 
 exit:
@@ -477,11 +491,12 @@ bytes_fromhex(PyTypeObject *type, PyObject *arg)
     PyObject *return_value = NULL;
     PyObject *string;
 
-    if (!PyArg_Parse(arg, "U:fromhex", &string))
+    if (!PyArg_Parse(arg, "U:fromhex", &string)) {
         goto exit;
+    }
     return_value = bytes_fromhex_impl(type, string);
 
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=bd0ce8f25d7e18f4 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=5618c05c24c1e617 input=a9049054013a1b77]*/
diff --git a/Objects/clinic/dictobject.c.h b/Objects/clinic/dictobject.c.h
index 5288b9a..d0cdfc3 100644
--- a/Objects/clinic/dictobject.c.h
+++ b/Objects/clinic/dictobject.c.h
@@ -23,8 +23,9 @@ dict_fromkeys(PyTypeObject *type, PyObject *args)
 
     if (!PyArg_UnpackTuple(args, "fromkeys",
         1, 2,
-        &iterable, &value))
+        &iterable, &value)) {
         goto exit;
+    }
     return_value = dict_fromkeys_impl(type, iterable, value);
 
 exit:
@@ -39,4 +40,4 @@ PyDoc_STRVAR(dict___contains____doc__,
 
 #define DICT___CONTAINS___METHODDEF    \
     {"__contains__", (PyCFunction)dict___contains__, METH_O|METH_COEXIST, dict___contains____doc__},
-/*[clinic end generated code: output=fe74d676332fdba6 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=926326109e3d9839 input=a9049054013a1b77]*/
diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h
index d42a700..891e90c 100644
--- a/Objects/clinic/unicodeobject.c.h
+++ b/Objects/clinic/unicodeobject.c.h
@@ -31,11 +31,12 @@ unicode_maketrans(void *null, PyObject *args)
     PyObject *z = NULL;
 
     if (!PyArg_ParseTuple(args, "O|UU:maketrans",
-        &x, &y, &z))
+        &x, &y, &z)) {
         goto exit;
+    }
     return_value = unicode_maketrans_impl(x, y, z);
 
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=94affdff5b2daff5 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=4a86dd108d92d104 input=a9049054013a1b77]*/
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
index 6c0e5bf..d514ec1 100644
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -1,3 +1,5 @@
+#include <stdbool.h>
+
 #include "Python.h"
 #include "code.h"
 #include "structmember.h"
@@ -5,6 +7,12 @@
 #define NAME_CHARS \
     "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"
 
+/* Holder for co_extra information */
+typedef struct {
+    Py_ssize_t ce_size;
+    void **ce_extras;
+} _PyCodeObjectExtra;
+
 /* all_name_chars(s): true iff all chars in s are valid NAME_CHARS */
 
 static int
@@ -96,7 +104,7 @@ PyCode_New(int argcount, int kwonlyargcount,
         Py_ssize_t total_args = argcount + kwonlyargcount +
             ((flags & CO_VARARGS) != 0) + ((flags & CO_VARKEYWORDS) != 0);
         Py_ssize_t alloc_size = sizeof(unsigned char) * n_cellvars;
-        int used_cell2arg = 0;
+        bool used_cell2arg = false;
         cell2arg = PyMem_MALLOC(alloc_size);
         if (cell2arg == NULL)
             return NULL;
@@ -109,7 +117,7 @@ PyCode_New(int argcount, int kwonlyargcount,
                 PyObject *arg = PyTuple_GET_ITEM(varnames, j);
                 if (!PyUnicode_Compare(cell, arg)) {
                     cell2arg[i] = j;
-                    used_cell2arg = 1;
+                    used_cell2arg = true;
                     break;
                 }
             }
@@ -152,6 +160,7 @@ PyCode_New(int argcount, int kwonlyargcount,
     co->co_lnotab = lnotab;
     co->co_zombieframe = NULL;
     co->co_weakreflist = NULL;
+    co->co_extra = NULL;
     return co;
 }
 
@@ -361,6 +370,21 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw)
 static void
 code_dealloc(PyCodeObject *co)
 {
+    if (co->co_extra != NULL) {
+        PyThreadState *tstate = PyThreadState_Get();
+        _PyCodeObjectExtra *co_extra = co->co_extra;
+
+        for (Py_ssize_t i = 0; i < co_extra->ce_size; i++) {
+            freefunc free_extra = tstate->co_extra_freefuncs[i];
+
+            if (free_extra != NULL) {
+                free_extra(co_extra->ce_extras[i]);
+            }
+        }
+
+        PyMem_FREE(co->co_extra);
+    }
+
     Py_XDECREF(co->co_code);
     Py_XDECREF(co->co_consts);
     Py_XDECREF(co->co_names);
@@ -694,7 +718,8 @@ PyCode_Addr2Line(PyCodeObject *co, int addrq)
         addr += *p++;
         if (addr > addrq)
             break;
-        line += *p++;
+        line += (signed char)*p;
+        p++;
     }
     return line;
 }
@@ -729,17 +754,19 @@ _PyCode_CheckLineNumber(PyCodeObject* co, int lasti, PyAddrPair *bounds)
         if (addr + *p > lasti)
             break;
         addr += *p++;
-        if (*p)
+        if ((signed char)*p)
             bounds->ap_lower = addr;
-        line += *p++;
+        line += (signed char)*p;
+        p++;
         --size;
     }
 
     if (size > 0) {
         while (--size >= 0) {
             addr += *p++;
-            if (*p++)
+            if ((signed char)*p)
                 break;
+            p++;
         }
         bounds->ap_upper = addr;
     }
@@ -749,3 +776,80 @@ _PyCode_CheckLineNumber(PyCodeObject* co, int lasti, PyAddrPair *bounds)
 
     return line;
 }
+
+
+int
+_PyCode_GetExtra(PyObject *code, Py_ssize_t index, void **extra)
+{
+    assert(*extra == NULL);
+
+    if (!PyCode_Check(code)) {
+        PyErr_BadInternalCall();
+        return -1;
+    }
+
+    PyCodeObject *o = (PyCodeObject*) code;
+    _PyCodeObjectExtra *co_extra = (_PyCodeObjectExtra*) o->co_extra;
+
+
+    if (co_extra == NULL || co_extra->ce_size <= index) {
+        return 0;
+    }
+
+    *extra = co_extra->ce_extras[index];
+    return 0;
+}
+
+
+int
+_PyCode_SetExtra(PyObject *code, Py_ssize_t index, void *extra)
+{
+    PyThreadState *tstate = PyThreadState_Get();
+
+    if (!PyCode_Check(code) || index < 0 ||
+            index >= tstate->co_extra_user_count) {
+        PyErr_BadInternalCall();
+        return -1;
+    }
+
+    PyCodeObject *o = (PyCodeObject*) code;
+    _PyCodeObjectExtra *co_extra = (_PyCodeObjectExtra *) o->co_extra;
+
+    if (co_extra == NULL) {
+        o->co_extra = (_PyCodeObjectExtra*) PyMem_Malloc(
+            sizeof(_PyCodeObjectExtra));
+        if (o->co_extra == NULL) {
+            return -1;
+        }
+        co_extra = (_PyCodeObjectExtra *) o->co_extra;
+
+        co_extra->ce_extras = PyMem_Malloc(
+            tstate->co_extra_user_count * sizeof(void*));
+        if (co_extra->ce_extras == NULL) {
+            return -1;
+        }
+
+        co_extra->ce_size = tstate->co_extra_user_count;
+
+        for (Py_ssize_t i = 0; i < co_extra->ce_size; i++) {
+            co_extra->ce_extras[i] = NULL;
+        }
+    }
+    else if (co_extra->ce_size <= index) {
+        co_extra->ce_extras = PyMem_Realloc(
+            co_extra->ce_extras, tstate->co_extra_user_count * sizeof(void*));
+
+        if (co_extra->ce_extras == NULL) {
+            return -1;
+        }
+
+        co_extra->ce_size = tstate->co_extra_user_count;
+
+        for (Py_ssize_t i = co_extra->ce_size; i < co_extra->ce_size; i++) {
+            co_extra->ce_extras[i] = NULL;
+        }
+    }
+
+    co_extra->ce_extras[index] = extra;
+    return 0;
+}
diff --git a/Objects/descrobject.c b/Objects/descrobject.c
index da68e3b..076e741 100644
--- a/Objects/descrobject.c
+++ b/Objects/descrobject.c
@@ -22,7 +22,7 @@ descr_name(PyDescrObject *descr)
 }
 
 static PyObject *
-descr_repr(PyDescrObject *descr, char *format)
+descr_repr(PyDescrObject *descr, const char *format)
 {
     PyObject *name = NULL;
     if (descr->d_name != NULL && PyUnicode_Check(descr->d_name))
@@ -213,7 +213,7 @@ static PyObject *
 methoddescr_call(PyMethodDescrObject *descr, PyObject *args, PyObject *kwds)
 {
     Py_ssize_t argc;
-    PyObject *self, *func, *result;
+    PyObject *self, *func, *result, **stack;
 
     /* Make sure that the first argument is acceptable as 'self' */
     assert(PyTuple_Check(args));
@@ -242,13 +242,8 @@ methoddescr_call(PyMethodDescrObject *descr, PyObject *args, PyObject *kwds)
     func = PyCFunction_NewEx(descr->d_method, self, NULL);
     if (func == NULL)
         return NULL;
-    args = PyTuple_GetSlice(args, 1, argc);
-    if (args == NULL) {
-        Py_DECREF(func);
-        return NULL;
-    }
-    result = PyEval_CallObjectWithKeywords(func, args, kwds);
-    Py_DECREF(args);
+    stack = &PyTuple_GET_ITEM(args, 1);
+    result = _PyObject_FastCallDict(func, stack, argc - 1, kwds);
     Py_DECREF(func);
     return result;
 }
@@ -258,7 +253,7 @@ classmethoddescr_call(PyMethodDescrObject *descr, PyObject *args,
                       PyObject *kwds)
 {
     Py_ssize_t argc;
-    PyObject *self, *func, *result;
+    PyObject *self, *func, *result, **stack;
 
     /* Make sure that the first argument is acceptable as 'self' */
     assert(PyTuple_Check(args));
@@ -295,14 +290,9 @@ classmethoddescr_call(PyMethodDescrObject *descr, PyObject *args,
     func = PyCFunction_NewEx(descr->d_method, self, NULL);
     if (func == NULL)
         return NULL;
-    args = PyTuple_GetSlice(args, 1, argc);
-    if (args == NULL) {
-        Py_DECREF(func);
-        return NULL;
-    }
-    result = PyEval_CallObjectWithKeywords(func, args, kwds);
+    stack = &PyTuple_GET_ITEM(args, 1);
+    result = _PyObject_FastCallDict(func, stack, argc - 1, kwds);
     Py_DECREF(func);
-    Py_DECREF(args);
     return result;
 }
 
@@ -310,7 +300,7 @@ static PyObject *
 wrapperdescr_call(PyWrapperDescrObject *descr, PyObject *args, PyObject *kwds)
 {
     Py_ssize_t argc;
-    PyObject *self, *func, *result;
+    PyObject *self, *func, *result, **stack;
 
     /* Make sure that the first argument is acceptable as 'self' */
     assert(PyTuple_Check(args));
@@ -339,13 +329,9 @@ wrapperdescr_call(PyWrapperDescrObject *descr, PyObject *args, PyObject *kwds)
     func = PyWrapper_New((PyObject *)descr, self);
     if (func == NULL)
         return NULL;
-    args = PyTuple_GetSlice(args, 1, argc);
-    if (args == NULL) {
-        Py_DECREF(func);
-        return NULL;
-    }
-    result = PyEval_CallObjectWithKeywords(func, args, kwds);
-    Py_DECREF(args);
+
+    stack = &PyTuple_GET_ITEM(args, 1);
+    result = _PyObject_FastCallDict(func, stack, argc - 1, kwds);
     Py_DECREF(func);
     return result;
 }
@@ -1033,7 +1019,7 @@ wrapper_dealloc(wrapperobject *wp)
 static PyObject *
 wrapper_richcompare(PyObject *a, PyObject *b, int op)
 {
-    Py_intptr_t result;
+    intptr_t result;
     PyObject *v;
     PyWrapperDescrObject *a_descr, *b_descr;
 
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index e04ab2b..d993238 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -321,7 +321,7 @@ static PyDictKeysObject *new_keys_object(Py_ssize_t size)
 
     assert(size >= PyDict_MINSIZE_SPLIT);
     assert(IS_POWER_OF_2(size));
-    dk = PyMem_MALLOC(sizeof(PyDictKeysObject) +
+    dk = PyObject_MALLOC(sizeof(PyDictKeysObject) +
                       sizeof(PyDictKeyEntry) * (size-1));
     if (dk == NULL) {
         PyErr_NoMemory();
@@ -350,7 +350,7 @@ free_keys_object(PyDictKeysObject *keys)
         Py_XDECREF(entries[i].me_key);
         Py_XDECREF(entries[i].me_value);
     }
-    PyMem_FREE(keys);
+    PyObject_FREE(keys);
 }
 
 #define new_values(size) PyMem_NEW(PyObject *, size)
@@ -961,7 +961,7 @@ dictresize(PyDictObject *mp, Py_ssize_t minused)
             }
         }
         assert(oldkeys->dk_refcnt == 1);
-        DK_DEBUG_DECREF PyMem_FREE(oldkeys);
+        DK_DEBUG_DECREF PyObject_FREE(oldkeys);
     }
     return 0;
 }
@@ -1160,39 +1160,42 @@ _PyDict_GetItemIdWithError(PyObject *dp, struct _Py_Identifier *key)
     return PyDict_GetItemWithError(dp, kv);
 }
 
-/* Fast version of global value lookup.
+/* Fast version of global value lookup (LOAD_GLOBAL).
  * Lookup in globals, then builtins.
+ *
+ * Raise an exception and return NULL if an error occurred (ex: computing the
+ * key hash failed, key comparison failed, ...). Return NULL if the key doesn't
+ * exist. Return the value if the key exists.
  */
 PyObject *
 _PyDict_LoadGlobal(PyDictObject *globals, PyDictObject *builtins, PyObject *key)
 {
-    PyObject *x;
-    if (PyUnicode_CheckExact(key)) {
-        PyObject **value_addr;
-        Py_hash_t hash = ((PyASCIIObject *)key)->hash;
-        if (hash != -1) {
-            PyDictKeyEntry *e;
-            e = globals->ma_keys->dk_lookup(globals, key, hash, &value_addr);
-            if (e == NULL) {
-                return NULL;
-            }
-            x = *value_addr;
-            if (x != NULL)
-                return x;
-            e = builtins->ma_keys->dk_lookup(builtins, key, hash, &value_addr);
-            if (e == NULL) {
-                return NULL;
-            }
-            x = *value_addr;
-            return x;
-        }
+    Py_hash_t hash;
+    PyDictKeyEntry *entry;
+    PyObject **value_addr;
+    PyObject *value;
+
+    if (!PyUnicode_CheckExact(key) ||
+        (hash = ((PyASCIIObject *) key)->hash) == -1)
+    {
+        hash = PyObject_Hash(key);
+        if (hash == -1)
+            return NULL;
     }
-    x = PyDict_GetItemWithError((PyObject *)globals, key);
-    if (x != NULL)
-        return x;
-    if (PyErr_Occurred())
+
+    /* namespace 1: globals */
+    entry = globals->ma_keys->dk_lookup(globals, key, hash, &value_addr);
+    if (entry == NULL)
+        return NULL;
+    value = *value_addr;
+    if (value != NULL)
+        return value;
+
+    /* namespace 2: builtins */
+    entry = builtins->ma_keys->dk_lookup(builtins, key, hash, &value_addr);
+    if (entry == NULL)
         return NULL;
-    return PyDict_GetItemWithError((PyObject *)builtins, key);
+    return *value_addr;
 }
 
 /* CAUTION: PyDict_SetItem() must guarantee that it won't resize the
@@ -1917,7 +1920,7 @@ dict_fromkeys_impl(PyTypeObject *type, PyObject *iterable, PyObject *value)
 }
 
 static int
-dict_update_common(PyObject *self, PyObject *args, PyObject *kwds, char *methname)
+dict_update_common(PyObject *self, PyObject *args, PyObject *kwds, const char *methname)
 {
     PyObject *arg = NULL;
     int result = 0;
@@ -2516,24 +2519,26 @@ dict_popitem(PyDictObject *mp)
 static int
 dict_traverse(PyObject *op, visitproc visit, void *arg)
 {
-    Py_ssize_t i, n;
     PyDictObject *mp = (PyDictObject *)op;
-    if (mp->ma_keys->dk_lookup == lookdict) {
-        for (i = 0; i < DK_SIZE(mp->ma_keys); i++) {
-            if (mp->ma_keys->dk_entries[i].me_value != NULL) {
-                Py_VISIT(mp->ma_keys->dk_entries[i].me_value);
-                Py_VISIT(mp->ma_keys->dk_entries[i].me_key);
+    PyDictKeysObject *keys = mp->ma_keys;
+    PyDictKeyEntry *entries = &keys->dk_entries[0];
+    Py_ssize_t i, n = DK_SIZE(mp->ma_keys);
+    if (keys->dk_lookup == lookdict) {
+        for (i = 0; i < n; i++) {
+            if (entries[i].me_value != NULL) {
+                Py_VISIT(entries[i].me_value);
+                Py_VISIT(entries[i].me_key);
             }
         }
     } else {
         if (mp->ma_values != NULL) {
-            for (i = 0, n = DK_SIZE(mp->ma_keys); i < n; i++) {
+            for (i = 0; i < n; i++) {
                 Py_VISIT(mp->ma_values[i]);
             }
         }
         else {
-            for (i = 0, n = DK_SIZE(mp->ma_keys); i < n; i++) {
-                Py_VISIT(mp->ma_keys->dk_entries[i].me_value);
+            for (i = 0; i < n; i++) {
+                Py_VISIT(entries[i].me_value);
             }
         }
     }
diff --git a/Objects/enumobject.c b/Objects/enumobject.c
index c458cfe..dae166d 100644
--- a/Objects/enumobject.c
+++ b/Objects/enumobject.c
@@ -250,6 +250,13 @@ reversed_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
         return NULL;
 
     reversed_meth = _PyObject_LookupSpecial(seq, &PyId___reversed__);
+    if (reversed_meth == Py_None) {
+        Py_DECREF(reversed_meth);
+        PyErr_Format(PyExc_TypeError,
+                     "'%.200s' object is not reversible",
+                     Py_TYPE(seq)->tp_name);
+        return NULL;
+    }
     if (reversed_meth != NULL) {
         PyObject *res = PyObject_CallFunctionObjArgs(reversed_meth, NULL);
         Py_DECREF(reversed_meth);
@@ -259,8 +266,9 @@ reversed_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
         return NULL;
 
     if (!PySequence_Check(seq)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "argument to reversed() must be a sequence");
+        PyErr_Format(PyExc_TypeError,
+                     "'%.200s' object is not reversible",
+                     Py_TYPE(seq)->tp_name);
         return NULL;
     }
 
diff --git a/Objects/exceptions.c b/Objects/exceptions.c
index 0749e90..6fb5eb7 100644
--- a/Objects/exceptions.c
+++ b/Objects/exceptions.c
@@ -59,15 +59,11 @@ BaseException_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 static int
 BaseException_init(PyBaseExceptionObject *self, PyObject *args, PyObject *kwds)
 {
-    PyObject *tmp;
-
     if (!_PyArg_NoKeywords(Py_TYPE(self)->tp_name, kwds))
         return -1;
 
-    tmp = self->args;
-    self->args = args;
-    Py_INCREF(self->args);
-    Py_XDECREF(tmp);
+    Py_INCREF(args);
+    Py_XSETREF(self->args, args);
 
     return 0;
 }
@@ -234,7 +230,7 @@ BaseException_set_tb(PyBaseExceptionObject *self, PyObject *tb)
         return -1;
     }
 
-    Py_XINCREF(tb);
+    Py_INCREF(tb);
     Py_XSETREF(self->traceback, tb);
     return 0;
 }
@@ -328,11 +324,10 @@ PyException_GetCause(PyObject *self) {
 
 /* Steals a reference to cause */
 void
-PyException_SetCause(PyObject *self, PyObject *cause) {
-    PyObject *old_cause = ((PyBaseExceptionObject *)self)->cause;
-    ((PyBaseExceptionObject *)self)->cause = cause;
+PyException_SetCause(PyObject *self, PyObject *cause)
+{
     ((PyBaseExceptionObject *)self)->suppress_context = 1;
-    Py_XDECREF(old_cause);
+    Py_XSETREF(((PyBaseExceptionObject *)self)->cause, cause);
 }
 
 PyObject *
@@ -344,10 +339,9 @@ PyException_GetContext(PyObject *self) {
 
 /* Steals a reference to context */
 void
-PyException_SetContext(PyObject *self, PyObject *context) {
-    PyObject *old_context = ((PyBaseExceptionObject *)self)->context;
-    ((PyBaseExceptionObject *)self)->context = context;
-    Py_XDECREF(old_context);
+PyException_SetContext(PyObject *self, PyObject *context)
+{
+    Py_XSETREF(((PyBaseExceptionObject *)self)->context, context);
 }
 
 
@@ -712,6 +706,13 @@ ComplexExtendsException(PyExc_Exception, ImportError,
                         "module.");
 
 /*
+ *    ModuleNotFoundError extends ImportError
+ */
+
+MiddlingExtendsException(PyExc_ImportError, ModuleNotFoundError, ImportError,
+                         "Module not found.");
+
+/*
  *    OSError extends Exception
  */
 
@@ -991,7 +992,7 @@ OSError_init(PyOSErrorObject *self, PyObject *args, PyObject *kwds)
     return 0;
 
 error:
-    Py_XDECREF(args);
+    Py_DECREF(args);
     return -1;
 }
 
@@ -1071,8 +1072,7 @@ OSError_str(PyOSErrorObject *self)
     }
     if (self->myerrno && self->strerror)
         return PyUnicode_FromFormat("[Errno %S] %S",
-                                    self->myerrno ? self->myerrno: Py_None,
-                                    self->strerror ? self->strerror: Py_None);
+                                    self->myerrno, self->strerror);
     return BaseException_str((PyBaseExceptionObject *)self);
 }
 
@@ -2476,6 +2476,7 @@ _PyExc_Init(PyObject *bltinmod)
     PRE_INIT(SystemExit)
     PRE_INIT(KeyboardInterrupt)
     PRE_INIT(ImportError)
+    PRE_INIT(ModuleNotFoundError)
     PRE_INIT(OSError)
     PRE_INIT(EOFError)
     PRE_INIT(RuntimeError)
@@ -2548,6 +2549,7 @@ _PyExc_Init(PyObject *bltinmod)
     POST_INIT(SystemExit)
     POST_INIT(KeyboardInterrupt)
     POST_INIT(ImportError)
+    POST_INIT(ModuleNotFoundError)
     POST_INIT(OSError)
     INIT_ALIAS(EnvironmentError, OSError)
     INIT_ALIAS(IOError, OSError)
@@ -2610,7 +2612,9 @@ _PyExc_Init(PyObject *bltinmod)
     ADD_ERRNO(BlockingIOError, EWOULDBLOCK);
     POST_INIT(BrokenPipeError);
     ADD_ERRNO(BrokenPipeError, EPIPE);
+#ifdef ESHUTDOWN
     ADD_ERRNO(BrokenPipeError, ESHUTDOWN);
+#endif
     POST_INIT(ChildProcessError);
     ADD_ERRNO(ChildProcessError, ECHILD);
     POST_INIT(ConnectionAbortedError);
diff --git a/Objects/fileobject.c b/Objects/fileobject.c
index 234d07e..f442418 100644
--- a/Objects/fileobject.c
+++ b/Objects/fileobject.c
@@ -127,7 +127,7 @@ PyFile_GetLine(PyObject *f, int n)
 int
 PyFile_WriteObject(PyObject *v, PyObject *f, int flags)
 {
-    PyObject *writer, *value, *args, *result;
+    PyObject *writer, *value, *result;
     _Py_IDENTIFIER(write);
 
     if (f == NULL) {
@@ -146,14 +146,7 @@ PyFile_WriteObject(PyObject *v, PyObject *f, int flags)
         Py_DECREF(writer);
         return -1;
     }
-    args = PyTuple_Pack(1, value);
-    if (args == NULL) {
-        Py_DECREF(value);
-        Py_DECREF(writer);
-        return -1;
-    }
-    result = PyEval_CallObject(writer, args);
-    Py_DECREF(args);
+    result = _PyObject_CallArg1(writer, value);
     Py_DECREF(value);
     Py_DECREF(writer);
     if (result == NULL)
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
index d92bec3..0642b16 100644
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -215,35 +215,49 @@ double
 PyFloat_AsDouble(PyObject *op)
 {
     PyNumberMethods *nb;
-    PyFloatObject *fo;
+    PyObject *res;
     double val;
 
-    if (op && PyFloat_Check(op))
-        return PyFloat_AS_DOUBLE((PyFloatObject*) op);
-
     if (op == NULL) {
         PyErr_BadArgument();
         return -1;
     }
 
-    if ((nb = Py_TYPE(op)->tp_as_number) == NULL || nb->nb_float == NULL) {
-        PyErr_SetString(PyExc_TypeError, "a float is required");
-        return -1;
+    if (PyFloat_Check(op)) {
+        return PyFloat_AS_DOUBLE(op);
     }
 
-    fo = (PyFloatObject*) (*nb->nb_float) (op);
-    if (fo == NULL)
-        return -1;
-    if (!PyFloat_Check(fo)) {
-        Py_DECREF(fo);
-        PyErr_SetString(PyExc_TypeError,
-                        "nb_float should return float object");
+    nb = Py_TYPE(op)->tp_as_number;
+    if (nb == NULL || nb->nb_float == NULL) {
+        PyErr_Format(PyExc_TypeError, "must be real number, not %.50s",
+                     op->ob_type->tp_name);
         return -1;
     }
 
-    val = PyFloat_AS_DOUBLE(fo);
-    Py_DECREF(fo);
+    res = (*nb->nb_float) (op);
+    if (res == NULL) {
+        return -1;
+    }
+    if (!PyFloat_CheckExact(res)) {
+        if (!PyFloat_Check(res)) {
+            PyErr_Format(PyExc_TypeError,
+                         "%.50s.__float__ returned non-float (type %.50s)",
+                         op->ob_type->tp_name, res->ob_type->tp_name);
+            Py_DECREF(res);
+            return -1;
+        }
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "%.50s.__float__ returned non-float (type %.50s).  "
+                "The ability to return an instance of a strict subclass of float "
+                "is deprecated, and may be removed in a future version of Python.",
+                op->ob_type->tp_name, res->ob_type->tp_name)) {
+            Py_DECREF(res);
+            return -1;
+        }
+    }
 
+    val = PyFloat_AS_DOUBLE(res);
+    Py_DECREF(res);
     return val;
 }
 
@@ -1195,7 +1209,7 @@ Return a hexadecimal representation of a floating-point number.\n\
 static PyObject *
 float_fromhex(PyObject *cls, PyObject *arg)
 {
-    PyObject *result_as_float, *result;
+    PyObject *result;
     double x;
     long exp, top_exp, lsb, key_digit;
     char *s, *coeff_start, *s_store, *coeff_end, *exp_start, *s_end;
@@ -1410,11 +1424,10 @@ float_fromhex(PyObject *cls, PyObject *arg)
         s++;
     if (s != s_end)
         goto parse_error;
-    result_as_float = Py_BuildValue("(d)", negate ? -x : x);
-    if (result_as_float == NULL)
-        return NULL;
-    result = PyObject_CallObject(cls, result_as_float);
-    Py_DECREF(result_as_float);
+    result = PyFloat_FromDouble(negate ? -x : x);
+    if (cls != (PyObject *)&PyFloat_Type && result != NULL) {
+        Py_SETREF(result, PyObject_CallFunctionObjArgs(cls, result, NULL));
+    }
     return result;
 
   overflow_error:
@@ -1451,29 +1464,23 @@ float_as_integer_ratio(PyObject *v, PyObject *unused)
     int exponent;
     int i;
 
-    PyObject *prev;
     PyObject *py_exponent = NULL;
     PyObject *numerator = NULL;
     PyObject *denominator = NULL;
     PyObject *result_pair = NULL;
     PyNumberMethods *long_methods = PyLong_Type.tp_as_number;
 
-#define INPLACE_UPDATE(obj, call) \
-    prev = obj; \
-    obj = call; \
-    Py_DECREF(prev); \
-
     CONVERT_TO_DOUBLE(v, self);
 
     if (Py_IS_INFINITY(self)) {
-      PyErr_SetString(PyExc_OverflowError,
-                      "Cannot pass infinity to float.as_integer_ratio.");
-      return NULL;
+        PyErr_SetString(PyExc_OverflowError,
+                        "cannot convert Infinity to integer ratio");
+        return NULL;
     }
     if (Py_IS_NAN(self)) {
-      PyErr_SetString(PyExc_ValueError,
-                      "Cannot pass NaN to float.as_integer_ratio.");
-      return NULL;
+        PyErr_SetString(PyExc_ValueError,
+                        "cannot convert NaN to integer ratio");
+        return NULL;
     }
 
     PyFPE_START_PROTECT("as_integer_ratio", goto error);
@@ -1489,29 +1496,31 @@ float_as_integer_ratio(PyObject *v, PyObject *unused)
        to be truncated by PyLong_FromDouble(). */
 
     numerator = PyLong_FromDouble(float_part);
-    if (numerator == NULL) goto error;
+    if (numerator == NULL)
+        goto error;
+    denominator = PyLong_FromLong(1);
+    if (denominator == NULL)
+        goto error;
+    py_exponent = PyLong_FromLong(Py_ABS(exponent));
+    if (py_exponent == NULL)
+        goto error;
 
     /* fold in 2**exponent */
-    denominator = PyLong_FromLong(1);
-    py_exponent = PyLong_FromLong(labs((long)exponent));
-    if (py_exponent == NULL) goto error;
-    INPLACE_UPDATE(py_exponent,
-                   long_methods->nb_lshift(denominator, py_exponent));
-    if (py_exponent == NULL) goto error;
     if (exponent > 0) {
-        INPLACE_UPDATE(numerator,
-                       long_methods->nb_multiply(numerator, py_exponent));
-        if (numerator == NULL) goto error;
+        Py_SETREF(numerator,
+                  long_methods->nb_lshift(numerator, py_exponent));
+        if (numerator == NULL)
+            goto error;
     }
     else {
-        Py_DECREF(denominator);
-        denominator = py_exponent;
-        py_exponent = NULL;
+        Py_SETREF(denominator,
+                  long_methods->nb_lshift(denominator, py_exponent));
+        if (denominator == NULL)
+            goto error;
     }
 
     result_pair = PyTuple_Pack(2, numerator, denominator);
 
-#undef INPLACE_UPDATE
 error:
     Py_XDECREF(py_exponent);
     Py_XDECREF(denominator);
@@ -1966,8 +1975,120 @@ _PyFloat_DebugMallocStats(FILE *out)
 
 
 /*----------------------------------------------------------------------------
- * _PyFloat_{Pack,Unpack}{4,8}.  See floatobject.h.
+ * _PyFloat_{Pack,Unpack}{2,4,8}.  See floatobject.h.
+ * To match the NPY_HALF_ROUND_TIES_TO_EVEN behavior in:
+ * https://github.com/numpy/numpy/blob/master/numpy/core/src/npymath/halffloat.c
+ * We use:
+ *       bits = (unsigned short)f;    Note the truncation
+ *       if ((f - bits > 0.5) || (f - bits == 0.5 && bits % 2)) {
+ *           bits++;
+ *       }
  */
+
+int
+_PyFloat_Pack2(double x, unsigned char *p, int le)
+{
+    unsigned char sign;
+    int e;
+    double f;
+    unsigned short bits;
+    int incr = 1;
+
+    if (x == 0.0) {
+        sign = (copysign(1.0, x) == -1.0);
+        e = 0;
+        bits = 0;
+    }
+    else if (Py_IS_INFINITY(x)) {
+        sign = (x < 0.0);
+        e = 0x1f;
+        bits = 0;
+    }
+    else if (Py_IS_NAN(x)) {
+        /* There are 2046 distinct half-precision NaNs (1022 signaling and
+           1024 quiet), but there are only two quiet NaNs that don't arise by
+           quieting a signaling NaN; we get those by setting the topmost bit
+           of the fraction field and clearing all other fraction bits. We
+           choose the one with the appropriate sign. */
+        sign = (copysign(1.0, x) == -1.0);
+        e = 0x1f;
+        bits = 512;
+    }
+    else {
+        sign = (x < 0.0);
+        if (sign) {
+            x = -x;
+        }
+
+        f = frexp(x, &e);
+        if (f < 0.5 || f >= 1.0) {
+            PyErr_SetString(PyExc_SystemError,
+                            "frexp() result out of range");
+            return -1;
+        }
+
+        /* Normalize f to be in the range [1.0, 2.0) */
+        f *= 2.0;
+        e--;
+
+        if (e >= 16) {
+            goto Overflow;
+        }
+        else if (e < -25) {
+            /* |x| < 2**-25. Underflow to zero. */
+            f = 0.0;
+            e = 0;
+        }
+        else if (e < -14) {
+            /* |x| < 2**-14. Gradual underflow */
+            f = ldexp(f, 14 + e);
+            e = 0;
+        }
+        else /* if (!(e == 0 && f == 0.0)) */ {
+            e += 15;
+            f -= 1.0; /* Get rid of leading 1 */
+        }
+
+        f *= 1024.0; /* 2**10 */
+        /* Round to even */
+        bits = (unsigned short)f; /* Note the truncation */
+        assert(bits < 1024);
+        assert(e < 31);
+        if ((f - bits > 0.5) || ((f - bits == 0.5) && (bits % 2 == 1))) {
+            ++bits;
+            if (bits == 1024) {
+                /* The carry propagated out of a string of 10 1 bits. */
+                bits = 0;
+                ++e;
+                if (e == 31)
+                    goto Overflow;
+            }
+        }
+    }
+
+    bits |= (e << 10) | (sign << 15);
+
+    /* Write out result. */
+    if (le) {
+        p += 1;
+        incr = -1;
+    }
+
+    /* First byte */
+    *p = (unsigned char)((bits >> 8) & 0xFF);
+    p += incr;
+
+    /* Second byte */
+    *p = (unsigned char)(bits & 0xFF);
+
+    return 0;
+
+  Overflow:
+    PyErr_SetString(PyExc_OverflowError,
+                    "float too large to pack with e format");
+    return -1;
+}
+
 int
 _PyFloat_Pack4(double x, unsigned char *p, int le)
 {
@@ -2203,6 +2324,76 @@ _PyFloat_Pack8(double x, unsigned char *p, int le)
 }
 
 double
+_PyFloat_Unpack2(const unsigned char *p, int le)
+{
+    unsigned char sign;
+    int e;
+    unsigned int f;
+    double x;
+    int incr = 1;
+
+    if (le) {
+        p += 1;
+        incr = -1;
+    }
+
+    /* First byte */
+    sign = (*p >> 7) & 1;
+    e = (*p & 0x7C) >> 2;
+    f = (*p & 0x03) << 8;
+    p += incr;
+
+    /* Second byte */
+    f |= *p;
+
+    if (e == 0x1f) {
+#ifdef PY_NO_SHORT_FLOAT_REPR
+        if (f == 0) {
+            /* Infinity */
+            return sign ? -Py_HUGE_VAL : Py_HUGE_VAL;
+        }
+        else {
+            /* NaN */
+#ifdef Py_NAN
+            return sign ? -Py_NAN : Py_NAN;
+#else
+            PyErr_SetString(
+                PyExc_ValueError,
+                "can't unpack IEEE 754 NaN "
+                "on platform that does not support NaNs");
+            return -1;
+#endif  /* #ifdef Py_NAN */
+        }
+#else
+        if (f == 0) {
+            /* Infinity */
+            return _Py_dg_infinity(sign);
+        }
+        else {
+            /* NaN */
+            return _Py_dg_stdnan(sign);
+        }
+#endif  /* #ifdef PY_NO_SHORT_FLOAT_REPR */
+    }
+
+    x = (double)f / 1024.0;
+
+    if (e == 0) {
+        e = -14;
+    }
+    else {
+        x += 1.0;
+        e -= 15;
+    }
+    x = ldexp(x, e);
+
+    if (sign)
+        x = -x;
+
+    return x;
+}
+
+double
 _PyFloat_Unpack4(const unsigned char *p, int le)
 {
     if (float_format == unknown_format) {
diff --git a/Objects/frameobject.c b/Objects/frameobject.c
index 9aadd61..b115614 100644
--- a/Objects/frameobject.c
+++ b/Objects/frameobject.c
@@ -137,7 +137,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno)
         new_lasti = -1;
         for (offset = 0; offset < lnotab_len; offset += 2) {
             addr += lnotab[offset];
-            line += lnotab[offset+1];
+            line += (signed char)lnotab[offset+1];
             if (line >= new_lineno) {
                 new_lasti = addr;
                 new_lineno = line;
@@ -189,7 +189,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno)
     memset(blockstack, '\0', sizeof(blockstack));
     memset(in_finally, '\0', sizeof(in_finally));
     blockstack_top = 0;
-    for (addr = 0; addr < code_len; addr++) {
+    for (addr = 0; addr < code_len; addr += 2) {
         unsigned char op = code[addr];
         switch (op) {
         case SETUP_LOOP:
@@ -251,10 +251,6 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno)
                 }
             }
         }
-
-        if (op >= HAVE_ARGUMENT) {
-            addr += 2;
-        }
     }
 
     /* Verify that the blockstack tracking code didn't get lost. */
@@ -277,7 +273,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno)
      * can tell whether the jump goes into any blocks without coming out
      * again - in that case we raise an exception below. */
     delta_iblock = 0;
-    for (addr = min_addr; addr < max_addr; addr++) {
+    for (addr = min_addr; addr < max_addr; addr += 2) {
         unsigned char op = code[addr];
         switch (op) {
         case SETUP_LOOP:
@@ -294,10 +290,6 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno)
         }
 
         min_delta_iblock = Py_MIN(min_delta_iblock, delta_iblock);
-
-        if (op >= HAVE_ARGUMENT) {
-            addr += 2;
-        }
     }
 
     /* Derive the absolute iblock values from the deltas. */
diff --git a/Objects/funcobject.c b/Objects/funcobject.c
index e6c327d..261c16d 100644
--- a/Objects/funcobject.c
+++ b/Objects/funcobject.c
@@ -249,7 +249,6 @@ func_get_code(PyFunctionObject *op)
 static int
 func_set_code(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
     Py_ssize_t nfree, nclosure;
 
     /* Not legal to del f.func_code or to set it to anything
@@ -270,10 +269,8 @@ func_set_code(PyFunctionObject *op, PyObject *value)
                      nclosure, nfree);
         return -1;
     }
-    tmp = op->func_code;
     Py_INCREF(value);
-    op->func_code = value;
-    Py_DECREF(tmp);
+    Py_XSETREF(op->func_code, value);
     return 0;
 }
 
@@ -287,8 +284,6 @@ func_get_name(PyFunctionObject *op)
 static int
 func_set_name(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     /* Not legal to del f.func_name or to set it to anything
      * other than a string object. */
     if (value == NULL || !PyUnicode_Check(value)) {
@@ -296,10 +291,8 @@ func_set_name(PyFunctionObject *op, PyObject *value)
                         "__name__ must be set to a string object");
         return -1;
     }
-    tmp = op->func_name;
     Py_INCREF(value);
-    op->func_name = value;
-    Py_DECREF(tmp);
+    Py_XSETREF(op->func_name, value);
     return 0;
 }
 
@@ -313,8 +306,6 @@ func_get_qualname(PyFunctionObject *op)
 static int
 func_set_qualname(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     /* Not legal to del f.__qualname__ or to set it to anything
      * other than a string object. */
     if (value == NULL || !PyUnicode_Check(value)) {
@@ -322,10 +313,8 @@ func_set_qualname(PyFunctionObject *op, PyObject *value)
                         "__qualname__ must be set to a string object");
         return -1;
     }
-    tmp = op->func_qualname;
     Py_INCREF(value);
-    op->func_qualname = value;
-    Py_DECREF(tmp);
+    Py_XSETREF(op->func_qualname, value);
     return 0;
 }
 
@@ -343,8 +332,6 @@ func_get_defaults(PyFunctionObject *op)
 static int
 func_set_defaults(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     /* Legal to del f.func_defaults.
      * Can only set func_defaults to NULL or a tuple. */
     if (value == Py_None)
@@ -354,10 +341,8 @@ func_set_defaults(PyFunctionObject *op, PyObject *value)
                         "__defaults__ must be set to a tuple object");
         return -1;
     }
-    tmp = op->func_defaults;
     Py_XINCREF(value);
-    op->func_defaults = value;
-    Py_XDECREF(tmp);
+    Py_XSETREF(op->func_defaults, value);
     return 0;
 }
 
@@ -375,8 +360,6 @@ func_get_kwdefaults(PyFunctionObject *op)
 static int
 func_set_kwdefaults(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     if (value == Py_None)
         value = NULL;
     /* Legal to del f.func_kwdefaults.
@@ -386,10 +369,8 @@ func_set_kwdefaults(PyFunctionObject *op, PyObject *value)
             "__kwdefaults__ must be set to a dict object");
         return -1;
     }
-    tmp = op->func_kwdefaults;
     Py_XINCREF(value);
-    op->func_kwdefaults = value;
-    Py_XDECREF(tmp);
+    Py_XSETREF(op->func_kwdefaults, value);
     return 0;
 }
 
@@ -408,8 +389,6 @@ func_get_annotations(PyFunctionObject *op)
 static int
 func_set_annotations(PyFunctionObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     if (value == Py_None)
         value = NULL;
     /* Legal to del f.func_annotations.
@@ -420,10 +399,8 @@ func_set_annotations(PyFunctionObject *op, PyObject *value)
             "__annotations__ must be set to a dict object");
         return -1;
     }
-    tmp = op->func_annotations;
     Py_XINCREF(value);
-    op->func_annotations = value;
-    Py_XDECREF(tmp);
+    Py_XSETREF(op->func_annotations, value);
     return 0;
 }
 
diff --git a/Objects/genobject.c b/Objects/genobject.c
index 9172e6a..562d41d 100644
--- a/Objects/genobject.c
+++ b/Objects/genobject.c
@@ -194,7 +194,7 @@ gen_send_ex(PyGenObject *gen, PyObject *arg, int exc, int closing)
             /* Pop the exception before issuing a warning. */
             PyErr_Fetch(&exc, &val, &tb);
 
-            if (PyErr_WarnFormat(PyExc_PendingDeprecationWarning, 1,
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                                  "generator '%.50S' raised StopIteration",
                                  gen->gi_qualname)) {
                 /* Warning was converted to an error. */
@@ -264,7 +264,7 @@ gen_close_iter(PyObject *yf)
                 PyErr_WriteUnraisable(yf);
             PyErr_Clear();
         } else {
-            retval = PyObject_CallFunction(meth, "");
+            retval = _PyObject_CallNoArg(meth);
             Py_DECREF(meth);
             if (retval == NULL)
                 return -1;
@@ -284,7 +284,7 @@ _PyGen_yf(PyGenObject *gen)
         PyObject *bytecode = f->f_code->co_code;
         unsigned char *code = (unsigned char *)PyBytes_AS_STRING(bytecode);
 
-        if (code[f->f_lasti + 1] != YIELD_FROM)
+        if (code[f->f_lasti + 2] != YIELD_FROM)
             return NULL;
         yf = f->f_stacktop[-1];
         Py_INCREF(yf);
@@ -383,7 +383,7 @@ gen_throw(PyGenObject *gen, PyObject *args)
             assert(ret == yf);
             Py_DECREF(ret);
             /* Termination repetition of YIELD_FROM */
-            gen->gi_frame->f_lasti++;
+            gen->gi_frame->f_lasti += 2;
             if (_PyGen_FetchStopIterationValue(&val) == 0) {
                 ret = gen_send_ex(gen, val, 0, 0);
                 Py_DECREF(val);
@@ -526,8 +526,6 @@ gen_get_name(PyGenObject *op)
 static int
 gen_set_name(PyGenObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     /* Not legal to del gen.gi_name or to set it to anything
      * other than a string object. */
     if (value == NULL || !PyUnicode_Check(value)) {
@@ -535,10 +533,8 @@ gen_set_name(PyGenObject *op, PyObject *value)
                         "__name__ must be set to a string object");
         return -1;
     }
-    tmp = op->gi_name;
     Py_INCREF(value);
-    op->gi_name = value;
-    Py_DECREF(tmp);
+    Py_XSETREF(op->gi_name, value);
     return 0;
 }
 
@@ -552,8 +548,6 @@ gen_get_qualname(PyGenObject *op)
 static int
 gen_set_qualname(PyGenObject *op, PyObject *value)
 {
-    PyObject *tmp;
-
     /* Not legal to del gen.__qualname__ or to set it to anything
      * other than a string object. */
     if (value == NULL || !PyUnicode_Check(value)) {
@@ -561,10 +555,8 @@ gen_set_qualname(PyGenObject *op, PyObject *value)
                         "__qualname__ must be set to a string object");
         return -1;
     }
-    tmp = op->gi_qualname;
     Py_INCREF(value);
-    op->gi_qualname = value;
-    Py_DECREF(tmp);
+    Py_XSETREF(op->gi_qualname, value);
     return 0;
 }
 
diff --git a/Objects/iterobject.c b/Objects/iterobject.c
index ab29ff8..75b2fcb 100644
--- a/Objects/iterobject.c
+++ b/Objects/iterobject.c
@@ -208,30 +208,32 @@ calliter_traverse(calliterobject *it, visitproc visit, void *arg)
 static PyObject *
 calliter_iternext(calliterobject *it)
 {
-    if (it->it_callable != NULL) {
-        PyObject *args = PyTuple_New(0);
-        PyObject *result;
-        if (args == NULL)
-            return NULL;
-        result = PyObject_Call(it->it_callable, args, NULL);
-        Py_DECREF(args);
-        if (result != NULL) {
-            int ok;
-            ok = PyObject_RichCompareBool(it->it_sentinel, result, Py_EQ);
-            if (ok == 0)
-                return result; /* Common case, fast path */
-            Py_DECREF(result);
-            if (ok > 0) {
-                Py_CLEAR(it->it_callable);
-                Py_CLEAR(it->it_sentinel);
-            }
+    PyObject *result;
+
+    if (it->it_callable == NULL) {
+        return NULL;
+    }
+
+    result = _PyObject_CallNoArg(it->it_callable);
+    if (result != NULL) {
+        int ok;
+
+        ok = PyObject_RichCompareBool(it->it_sentinel, result, Py_EQ);
+        if (ok == 0) {
+            return result; /* Common case, fast path */
         }
-        else if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
-            PyErr_Clear();
+
+        Py_DECREF(result);
+        if (ok > 0) {
             Py_CLEAR(it->it_callable);
             Py_CLEAR(it->it_sentinel);
         }
     }
+    else if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
+        PyErr_Clear();
+        Py_CLEAR(it->it_callable);
+        Py_CLEAR(it->it_sentinel);
+    }
     return NULL;
 }
 
diff --git a/Objects/listobject.c b/Objects/listobject.c
index 815a1b9..dcd7b5e 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -49,7 +49,7 @@ list_resize(PyListObject *self, Py_ssize_t newsize)
     new_allocated = (newsize >> 3) + (newsize < 9 ? 3 : 6);
 
     /* check for integer overflow */
-    if (new_allocated > PY_SIZE_MAX - newsize) {
+    if (new_allocated > SIZE_MAX - newsize) {
         PyErr_NoMemory();
         return -1;
     } else {
@@ -59,7 +59,7 @@ list_resize(PyListObject *self, Py_ssize_t newsize)
     if (newsize == 0)
         new_allocated = 0;
     items = self->ob_item;
-    if (new_allocated <= (PY_SIZE_MAX / sizeof(PyObject *)))
+    if (new_allocated <= (SIZE_MAX / sizeof(PyObject *)))
         PyMem_RESIZE(items, PyObject *, new_allocated);
     else
         items = NULL;
@@ -82,6 +82,16 @@ static size_t count_reuse = 0;
 static void
 show_alloc(void)
 {
+    PyObject *xoptions, *value;
+    _Py_IDENTIFIER(showalloccount);
+
+    xoptions = PySys_GetXOptions();
+    if (xoptions == NULL)
+        return;
+    value = _PyDict_GetItemId(xoptions, &PyId_showalloccount);
+    if (value != Py_True)
+        return;
+
     fprintf(stderr, "List allocations: %" PY_FORMAT_SIZE_T "d\n",
         count_alloc);
     fprintf(stderr, "List reuse through freelist: %" PY_FORMAT_SIZE_T
@@ -130,7 +140,6 @@ PyObject *
 PyList_New(Py_ssize_t size)
 {
     PyListObject *op;
-    size_t nbytes;
 #ifdef SHOW_ALLOC_COUNT
     static int initialized = 0;
     if (!initialized) {
@@ -143,11 +152,6 @@ PyList_New(Py_ssize_t size)
         PyErr_BadInternalCall();
         return NULL;
     }
-    /* Check for overflow without an actual overflow,
-     *  which can cause compiler to optimise out */
-    if ((size_t)size > PY_SIZE_MAX / sizeof(PyObject *))
-        return PyErr_NoMemory();
-    nbytes = size * sizeof(PyObject *);
     if (numfree) {
         numfree--;
         op = free_list[numfree];
@@ -166,12 +170,11 @@ PyList_New(Py_ssize_t size)
     if (size <= 0)
         op->ob_item = NULL;
     else {
-        op->ob_item = (PyObject **) PyMem_MALLOC(nbytes);
+        op->ob_item = (PyObject **) PyMem_Calloc(size, sizeof(PyObject *));
         if (op->ob_item == NULL) {
             Py_DECREF(op);
             return PyErr_NoMemory();
         }
-        memset(op->ob_item, 0, nbytes);
     }
     Py_SIZE(op) = size;
     op->allocated = size;
@@ -216,7 +219,6 @@ int
 PyList_SetItem(PyObject *op, Py_ssize_t i,
                PyObject *newitem)
 {
-    PyObject *olditem;
     PyObject **p;
     if (!PyList_Check(op)) {
         Py_XDECREF(newitem);
@@ -230,9 +232,7 @@ PyList_SetItem(PyObject *op, Py_ssize_t i,
         return -1;
     }
     p = ((PyListObject *)op) -> ob_item + i;
-    olditem = *p;
-    *p = newitem;
-    Py_XDECREF(olditem);
+    Py_XSETREF(*p, newitem);
     return 0;
 }
 
@@ -251,7 +251,7 @@ ins1(PyListObject *self, Py_ssize_t where, PyObject *v)
         return -1;
     }
 
-    if (list_resize(self, n+1) == -1)
+    if (list_resize(self, n+1) < 0)
         return -1;
 
     if (where < 0) {
@@ -291,7 +291,7 @@ app1(PyListObject *self, PyObject *v)
         return -1;
     }
 
-    if (list_resize(self, n+1) == -1)
+    if (list_resize(self, n+1) < 0)
         return -1;
 
     Py_INCREF(v);
@@ -481,9 +481,9 @@ list_concat(PyListObject *a, PyObject *bb)
         return NULL;
     }
 #define b ((PyListObject *)bb)
-    size = Py_SIZE(a) + Py_SIZE(b);
-    if (size < 0)
+    if (Py_SIZE(a) > PY_SSIZE_T_MAX - Py_SIZE(b))
         return PyErr_NoMemory();
+    size = Py_SIZE(a) + Py_SIZE(b);
     np = (PyListObject *) PyList_New(size);
     if (np == NULL) {
         return NULL;
@@ -714,7 +714,7 @@ list_inplace_repeat(PyListObject *self, Py_ssize_t n)
         return PyErr_NoMemory();
     }
 
-    if (list_resize(self, size*n) == -1)
+    if (list_resize(self, size*n) < 0)
         return NULL;
 
     p = size;
@@ -733,7 +733,6 @@ list_inplace_repeat(PyListObject *self, Py_ssize_t n)
 static int
 list_ass_item(PyListObject *a, Py_ssize_t i, PyObject *v)
 {
-    PyObject *old_value;
     if (i < 0 || i >= Py_SIZE(a)) {
         PyErr_SetString(PyExc_IndexError,
                         "list assignment index out of range");
@@ -742,9 +741,7 @@ list_ass_item(PyListObject *a, Py_ssize_t i, PyObject *v)
     if (v == NULL)
         return list_ass_slice(a, i, i+1, v);
     Py_INCREF(v);
-    old_value = a->ob_item[i];
-    a->ob_item[i] = v;
-    Py_DECREF(old_value);
+    Py_SETREF(a->ob_item[i], v);
     return 0;
 }
 
@@ -807,7 +804,7 @@ listextend(PyListObject *self, PyObject *b)
             Py_RETURN_NONE;
         }
         m = Py_SIZE(self);
-        if (list_resize(self, m + n) == -1) {
+        if (list_resize(self, m + n) < 0) {
             Py_DECREF(b);
             return NULL;
         }
@@ -835,23 +832,25 @@ listextend(PyListObject *self, PyObject *b)
 
     /* Guess a result list size. */
     n = PyObject_LengthHint(b, 8);
-    if (n == -1) {
+    if (n < 0) {
         Py_DECREF(it);
         return NULL;
     }
     m = Py_SIZE(self);
-    mn = m + n;
-    if (mn >= m) {
+    if (m > PY_SSIZE_T_MAX - n) {
+        /* m + n overflowed; on the chance that n lied, and there really
+         * is enough room, ignore it.  If n was telling the truth, we'll
+         * eventually run out of memory during the loop.
+         */
+    }
+    else {
+        mn = m + n;
         /* Make room. */
-        if (list_resize(self, mn) == -1)
+        if (list_resize(self, mn) < 0)
             goto error;
         /* Make the list sane again. */
         Py_SIZE(self) = m;
     }
-    /* Else m + n overflowed; on the chance that n lied, and there really
-     * is enough room, ignore it.  If n was telling the truth, we'll
-     * eventually run out of memory during the loop.
-     */
 
     /* Run iterator to exhaustion. */
     for (;;) {
@@ -2500,9 +2499,6 @@ list_ass_subscript(PyListObject* self, PyObject* item, PyObject* value)
                 step = -step;
             }
 
-            assert((size_t)slicelength <=
-                   PY_SIZE_MAX / sizeof(PyObject*));
-
             garbage = (PyObject**)
                 PyMem_MALLOC(slicelength*sizeof(PyObject*));
             if (!garbage) {
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 152a270..17d2797 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -486,7 +486,7 @@ sub-run, yet finding such very efficiently when they exist.
 I first learned about the galloping strategy in a related context; see:
 
     "Adaptive Set Intersections, Unions, and Differences" (2000)
-    Erik D. Demaine, Alejandro L�pez-Ortiz, J. Ian Munro
+    Erik D. Demaine, Alejandro López-Ortiz, J. Ian Munro
 
 and its followup(s).  An earlier paper called the same strategy
 "exponential search":
diff --git a/Objects/lnotab_notes.txt b/Objects/lnotab_notes.txt
index d247edd..5153757 100644
--- a/Objects/lnotab_notes.txt
+++ b/Objects/lnotab_notes.txt
@@ -12,42 +12,47 @@ pairs.  The details are important and delicate, best illustrated by example:
         0		    1
         6		    2
        50		    7
-      350                 307
-      361                 308
+      350                 207
+      361                 208
 
 Instead of storing these numbers literally, we compress the list by storing only
-the increments from one row to the next.  Conceptually, the stored list might
+the difference from one row to the next.  Conceptually, the stored list might
 look like:
 
-    0, 1,  6, 1,  44, 5,  300, 300,  11, 1
+    0, 1,  6, 1,  44, 5,  300, 200,  11, 1
 
-The above doesn't really work, but it's a start. Note that an unsigned byte
-can't hold negative values, or values larger than 255, and the above example
-contains two such values. So we make two tweaks:
+The above doesn't really work, but it's a start. An unsigned byte (byte code
+offset) can't hold negative values, or values larger than 255, a signed byte
+(line number) can't hold values larger than 127 or less than -128, and the
+above example contains two such values. So we make two tweaks:
 
- (a) there's a deep assumption that byte code offsets and their corresponding
- line #s both increase monotonically, and
- (b) if at least one column jumps by more than 255 from one row to the next,
- more than one pair is written to the table. In case #b, there's no way to know
- from looking at the table later how many were written.  That's the delicate
- part.  A user of co_lnotab desiring to find the source line number
- corresponding to a bytecode address A should do something like this
+ (a) there's a deep assumption that byte code offsets increase monotonically,
+ and
+ (b) if byte code offset jumps by more than 255 from one row to the next, or if
+ source code line number jumps by more than 127 or less than -128 from one row
+ to the next, more than one pair is written to the table. In case #b,
+ there's no way to know from looking at the table later how many were written.
+ That's the delicate part.  A user of co_lnotab desiring to find the source
+ line number corresponding to a bytecode address A should do something like
+ this:
 
     lineno = addr = 0
     for addr_incr, line_incr in co_lnotab:
         addr += addr_incr
         if addr > A:
             return lineno
+        if line_incr >= 0x80:
+            line_incr -= 0x100
         lineno += line_incr
 
 (In C, this is implemented by PyCode_Addr2Line().)  In order for this to work,
 when the addr field increments by more than 255, the line # increment in each
 pair generated must be 0 until the remaining addr increment is < 256.  So, in
 the example above, assemble_lnotab in compile.c should not (as was actually done
-until 2.2) expand 300, 300 to
+until 2.2) expand 300, 200 to
     255, 255, 45, 45,
 but to
-    255, 0, 45, 255, 0, 45.
+    255, 0, 45, 128, 0, 72.
 
 The above is sufficient to reconstruct line numbers for tracebacks, but not for
 line tracing.  Tracing is handled by PyCode_CheckLineNumber() in codeobject.c
@@ -90,16 +95,16 @@ which compiles to this:
               6 POP_JUMP_IF_FALSE       17
 
   3           9 LOAD_CONST               1 (1)
-             12 PRINT_ITEM          
+             12 PRINT_ITEM
 
-  4          13 BREAK_LOOP          
+  4          13 BREAK_LOOP
              14 JUMP_ABSOLUTE            3
-        >>   17 POP_BLOCK           
+        >>   17 POP_BLOCK
 
   6          18 LOAD_CONST               2 (2)
-             21 PRINT_ITEM          
+             21 PRINT_ITEM
         >>   22 LOAD_CONST               0 (None)
-             25 RETURN_VALUE        
+             25 RETURN_VALUE
 
 If 'a' is false, execution will jump to the POP_BLOCK instruction at offset 17
 and the co_lnotab will claim that execution has moved to line 4, which is wrong.
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 9b62d92..740b7f5 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -721,7 +721,7 @@ _PyLong_NumBits(PyObject *vv)
     assert(ndigits == 0 || v->ob_digit[ndigits - 1] != 0);
     if (ndigits > 0) {
         digit msd = v->ob_digit[ndigits - 1];
-        if ((size_t)(ndigits - 1) > PY_SIZE_MAX / (size_t)PyLong_SHIFT)
+        if ((size_t)(ndigits - 1) > SIZE_MAX / (size_t)PyLong_SHIFT)
             goto Overflow;
         result = (size_t)(ndigits - 1) * (size_t)PyLong_SHIFT;
         do {
@@ -989,16 +989,13 @@ PyObject *
 PyLong_FromVoidPtr(void *p)
 {
 #if SIZEOF_VOID_P <= SIZEOF_LONG
-    return PyLong_FromUnsignedLong((unsigned long)(Py_uintptr_t)p);
+    return PyLong_FromUnsignedLong((unsigned long)(uintptr_t)p);
 #else
 
-#ifndef HAVE_LONG_LONG
-#   error "PyLong_FromVoidPtr: sizeof(void*) > sizeof(long), but no long long"
-#endif
 #if SIZEOF_LONG_LONG < SIZEOF_VOID_P
-#   error "PyLong_FromVoidPtr: sizeof(PY_LONG_LONG) < sizeof(void*)"
+#   error "PyLong_FromVoidPtr: sizeof(long long) < sizeof(void*)"
 #endif
-    return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG)(Py_uintptr_t)p);
+    return PyLong_FromUnsignedLongLong((unsigned long long)(uintptr_t)p);
 #endif /* SIZEOF_VOID_P <= SIZEOF_LONG */
 
 }
@@ -1017,13 +1014,10 @@ PyLong_AsVoidPtr(PyObject *vv)
         x = PyLong_AsUnsignedLong(vv);
 #else
 
-#ifndef HAVE_LONG_LONG
-#   error "PyLong_AsVoidPtr: sizeof(void*) > sizeof(long), but no long long"
-#endif
 #if SIZEOF_LONG_LONG < SIZEOF_VOID_P
-#   error "PyLong_AsVoidPtr: sizeof(PY_LONG_LONG) < sizeof(void*)"
+#   error "PyLong_AsVoidPtr: sizeof(long long) < sizeof(void*)"
 #endif
-    PY_LONG_LONG x;
+    long long x;
 
     if (PyLong_Check(vv) && _PyLong_Sign(vv) < 0)
         x = PyLong_AsLongLong(vv);
@@ -1037,22 +1031,20 @@ PyLong_AsVoidPtr(PyObject *vv)
     return (void *)x;
 }
 
-#ifdef HAVE_LONG_LONG
-
-/* Initial PY_LONG_LONG support by Chris Herborth (chrish@qnx.com), later
+/* Initial long long support by Chris Herborth (chrish@qnx.com), later
  * rewritten to use the newer PyLong_{As,From}ByteArray API.
  */
 
-#define PY_ABS_LLONG_MIN (0-(unsigned PY_LONG_LONG)PY_LLONG_MIN)
+#define PY_ABS_LLONG_MIN (0-(unsigned long long)PY_LLONG_MIN)
 
-/* Create a new int object from a C PY_LONG_LONG int. */
+/* Create a new int object from a C long long int. */
 
 PyObject *
-PyLong_FromLongLong(PY_LONG_LONG ival)
+PyLong_FromLongLong(long long ival)
 {
     PyLongObject *v;
-    unsigned PY_LONG_LONG abs_ival;
-    unsigned PY_LONG_LONG t;  /* unsigned so >> doesn't propagate sign bit */
+    unsigned long long abs_ival;
+    unsigned long long t;  /* unsigned so >> doesn't propagate sign bit */
     int ndigits = 0;
     int negative = 0;
 
@@ -1060,11 +1052,11 @@ PyLong_FromLongLong(PY_LONG_LONG ival)
     if (ival < 0) {
         /* avoid signed overflow on negation;  see comments
            in PyLong_FromLong above. */
-        abs_ival = (unsigned PY_LONG_LONG)(-1-ival) + 1;
+        abs_ival = (unsigned long long)(-1-ival) + 1;
         negative = 1;
     }
     else {
-        abs_ival = (unsigned PY_LONG_LONG)ival;
+        abs_ival = (unsigned long long)ival;
     }
 
     /* Count the number of Python digits.
@@ -1089,19 +1081,19 @@ PyLong_FromLongLong(PY_LONG_LONG ival)
     return (PyObject *)v;
 }
 
-/* Create a new int object from a C unsigned PY_LONG_LONG int. */
+/* Create a new int object from a C unsigned long long int. */
 
 PyObject *
-PyLong_FromUnsignedLongLong(unsigned PY_LONG_LONG ival)
+PyLong_FromUnsignedLongLong(unsigned long long ival)
 {
     PyLongObject *v;
-    unsigned PY_LONG_LONG t;
+    unsigned long long t;
     int ndigits = 0;
 
     if (ival < PyLong_BASE)
         return PyLong_FromLong((long)ival);
     /* Count the number of Python digits. */
-    t = (unsigned PY_LONG_LONG)ival;
+    t = (unsigned long long)ival;
     while (t) {
         ++ndigits;
         t >>= PyLong_SHIFT;
@@ -1190,11 +1182,11 @@ PyLong_FromSize_t(size_t ival)
 /* Get a C long long int from an int object or any object that has an
    __int__ method.  Return -1 and set an error if overflow occurs. */
 
-PY_LONG_LONG
+long long
 PyLong_AsLongLong(PyObject *vv)
 {
     PyLongObject *v;
-    PY_LONG_LONG bytes;
+    long long bytes;
     int res;
     int do_decref = 0; /* if nb_int was called */
 
@@ -1232,30 +1224,30 @@ PyLong_AsLongLong(PyObject *vv)
         Py_DECREF(v);
     }
 
-    /* Plan 9 can't handle PY_LONG_LONG in ? : expressions */
+    /* Plan 9 can't handle long long in ? : expressions */
     if (res < 0)
-        return (PY_LONG_LONG)-1;
+        return (long long)-1;
     else
         return bytes;
 }
 
-/* Get a C unsigned PY_LONG_LONG int from an int object.
+/* Get a C unsigned long long int from an int object.
    Return -1 and set an error if overflow occurs. */
 
-unsigned PY_LONG_LONG
+unsigned long long
 PyLong_AsUnsignedLongLong(PyObject *vv)
 {
     PyLongObject *v;
-    unsigned PY_LONG_LONG bytes;
+    unsigned long long bytes;
     int res;
 
     if (vv == NULL) {
         PyErr_BadInternalCall();
-        return (unsigned PY_LONG_LONG)-1;
+        return (unsigned long long)-1;
     }
     if (!PyLong_Check(vv)) {
         PyErr_SetString(PyExc_TypeError, "an integer is required");
-        return (unsigned PY_LONG_LONG)-1;
+        return (unsigned long long)-1;
     }
 
     v = (PyLongObject*)vv;
@@ -1267,9 +1259,9 @@ PyLong_AsUnsignedLongLong(PyObject *vv)
     res = _PyLong_AsByteArray((PyLongObject *)vv, (unsigned char *)&bytes,
                               SIZEOF_LONG_LONG, PY_LITTLE_ENDIAN, 0);
 
-    /* Plan 9 can't handle PY_LONG_LONG in ? : expressions */
+    /* Plan 9 can't handle long long in ? : expressions */
     if (res < 0)
-        return (unsigned PY_LONG_LONG)res;
+        return (unsigned long long)res;
     else
         return bytes;
 }
@@ -1277,11 +1269,11 @@ PyLong_AsUnsignedLongLong(PyObject *vv)
 /* Get a C unsigned long int from an int object, ignoring the high bits.
    Returns -1 and sets an error condition if an error occurs. */
 
-static unsigned PY_LONG_LONG
+static unsigned long long
 _PyLong_AsUnsignedLongLongMask(PyObject *vv)
 {
     PyLongObject *v;
-    unsigned PY_LONG_LONG x;
+    unsigned long long x;
     Py_ssize_t i;
     int sign;
 
@@ -1307,11 +1299,11 @@ _PyLong_AsUnsignedLongLongMask(PyObject *vv)
     return x * sign;
 }
 
-unsigned PY_LONG_LONG
+unsigned long long
 PyLong_AsUnsignedLongLongMask(PyObject *op)
 {
     PyLongObject *lo;
-    unsigned PY_LONG_LONG val;
+    unsigned long long val;
 
     if (op == NULL) {
         PyErr_BadInternalCall();
@@ -1324,7 +1316,7 @@ PyLong_AsUnsignedLongLongMask(PyObject *op)
 
     lo = _PyLong_FromNbInt(op);
     if (lo == NULL)
-        return (unsigned PY_LONG_LONG)-1;
+        return (unsigned long long)-1;
 
     val = _PyLong_AsUnsignedLongLongMask((PyObject *)lo);
     Py_DECREF(lo);
@@ -1341,13 +1333,13 @@ PyLong_AsUnsignedLongLongMask(PyObject *op)
    In this case *overflow will be 0.
 */
 
-PY_LONG_LONG
+long long
 PyLong_AsLongLongAndOverflow(PyObject *vv, int *overflow)
 {
     /* This version by Tim Peters */
     PyLongObject *v;
-    unsigned PY_LONG_LONG x, prev;
-    PY_LONG_LONG res;
+    unsigned long long x, prev;
+    long long res;
     Py_ssize_t i;
     int sign;
     int do_decref = 0; /* if nb_int was called */
@@ -1399,8 +1391,8 @@ PyLong_AsLongLongAndOverflow(PyObject *vv, int *overflow)
         /* Haven't lost any bits, but casting to long requires extra
          * care (see comment above).
          */
-        if (x <= (unsigned PY_LONG_LONG)PY_LLONG_MAX) {
-            res = (PY_LONG_LONG)x * sign;
+        if (x <= (unsigned long long)PY_LLONG_MAX) {
+            res = (long long)x * sign;
         }
         else if (sign < 0 && x == PY_ABS_LLONG_MIN) {
             res = PY_LLONG_MIN;
@@ -1417,8 +1409,6 @@ PyLong_AsLongLongAndOverflow(PyObject *vv, int *overflow)
     return res;
 }
 
-#endif /* HAVE_LONG_LONG */
-
 #define CHECK_BINOP(v,w)                                \
     do {                                                \
         if (!PyLong_Check(v) || !PyLong_Check(w))       \
@@ -1582,13 +1572,16 @@ divrem1(PyLongObject *a, digit n, digit *prem)
 static int
 long_to_decimal_string_internal(PyObject *aa,
                                 PyObject **p_output,
-                                _PyUnicodeWriter *writer)
+                                _PyUnicodeWriter *writer,
+                                _PyBytesWriter *bytes_writer,
+                                char **bytes_str)
 {
     PyLongObject *scratch, *a;
-    PyObject *str;
+    PyObject *str = NULL;
     Py_ssize_t size, strlen, size_a, i, j;
     digit *pout, *pin, rem, tenpow;
     int negative;
+    int d;
     enum PyUnicode_Kind kind;
 
     a = (PyLongObject *)aa;
@@ -1606,15 +1599,17 @@ long_to_decimal_string_internal(PyObject *aa,
 
        But log2(a) < size_a * PyLong_SHIFT, and
        log2(_PyLong_DECIMAL_BASE) = log2(10) * _PyLong_DECIMAL_SHIFT
-                                  > 3 * _PyLong_DECIMAL_SHIFT
+                                  > 3.3 * _PyLong_DECIMAL_SHIFT
+
+         size_a * PyLong_SHIFT / (3.3 * _PyLong_DECIMAL_SHIFT) =
+             size_a + size_a / d < size_a + size_a / floor(d),
+       where d = (3.3 * _PyLong_DECIMAL_SHIFT) /
+                 (PyLong_SHIFT - 3.3 * _PyLong_DECIMAL_SHIFT)
     */
-    if (size_a > PY_SSIZE_T_MAX / PyLong_SHIFT) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "int too large to format");
-        return -1;
-    }
-    /* the expression size_a * PyLong_SHIFT is now safe from overflow */
-    size = 1 + size_a * PyLong_SHIFT / (3 * _PyLong_DECIMAL_SHIFT);
+    d = (33 * _PyLong_DECIMAL_SHIFT) /
+        (10 * PyLong_SHIFT - 33 * _PyLong_DECIMAL_SHIFT);
+    assert(size_a < PY_SSIZE_T_MAX/2);
+    size = 1 + size_a + size_a / d;
     scratch = _PyLong_New(size);
     if (scratch == NULL)
         return -1;
@@ -1662,7 +1657,13 @@ long_to_decimal_string_internal(PyObject *aa,
             return -1;
         }
         kind = writer->kind;
-        str = NULL;
+    }
+    else if (bytes_writer) {
+        *bytes_str = _PyBytesWriter_Prepare(bytes_writer, *bytes_str, strlen);
+        if (*bytes_str == NULL) {
+            Py_DECREF(scratch);
+            return -1;
+        }
     }
     else {
         str = PyUnicode_New(strlen, '9');
@@ -1673,13 +1674,8 @@ long_to_decimal_string_internal(PyObject *aa,
         kind = PyUnicode_KIND(str);
     }
 
-#define WRITE_DIGITS(TYPE)                                            \
+#define WRITE_DIGITS(p)                                               \
     do {                                                              \
-        if (writer)                                                   \
-            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + strlen; \
-        else                                                          \
-            p = (TYPE*)PyUnicode_DATA(str) + strlen;                  \
-                                                                      \
         /* pout[0] through pout[size-2] contribute exactly            \
            _PyLong_DECIMAL_SHIFT digits each */                       \
         for (i=0; i < size - 1; i++) {                                \
@@ -1699,6 +1695,16 @@ long_to_decimal_string_internal(PyObject *aa,
         /* and sign */                                                \
         if (negative)                                                 \
             *--p = '-';                                               \
+    } while (0)
+
+#define WRITE_UNICODE_DIGITS(TYPE)                                    \
+    do {                                                              \
+        if (writer)                                                   \
+            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + strlen; \
+        else                                                          \
+            p = (TYPE*)PyUnicode_DATA(str) + strlen;                  \
+                                                                      \
+        WRITE_DIGITS(p);                                              \
                                                                       \
         /* check we've counted correctly */                           \
         if (writer)                                                   \
@@ -1708,25 +1714,34 @@ long_to_decimal_string_internal(PyObject *aa,
     } while (0)
 
     /* fill the string right-to-left */
-    if (kind == PyUnicode_1BYTE_KIND) {
+    if (bytes_writer) {
+        char *p = *bytes_str + strlen;
+        WRITE_DIGITS(p);
+        assert(p == *bytes_str);
+    }
+    else if (kind == PyUnicode_1BYTE_KIND) {
         Py_UCS1 *p;
-        WRITE_DIGITS(Py_UCS1);
+        WRITE_UNICODE_DIGITS(Py_UCS1);
     }
     else if (kind == PyUnicode_2BYTE_KIND) {
         Py_UCS2 *p;
-        WRITE_DIGITS(Py_UCS2);
+        WRITE_UNICODE_DIGITS(Py_UCS2);
     }
     else {
         Py_UCS4 *p;
         assert (kind == PyUnicode_4BYTE_KIND);
-        WRITE_DIGITS(Py_UCS4);
+        WRITE_UNICODE_DIGITS(Py_UCS4);
     }
 #undef WRITE_DIGITS
+#undef WRITE_UNICODE_DIGITS
 
     Py_DECREF(scratch);
     if (writer) {
         writer->pos += strlen;
     }
+    else if (bytes_writer) {
+        (*bytes_str) += strlen;
+    }
     else {
         assert(_PyUnicode_CheckConsistency(str, 1));
         *p_output = (PyObject *)str;
@@ -1738,7 +1753,7 @@ static PyObject *
 long_to_decimal_string(PyObject *aa)
 {
     PyObject *v;
-    if (long_to_decimal_string_internal(aa, &v, NULL) == -1)
+    if (long_to_decimal_string_internal(aa, &v, NULL, NULL, NULL) == -1)
         return NULL;
     return v;
 }
@@ -1750,10 +1765,11 @@ long_to_decimal_string(PyObject *aa)
 
 static int
 long_format_binary(PyObject *aa, int base, int alternate,
-                   PyObject **p_output, _PyUnicodeWriter *writer)
+                   PyObject **p_output, _PyUnicodeWriter *writer,
+                   _PyBytesWriter *bytes_writer, char **bytes_str)
 {
     PyLongObject *a = (PyLongObject *)aa;
-    PyObject *v;
+    PyObject *v = NULL;
     Py_ssize_t sz;
     Py_ssize_t size_a;
     enum PyUnicode_Kind kind;
@@ -1810,7 +1826,11 @@ long_format_binary(PyObject *aa, int base, int alternate,
         if (_PyUnicodeWriter_Prepare(writer, sz, 'x') == -1)
             return -1;
         kind = writer->kind;
-        v = NULL;
+    }
+    else if (bytes_writer) {
+        *bytes_str = _PyBytesWriter_Prepare(bytes_writer, *bytes_str, sz);
+        if (*bytes_str == NULL)
+            return -1;
     }
     else {
         v = PyUnicode_New(sz, 'x');
@@ -1819,13 +1839,8 @@ long_format_binary(PyObject *aa, int base, int alternate,
         kind = PyUnicode_KIND(v);
     }
 
-#define WRITE_DIGITS(TYPE)                                              \
+#define WRITE_DIGITS(p)                                                 \
     do {                                                                \
-        if (writer)                                                     \
-            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + sz; \
-        else                                                            \
-            p = (TYPE*)PyUnicode_DATA(v) + sz;                          \
-                                                                        \
         if (size_a == 0) {                                              \
             *--p = '0';                                                 \
         }                                                               \
@@ -1860,30 +1875,50 @@ long_format_binary(PyObject *aa, int base, int alternate,
         }                                                               \
         if (negative)                                                   \
             *--p = '-';                                                 \
+    } while (0)
+
+#define WRITE_UNICODE_DIGITS(TYPE)                                      \
+    do {                                                                \
+        if (writer)                                                     \
+            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + sz; \
+        else                                                            \
+            p = (TYPE*)PyUnicode_DATA(v) + sz;                          \
+                                                                        \
+        WRITE_DIGITS(p);                                                \
+                                                                        \
         if (writer)                                                     \
             assert(p == ((TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos)); \
         else                                                            \
             assert(p == (TYPE*)PyUnicode_DATA(v));                      \
     } while (0)
 
-    if (kind == PyUnicode_1BYTE_KIND) {
+    if (bytes_writer) {
+        char *p = *bytes_str + sz;
+        WRITE_DIGITS(p);
+        assert(p == *bytes_str);
+    }
+    else if (kind == PyUnicode_1BYTE_KIND) {
         Py_UCS1 *p;
-        WRITE_DIGITS(Py_UCS1);
+        WRITE_UNICODE_DIGITS(Py_UCS1);
     }
     else if (kind == PyUnicode_2BYTE_KIND) {
         Py_UCS2 *p;
-        WRITE_DIGITS(Py_UCS2);
+        WRITE_UNICODE_DIGITS(Py_UCS2);
     }
     else {
         Py_UCS4 *p;
         assert (kind == PyUnicode_4BYTE_KIND);
-        WRITE_DIGITS(Py_UCS4);
+        WRITE_UNICODE_DIGITS(Py_UCS4);
     }
 #undef WRITE_DIGITS
+#undef WRITE_UNICODE_DIGITS
 
     if (writer) {
         writer->pos += sz;
     }
+    else if (bytes_writer) {
+        (*bytes_str) += sz;
+    }
     else {
         assert(_PyUnicode_CheckConsistency(v, 1));
         *p_output = v;
@@ -1897,9 +1932,9 @@ _PyLong_Format(PyObject *obj, int base)
     PyObject *str;
     int err;
     if (base == 10)
-        err = long_to_decimal_string_internal(obj, &str, NULL);
+        err = long_to_decimal_string_internal(obj, &str, NULL, NULL, NULL);
     else
-        err = long_format_binary(obj, base, 1, &str, NULL);
+        err = long_format_binary(obj, base, 1, &str, NULL, NULL, NULL);
     if (err == -1)
         return NULL;
     return str;
@@ -1911,9 +1946,31 @@ _PyLong_FormatWriter(_PyUnicodeWriter *writer,
                      int base, int alternate)
 {
     if (base == 10)
-        return long_to_decimal_string_internal(obj, NULL, writer);
+        return long_to_decimal_string_internal(obj, NULL, writer,
+                                               NULL, NULL);
     else
-        return long_format_binary(obj, base, alternate, NULL, writer);
+        return long_format_binary(obj, base, alternate, NULL, writer,
+                                  NULL, NULL);
+}
+
+char*
+_PyLong_FormatBytesWriter(_PyBytesWriter *writer, char *str,
+                          PyObject *obj,
+                          int base, int alternate)
+{
+    char *str2;
+    int res;
+    str2 = str;
+    if (base == 10)
+        res = long_to_decimal_string_internal(obj, NULL, NULL,
+                                              writer, &str2);
+    else
+        res = long_format_binary(obj, base, alternate, NULL, NULL,
+                                 writer, &str2);
+    if (res < 0)
+        return NULL;
+    assert(str2 != NULL);
+    return str2;
 }
 
 /* Table of digit values for 8-bit string -> integer conversion.
@@ -2394,8 +2451,11 @@ long_divrem(PyLongObject *a, PyLongObject *b,
         *pdiv = (PyLongObject*)PyLong_FromLong(0);
         if (*pdiv == NULL)
             return -1;
-        Py_INCREF(a);
-        *prem = (PyLongObject *) a;
+        *prem = (PyLongObject *)long_long((PyObject *)a);
+        if (*prem == NULL) {
+            Py_CLEAR(*pdiv);
+            return -1;
+        }
         return 0;
     }
     if (size_b == 1) {
@@ -2705,6 +2765,13 @@ PyLong_AsDouble(PyObject *v)
         PyErr_SetString(PyExc_TypeError, "an integer is required");
         return -1.0;
     }
+    if (Py_ABS(Py_SIZE(v)) <= 1) {
+        /* Fast path; single digit long (31 bits) will cast safely
+           to double.  This improves performance of FP/long operations
+           by 20%.
+        */
+        return (double)MEDIUM_VALUE((PyLongObject *)v);
+    }
     x = _PyLong_Frexp((PyLongObject *)v, &exponent);
     if ((x == -1.0 && PyErr_Occurred()) || exponent > DBL_MAX_EXP) {
         PyErr_SetString(PyExc_OverflowError,
@@ -2929,9 +2996,7 @@ x_sub(PyLongObject *a, PyLongObject *b)
     }
     assert(borrow == 0);
     if (sign < 0) {
-        _PyLong_Negate(&z);
-        if (z == NULL)
-            return NULL;
+        Py_SIZE(z) = -Py_SIZE(z);
     }
     return long_normalize(z);
 }
@@ -2951,8 +3016,14 @@ long_add(PyLongObject *a, PyLongObject *b)
     if (Py_SIZE(a) < 0) {
         if (Py_SIZE(b) < 0) {
             z = x_add(a, b);
-            if (z != NULL && Py_SIZE(z) != 0)
+            if (z != NULL) {
+                /* x_add received at least one multiple-digit int,
+                   and thus z must be a multiple-digit int.
+                   That also means z is not an element of
+                   small_ints, so negating it in-place is safe. */
+                assert(Py_REFCNT(z) == 1);
                 Py_SIZE(z) = -(Py_SIZE(z));
+            }
         }
         else
             z = x_sub(b, a);
@@ -2983,8 +3054,10 @@ long_sub(PyLongObject *a, PyLongObject *b)
             z = x_sub(a, b);
         else
             z = x_add(a, b);
-        if (z != NULL && Py_SIZE(z) != 0)
+        if (z != NULL) {
+            assert(Py_SIZE(z) == 0 || Py_REFCNT(z) == 1);
             Py_SIZE(z) = -(Py_SIZE(z));
+        }
     }
     else {
         if (Py_SIZE(b) < 0)
@@ -3408,17 +3481,7 @@ long_mul(PyLongObject *a, PyLongObject *b)
     /* fast path for single-digit multiplication */
     if (Py_ABS(Py_SIZE(a)) <= 1 && Py_ABS(Py_SIZE(b)) <= 1) {
         stwodigits v = (stwodigits)(MEDIUM_VALUE(a)) * MEDIUM_VALUE(b);
-#ifdef HAVE_LONG_LONG
-        return PyLong_FromLongLong((PY_LONG_LONG)v);
-#else
-        /* if we don't have long long then we're almost certainly
-           using 15-bit digits, so v will fit in a long.  In the
-           unlikely event that we're using 30-bit digits on a platform
-           without long long, a large v will just cause us to fall
-           through to the general multiplication code below. */
-        if (v >= LONG_MIN && v <= LONG_MAX)
-            return PyLong_FromLong((long)v);
-#endif
+        return PyLong_FromLongLong((long long)v);
     }
 
     z = k_mul(a, b);
@@ -3431,6 +3494,52 @@ long_mul(PyLongObject *a, PyLongObject *b)
     return (PyObject *)z;
 }
 
+/* Fast modulo division for single-digit longs. */
+static PyObject *
+fast_mod(PyLongObject *a, PyLongObject *b)
+{
+    sdigit left = a->ob_digit[0];
+    sdigit right = b->ob_digit[0];
+    sdigit mod;
+
+    assert(Py_ABS(Py_SIZE(a)) == 1);
+    assert(Py_ABS(Py_SIZE(b)) == 1);
+
+    if (Py_SIZE(a) == Py_SIZE(b)) {
+        /* 'a' and 'b' have the same sign. */
+        mod = left % right;
+    }
+    else {
+        /* Either 'a' or 'b' is negative. */
+        mod = right - 1 - (left - 1) % right;
+    }
+
+    return PyLong_FromLong(mod * (sdigit)Py_SIZE(b));
+}
+
+/* Fast floor division for single-digit longs. */
+static PyObject *
+fast_floor_div(PyLongObject *a, PyLongObject *b)
+{
+    sdigit left = a->ob_digit[0];
+    sdigit right = b->ob_digit[0];
+    sdigit div;
+
+    assert(Py_ABS(Py_SIZE(a)) == 1);
+    assert(Py_ABS(Py_SIZE(b)) == 1);
+
+    if (Py_SIZE(a) == Py_SIZE(b)) {
+        /* 'a' and 'b' have the same sign. */
+        div = left / right;
+    }
+    else {
+        /* Either 'a' or 'b' is negative. */
+        div = -1 - (left - 1) / right;
+    }
+
+    return PyLong_FromLong(div);
+}
+
 /* The / and % operators are now defined in terms of divmod().
    The expression a mod b has the value a - b*floor(a/b).
    The long_divrem function gives the remainder after division of
@@ -3458,6 +3567,30 @@ l_divmod(PyLongObject *v, PyLongObject *w,
 {
     PyLongObject *div, *mod;
 
+    if (Py_ABS(Py_SIZE(v)) == 1 && Py_ABS(Py_SIZE(w)) == 1) {
+        /* Fast path for single-digit longs */
+        div = NULL;
+        if (pdiv != NULL) {
+            div = (PyLongObject *)fast_floor_div(v, w);
+            if (div == NULL) {
+                return -1;
+            }
+        }
+        if (pmod != NULL) {
+            mod = (PyLongObject *)fast_mod(v, w);
+            if (mod == NULL) {
+                Py_XDECREF(div);
+                return -1;
+            }
+            *pmod = mod;
+        }
+        if (pdiv != NULL) {
+            /* We only want to set `*pdiv` when `*pmod` is
+               set successfully. */
+            *pdiv = div;
+        }
+        return 0;
+    }
     if (long_divrem(v, w, &div, &mod) < 0)
         return -1;
     if ((Py_SIZE(mod) < 0 && Py_SIZE(w) > 0) ||
@@ -3502,6 +3635,11 @@ long_div(PyObject *a, PyObject *b)
     PyLongObject *div;
 
     CHECK_BINOP(a, b);
+
+    if (Py_ABS(Py_SIZE(a)) == 1 && Py_ABS(Py_SIZE(b)) == 1) {
+        return fast_floor_div((PyLongObject*)a, (PyLongObject*)b);
+    }
+
     if (l_divmod((PyLongObject*)a, (PyLongObject*)b, &div, NULL) < 0)
         div = NULL;
     return (PyObject *)div;
@@ -3741,9 +3879,9 @@ long_true_divide(PyObject *v, PyObject *w)
     /* Round by directly modifying the low digit of x. */
     mask = (digit)1 << (extra_bits - 1);
     low = x->ob_digit[0] | inexact;
-    if (low & mask && low & (3*mask-1))
+    if ((low & mask) && (low & (3U*mask-1U)))
         low += mask;
-    x->ob_digit[0] = low & ~(mask-1U);
+    x->ob_digit[0] = low & ~(2U*mask-1U);
 
     /* Convert x to a double dx; the conversion is exact. */
     dx = x->ob_digit[--x_size];
@@ -3777,6 +3915,10 @@ long_mod(PyObject *a, PyObject *b)
 
     CHECK_BINOP(a, b);
 
+    if (Py_ABS(Py_SIZE(a)) == 1 && Py_ABS(Py_SIZE(b)) == 1) {
+        return fast_mod((PyLongObject*)a, (PyLongObject*)b);
+    }
+
     if (l_divmod((PyLongObject*)a, (PyLongObject*)b, NULL, &mod) < 0)
         mod = NULL;
     return (PyObject *)mod;
@@ -4011,8 +4153,10 @@ long_invert(PyLongObject *v)
     Py_DECREF(w);
     if (x == NULL)
         return NULL;
-    Py_SIZE(x) = -(Py_SIZE(x));
-    return (PyObject *)maybe_small_long(x);
+    _PyLong_Negate(&x);
+    /* No need for maybe_small_long here, since any small
+       longs will have been caught in the Py_SIZE <= 1 fast path. */
+    return (PyObject *)x;
 }
 
 static PyObject *
@@ -4117,6 +4261,11 @@ long_lshift(PyObject *v, PyObject *w)
         PyErr_SetString(PyExc_ValueError, "negative shift count");
         return NULL;
     }
+
+    if (Py_SIZE(a) == 0) {
+        return PyLong_FromLong(0);
+    }
+
     /* wordshift, remshift = divmod(shiftby, PyLong_SHIFT) */
     wordshift = shiftby / PyLong_SHIFT;
     remshift  = shiftby - wordshift * PyLong_SHIFT;
@@ -4501,7 +4650,7 @@ simple:
     /* a fits into a long, so b must too */
     x = PyLong_AsLong((PyObject *)a);
     y = PyLong_AsLong((PyObject *)b);
-#elif defined(PY_LONG_LONG) && PY_LLONG_MAX >> PyLong_SHIFT >> PyLong_SHIFT
+#elif PY_LLONG_MAX >> PyLong_SHIFT >> PyLong_SHIFT
     x = PyLong_AsLongLong((PyObject *)a);
     y = PyLong_AsLongLong((PyObject *)b);
 #else
@@ -4520,7 +4669,7 @@ simple:
     }
 #if LONG_MAX >> PyLong_SHIFT >> PyLong_SHIFT
     return PyLong_FromLong(x);
-#elif defined(PY_LONG_LONG) && PY_LLONG_MAX >> PyLong_SHIFT >> PyLong_SHIFT
+#elif PY_LLONG_MAX >> PyLong_SHIFT >> PyLong_SHIFT
     return PyLong_FromLongLong(x);
 #else
 # error "_PyLong_GCD"
diff --git a/Objects/memoryobject.c b/Objects/memoryobject.c
index 10162cb..e53c854 100644
--- a/Objects/memoryobject.c
+++ b/Objects/memoryobject.c
@@ -1111,17 +1111,11 @@ get_native_fmtchar(char *result, const char *fmt)
     case 'h': case 'H': size = sizeof(short); break;
     case 'i': case 'I': size = sizeof(int); break;
     case 'l': case 'L': size = sizeof(long); break;
-    #ifdef HAVE_LONG_LONG
-    case 'q': case 'Q': size = sizeof(PY_LONG_LONG); break;
-    #endif
+    case 'q': case 'Q': size = sizeof(long long); break;
     case 'n': case 'N': size = sizeof(Py_ssize_t); break;
     case 'f': size = sizeof(float); break;
     case 'd': size = sizeof(double); break;
-    #ifdef HAVE_C99_BOOL
     case '?': size = sizeof(_Bool); break;
-    #else
-    case '?': size = sizeof(char); break;
-    #endif
     case 'P': size = sizeof(void *); break;
     }
 
@@ -1133,7 +1127,7 @@ get_native_fmtchar(char *result, const char *fmt)
     return -1;
 }
 
-Py_LOCAL_INLINE(char *)
+Py_LOCAL_INLINE(const char *)
 get_native_fmtstr(const char *fmt)
 {
     int at = 0;
@@ -1158,19 +1152,13 @@ get_native_fmtstr(const char *fmt)
     case 'I': RETURN("I");
     case 'l': RETURN("l");
     case 'L': RETURN("L");
-    #ifdef HAVE_LONG_LONG
     case 'q': RETURN("q");
     case 'Q': RETURN("Q");
-    #endif
     case 'n': RETURN("n");
     case 'N': RETURN("N");
     case 'f': RETURN("f");
     case 'd': RETURN("d");
-    #ifdef HAVE_C99_BOOL
-    case '?': RETURN("?");
-    #else
     case '?': RETURN("?");
-    #endif
     case 'P': RETURN("P");
     }
 
@@ -1221,7 +1209,7 @@ cast_to_1D(PyMemoryViewObject *mv, PyObject *format)
         goto out;
     }
 
-    view->format = get_native_fmtstr(PyBytes_AS_STRING(asciifmt));
+    view->format = (char *)get_native_fmtstr(PyBytes_AS_STRING(asciifmt));
     if (view->format == NULL) {
         /* NOT_REACHED: get_native_fmtchar() already validates the format. */
         PyErr_SetString(PyExc_RuntimeError,
@@ -1581,12 +1569,11 @@ pylong_as_lu(PyObject *item)
     return lu;
 }
 
-#ifdef HAVE_LONG_LONG
-static PY_LONG_LONG
+static long long
 pylong_as_lld(PyObject *item)
 {
     PyObject *tmp;
-    PY_LONG_LONG lld;
+    long long lld;
 
     tmp = PyNumber_Index(item);
     if (tmp == NULL)
@@ -1597,21 +1584,20 @@ pylong_as_lld(PyObject *item)
     return lld;
 }
 
-static unsigned PY_LONG_LONG
+static unsigned long long
 pylong_as_llu(PyObject *item)
 {
     PyObject *tmp;
-    unsigned PY_LONG_LONG llu;
+    unsigned long long llu;
 
     tmp = PyNumber_Index(item);
     if (tmp == NULL)
-        return (unsigned PY_LONG_LONG)-1;
+        return (unsigned long long)-1;
 
     llu = PyLong_AsUnsignedLongLong(tmp);
     Py_DECREF(tmp);
     return llu;
 }
-#endif
 
 static Py_ssize_t
 pylong_as_zd(PyObject *item)
@@ -1659,10 +1645,10 @@ pylong_as_zu(PyObject *item)
 Py_LOCAL_INLINE(PyObject *)
 unpack_single(const char *ptr, const char *fmt)
 {
-    unsigned PY_LONG_LONG llu;
+    unsigned long long llu;
     unsigned long lu;
     size_t zu;
-    PY_LONG_LONG lld;
+    long long lld;
     long ld;
     Py_ssize_t zd;
     double d;
@@ -1679,11 +1665,7 @@ unpack_single(const char *ptr, const char *fmt)
     case 'l': UNPACK_SINGLE(ld, ptr, long); goto convert_ld;
 
     /* boolean */
-    #ifdef HAVE_C99_BOOL
     case '?': UNPACK_SINGLE(ld, ptr, _Bool); goto convert_bool;
-    #else
-    case '?': UNPACK_SINGLE(ld, ptr, char); goto convert_bool;
-    #endif
 
     /* unsigned integers */
     case 'H': UNPACK_SINGLE(lu, ptr, unsigned short); goto convert_lu;
@@ -1691,10 +1673,8 @@ unpack_single(const char *ptr, const char *fmt)
     case 'L': UNPACK_SINGLE(lu, ptr, unsigned long); goto convert_lu;
 
     /* native 64-bit */
-    #ifdef HAVE_LONG_LONG
-    case 'q': UNPACK_SINGLE(lld, ptr, PY_LONG_LONG); goto convert_lld;
-    case 'Q': UNPACK_SINGLE(llu, ptr, unsigned PY_LONG_LONG); goto convert_llu;
-    #endif
+    case 'q': UNPACK_SINGLE(lld, ptr, long long); goto convert_lld;
+    case 'Q': UNPACK_SINGLE(llu, ptr, unsigned long long); goto convert_llu;
 
     /* ssize_t and size_t */
     case 'n': UNPACK_SINGLE(zd, ptr, Py_ssize_t); goto convert_zd;
@@ -1755,10 +1735,10 @@ err_format:
 static int
 pack_single(char *ptr, PyObject *item, const char *fmt)
 {
-    unsigned PY_LONG_LONG llu;
+    unsigned long long llu;
     unsigned long lu;
     size_t zu;
-    PY_LONG_LONG lld;
+    long long lld;
     long ld;
     Py_ssize_t zd;
     double d;
@@ -1806,20 +1786,18 @@ pack_single(char *ptr, PyObject *item, const char *fmt)
         break;
 
     /* native 64-bit */
-    #ifdef HAVE_LONG_LONG
     case 'q':
         lld = pylong_as_lld(item);
         if (lld == -1 && PyErr_Occurred())
             goto err_occurred;
-        PACK_SINGLE(ptr, lld, PY_LONG_LONG);
+        PACK_SINGLE(ptr, lld, long long);
         break;
     case 'Q':
         llu = pylong_as_llu(item);
-        if (llu == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred())
+        if (llu == (unsigned long long)-1 && PyErr_Occurred())
             goto err_occurred;
-        PACK_SINGLE(ptr, llu, unsigned PY_LONG_LONG);
+        PACK_SINGLE(ptr, llu, unsigned long long);
         break;
-    #endif
 
     /* ssize_t and size_t */
     case 'n':
@@ -1853,11 +1831,7 @@ pack_single(char *ptr, PyObject *item, const char *fmt)
         ld = PyObject_IsTrue(item);
         if (ld < 0)
             return -1; /* preserve original error */
-    #ifdef HAVE_C99_BOOL
         PACK_SINGLE(ptr, ld, _Bool);
-    #else
-        PACK_SINGLE(ptr, ld, char);
-    #endif
          break;
 
     /* bytes object */
@@ -2644,11 +2618,7 @@ unpack_cmp(const char *p, const char *q, char fmt,
     case 'l': CMP_SINGLE(p, q, long); return equal;
 
     /* boolean */
-    #ifdef HAVE_C99_BOOL
     case '?': CMP_SINGLE(p, q, _Bool); return equal;
-    #else
-    case '?': CMP_SINGLE(p, q, char); return equal;
-    #endif
 
     /* unsigned integers */
     case 'H': CMP_SINGLE(p, q, unsigned short); return equal;
@@ -2656,10 +2626,8 @@ unpack_cmp(const char *p, const char *q, char fmt,
     case 'L': CMP_SINGLE(p, q, unsigned long); return equal;
 
     /* native 64-bit */
-    #ifdef HAVE_LONG_LONG
-    case 'q': CMP_SINGLE(p, q, PY_LONG_LONG); return equal;
-    case 'Q': CMP_SINGLE(p, q, unsigned PY_LONG_LONG); return equal;
-    #endif
+    case 'q': CMP_SINGLE(p, q, long long); return equal;
+    case 'Q': CMP_SINGLE(p, q, unsigned long long); return equal;
 
     /* ssize_t and size_t */
     case 'n': CMP_SINGLE(p, q, Py_ssize_t); return equal;
diff --git a/Objects/methodobject.c b/Objects/methodobject.c
index 946357f..394f1f4 100644
--- a/Objects/methodobject.c
+++ b/Objects/methodobject.c
@@ -145,6 +145,104 @@ PyCFunction_Call(PyObject *func, PyObject *args, PyObject *kwds)
     return _Py_CheckFunctionResult(func, res, NULL);
 }
 
+PyObject *
+_PyCFunction_FastCallDict(PyObject *func_obj, PyObject **args, Py_ssize_t nargs,
+                          PyObject *kwargs)
+{
+    PyCFunctionObject *func = (PyCFunctionObject*)func_obj;
+    PyCFunction meth = PyCFunction_GET_FUNCTION(func);
+    PyObject *self = PyCFunction_GET_SELF(func);
+    PyObject *result;
+    int flags;
+
+    assert(func != NULL);
+    assert(nargs >= 0);
+    assert(nargs == 0 || args != NULL);
+    assert(kwargs == NULL || PyDict_Check(kwargs));
+
+    /* _PyCFunction_FastCallDict() must not be called with an exception set,
+       because it may clear it (directly or indirectly) and so the
+       caller loses its exception */
+    assert(!PyErr_Occurred());
+
+    flags = PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST);
+
+    switch (flags)
+    {
+    case METH_NOARGS:
+        if (kwargs != NULL && PyDict_Size(kwargs) != 0) {
+            PyErr_Format(PyExc_TypeError, "%.200s() takes no keyword arguments",
+                         func->m_ml->ml_name);
+            return NULL;
+        }
+
+        if (nargs != 0) {
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes no arguments (%zd given)",
+                func->m_ml->ml_name, nargs);
+            return NULL;
+        }
+
+        result = (*meth) (self, NULL);
+        break;
+
+    case METH_O:
+        if (kwargs != NULL && PyDict_Size(kwargs) != 0) {
+            PyErr_Format(PyExc_TypeError, "%.200s() takes no keyword arguments",
+                         func->m_ml->ml_name);
+            return NULL;
+        }
+
+        if (nargs != 1) {
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes exactly one argument (%zd given)",
+                func->m_ml->ml_name, nargs);
+            return NULL;
+        }
+
+        result = (*meth) (self, args[0]);
+        break;
+
+    case METH_VARARGS:
+    case METH_VARARGS | METH_KEYWORDS:
+    {
+        /* Slow-path: create a temporary tuple */
+        PyObject *tuple;
+
+        if (!(flags & METH_KEYWORDS) && kwargs != NULL && PyDict_Size(kwargs) != 0) {
+            PyErr_Format(PyExc_TypeError,
+                         "%.200s() takes no keyword arguments",
+                         func->m_ml->ml_name);
+            return NULL;
+        }
+
+        tuple = _PyStack_AsTuple(args, nargs);
+        if (tuple == NULL) {
+            return NULL;
+        }
+
+        if (flags & METH_KEYWORDS) {
+            result = (*(PyCFunctionWithKeywords)meth) (self, tuple, kwargs);
+        }
+        else {
+            result = (*meth) (self, tuple);
+        }
+        Py_DECREF(tuple);
+        break;
+    }
+
+    default:
+        PyErr_SetString(PyExc_SystemError,
+                        "Bad call flags in PyCFunction_Call. "
+                        "METH_OLDARGS is no longer supported!");
+        return NULL;
+    }
+
+    result = _Py_CheckFunctionResult(func_obj, result, NULL);
+
+    return result;
+}
+
 /* Methods (the standard built-in methods, that is) */
 
 static void
diff --git a/Objects/moduleobject.c b/Objects/moduleobject.c
index ac07642..d88b06a 100644
--- a/Objects/moduleobject.c
+++ b/Objects/moduleobject.c
@@ -450,8 +450,7 @@ PyModule_GetDict(PyObject *m)
         return NULL;
     }
     d = ((PyModuleObject *)m) -> md_dict;
-    if (d == NULL)
-        ((PyModuleObject *)m) -> md_dict = d = PyDict_New();
+    assert(d != NULL);
     return d;
 }
 
diff --git a/Objects/namespaceobject.c b/Objects/namespaceobject.c
index 3d27a95..0bb3063 100644
--- a/Objects/namespaceobject.c
+++ b/Objects/namespaceobject.c
@@ -1,4 +1,4 @@
-/* namespace object implementation */
+// namespace object implementation
 
 #include "Python.h"
 #include "structmember.h"
@@ -16,7 +16,7 @@ static PyMemberDef namespace_members[] = {
 };
 
 
-/* Methods */
+// Methods
 
 static PyObject *
 namespace_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
@@ -40,7 +40,7 @@ namespace_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 static int
 namespace_init(_PyNamespaceObject *ns, PyObject *args, PyObject *kwds)
 {
-    /* ignore args if it's NULL or empty */
+    // ignore args if it's NULL or empty
     if (args != NULL) {
         Py_ssize_t argcount = PyObject_Size(args);
         if (argcount < 0)
@@ -191,7 +191,7 @@ namespace_reduce(_PyNamespaceObject *ns)
 static PyMethodDef namespace_methods[] = {
     {"__reduce__", (PyCFunction)namespace_reduce, METH_NOARGS,
      namespace_reduce__doc__},
-    {NULL,         NULL}  /* sentinel */
+    {NULL,         NULL}  // sentinel
 };
 
 
diff --git a/Objects/object.c b/Objects/object.c
index 8024889..559794f 100644
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -109,6 +109,15 @@ void
 dump_counts(FILE* f)
 {
     PyTypeObject *tp;
+    PyObject *xoptions, *value;
+    _Py_IDENTIFIER(showalloccount);
+
+    xoptions = PySys_GetXOptions();
+    if (xoptions == NULL)
+        return;
+    value = _PyDict_GetItemId(xoptions, &PyId_showalloccount);
+    if (value != Py_True)
+        return;
 
     for (tp = type_list; tp; tp = tp->tp_next)
         fprintf(f, "%s alloc'd: %" PY_FORMAT_SIZE_T "d, "
@@ -644,7 +653,7 @@ PyObject_Bytes(PyObject *v)
 /* Map rich comparison operators to their swapped version, e.g. LT <--> GT */
 int _Py_SwappedOp[] = {Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE};
 
-static char *opstrings[] = {"<", "<=", "==", "!=", ">", ">="};
+static const char * const opstrings[] = {"<", "<=", "==", "!=", ">", ">="};
 
 /* Perform a rich comparison, raising TypeError when the requested comparison
    operator is not supported. */
@@ -686,11 +695,10 @@ do_richcompare(PyObject *v, PyObject *w, int op)
         res = (v != w) ? Py_True : Py_False;
         break;
     default:
-        /* XXX Special-case None so it doesn't show as NoneType() */
         PyErr_Format(PyExc_TypeError,
-                     "unorderable types: %.100s() %s %.100s()",
-                     v->ob_type->tp_name,
+                     "'%s' not supported between instances of '%.100s' and '%.100s'",
                      opstrings[op],
+                     v->ob_type->tp_name,
                      w->ob_type->tp_name);
         return NULL;
     }
@@ -1041,8 +1049,7 @@ _PyObject_GenericGetAttrWithDict(PyObject *obj, PyObject *name, PyObject *dict)
                      name->ob_type->tp_name);
         return NULL;
     }
-    else
-        Py_INCREF(name);
+    Py_INCREF(name);
 
     if (tp->tp_dict == NULL) {
         if (PyType_Ready(tp) < 0)
@@ -1050,10 +1057,10 @@ _PyObject_GenericGetAttrWithDict(PyObject *obj, PyObject *name, PyObject *dict)
     }
 
     descr = _PyType_Lookup(tp, name);
-    Py_XINCREF(descr);
 
     f = NULL;
     if (descr != NULL) {
+        Py_INCREF(descr);
         f = descr->ob_type->tp_descr_get;
         if (f != NULL && PyDescr_IsData(descr)) {
             res = f(descr, obj, (PyObject *)obj->ob_type);
@@ -1073,8 +1080,9 @@ _PyObject_GenericGetAttrWithDict(PyObject *obj, PyObject *name, PyObject *dict)
                 if (tsize < 0)
                     tsize = -tsize;
                 size = _PyObject_VAR_SIZE(tp, tsize);
+                assert(size <= PY_SSIZE_T_MAX);
 
-                dictoffset += (long)size;
+                dictoffset += (Py_ssize_t)size;
                 assert(dictoffset > 0);
                 assert(dictoffset % SIZEOF_VOID_P == 0);
             }
@@ -1142,12 +1150,11 @@ _PyObject_GenericSetAttrWithDict(PyObject *obj, PyObject *name,
     Py_INCREF(name);
 
     descr = _PyType_Lookup(tp, name);
-    Py_XINCREF(descr);
 
-    f = NULL;
     if (descr != NULL) {
+        Py_INCREF(descr);
         f = descr->ob_type->tp_descr_set;
-        if (f != NULL && PyDescr_IsData(descr)) {
+        if (f != NULL) {
             res = f(descr, obj, value);
             goto done;
         }
@@ -1155,40 +1162,32 @@ _PyObject_GenericSetAttrWithDict(PyObject *obj, PyObject *name,
 
     if (dict == NULL) {
         dictptr = _PyObject_GetDictPtr(obj);
-        if (dictptr != NULL) {
-            res = _PyObjectDict_SetItem(Py_TYPE(obj), dictptr, name, value);
-            if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError))
-                PyErr_SetObject(PyExc_AttributeError, name);
+        if (dictptr == NULL) {
+            if (descr == NULL) {
+                PyErr_Format(PyExc_AttributeError,
+                             "'%.100s' object has no attribute '%U'",
+                             tp->tp_name, name);
+            }
+            else {
+                PyErr_Format(PyExc_AttributeError,
+                             "'%.50s' object attribute '%U' is read-only",
+                             tp->tp_name, name);
+            }
             goto done;
         }
+        res = _PyObjectDict_SetItem(tp, dictptr, name, value);
     }
-    if (dict != NULL) {
+    else {
         Py_INCREF(dict);
         if (value == NULL)
             res = PyDict_DelItem(dict, name);
         else
             res = PyDict_SetItem(dict, name, value);
         Py_DECREF(dict);
-        if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError))
-            PyErr_SetObject(PyExc_AttributeError, name);
-        goto done;
     }
+    if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError))
+        PyErr_SetObject(PyExc_AttributeError, name);
 
-    if (f != NULL) {
-        res = f(descr, obj, value);
-        goto done;
-    }
-
-    if (descr == NULL) {
-        PyErr_Format(PyExc_AttributeError,
-                     "'%.100s' object has no attribute '%U'",
-                     tp->tp_name, name);
-        goto done;
-    }
-
-    PyErr_Format(PyExc_AttributeError,
-                 "'%.50s' object attribute '%U' is read-only",
-                 tp->tp_name, name);
   done:
     Py_XDECREF(descr);
     Py_DECREF(name);
@@ -1204,7 +1203,7 @@ PyObject_GenericSetAttr(PyObject *obj, PyObject *name, PyObject *value)
 int
 PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context)
 {
-    PyObject *dict, **dictptr = _PyObject_GetDictPtr(obj);
+    PyObject **dictptr = _PyObject_GetDictPtr(obj);
     if (dictptr == NULL) {
         PyErr_SetString(PyExc_AttributeError,
                         "This object has no __dict__");
@@ -1220,10 +1219,8 @@ PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context)
                      "not a '%.200s'", Py_TYPE(value)->tp_name);
         return -1;
     }
-    dict = *dictptr;
-    Py_XINCREF(value);
-    *dictptr = value;
-    Py_XDECREF(dict);
+    Py_INCREF(value);
+    Py_XSETREF(*dictptr, value);
     return 0;
 }
 
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index 7cc889f..54d68b7 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -1,17 +1,38 @@
 #include "Python.h"
 
+
+/* Defined in tracemalloc.c */
+extern void _PyMem_DumpTraceback(int fd, const void *ptr);
+
+
 /* Python's malloc wrappers (see pymem.h) */
 
-#ifdef PYMALLOC_DEBUG   /* WITH_PYMALLOC && PYMALLOC_DEBUG */
+/*
+ * Basic types
+ * I don't care if these are defined in <sys/types.h> or elsewhere. Axiom.
+ */
+#undef  uchar
+#define uchar   unsigned char   /* assuming == 8 bits  */
+
+#undef  uint
+#define uint    unsigned int    /* assuming >= 16 bits */
+
+#undef uptr
+#define uptr    uintptr_t
+
 /* Forward declaration */
+static void* _PyMem_DebugRawMalloc(void *ctx, size_t size);
+static void* _PyMem_DebugRawCalloc(void *ctx, size_t nelem, size_t elsize);
+static void* _PyMem_DebugRawRealloc(void *ctx, void *ptr, size_t size);
+static void _PyMem_DebugRawFree(void *ctx, void *p);
+
 static void* _PyMem_DebugMalloc(void *ctx, size_t size);
 static void* _PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize);
-static void _PyMem_DebugFree(void *ctx, void *p);
 static void* _PyMem_DebugRealloc(void *ctx, void *ptr, size_t size);
+static void _PyMem_DebugFree(void *ctx, void *p);
 
 static void _PyObject_DebugDumpAddress(const void *p);
 static void _PyMem_DebugCheckAddress(char api_id, const void *p);
-#endif
 
 #if defined(__has_feature)  /* Clang */
  #if __has_feature(address_sanitizer)  /* is ASAN enabled? */
@@ -145,9 +166,8 @@ _PyObject_ArenaFree(void *ctx, void *ptr, size_t size)
 #else
 #  define PYOBJ_FUNCS PYRAW_FUNCS
 #endif
-#define PYMEM_FUNCS PYRAW_FUNCS
+#define PYMEM_FUNCS PYOBJ_FUNCS
 
-#ifdef PYMALLOC_DEBUG
 typedef struct {
     /* We tag each block with an API ID in order to tag API violations */
     char api_id;
@@ -163,19 +183,21 @@ static struct {
     {'o', {NULL, PYOBJ_FUNCS}}
     };
 
-#define PYDBG_FUNCS _PyMem_DebugMalloc, _PyMem_DebugCalloc, _PyMem_DebugRealloc, _PyMem_DebugFree
-#endif
+#define PYRAWDBG_FUNCS \
+    _PyMem_DebugRawMalloc, _PyMem_DebugRawCalloc, _PyMem_DebugRawRealloc, _PyMem_DebugRawFree
+#define PYDBG_FUNCS \
+    _PyMem_DebugMalloc, _PyMem_DebugCalloc, _PyMem_DebugRealloc, _PyMem_DebugFree
 
 static PyMemAllocatorEx _PyMem_Raw = {
-#ifdef PYMALLOC_DEBUG
-    &_PyMem_Debug.raw, PYDBG_FUNCS
+#ifdef Py_DEBUG
+    &_PyMem_Debug.raw, PYRAWDBG_FUNCS
 #else
     NULL, PYRAW_FUNCS
 #endif
     };
 
 static PyMemAllocatorEx _PyMem = {
-#ifdef PYMALLOC_DEBUG
+#ifdef Py_DEBUG
     &_PyMem_Debug.mem, PYDBG_FUNCS
 #else
     NULL, PYMEM_FUNCS
@@ -183,16 +205,76 @@ static PyMemAllocatorEx _PyMem = {
     };
 
 static PyMemAllocatorEx _PyObject = {
-#ifdef PYMALLOC_DEBUG
+#ifdef Py_DEBUG
     &_PyMem_Debug.obj, PYDBG_FUNCS
 #else
     NULL, PYOBJ_FUNCS
 #endif
     };
 
+int
+_PyMem_SetupAllocators(const char *opt)
+{
+    if (opt == NULL || *opt == '\0') {
+        /* PYTHONMALLOC is empty or is not set or ignored (-E/-I command line
+           options): use default allocators */
+#ifdef Py_DEBUG
+#  ifdef WITH_PYMALLOC
+        opt = "pymalloc_debug";
+#  else
+        opt = "malloc_debug";
+#  endif
+#else
+   /* !Py_DEBUG */
+#  ifdef WITH_PYMALLOC
+        opt = "pymalloc";
+#  else
+        opt = "malloc";
+#  endif
+#endif
+    }
+
+    if (strcmp(opt, "debug") == 0) {
+        PyMem_SetupDebugHooks();
+    }
+    else if (strcmp(opt, "malloc") == 0 || strcmp(opt, "malloc_debug") == 0)
+    {
+        PyMemAllocatorEx alloc = {NULL, PYRAW_FUNCS};
+
+        PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &alloc);
+        PyMem_SetAllocator(PYMEM_DOMAIN_MEM, &alloc);
+        PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &alloc);
+
+        if (strcmp(opt, "malloc_debug") == 0)
+            PyMem_SetupDebugHooks();
+    }
+#ifdef WITH_PYMALLOC
+    else if (strcmp(opt, "pymalloc") == 0
+             || strcmp(opt, "pymalloc_debug") == 0)
+    {
+        PyMemAllocatorEx raw_alloc = {NULL, PYRAW_FUNCS};
+        PyMemAllocatorEx mem_alloc = {NULL, PYMEM_FUNCS};
+        PyMemAllocatorEx obj_alloc = {NULL, PYOBJ_FUNCS};
+
+        PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &raw_alloc);
+        PyMem_SetAllocator(PYMEM_DOMAIN_MEM, &mem_alloc);
+        PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &obj_alloc);
+
+        if (strcmp(opt, "pymalloc_debug") == 0)
+            PyMem_SetupDebugHooks();
+    }
+#endif
+    else {
+        /* unknown allocator */
+        return -1;
+    }
+    return 0;
+}
+
 #undef PYRAW_FUNCS
 #undef PYMEM_FUNCS
 #undef PYOBJ_FUNCS
+#undef PYRAWDBG_FUNCS
 #undef PYDBG_FUNCS
 
 static PyObjectArenaAllocator _PyObject_Arena = {NULL,
@@ -205,23 +287,46 @@ static PyObjectArenaAllocator _PyObject_Arena = {NULL,
 #endif
     };
 
+#ifdef WITH_PYMALLOC
+static int
+_PyMem_DebugEnabled(void)
+{
+    return (_PyObject.malloc == _PyMem_DebugMalloc);
+}
+
+int
+_PyMem_PymallocEnabled(void)
+{
+    if (_PyMem_DebugEnabled()) {
+        return (_PyMem_Debug.obj.alloc.malloc == _PyObject_Malloc);
+    }
+    else {
+        return (_PyObject.malloc == _PyObject_Malloc);
+    }
+}
+#endif
+
 void
 PyMem_SetupDebugHooks(void)
 {
-#ifdef PYMALLOC_DEBUG
     PyMemAllocatorEx alloc;
 
-    alloc.malloc = _PyMem_DebugMalloc;
-    alloc.calloc = _PyMem_DebugCalloc;
-    alloc.realloc = _PyMem_DebugRealloc;
-    alloc.free = _PyMem_DebugFree;
+    alloc.malloc = _PyMem_DebugRawMalloc;
+    alloc.calloc = _PyMem_DebugRawCalloc;
+    alloc.realloc = _PyMem_DebugRawRealloc;
+    alloc.free = _PyMem_DebugRawFree;
 
-    if (_PyMem_Raw.malloc != _PyMem_DebugMalloc) {
+    if (_PyMem_Raw.malloc != _PyMem_DebugRawMalloc) {
         alloc.ctx = &_PyMem_Debug.raw;
         PyMem_GetAllocator(PYMEM_DOMAIN_RAW, &_PyMem_Debug.raw.alloc);
         PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &alloc);
     }
 
+    alloc.malloc = _PyMem_DebugMalloc;
+    alloc.calloc = _PyMem_DebugCalloc;
+    alloc.realloc = _PyMem_DebugRealloc;
+    alloc.free = _PyMem_DebugFree;
+
     if (_PyMem.malloc != _PyMem_DebugMalloc) {
         alloc.ctx = &_PyMem_Debug.mem;
         PyMem_GetAllocator(PYMEM_DOMAIN_MEM, &_PyMem_Debug.mem.alloc);
@@ -233,7 +338,6 @@ PyMem_SetupDebugHooks(void)
         PyMem_GetAllocator(PYMEM_DOMAIN_OBJ, &_PyMem_Debug.obj.alloc);
         PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &alloc);
     }
-#endif
 }
 
 void
@@ -264,7 +368,6 @@ PyMem_SetAllocator(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
     case PYMEM_DOMAIN_OBJ: _PyObject = *allocator; break;
     /* ignore unknown domain */
     }
-
 }
 
 void
@@ -642,22 +745,6 @@ static int running_on_valgrind = -1;
 #define SIMPLELOCK_LOCK(lock)   /* acquire released lock */
 #define SIMPLELOCK_UNLOCK(lock) /* release acquired lock */
 
-/*
- * Basic types
- * I don't care if these are defined in <sys/types.h> or elsewhere. Axiom.
- */
-#undef  uchar
-#define uchar   unsigned char   /* assuming == 8 bits  */
-
-#undef  uint
-#define uint    unsigned int    /* assuming >= 16 bits */
-
-#undef  ulong
-#define ulong   unsigned long   /* assuming >= 32 bits */
-
-#undef uptr
-#define uptr    Py_uintptr_t
-
 /* When you say memory, my mind reasons in terms of (pointers to) blocks */
 typedef uchar block;
 
@@ -949,11 +1036,15 @@ new_arena(void)
     struct arena_object* arenaobj;
     uint excess;        /* number of bytes above pool alignment */
     void *address;
+    static int debug_stats = -1;
 
-#ifdef PYMALLOC_DEBUG
-    if (Py_GETENV("PYTHONMALLOCSTATS"))
+    if (debug_stats == -1) {
+        char *opt = Py_GETENV("PYTHONMALLOCSTATS");
+        debug_stats = (opt != NULL && *opt != '\0');
+    }
+    if (debug_stats)
         _PyObject_DebugMallocStats(stderr);
-#endif
+
     if (unused_arena_objects == NULL) {
         uint i;
         uint numarenas;
@@ -966,7 +1057,7 @@ new_arena(void)
         if (numarenas <= maxarenas)
             return NULL;                /* overflow */
 #if SIZEOF_SIZE_T <= SIZEOF_INT
-        if (numarenas > PY_SIZE_MAX / sizeof(*arenas))
+        if (numarenas > SIZE_MAX / sizeof(*arenas))
             return NULL;                /* overflow */
 #endif
         nbytes = numarenas * sizeof(*arenas);
@@ -1709,7 +1800,7 @@ _Py_GetAllocatedBlocks(void)
 
 #endif /* WITH_PYMALLOC */
 
-#ifdef PYMALLOC_DEBUG
+
 /*==========================================================================*/
 /* A x-platform debugging allocator.  This doesn't manage memory directly,
  * it wraps a real allocator, adding extra debugging info to the memory blocks.
@@ -1767,31 +1858,6 @@ write_size_t(void *p, size_t n)
     }
 }
 
-#ifdef Py_DEBUG
-/* Is target in the list?  The list is traversed via the nextpool pointers.
- * The list may be NULL-terminated, or circular.  Return 1 if target is in
- * list, else 0.
- */
-static int
-pool_is_in_list(const poolp target, poolp list)
-{
-    poolp origlist = list;
-    assert(target != NULL);
-    if (list == NULL)
-        return 0;
-    do {
-        if (target == list)
-            return 1;
-        list = list->nextpool;
-    } while (list != NULL && list != origlist);
-    return 0;
-}
-
-#else
-#define pool_is_in_list(X, Y) 1
-
-#endif  /* Py_DEBUG */
-
 /* Let S = sizeof(size_t).  The debug malloc asks for 4*S extra bytes and
    fills them with useful stuff, here calling the underlying malloc's result p:
 
@@ -1819,7 +1885,7 @@ p[2*S+n+S: 2*S+n+2*S]
 */
 
 static void *
-_PyMem_DebugAlloc(int use_calloc, void *ctx, size_t nbytes)
+_PyMem_DebugRawAlloc(int use_calloc, void *ctx, size_t nbytes)
 {
     debug_alloc_api_t *api = (debug_alloc_api_t *)ctx;
     uchar *p;           /* base address of malloc'ed block */
@@ -1856,18 +1922,18 @@ _PyMem_DebugAlloc(int use_calloc, void *ctx, size_t nbytes)
 }
 
 static void *
-_PyMem_DebugMalloc(void *ctx, size_t nbytes)
+_PyMem_DebugRawMalloc(void *ctx, size_t nbytes)
 {
-    return _PyMem_DebugAlloc(0, ctx, nbytes);
+    return _PyMem_DebugRawAlloc(0, ctx, nbytes);
 }
 
 static void *
-_PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize)
+_PyMem_DebugRawCalloc(void *ctx, size_t nelem, size_t elsize)
 {
     size_t nbytes;
     assert(elsize == 0 || nelem <= PY_SSIZE_T_MAX / elsize);
     nbytes = nelem * elsize;
-    return _PyMem_DebugAlloc(1, ctx, nbytes);
+    return _PyMem_DebugRawAlloc(1, ctx, nbytes);
 }
 
 /* The debug free first checks the 2*SST bytes on each end for sanity (in
@@ -1876,7 +1942,7 @@ _PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize)
    Then calls the underlying free.
 */
 static void
-_PyMem_DebugFree(void *ctx, void *p)
+_PyMem_DebugRawFree(void *ctx, void *p)
 {
     debug_alloc_api_t *api = (debug_alloc_api_t *)ctx;
     uchar *q = (uchar *)p - 2*SST;  /* address returned from malloc */
@@ -1893,7 +1959,7 @@ _PyMem_DebugFree(void *ctx, void *p)
 }
 
 static void *
-_PyMem_DebugRealloc(void *ctx, void *p, size_t nbytes)
+_PyMem_DebugRawRealloc(void *ctx, void *p, size_t nbytes)
 {
     debug_alloc_api_t *api = (debug_alloc_api_t *)ctx;
     uchar *q = (uchar *)p, *oldq;
@@ -1903,7 +1969,7 @@ _PyMem_DebugRealloc(void *ctx, void *p, size_t nbytes)
     int i;
 
     if (p == NULL)
-        return _PyMem_DebugAlloc(0, ctx, nbytes);
+        return _PyMem_DebugRawAlloc(0, ctx, nbytes);
 
     _PyMem_DebugCheckAddress(api->api_id, p);
     bumpserialno();
@@ -1946,6 +2012,44 @@ _PyMem_DebugRealloc(void *ctx, void *p, size_t nbytes)
     return q;
 }
 
+static void
+_PyMem_DebugCheckGIL(void)
+{
+#ifdef WITH_THREAD
+    if (!PyGILState_Check())
+        Py_FatalError("Python memory allocator called "
+                      "without holding the GIL");
+#endif
+}
+
+static void *
+_PyMem_DebugMalloc(void *ctx, size_t nbytes)
+{
+    _PyMem_DebugCheckGIL();
+    return _PyMem_DebugRawMalloc(ctx, nbytes);
+}
+
+static void *
+_PyMem_DebugCalloc(void *ctx, size_t nelem, size_t elsize)
+{
+    _PyMem_DebugCheckGIL();
+    return _PyMem_DebugRawCalloc(ctx, nelem, elsize);
+}
+
+static void
+_PyMem_DebugFree(void *ctx, void *ptr)
+{
+    _PyMem_DebugCheckGIL();
+    _PyMem_DebugRawFree(ctx, ptr);
+}
+
+static void *
+_PyMem_DebugRealloc(void *ctx, void *ptr, size_t nbytes)
+{
+    _PyMem_DebugCheckGIL();
+    return _PyMem_DebugRawRealloc(ctx, ptr, nbytes);
+}
+
 /* Check the forbidden bytes on both ends of the memory allocated for p.
  * If anything is wrong, print info to stderr via _PyObject_DebugDumpAddress,
  * and call Py_FatalError to kill the program.
@@ -2104,9 +2208,12 @@ _PyObject_DebugDumpAddress(const void *p)
         }
         fputc('\n', stderr);
     }
+    fputc('\n', stderr);
+
+    fflush(stderr);
+    _PyMem_DumpTraceback(fileno(stderr), p);
 }
 
-#endif  /* PYMALLOC_DEBUG */
 
 static size_t
 printone(FILE *out, const char* msg, size_t value)
@@ -2158,8 +2265,30 @@ _PyDebugAllocatorStats(FILE *out,
     (void)printone(out, buf2, num_blocks * sizeof_block);
 }
 
+
 #ifdef WITH_PYMALLOC
 
+#ifdef Py_DEBUG
+/* Is target in the list?  The list is traversed via the nextpool pointers.
+ * The list may be NULL-terminated, or circular.  Return 1 if target is in
+ * list, else 0.
+ */
+static int
+pool_is_in_list(const poolp target, poolp list)
+{
+    poolp origlist = list;
+    assert(target != NULL);
+    if (list == NULL)
+        return 0;
+    do {
+        if (target == list)
+            return 1;
+        list = list->nextpool;
+    } while (list != NULL && list != origlist);
+    return 0;
+}
+#endif
+
 /* Print summary info to "out" about the state of pymalloc's structures.
  * In Py_DEBUG mode, also perform some expensive internal consistency
  * checks.
@@ -2233,7 +2362,9 @@ _PyObject_DebugMallocStats(FILE *out)
 
             if (p->ref.count == 0) {
                 /* currently unused */
+#ifdef Py_DEBUG
                 assert(pool_is_in_list(p, arenas[i].freepools));
+#endif
                 continue;
             }
             ++numpools[sz];
@@ -2273,9 +2404,8 @@ _PyObject_DebugMallocStats(FILE *out)
         quantization += p * ((POOL_SIZE - POOL_OVERHEAD) % size);
     }
     fputc('\n', out);
-#ifdef PYMALLOC_DEBUG
-    (void)printone(out, "# times object malloc called", serialno);
-#endif
+    if (_PyMem_DebugEnabled())
+        (void)printone(out, "# times object malloc called", serialno);
     (void)printone(out, "# arenas allocated total", ntimes_arena_allocated);
     (void)printone(out, "# arenas reclaimed", ntimes_arena_allocated - narenas);
     (void)printone(out, "# arenas highwater mark", narenas_highwater);
@@ -2303,6 +2433,7 @@ _PyObject_DebugMallocStats(FILE *out)
 
 #endif /* #ifdef WITH_PYMALLOC */
 
+
 #ifdef Py_USING_MEMORY_DEBUGGER
 /* Make this function last so gcc won't inline it since the definition is
  * after the reference.
diff --git a/Objects/odictobject.c b/Objects/odictobject.c
index a6963d7..f056074 100644
--- a/Objects/odictobject.c
+++ b/Objects/odictobject.c
@@ -1424,14 +1424,13 @@ static PyMethodDef odict_methods[] = {
  * OrderedDict members
  */
 
-/* tp_members */
+/* tp_getset */
 
-static PyMemberDef odict_members[] = {
-    {"__dict__", T_OBJECT, offsetof(PyODictObject, od_inst_dict), READONLY},
-    {0}
+static PyGetSetDef odict_getset[] = {
+    {"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict},
+    {NULL}
 };
 
-
 /* ----------------------------------------------
  * OrderedDict type slot methods
  */
@@ -1463,7 +1462,7 @@ odict_dealloc(PyODictObject *self)
     ++tstate->trash_delete_nesting;
 
     Py_TRASHCAN_SAFE_END(self)
-};
+}
 
 /* tp_repr */
 
@@ -1540,7 +1539,7 @@ Done:
     Py_XDECREF(pieces);
     Py_ReprLeave((PyObject *)self);
     return result;
-};
+}
 
 /* tp_doc */
 
@@ -1612,7 +1611,7 @@ odict_richcompare(PyObject *v, PyObject *w, int op)
     } else {
         Py_RETURN_NOTIMPLEMENTED;
     }
-};
+}
 
 /* tp_iter */
 
@@ -1620,7 +1619,7 @@ static PyObject *
 odict_iter(PyODictObject *od)
 {
     return odictiter_new(od, _odict_ITER_KEYS);
-};
+}
 
 /* tp_init */
 
@@ -1646,27 +1645,19 @@ odict_init(PyObject *self, PyObject *args, PyObject *kwds)
         Py_DECREF(res);
         return 0;
     }
-};
+}
 
 /* tp_new */
 
 static PyObject *
 odict_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 {
-    PyObject *dict;
     PyODictObject *od;
 
-    dict = PyDict_New();
-    if (dict == NULL)
-        return NULL;
-
     od = (PyODictObject *)PyDict_Type.tp_new(type, args, kwds);
-    if (od == NULL) {
-        Py_DECREF(dict);
+    if (od == NULL)
         return NULL;
-    }
 
-    od->od_inst_dict = dict;
     /* type constructor fills the memory with zeros (see
        PyType_GenericAlloc()), there is no need to set them to zero again */
     if (_odict_resize(od) < 0) {
@@ -1708,8 +1699,8 @@ PyTypeObject PyODict_Type = {
     (getiterfunc)odict_iter,                    /* tp_iter */
     0,                                          /* tp_iternext */
     odict_methods,                              /* tp_methods */
-    odict_members,                              /* tp_members */
-    0,                                          /* tp_getset */
+    0,                                          /* tp_members */
+    odict_getset,                               /* tp_getset */
     &PyDict_Type,                               /* tp_base */
     0,                                          /* tp_dict */
     0,                                          /* tp_descr_get */
@@ -1729,7 +1720,7 @@ PyTypeObject PyODict_Type = {
 PyObject *
 PyODict_New(void) {
     return odict_new(&PyODict_Type, NULL, NULL);
-};
+}
 
 static int
 _PyODict_SetItem_KnownHash(PyObject *od, PyObject *key, PyObject *value,
@@ -1747,7 +1738,7 @@ _PyODict_SetItem_KnownHash(PyObject *od, PyObject *key, PyObject *value,
         }
     }
     return res;
-};
+}
 
 int
 PyODict_SetItem(PyObject *od, PyObject *key, PyObject *value)
@@ -1756,7 +1747,7 @@ PyODict_SetItem(PyObject *od, PyObject *key, PyObject *value)
     if (hash == -1)
         return -1;
     return _PyODict_SetItem_KnownHash(od, key, value, hash);
-};
+}
 
 int
 PyODict_DelItem(PyObject *od, PyObject *key)
@@ -1769,7 +1760,22 @@ PyODict_DelItem(PyObject *od, PyObject *key)
     if (res < 0)
         return -1;
     return _PyDict_DelItem_KnownHash(od, key, hash);
-};
+}
+
+PyObject *
+_PyODict_KeysAsTuple(PyObject *od) {
+    Py_ssize_t i = 0;
+    _ODictNode *node;
+    PyObject *keys = PyTuple_New(PyODict_Size(od));
+    if (keys == NULL)
+        return NULL;
+    _odict_FOREACH((PyODictObject *)od, node) {
+        Py_INCREF(_odictnode_KEY(node));
+        PyTuple_SET_ITEM(keys, i, _odictnode_KEY(node));
+        i++;
+    }
+    return keys;
+}
 
 
 /* -------------------------------------------
diff --git a/Objects/rangeobject.c b/Objects/rangeobject.c
index 0e9eb20..8e74132 100644
--- a/Objects/rangeobject.c
+++ b/Objects/rangeobject.c
@@ -6,7 +6,7 @@
 /* Support objects whose length is > PY_SSIZE_T_MAX.
 
    This could be sped up for small PyLongs if they fit in a Py_ssize_t.
-   This only matters on Win64.  Though we could use PY_LONG_LONG which
+   This only matters on Win64.  Though we could use long long which
    would presumably help perf.
 */
 
@@ -29,17 +29,10 @@ validate_step(PyObject *step)
         return PyLong_FromLong(1);
 
     step = PyNumber_Index(step);
-    if (step) {
-        Py_ssize_t istep = PyNumber_AsSsize_t(step, NULL);
-        if (istep == -1 && PyErr_Occurred()) {
-            /* Ignore OverflowError, we know the value isn't 0. */
-            PyErr_Clear();
-        }
-        else if (istep == 0) {
-            PyErr_SetString(PyExc_ValueError,
-                            "range() arg 3 must not be zero");
-            Py_CLEAR(step);
-        }
+    if (step && _PyLong_Sign(step) == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "range() arg 3 must not be zero");
+        Py_CLEAR(step);
     }
 
     return step;
@@ -129,9 +122,9 @@ range_new(PyTypeObject *type, PyObject *args, PyObject *kw)
         return (PyObject *) obj;
 
     /* Failed to create object, release attributes */
-    Py_XDECREF(start);
-    Py_XDECREF(stop);
-    Py_XDECREF(step);
+    Py_DECREF(start);
+    Py_DECREF(stop);
+    Py_DECREF(step);
     return NULL;
 }
 
@@ -196,7 +189,7 @@ compute_range_length(PyObject *start, PyObject *stop, PyObject *step)
     /* if (lo >= hi), return length of 0. */
     cmp_result = PyObject_RichCompareBool(lo, hi, Py_GE);
     if (cmp_result != 0) {
-        Py_XDECREF(step);
+        Py_DECREF(step);
         if (cmp_result < 0)
             return NULL;
         return PyLong_FromLong(0);
@@ -225,9 +218,9 @@ compute_range_length(PyObject *start, PyObject *stop, PyObject *step)
     return result;
 
   Fail:
+    Py_DECREF(step);
     Py_XDECREF(tmp2);
     Py_XDECREF(diff);
-    Py_XDECREF(step);
     Py_XDECREF(tmp1);
     Py_XDECREF(one);
     return NULL;
diff --git a/Objects/setobject.c b/Objects/setobject.c
index 4ef692d..6dd403f 100644
--- a/Objects/setobject.c
+++ b/Objects/setobject.c
@@ -26,7 +26,6 @@
 
 #include "Python.h"
 #include "structmember.h"
-#include "stringlib/eq.h"
 
 /* Object used as dummy key to fill deleted entries */
 static PyObject _dummy_struct;
@@ -48,19 +47,20 @@ static PyObject _dummy_struct;
 static setentry *
 set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
 {
-    setentry *table = so->table;
-    setentry *freeslot = NULL;
+    setentry *table;
     setentry *entry;
-    size_t perturb = hash;
+    size_t perturb;
     size_t mask = so->mask;
     size_t i = (size_t)hash & mask; /* Unsigned for defined overflow behavior */
     size_t j;
     int cmp;
 
-    entry = &table[i];
+    entry = &so->table[i];
     if (entry->key == NULL)
         return entry;
 
+    perturb = hash;
+
     while (1) {
         if (entry->hash == hash) {
             PyObject *startkey = entry->key;
@@ -70,8 +70,9 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                 return entry;
             if (PyUnicode_CheckExact(startkey)
                 && PyUnicode_CheckExact(key)
-                && unicode_eq(startkey, key))
+                && _PyUnicode_EQ(startkey, key))
                 return entry;
+            table = so->table;
             Py_INCREF(startkey);
             cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
             Py_DECREF(startkey);
@@ -83,14 +84,12 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                 return entry;
             mask = so->mask;                 /* help avoid a register spill */
         }
-        if (entry->hash == -1 && freeslot == NULL)
-            freeslot = entry;
 
         if (i + LINEAR_PROBES <= mask) {
             for (j = 0 ; j < LINEAR_PROBES ; j++) {
                 entry++;
-                if (entry->key == NULL)
-                    goto found_null;
+                if (entry->hash == 0 && entry->key == NULL)
+                    return entry;
                 if (entry->hash == hash) {
                     PyObject *startkey = entry->key;
                     assert(startkey != dummy);
@@ -98,8 +97,9 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                         return entry;
                     if (PyUnicode_CheckExact(startkey)
                         && PyUnicode_CheckExact(key)
-                        && unicode_eq(startkey, key))
+                        && _PyUnicode_EQ(startkey, key))
                         return entry;
+                    table = so->table;
                     Py_INCREF(startkey);
                     cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
                     Py_DECREF(startkey);
@@ -111,7 +111,104 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                         return entry;
                     mask = so->mask;
                 }
-                if (entry->hash == -1 && freeslot == NULL)
+            }
+        }
+
+        perturb >>= PERTURB_SHIFT;
+        i = (i * 5 + 1 + perturb) & mask;
+
+        entry = &so->table[i];
+        if (entry->key == NULL)
+            return entry;
+    }
+}
+
+static int set_table_resize(PySetObject *, Py_ssize_t);
+
+static int
+set_add_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
+{
+    setentry *table;
+    setentry *freeslot;
+    setentry *entry;
+    size_t perturb;
+    size_t mask;
+    size_t i;                       /* Unsigned for defined overflow behavior */
+    size_t j;
+    int cmp;
+
+    /* Pre-increment is necessary to prevent arbitrary code in the rich
+       comparison from deallocating the key just before the insertion. */
+    Py_INCREF(key);
+
+  restart:
+
+    mask = so->mask;
+    i = (size_t)hash & mask;
+
+    entry = &so->table[i];
+    if (entry->key == NULL)
+        goto found_unused;
+
+    freeslot = NULL;
+    perturb = hash;
+
+    while (1) {
+        if (entry->hash == hash) {
+            PyObject *startkey = entry->key;
+            /* startkey cannot be a dummy because the dummy hash field is -1 */
+            assert(startkey != dummy);
+            if (startkey == key)
+                goto found_active;
+            if (PyUnicode_CheckExact(startkey)
+                && PyUnicode_CheckExact(key)
+                && _PyUnicode_EQ(startkey, key))
+                goto found_active;
+            table = so->table;
+            Py_INCREF(startkey);
+            cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
+            Py_DECREF(startkey);
+            if (cmp > 0)                                          /* likely */
+                goto found_active;
+            if (cmp < 0)
+                goto comparison_error;
+            /* Continuing the search from the current entry only makes
+               sense if the table and entry are unchanged; otherwise,
+               we have to restart from the beginning */
+            if (table != so->table || entry->key != startkey)
+                goto restart;
+            mask = so->mask;                 /* help avoid a register spill */
+        }
+        else if (entry->hash == -1 && freeslot == NULL)
+            freeslot = entry;
+
+        if (i + LINEAR_PROBES <= mask) {
+            for (j = 0 ; j < LINEAR_PROBES ; j++) {
+                entry++;
+                if (entry->hash == 0 && entry->key == NULL)
+                    goto found_unused_or_dummy;
+                if (entry->hash == hash) {
+                    PyObject *startkey = entry->key;
+                    assert(startkey != dummy);
+                    if (startkey == key)
+                        goto found_active;
+                    if (PyUnicode_CheckExact(startkey)
+                        && PyUnicode_CheckExact(key)
+                        && _PyUnicode_EQ(startkey, key))
+                        goto found_active;
+                    table = so->table;
+                    Py_INCREF(startkey);
+                    cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
+                    Py_DECREF(startkey);
+                    if (cmp > 0)
+                        goto found_active;
+                    if (cmp < 0)
+                        goto comparison_error;
+                    if (table != so->table || entry->key != startkey)
+                        goto restart;
+                    mask = so->mask;
+                }
+                else if (entry->hash == -1 && freeslot == NULL)
                     freeslot = entry;
             }
         }
@@ -119,29 +216,51 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
         perturb >>= PERTURB_SHIFT;
         i = (i * 5 + 1 + perturb) & mask;
 
-        entry = &table[i];
+        entry = &so->table[i];
         if (entry->key == NULL)
-            goto found_null;
+            goto found_unused_or_dummy;
     }
-  found_null:
-    return freeslot == NULL ? entry : freeslot;
+
+  found_unused_or_dummy:
+    if (freeslot == NULL)
+        goto found_unused;
+    so->used++;
+    freeslot->key = key;
+    freeslot->hash = hash;
+    return 0;
+
+  found_unused:
+    so->fill++;
+    so->used++;
+    entry->key = key;
+    entry->hash = hash;
+    if ((size_t)so->fill*3 < mask*2)
+        return 0;
+    return set_table_resize(so, so->used);
+
+  found_active:
+    Py_DECREF(key);
+    return 0;
+
+  comparison_error:
+    Py_DECREF(key);
+    return -1;
 }
 
 /*
 Internal routine used by set_table_resize() to insert an item which is
 known to be absent from the set.  This routine also assumes that
 the set contains no deleted entries.  Besides the performance benefit,
-using set_insert_clean() in set_table_resize() is dangerous (SF bug #1456209).
-Note that no refcounts are changed by this routine; if needed, the caller
-is responsible for incref'ing `key`.
+there is also safety benefit since using set_add_entry() risks making
+a callback in the middle of a set_table_resize(), see issue 1456209.
+The caller is responsible for updating the key's reference count and
+the setobject's fill and used fields.
 */
 static void
-set_insert_clean(PySetObject *so, PyObject *key, Py_hash_t hash)
+set_insert_clean(setentry *table, size_t mask, PyObject *key, Py_hash_t hash)
 {
-    setentry *table = so->table;
     setentry *entry;
     size_t perturb = hash;
-    size_t mask = (size_t)so->mask;
     size_t i = (size_t)hash & mask;
     size_t j;
 
@@ -162,45 +281,11 @@ set_insert_clean(PySetObject *so, PyObject *key, Py_hash_t hash)
   found_null:
     entry->key = key;
     entry->hash = hash;
-    so->fill++;
-    so->used++;
 }
 
 /* ======== End logic for probing the hash table ========================== */
 /* ======================================================================== */
 
-
-/*
-Internal routine to insert a new key into the table.
-Used by the public insert routine.
-Eats a reference to key.
-*/
-static int
-set_insert_key(PySetObject *so, PyObject *key, Py_hash_t hash)
-{
-    setentry *entry;
-
-    entry = set_lookkey(so, key, hash);
-    if (entry == NULL)
-        return -1;
-    if (entry->key == NULL) {
-        /* UNUSED */
-        entry->key = key;
-        entry->hash = hash;
-        so->fill++;
-        so->used++;
-    } else if (entry->key == dummy) {
-        /* DUMMY */
-        entry->key = key;
-        entry->hash = hash;
-        so->used++;
-    } else {
-        /* ACTIVE */
-        Py_DECREF(key);
-    }
-    return 0;
-}
-
 /*
 Restructure the table by allocating a new table and reinserting all
 keys again.  When entries have been deleted, the new table may
@@ -213,10 +298,13 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
     setentry *oldtable, *newtable, *entry;
     Py_ssize_t oldfill = so->fill;
     Py_ssize_t oldused = so->used;
+    Py_ssize_t oldmask = so->mask;
+    size_t newmask;
     int is_oldtable_malloced;
     setentry small_copy[PySet_MINSIZE];
 
     assert(minused >= 0);
+    minused = (minused > 50000) ? minused * 2 : minused * 4;
 
     /* Find the smallest table size > minused. */
     /* XXX speed-up with intrinsics */
@@ -264,25 +352,24 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
     /* Make the set empty, using the new table. */
     assert(newtable != oldtable);
     memset(newtable, 0, sizeof(setentry) * newsize);
-    so->fill = 0;
-    so->used = 0;
+    so->fill = oldused;
+    so->used = oldused;
     so->mask = newsize - 1;
     so->table = newtable;
 
     /* Copy the data over; this is refcount-neutral for active entries;
        dummy entries aren't copied over, of course */
+    newmask = (size_t)so->mask;
     if (oldfill == oldused) {
-        for (entry = oldtable; oldused > 0; entry++) {
+        for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
             if (entry->key != NULL) {
-                oldused--;
-                set_insert_clean(so, entry->key, entry->hash);
+                set_insert_clean(newtable, newmask, entry->key, entry->hash);
             }
         }
     } else {
-        for (entry = oldtable; oldused > 0; entry++) {
+        for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
             if (entry->key != NULL && entry->key != dummy) {
-                oldused--;
-                set_insert_clean(so, entry->key, entry->hash);
+                set_insert_clean(newtable, newmask, entry->key, entry->hash);
             }
         }
     }
@@ -292,31 +379,42 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
     return 0;
 }
 
-/* CAUTION: set_add_key/entry() must guarantee it won't resize the table */
+static int
+set_contains_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
+{
+    setentry *entry;
+
+    entry = set_lookkey(so, key, hash);
+    if (entry != NULL)
+        return entry->key != NULL;
+    return -1;
+}
+
+#define DISCARD_NOTFOUND 0
+#define DISCARD_FOUND 1
 
 static int
-set_add_entry(PySetObject *so, setentry *entry)
+set_discard_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
 {
-    Py_ssize_t n_used;
-    PyObject *key = entry->key;
-    Py_hash_t hash = entry->hash;
+    setentry *entry;
+    PyObject *old_key;
 
-    assert(so->fill <= so->mask);  /* at least one empty slot */
-    n_used = so->used;
-    Py_INCREF(key);
-    if (set_insert_key(so, key, hash)) {
-        Py_DECREF(key);
+    entry = set_lookkey(so, key, hash);
+    if (entry == NULL)
         return -1;
-    }
-    if (!(so->used > n_used && so->fill*3 >= (so->mask+1)*2))
-        return 0;
-    return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
+    if (entry->key == NULL)
+        return DISCARD_NOTFOUND;
+    old_key = entry->key;
+    entry->key = dummy;
+    entry->hash = -1;
+    so->used--;
+    Py_DECREF(old_key);
+    return DISCARD_FOUND;
 }
 
 static int
 set_add_key(PySetObject *so, PyObject *key)
 {
-    setentry entry;
     Py_hash_t hash;
 
     if (!PyUnicode_CheckExact(key) ||
@@ -325,50 +423,35 @@ set_add_key(PySetObject *so, PyObject *key)
         if (hash == -1)
             return -1;
     }
-    entry.key = key;
-    entry.hash = hash;
-    return set_add_entry(so, &entry);
+    return set_add_entry(so, key, hash);
 }
 
-#define DISCARD_NOTFOUND 0
-#define DISCARD_FOUND 1
-
 static int
-set_discard_entry(PySetObject *so, setentry *oldentry)
+set_contains_key(PySetObject *so, PyObject *key)
 {
-    setentry *entry;
-    PyObject *old_key;
+    Py_hash_t hash;
 
-    entry = set_lookkey(so, oldentry->key, oldentry->hash);
-    if (entry == NULL)
-        return -1;
-    if (entry->key == NULL  ||  entry->key == dummy)
-        return DISCARD_NOTFOUND;
-    old_key = entry->key;
-    entry->key = dummy;
-    entry->hash = -1;
-    so->used--;
-    Py_DECREF(old_key);
-    return DISCARD_FOUND;
+    if (!PyUnicode_CheckExact(key) ||
+        (hash = ((PyASCIIObject *) key)->hash) == -1) {
+        hash = PyObject_Hash(key);
+        if (hash == -1)
+            return -1;
+    }
+    return set_contains_entry(so, key, hash);
 }
 
 static int
 set_discard_key(PySetObject *so, PyObject *key)
 {
-    setentry entry;
     Py_hash_t hash;
 
-    assert (PyAnySet_Check(so));
-
     if (!PyUnicode_CheckExact(key) ||
         (hash = ((PyASCIIObject *) key)->hash) == -1) {
         hash = PyObject_Hash(key);
         if (hash == -1)
             return -1;
     }
-    entry.key = key;
-    entry.hash = hash;
-    return set_discard_entry(so, &entry);
+    return set_discard_entry(so, key, hash);
 }
 
 static void
@@ -449,20 +532,22 @@ set_next(PySetObject *so, Py_ssize_t *pos_ptr, setentry **entry_ptr)
 {
     Py_ssize_t i;
     Py_ssize_t mask;
-    setentry *table;
+    setentry *entry;
 
     assert (PyAnySet_Check(so));
     i = *pos_ptr;
     assert(i >= 0);
-    table = so->table;
     mask = so->mask;
-    while (i <= mask && (table[i].key == NULL || table[i].key == dummy))
+    entry = &so->table[i];
+    while (i <= mask && (entry->key == NULL || entry->key == dummy)) {
         i++;
+        entry++;
+    }
     *pos_ptr = i+1;
     if (i > mask)
         return 0;
-    assert(table[i].key != NULL);
-    *entry_ptr = &table[i];
+    assert(entry != NULL);
+    *entry_ptr = entry;
     return 1;
 }
 
@@ -560,8 +645,8 @@ set_merge(PySetObject *so, PyObject *otherset)
      * incrementally resizing as we insert new keys.  Expect
      * that there will be no (or few) overlapping keys.
      */
-    if ((so->fill + other->used)*3 >= (so->mask+1)*2) {
-       if (set_table_resize(so, (so->used + other->used)*2) != 0)
+    if ((so->fill + other->used)*3 >= so->mask*2) {
+       if (set_table_resize(so, so->used + other->used) != 0)
            return -1;
     }
     so_entry = so->table;
@@ -586,11 +671,15 @@ set_merge(PySetObject *so, PyObject *otherset)
 
     /* If our table is empty, we can use set_insert_clean() */
     if (so->fill == 0) {
-        for (i = 0; i <= other->mask; i++, other_entry++) {
+        setentry *newtable = so->table;
+        size_t newmask = (size_t)so->mask;
+        so->fill = other->used;
+        so->used = other->used;
+        for (i = other->mask + 1; i > 0 ; i--, other_entry++) {
             key = other_entry->key;
             if (key != NULL && key != dummy) {
                 Py_INCREF(key);
-                set_insert_clean(so, key, other_entry->hash);
+                set_insert_clean(newtable, newmask, key, other_entry->hash);
             }
         }
         return 0;
@@ -601,46 +690,13 @@ set_merge(PySetObject *so, PyObject *otherset)
         other_entry = &other->table[i];
         key = other_entry->key;
         if (key != NULL && key != dummy) {
-            Py_INCREF(key);
-            if (set_insert_key(so, key, other_entry->hash)) {
-                Py_DECREF(key);
+            if (set_add_entry(so, key, other_entry->hash))
                 return -1;
-            }
         }
     }
     return 0;
 }
 
-static int
-set_contains_entry(PySetObject *so, setentry *entry)
-{
-    PyObject *key;
-    setentry *lu_entry;
-
-    lu_entry = set_lookkey(so, entry->key, entry->hash);
-    if (lu_entry == NULL)
-        return -1;
-    key = lu_entry->key;
-    return key != NULL && key != dummy;
-}
-
-static int
-set_contains_key(PySetObject *so, PyObject *key)
-{
-    setentry entry;
-    Py_hash_t hash;
-
-    if (!PyUnicode_CheckExact(key) ||
-        (hash = ((PyASCIIObject *) key)->hash) == -1) {
-        hash = PyObject_Hash(key);
-        if (hash == -1)
-            return -1;
-    }
-    entry.key = key;
-    entry.hash = hash;
-    return set_contains_entry(so, &entry);
-}
-
 static PyObject *
 set_pop(PySetObject *so)
 {
@@ -682,43 +738,64 @@ set_traverse(PySetObject *so, visitproc visit, void *arg)
     return 0;
 }
 
-static Py_hash_t
-frozenset_hash(PyObject *self)
+/* Work to increase the bit dispersion for closely spaced hash values.
+   This is important because some use cases have many combinations of a
+   small number of elements with nearby hashes so that many distinct
+   combinations collapse to only a handful of distinct hash values. */
+
+static Py_uhash_t
+_shuffle_bits(Py_uhash_t h)
 {
-    /* Most of the constants in this hash algorithm are randomly choosen
-       large primes with "interesting bit patterns" and that passed
-       tests for good collision statistics on a variety of problematic
-       datasets such as:
+    return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
+}
 
-          ps = []
-          for r in range(21):
-              ps += itertools.combinations(range(20), r)
-          num_distinct_hashes = len({hash(frozenset(s)) for s in ps})
+/* Most of the constants in this hash algorithm are randomly chosen
+   large primes with "interesting bit patterns" and that passed tests
+   for good collision statistics on a variety of problematic datasets
+   including powersets and graph structures (such as David Eppstein's
+   graph recipes in Lib/test/test_set.py) */
 
-    */
+static Py_hash_t
+frozenset_hash(PyObject *self)
+{
     PySetObject *so = (PySetObject *)self;
-    Py_uhash_t h, hash = 1927868237UL;
+    Py_uhash_t hash = 0;
     setentry *entry;
-    Py_ssize_t pos = 0;
 
     if (so->hash != -1)
         return so->hash;
 
-    hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1;
-    while (set_next(so, &pos, &entry)) {
-        /* Work to increase the bit dispersion for closely spaced hash
-           values.  This is important because some use cases have many
-           combinations of a small number of elements with nearby
-           hashes so that many distinct combinations collapse to only
-           a handful of distinct hash values. */
-        h = entry->hash;
-        hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
-    }
-    /* Make the final result spread-out in a different pattern
-       than the algorithm for tuples or other python objects. */
+    /* Xor-in shuffled bits from every entry's hash field because xor is
+       commutative and a frozenset hash should be independent of order.
+
+       For speed, include null entries and dummy entries and then
+       subtract out their effect afterwards so that the final hash
+       depends only on active entries.  This allows the code to be
+       vectorized by the compiler and it saves the unpredictable
+       branches that would arise when trying to exclude null and dummy
+       entries on every iteration. */
+
+    for (entry = so->table; entry <= &so->table[so->mask]; entry++)
+        hash ^= _shuffle_bits(entry->hash);
+
+    /* Remove the effect of an odd number of NULL entries */
+    if ((so->mask + 1 - so->fill) & 1)
+        hash ^= _shuffle_bits(0);
+
+    /* Remove the effect of an odd number of dummy entries */
+    if ((so->fill - so->used) & 1)
+        hash ^= _shuffle_bits(-1);
+
+    /* Factor in the number of active entries */
+    hash ^= ((Py_uhash_t)PySet_GET_SIZE(self) + 1) * 1927868237UL;
+
+    /* Disperse patterns arising in nested frozensets */
     hash = hash * 69069U + 907133923UL;
+
+    /* -1 is reserved as an error code */
     if (hash == (Py_uhash_t)-1)
         hash = 590923713UL;
+
     so->hash = hash;
     return hash;
 }
@@ -865,7 +942,7 @@ PyTypeObject PySetIter_Type = {
     PyObject_GenericGetAttr,                    /* tp_getattro */
     0,                                          /* tp_setattro */
     0,                                          /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,    /* tp_flags */
     0,                                          /* tp_doc */
     (traverseproc)setiter_traverse,             /* tp_traverse */
     0,                                          /* tp_clear */
@@ -910,18 +987,14 @@ set_update_internal(PySetObject *so, PyObject *other)
         * incrementally resizing as we insert new keys.  Expect
         * that there will be no (or few) overlapping keys.
         */
-        if (dictsize == -1)
+        if (dictsize < 0)
             return -1;
-        if ((so->fill + dictsize)*3 >= (so->mask+1)*2) {
-            if (set_table_resize(so, (so->used + dictsize)*2) != 0)
+        if ((so->fill + dictsize)*3 >= so->mask*2) {
+            if (set_table_resize(so, so->used + dictsize) != 0)
                 return -1;
         }
         while (_PyDict_Next(other, &pos, &key, &value, &hash)) {
-            setentry an_entry;
-
-            an_entry.hash = hash;
-            an_entry.key = key;
-            if (set_add_entry(so, &an_entry))
+            if (set_add_entry(so, key, hash))
                 return -1;
         }
         return 0;
@@ -970,9 +1043,8 @@ PyDoc_STRVAR(update_doc,
 static PyObject *
 make_new_set(PyTypeObject *type, PyObject *iterable)
 {
-    PySetObject *so = NULL;
+    PySetObject *so;
 
-    /* create PySetObject structure */
     so = (PySetObject *)type->tp_alloc(type, 0);
     if (so == NULL)
         return NULL;
@@ -1015,7 +1087,8 @@ frozenset_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 {
     PyObject *iterable = NULL, *result;
 
-    if (type == &PyFrozenSet_Type && !_PyArg_NoKeywords("frozenset()", kwds))
+    if (kwds != NULL && type == &PyFrozenSet_Type
+        && !_PyArg_NoKeywords("frozenset()", kwds))
         return NULL;
 
     if (!PyArg_UnpackTuple(args, type->tp_name, 0, 1, &iterable))
@@ -1042,24 +1115,9 @@ frozenset_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
     return emptyfrozenset;
 }
 
-int
-PySet_ClearFreeList(void)
-{
-    return 0;
-}
-
-void
-PySet_Fini(void)
-{
-    Py_CLEAR(emptyfrozenset);
-}
-
 static PyObject *
 set_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 {
-    if (type == &PySet_Type && !_PyArg_NoKeywords("set()", kwds))
-        return NULL;
-
     return make_new_set(type, NULL);
 }
 
@@ -1201,6 +1259,8 @@ set_intersection(PySetObject *so, PyObject *other)
 {
     PySetObject *result;
     PyObject *key, *it, *tmp;
+    Py_hash_t hash;
+    int rv;
 
     if ((PyObject *)so == other)
         return set_copy(so);
@@ -1220,13 +1280,15 @@ set_intersection(PySetObject *so, PyObject *other)
         }
 
         while (set_next((PySetObject *)other, &pos, &entry)) {
-            int rv = set_contains_entry(so, entry);
-            if (rv == -1) {
+            key = entry->key;
+            hash = entry->hash;
+            rv = set_contains_entry(so, key, hash);
+            if (rv < 0) {
                 Py_DECREF(result);
                 return NULL;
             }
             if (rv) {
-                if (set_add_entry(result, entry)) {
+                if (set_add_entry(result, key, hash)) {
                     Py_DECREF(result);
                     return NULL;
                 }
@@ -1242,32 +1304,15 @@ set_intersection(PySetObject *so, PyObject *other)
     }
 
     while ((key = PyIter_Next(it)) != NULL) {
-        int rv;
-        setentry entry;
-        Py_hash_t hash = PyObject_Hash(key);
-
-        if (hash == -1) {
-            Py_DECREF(it);
-            Py_DECREF(result);
-            Py_DECREF(key);
-            return NULL;
-        }
-        entry.hash = hash;
-        entry.key = key;
-        rv = set_contains_entry(so, &entry);
-        if (rv == -1) {
-            Py_DECREF(it);
-            Py_DECREF(result);
-            Py_DECREF(key);
-            return NULL;
-        }
+        hash = PyObject_Hash(key);
+        if (hash == -1)
+            goto error;
+        rv = set_contains_entry(so, key, hash);
+        if (rv < 0)
+            goto error;
         if (rv) {
-            if (set_add_entry(result, &entry)) {
-                Py_DECREF(it);
-                Py_DECREF(result);
-                Py_DECREF(key);
-                return NULL;
-            }
+            if (set_add_entry(result, key, hash))
+                goto error;
         }
         Py_DECREF(key);
     }
@@ -1277,6 +1322,11 @@ set_intersection(PySetObject *so, PyObject *other)
         return NULL;
     }
     return (PyObject *)result;
+  error:
+    Py_DECREF(it);
+    Py_DECREF(result);
+    Py_DECREF(key);
+    return NULL;
 }
 
 static PyObject *
@@ -1363,6 +1413,7 @@ static PyObject *
 set_isdisjoint(PySetObject *so, PyObject *other)
 {
     PyObject *key, *it, *tmp;
+    int rv;
 
     if ((PyObject *)so == other) {
         if (PySet_GET_SIZE(so) == 0)
@@ -1381,8 +1432,8 @@ set_isdisjoint(PySetObject *so, PyObject *other)
             other = tmp;
         }
         while (set_next((PySetObject *)other, &pos, &entry)) {
-            int rv = set_contains_entry(so, entry);
-            if (rv == -1)
+            rv = set_contains_entry(so, entry->key, entry->hash);
+            if (rv < 0)
                 return NULL;
             if (rv)
                 Py_RETURN_FALSE;
@@ -1395,8 +1446,6 @@ set_isdisjoint(PySetObject *so, PyObject *other)
         return NULL;
 
     while ((key = PyIter_Next(it)) != NULL) {
-        int rv;
-        setentry entry;
         Py_hash_t hash = PyObject_Hash(key);
 
         if (hash == -1) {
@@ -1404,11 +1453,9 @@ set_isdisjoint(PySetObject *so, PyObject *other)
             Py_DECREF(it);
             return NULL;
         }
-        entry.hash = hash;
-        entry.key = key;
-        rv = set_contains_entry(so, &entry);
+        rv = set_contains_entry(so, key, hash);
         Py_DECREF(key);
-        if (rv == -1) {
+        if (rv < 0) {
             Py_DECREF(it);
             return NULL;
         }
@@ -1437,7 +1484,7 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
         Py_ssize_t pos = 0;
 
         while (set_next((PySetObject *)other, &pos, &entry))
-            if (set_discard_entry(so, entry) == -1)
+            if (set_discard_entry(so, entry->key, entry->hash) < 0)
                 return -1;
     } else {
         PyObject *key, *it;
@@ -1446,7 +1493,7 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
             return -1;
 
         while ((key = PyIter_Next(it)) != NULL) {
-            if (set_discard_key(so, key) == -1) {
+            if (set_discard_key(so, key) < 0) {
                 Py_DECREF(it);
                 Py_DECREF(key);
                 return -1;
@@ -1457,10 +1504,10 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
         if (PyErr_Occurred())
             return -1;
     }
-    /* If more than 1/5 are dummies, then resize them away. */
-    if ((so->fill - so->used) * 5 < so->mask)
+    /* If more than 1/4th are dummies, then resize them away. */
+    if ((size_t)(so->fill - so->used) <= (size_t)so->mask / 4)
         return 0;
-    return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
+    return set_table_resize(so, so->used);
 }
 
 static PyObject *
@@ -1487,7 +1534,7 @@ set_copy_and_difference(PySetObject *so, PyObject *other)
     result = set_copy(so);
     if (result == NULL)
         return NULL;
-    if (set_difference_update_internal((PySetObject *) result, other) != -1)
+    if (set_difference_update_internal((PySetObject *) result, other) == 0)
         return result;
     Py_DECREF(result);
     return NULL;
@@ -1497,8 +1544,11 @@ static PyObject *
 set_difference(PySetObject *so, PyObject *other)
 {
     PyObject *result;
+    PyObject *key;
+    Py_hash_t hash;
     setentry *entry;
     Py_ssize_t pos = 0;
+    int rv;
 
     if (!PyAnySet_Check(other)  && !PyDict_CheckExact(other)) {
         return set_copy_and_difference(so, other);
@@ -1516,17 +1566,15 @@ set_difference(PySetObject *so, PyObject *other)
 
     if (PyDict_CheckExact(other)) {
         while (set_next(so, &pos, &entry)) {
-            setentry entrycopy;
-            int rv;
-            entrycopy.hash = entry->hash;
-            entrycopy.key = entry->key;
-            rv = _PyDict_Contains(other, entry->key, entry->hash);
+            key = entry->key;
+            hash = entry->hash;
+            rv = _PyDict_Contains(other, key, hash);
             if (rv < 0) {
                 Py_DECREF(result);
                 return NULL;
             }
             if (!rv) {
-                if (set_add_entry((PySetObject *)result, &entrycopy)) {
+                if (set_add_entry((PySetObject *)result, key, hash)) {
                     Py_DECREF(result);
                     return NULL;
                 }
@@ -1537,13 +1585,15 @@ set_difference(PySetObject *so, PyObject *other)
 
     /* Iterate over so, checking for common elements in other. */
     while (set_next(so, &pos, &entry)) {
-        int rv = set_contains_entry((PySetObject *)other, entry);
-        if (rv == -1) {
+        key = entry->key;
+        hash = entry->hash;
+        rv = set_contains_entry((PySetObject *)other, key, hash);
+        if (rv < 0) {
             Py_DECREF(result);
             return NULL;
         }
         if (!rv) {
-            if (set_add_entry((PySetObject *)result, entry)) {
+            if (set_add_entry((PySetObject *)result, key, hash)) {
                 Py_DECREF(result);
                 return NULL;
             }
@@ -1605,29 +1655,24 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other)
     PySetObject *otherset;
     PyObject *key;
     Py_ssize_t pos = 0;
+    Py_hash_t hash;
     setentry *entry;
+    int rv;
 
     if ((PyObject *)so == other)
         return set_clear(so);
 
     if (PyDict_CheckExact(other)) {
         PyObject *value;
-        int rv;
-        Py_hash_t hash;
         while (_PyDict_Next(other, &pos, &key, &value, &hash)) {
-            setentry an_entry;
-
             Py_INCREF(key);
-            an_entry.hash = hash;
-            an_entry.key = key;
-
-            rv = set_discard_entry(so, &an_entry);
-            if (rv == -1) {
+            rv = set_discard_entry(so, key, hash);
+            if (rv < 0) {
                 Py_DECREF(key);
                 return NULL;
             }
             if (rv == DISCARD_NOTFOUND) {
-                if (set_add_entry(so, &an_entry)) {
+                if (set_add_entry(so, key, hash)) {
                     Py_DECREF(key);
                     return NULL;
                 }
@@ -1647,13 +1692,15 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other)
     }
 
     while (set_next(otherset, &pos, &entry)) {
-        int rv = set_discard_entry(so, entry);
-        if (rv == -1) {
+        key = entry->key;
+        hash = entry->hash;
+        rv = set_discard_entry(so, key, hash);
+        if (rv < 0) {
             Py_DECREF(otherset);
             return NULL;
         }
         if (rv == DISCARD_NOTFOUND) {
-            if (set_add_entry(so, entry)) {
+            if (set_add_entry(so, key, hash)) {
                 Py_DECREF(otherset);
                 return NULL;
             }
@@ -1715,6 +1762,7 @@ set_issubset(PySetObject *so, PyObject *other)
 {
     setentry *entry;
     Py_ssize_t pos = 0;
+    int rv;
 
     if (!PyAnySet_Check(other)) {
         PyObject *tmp, *result;
@@ -1729,8 +1777,8 @@ set_issubset(PySetObject *so, PyObject *other)
         Py_RETURN_FALSE;
 
     while (set_next(so, &pos, &entry)) {
-        int rv = set_contains_entry((PySetObject *)other, entry);
-        if (rv == -1)
+        rv = set_contains_entry((PySetObject *)other, entry->key, entry->hash);
+        if (rv < 0)
             return NULL;
         if (!rv)
             Py_RETURN_FALSE;
@@ -1821,7 +1869,7 @@ set_contains(PySetObject *so, PyObject *key)
     int rv;
 
     rv = set_contains_key(so, key);
-    if (rv == -1) {
+    if (rv < 0) {
         if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
             return -1;
         PyErr_Clear();
@@ -1840,7 +1888,7 @@ set_direct_contains(PySetObject *so, PyObject *key)
     long result;
 
     result = set_contains(so, key);
-    if (result == -1)
+    if (result < 0)
         return NULL;
     return PyBool_FromLong(result);
 }
@@ -1854,7 +1902,7 @@ set_remove(PySetObject *so, PyObject *key)
     int rv;
 
     rv = set_discard_key(so, key);
-    if (rv == -1) {
+    if (rv < 0) {
         if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
             return NULL;
         PyErr_Clear();
@@ -1863,7 +1911,7 @@ set_remove(PySetObject *so, PyObject *key)
             return NULL;
         rv = set_discard_key(so, tmpkey);
         Py_DECREF(tmpkey);
-        if (rv == -1)
+        if (rv < 0)
             return NULL;
     }
 
@@ -1886,7 +1934,7 @@ set_discard(PySetObject *so, PyObject *key)
     int rv;
 
     rv = set_discard_key(so, key);
-    if (rv == -1) {
+    if (rv < 0) {
         if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
             return NULL;
         PyErr_Clear();
@@ -1895,7 +1943,7 @@ set_discard(PySetObject *so, PyObject *key)
             return NULL;
         rv = set_discard_key(so, tmpkey);
         Py_DECREF(tmpkey);
-        if (rv == -1)
+        if (rv < 0)
             return NULL;
     }
     Py_RETURN_NONE;
@@ -1949,13 +1997,12 @@ set_init(PySetObject *self, PyObject *args, PyObject *kwds)
 {
     PyObject *iterable = NULL;
 
-    if (!PyAnySet_Check(self))
-        return -1;
-    if (PySet_Check(self) && !_PyArg_NoKeywords("set()", kwds))
+    if (kwds != NULL && !_PyArg_NoKeywords("set()", kwds))
         return -1;
     if (!PyArg_UnpackTuple(args, Py_TYPE(self)->tp_name, 0, 1, &iterable))
         return -1;
-    set_clear_internal(self);
+    if (self->fill)
+        set_clear_internal(self);
     self->hash = -1;
     if (iterable == NULL)
         return 0;
@@ -2122,7 +2169,7 @@ static PyMethodDef frozenset_methods[] = {
      copy_doc},
     {"difference",      (PyCFunction)set_difference_multi,      METH_VARARGS,
      difference_doc},
-    {"intersection",(PyCFunction)set_intersection_multi,        METH_VARARGS,
+    {"intersection",    (PyCFunction)set_intersection_multi,    METH_VARARGS,
      intersection_doc},
     {"isdisjoint",      (PyCFunction)set_isdisjoint,    METH_O,
      isdisjoint_doc},
@@ -2193,7 +2240,7 @@ PyTypeObject PyFrozenSet_Type = {
     (traverseproc)set_traverse,         /* tp_traverse */
     (inquiry)set_clear_internal,        /* tp_clear */
     (richcmpfunc)set_richcompare,       /* tp_richcompare */
-    offsetof(PySetObject, weakreflist),         /* tp_weaklistoffset */
+    offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */
     (getiterfunc)set_iter,              /* tp_iter */
     0,                                  /* tp_iternext */
     frozenset_methods,                  /* tp_methods */
@@ -2277,6 +2324,18 @@ PySet_Add(PyObject *anyset, PyObject *key)
 }
 
 int
+PySet_ClearFreeList(void)
+{
+    return 0;
+}
+
+void
+PySet_Fini(void)
+{
+    Py_CLEAR(emptyfrozenset);
+}
+
+int
 _PySet_NextEntry(PyObject *set, Py_ssize_t *pos, PyObject **key, Py_hash_t *hash)
 {
     setentry *entry;
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index 0fc6b58..a9d0a34 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -1,6 +1,8 @@
 /* stringlib: codec implementations */
 
-#if STRINGLIB_IS_UNICODE
+#if !STRINGLIB_IS_UNICODE
+# error "codecs.h is specific to Unicode"
+#endif
 
 /* Mask to quickly check whether a C 'long' contains a
    non-ASCII, UTF8-encoded char. */
@@ -263,50 +265,34 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
 
     Py_ssize_t i;                /* index into s of next input byte */
-    PyObject *result;            /* result string object */
     char *p;                     /* next free byte in output buffer */
-    Py_ssize_t nallocated;      /* number of result bytes allocated */
-    Py_ssize_t nneeded;            /* number of result bytes needed */
 #if STRINGLIB_SIZEOF_CHAR > 1
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
     PyObject *rep = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 #endif
 #if STRINGLIB_SIZEOF_CHAR == 1
     const Py_ssize_t max_char_size = 2;
-    char stackbuf[MAX_SHORT_UNICHARS * 2];
 #elif STRINGLIB_SIZEOF_CHAR == 2
     const Py_ssize_t max_char_size = 3;
-    char stackbuf[MAX_SHORT_UNICHARS * 3];
 #else /*  STRINGLIB_SIZEOF_CHAR == 4 */
     const Py_ssize_t max_char_size = 4;
-    char stackbuf[MAX_SHORT_UNICHARS * 4];
 #endif
+    _PyBytesWriter writer;
 
     assert(size >= 0);
+    _PyBytesWriter_Init(&writer);
 
-    if (size <= MAX_SHORT_UNICHARS) {
-        /* Write into the stack buffer; nallocated can't overflow.
-         * At the end, we'll allocate exactly as much heap space as it
-         * turns out we need.
-         */
-        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
-        result = NULL;   /* will allocate after we're done */
-        p = stackbuf;
-    }
-    else {
-        if (size > PY_SSIZE_T_MAX / max_char_size) {
-            /* integer overflow */
-            return PyErr_NoMemory();
-        }
-        /* Overallocate on the heap, and give the excess back at the end. */
-        nallocated = size * max_char_size;
-        result = PyBytes_FromStringAndSize(NULL, nallocated);
-        if (result == NULL)
-            return NULL;
-        p = PyBytes_AS_STRING(result);
+    if (size > PY_SSIZE_T_MAX / max_char_size) {
+        /* integer overflow */
+        return PyErr_NoMemory();
     }
 
+    p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
+    if (p == NULL)
+        return NULL;
+
     for (i = 0; i < size;) {
         Py_UCS4 ch = data[i++];
 
@@ -326,72 +312,119 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
         }
 #if STRINGLIB_SIZEOF_CHAR > 1
         else if (Py_UNICODE_IS_SURROGATE(ch)) {
-            Py_ssize_t newpos;
-            Py_ssize_t repsize, k, startpos;
+            Py_ssize_t startpos, endpos, newpos;
+            Py_ssize_t k;
+            if (error_handler == _Py_ERROR_UNKNOWN) {
+                error_handler = get_error_handler(errors);
+            }
+
             startpos = i-1;
-            rep = unicode_encode_call_errorhandler(
-                  errors, &errorHandler, "utf-8", "surrogates not allowed",
-                  unicode, &exc, startpos, startpos+1, &newpos);
-            if (!rep)
-                goto error;
-
-            if (PyBytes_Check(rep))
-                repsize = PyBytes_GET_SIZE(rep);
-            else
-                repsize = PyUnicode_GET_LENGTH(rep);
+            endpos = startpos+1;
+
+            while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
+                endpos++;
+
+            /* Only overallocate the buffer if it's not the last write */
+            writer.overallocate = (endpos < size);
+
+            switch (error_handler)
+            {
+            case _Py_ERROR_REPLACE:
+                memset(p, '?', endpos - startpos);
+                p += (endpos - startpos);
+                /* fall through the ignore handler */
+            case _Py_ERROR_IGNORE:
+                i += (endpos - startpos - 1);
+                break;
 
-            if (repsize > max_char_size) {
-                Py_ssize_t offset;
+            case _Py_ERROR_SURROGATEPASS:
+                for (k=startpos; k<endpos; k++) {
+                    ch = data[k];
+                    *p++ = (char)(0xe0 | (ch >> 12));
+                    *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+                    *p++ = (char)(0x80 | (ch & 0x3f));
+                }
+                i += (endpos - startpos - 1);
+                break;
 
-                if (result == NULL)
-                    offset = p - stackbuf;
-                else
-                    offset = p - PyBytes_AS_STRING(result);
+            case _Py_ERROR_BACKSLASHREPLACE:
+                /* subtract preallocated bytes */
+                writer.min_size -= max_char_size * (endpos - startpos);
+                p = backslashreplace(&writer, p,
+                                     unicode, startpos, endpos);
+                if (p == NULL)
+                    goto error;
+                i += (endpos - startpos - 1);
+                break;
 
-                if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
-                    /* integer overflow */
-                    PyErr_NoMemory();
+            case _Py_ERROR_XMLCHARREFREPLACE:
+                /* subtract preallocated bytes */
+                writer.min_size -= max_char_size * (endpos - startpos);
+                p = xmlcharrefreplace(&writer, p,
+                                      unicode, startpos, endpos);
+                if (p == NULL)
                     goto error;
+                i += (endpos - startpos - 1);
+                break;
+
+            case _Py_ERROR_SURROGATEESCAPE:
+                for (k=startpos; k<endpos; k++) {
+                    ch = data[k];
+                    if (!(0xDC80 <= ch && ch <= 0xDCFF))
+                        break;
+                    *p++ = (char)(ch & 0xff);
                 }
-                nallocated += repsize - max_char_size;
-                if (result != NULL) {
-                    if (_PyBytes_Resize(&result, nallocated) < 0)
-                        goto error;
-                } else {
-                    result = PyBytes_FromStringAndSize(NULL, nallocated);
-                    if (result == NULL)
-                        goto error;
-                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
+                if (k >= endpos) {
+                    i += (endpos - startpos - 1);
+                    break;
                 }
-                p = PyBytes_AS_STRING(result) + offset;
-            }
+                startpos = k;
+                assert(startpos < endpos);
+                /* fall through the default handler */
+            default:
+                rep = unicode_encode_call_errorhandler(
+                      errors, &error_handler_obj, "utf-8", "surrogates not allowed",
+                      unicode, &exc, startpos, endpos, &newpos);
+                if (!rep)
+                    goto error;
 
-            if (PyBytes_Check(rep)) {
-                char *prep = PyBytes_AS_STRING(rep);
-                for(k = repsize; k > 0; k--)
-                    *p++ = *prep++;
-            } else /* rep is unicode */ {
-                enum PyUnicode_Kind repkind;
-                void *repdata;
+                /* subtract preallocated bytes */
+                writer.min_size -= max_char_size;
 
-                if (PyUnicode_READY(rep) < 0)
-                    goto error;
-                repkind = PyUnicode_KIND(rep);
-                repdata = PyUnicode_DATA(rep);
+                if (PyBytes_Check(rep)) {
+                    p = _PyBytesWriter_WriteBytes(&writer, p,
+                                                  PyBytes_AS_STRING(rep),
+                                                  PyBytes_GET_SIZE(rep));
+                }
+                else {
+                    /* rep is unicode */
+                    if (PyUnicode_READY(rep) < 0)
+                        goto error;
 
-                for(k=0; k<repsize; k++) {
-                    Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
-                    if (0x80 <= c) {
+                    if (!PyUnicode_IS_ASCII(rep)) {
                         raise_encode_exception(&exc, "utf-8",
                                                unicode,
                                                i-1, i,
                                                "surrogates not allowed");
                         goto error;
                     }
-                    *p++ = (char)c;
+
+                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+                    p = _PyBytesWriter_WriteBytes(&writer, p,
+                                                  PyUnicode_DATA(rep),
+                                                  PyUnicode_GET_LENGTH(rep));
                 }
+
+                if (p == NULL)
+                    goto error;
+                Py_CLEAR(rep);
+
+                i = newpos;
             }
-            Py_CLEAR(rep);
+
+            /* If overallocation was disabled, ensure that it was the last
+               write. Otherwise, we missed an optimization */
+            assert(writer.overallocate || i == size);
         }
         else
 #if STRINGLIB_SIZEOF_CHAR > 2
@@ -416,31 +449,18 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
     }
 
-    if (result == NULL) {
-        /* This was stack allocated. */
-        nneeded = p - stackbuf;
-        assert(nneeded <= nallocated);
-        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
-    }
-    else {
-        /* Cut back to size actually needed. */
-        nneeded = p - PyBytes_AS_STRING(result);
-        assert(nneeded <= nallocated);
-        _PyBytes_Resize(&result, nneeded);
-    }
-
 #if STRINGLIB_SIZEOF_CHAR > 1
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
 #endif
-    return result;
+    return _PyBytesWriter_Finish(&writer, p);
 
 #if STRINGLIB_SIZEOF_CHAR > 1
  error:
     Py_XDECREF(rep);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
-    Py_XDECREF(result);
+    _PyBytesWriter_Dealloc(&writer);
     return NULL;
 #endif
 
@@ -806,5 +826,3 @@ STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
 #undef SWAB4
 
 #endif
-
-#endif /* STRINGLIB_IS_UNICODE */
diff --git a/Objects/stringlib/ctype.h b/Objects/stringlib/ctype.h
index 739cf3d..f054625 100644
--- a/Objects/stringlib/ctype.h
+++ b/Objects/stringlib/ctype.h
@@ -1,5 +1,6 @@
-/* NOTE: this API is -ONLY- for use with single byte character strings. */
-/* Do not use it with Unicode. */
+#if STRINGLIB_IS_UNICODE
+# error "ctype.h only compatible with byte-wise strings"
+#endif
 
 #include "bytes_methods.h"
 
diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
index cda68e7..98165ad 100644
--- a/Objects/stringlib/fastsearch.h
+++ b/Objects/stringlib/fastsearch.h
@@ -32,52 +32,98 @@
 #define STRINGLIB_BLOOM(mask, ch)     \
     ((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
 
-
 Py_LOCAL_INLINE(Py_ssize_t)
-STRINGLIB(fastsearch_memchr_1char)(const STRINGLIB_CHAR* s, Py_ssize_t n,
-                                   STRINGLIB_CHAR ch, unsigned char needle,
-                                   int mode)
+STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
 {
-    if (mode == FAST_SEARCH) {
-        const STRINGLIB_CHAR *ptr = s;
-        const STRINGLIB_CHAR *e = s + n;
-        while (ptr < e) {
-            void *candidate = memchr((const void *) ptr, needle, (e - ptr) * sizeof(STRINGLIB_CHAR));
-            if (candidate == NULL)
-                return -1;
-            ptr = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
-            if (sizeof(STRINGLIB_CHAR) == 1 || *ptr == ch)
-                return (ptr - s);
-            /* False positive */
-            ptr++;
-        }
+    const STRINGLIB_CHAR *p, *e;
+
+    p = s;
+    e = s + n;
+    if (n > 10) {
+#if STRINGLIB_SIZEOF_CHAR == 1
+        p = memchr(s, ch, n);
+        if (p != NULL)
+            return (p - s);
         return -1;
+#else
+        /* use memchr if we can choose a needle without two many likely
+           false positives */
+        unsigned char needle = ch & 0xff;
+        /* If looking for a multiple of 256, we'd have too
+           many false positives looking for the '\0' byte in UCS2
+           and UCS4 representations. */
+        if (needle != 0) {
+            while (p < e) {
+                void *candidate = memchr(p, needle,
+                                         (e - p) * sizeof(STRINGLIB_CHAR));
+                if (candidate == NULL)
+                    return -1;
+                p = (const STRINGLIB_CHAR *)
+                        _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
+                if (*p == ch)
+                    return (p - s);
+                /* False positive */
+                p++;
+            }
+            return -1;
+        }
+#endif
     }
+    while (p < e) {
+        if (*p == ch)
+            return (p - s);
+        p++;
+    }
+    return -1;
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
+{
+    const STRINGLIB_CHAR *p;
 #ifdef HAVE_MEMRCHR
     /* memrchr() is a GNU extension, available since glibc 2.1.91.
        it doesn't seem as optimized as memchr(), but is still quite
-       faster than our hand-written loop in FASTSEARCH below */
-    else if (mode == FAST_RSEARCH) {
-        while (n > 0) {
-            const STRINGLIB_CHAR *found;
-            void *candidate = memrchr((const void *) s, needle, n * sizeof(STRINGLIB_CHAR));
-            if (candidate == NULL)
-                return -1;
-            found = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
-            n = found - s;
-            if (sizeof(STRINGLIB_CHAR) == 1 || *found == ch)
-                return n;
-            /* False positive */
-        }
+       faster than our hand-written loop below */
+
+    if (n > 10) {
+#if STRINGLIB_SIZEOF_CHAR == 1
+        p = memrchr(s, ch, n);
+        if (p != NULL)
+            return (p - s);
         return -1;
-    }
+#else
+        /* use memrchr if we can choose a needle without two many likely
+           false positives */
+        unsigned char needle = ch & 0xff;
+        /* If looking for a multiple of 256, we'd have too
+           many false positives looking for the '\0' byte in UCS2
+           and UCS4 representations. */
+        if (needle != 0) {
+            while (n > 0) {
+                void *candidate = memrchr(s, needle,
+                                          n * sizeof(STRINGLIB_CHAR));
+                if (candidate == NULL)
+                    return -1;
+                p = (const STRINGLIB_CHAR *)
+                        _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
+                n = p - s;
+                if (*p == ch)
+                    return n;
+                /* False positive */
+            }
+            return -1;
+        }
 #endif
-    else {
-        assert(0); /* Should never get here */
-        return 0;
     }
-
-#undef DO_MEMCHR
+#endif  /* HAVE_MEMRCHR */
+    p = s + n;
+    while (p > s) {
+        p--;
+        if (*p == ch)
+            return (p - s);
+    }
+    return -1;
 }
 
 Py_LOCAL_INLINE(Py_ssize_t)
@@ -99,25 +145,11 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
         if (m <= 0)
             return -1;
         /* use special case for 1-character strings */
-        if (n > 10 && (mode == FAST_SEARCH
-#ifdef HAVE_MEMRCHR
-                    || mode == FAST_RSEARCH
-#endif
-                    )) {
-            /* use memchr if we can choose a needle without two many likely
-               false positives */
-            unsigned char needle;
-            needle = p[0] & 0xff;
-#if STRINGLIB_SIZEOF_CHAR > 1
-            /* If looking for a multiple of 256, we'd have too
-               many false positives looking for the '\0' byte in UCS2
-               and UCS4 representations. */
-            if (needle != 0)
-#endif
-                return STRINGLIB(fastsearch_memchr_1char)
-                       (s, n, p[0], needle, mode);
-        }
-        if (mode == FAST_COUNT) {
+        if (mode == FAST_SEARCH)
+            return STRINGLIB(find_char)(s, n, p[0]);
+        else if (mode == FAST_RSEARCH)
+            return STRINGLIB(rfind_char)(s, n, p[0]);
+        else {  /* FAST_COUNT */
             for (i = 0; i < n; i++)
                 if (s[i] == p[0]) {
                     count++;
@@ -125,14 +157,6 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
                         return maxcount;
                 }
             return count;
-        } else if (mode == FAST_SEARCH) {
-            for (i = 0; i < n; i++)
-                if (s[i] == p[0])
-                    return i;
-        } else {    /* FAST_RSEARCH */
-            for (i = n - 1; i > -1; i--)
-                if (s[i] == p[0])
-                    return i;
         }
         return -1;
     }
diff --git a/Objects/stringlib/find.h b/Objects/stringlib/find.h
index 14815f6..509b929 100644
--- a/Objects/stringlib/find.h
+++ b/Objects/stringlib/find.h
@@ -117,85 +117,3 @@ STRINGLIB(parse_args_finds)(const char * function_name, PyObject *args,
 }
 
 #undef FORMAT_BUFFER_SIZE
-
-#if STRINGLIB_IS_UNICODE
-
-/*
-Wraps stringlib_parse_args_finds() and additionally ensures that the
-first argument is a unicode object.
-
-Note that we receive a pointer to the pointer of the substring object,
-so when we create that object in this function we don't DECREF it,
-because it continues living in the caller functions (those functions,
-after finishing using the substring, must DECREF it).
-*/
-
-Py_LOCAL_INLINE(int)
-STRINGLIB(parse_args_finds_unicode)(const char * function_name, PyObject *args,
-                                   PyObject **substring,
-                                   Py_ssize_t *start, Py_ssize_t *end)
-{
-    PyObject *tmp_substring;
-
-    if(STRINGLIB(parse_args_finds)(function_name, args, &tmp_substring,
-                                  start, end)) {
-        tmp_substring = PyUnicode_FromObject(tmp_substring);
-        if (!tmp_substring)
-            return 0;
-        *substring = tmp_substring;
-        return 1;
-    }
-    return 0;
-}
-
-#else /* !STRINGLIB_IS_UNICODE */
-
-/*
-Wraps stringlib_parse_args_finds() and additionally checks whether the
-first argument is an integer in range(0, 256).
-
-If this is the case, writes the integer value to the byte parameter
-and sets subobj to NULL. Otherwise, sets the first argument to subobj
-and doesn't touch byte. The other parameters are similar to those of
-stringlib_parse_args_finds().
-*/
-
-Py_LOCAL_INLINE(int)
-STRINGLIB(parse_args_finds_byte)(const char *function_name, PyObject *args,
-                                 PyObject **subobj, char *byte,
-                                 Py_ssize_t *start, Py_ssize_t *end)
-{
-    PyObject *tmp_subobj;
-    Py_ssize_t ival;
-    PyObject *err;
-
-    if(!STRINGLIB(parse_args_finds)(function_name, args, &tmp_subobj,
-                                    start, end))
-        return 0;
-
-    if (!PyNumber_Check(tmp_subobj)) {
-        *subobj = tmp_subobj;
-        return 1;
-    }
-
-    ival = PyNumber_AsSsize_t(tmp_subobj, PyExc_OverflowError);
-    if (ival == -1) {
-        err = PyErr_Occurred();
-        if (err && !PyErr_GivenExceptionMatches(err, PyExc_OverflowError)) {
-            PyErr_Clear();
-            *subobj = tmp_subobj;
-            return 1;
-        }
-    }
-
-    if (ival < 0 || ival > 255) {
-        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)");
-        return 0;
-    }
-
-    *subobj = NULL;
-    *byte = (char)ival;
-    return 1;
-}
-
-#endif /* STRINGLIB_IS_UNICODE */
diff --git a/Objects/stringlib/find_max_char.h b/Objects/stringlib/find_max_char.h
index eb3fe88..8ccbc30 100644
--- a/Objects/stringlib/find_max_char.h
+++ b/Objects/stringlib/find_max_char.h
@@ -1,6 +1,8 @@
 /* Finding the optimal width of unicode characters in a buffer */
 
-#if STRINGLIB_IS_UNICODE
+#if !STRINGLIB_IS_UNICODE
+# error "find_max_char.h is specific to Unicode"
+#endif
 
 /* Mask to quickly check whether a C 'long' contains a
    non-ASCII, UTF8-encoded char. */
@@ -129,5 +131,4 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
 #undef MAX_CHAR_UCS4
 
 #endif /* STRINGLIB_SIZEOF_CHAR == 1 */
-#endif /* STRINGLIB_IS_UNICODE */
 
diff --git a/Objects/stringlib/join.h b/Objects/stringlib/join.h
index cbf81be..90f966d 100644
--- a/Objects/stringlib/join.h
+++ b/Objects/stringlib/join.h
@@ -1,6 +1,6 @@
 /* stringlib: bytes joining implementation */
 
-#if STRINGLIB_SIZEOF_CHAR != 1
+#if STRINGLIB_IS_UNICODE
 #error join.h only compatible with byte-wise strings
 #endif
 
diff --git a/Objects/stringlib/localeutil.h b/Objects/stringlib/localeutil.h
index 6e2f073..df501ed 100644
--- a/Objects/stringlib/localeutil.h
+++ b/Objects/stringlib/localeutil.h
@@ -2,8 +2,8 @@
 
 #include <locale.h>
 
-#ifndef STRINGLIB_IS_UNICODE
-#   error "localeutil is specific to Unicode"
+#if !STRINGLIB_IS_UNICODE
+#   error "localeutil.h is specific to Unicode"
 #endif
 
 typedef struct {
diff --git a/Objects/stringlib/transmogrify.h b/Objects/stringlib/transmogrify.h
index b559b53..625507d 100644
--- a/Objects/stringlib/transmogrify.h
+++ b/Objects/stringlib/transmogrify.h
@@ -1,14 +1,21 @@
-/* NOTE: this API is -ONLY- for use with single byte character strings. */
-/* Do not use it with Unicode. */
+#if STRINGLIB_IS_UNICODE
+# error "transmogrify.h only compatible with byte-wise strings"
+#endif
 
 /* the more complicated methods.  parts of these should be pulled out into the
    shared code in bytes_methods.c to cut down on duplicate code bloat.  */
 
-PyDoc_STRVAR(expandtabs__doc__,
-"B.expandtabs(tabsize=8) -> copy of B\n\
-\n\
-Return a copy of B where all tab characters are expanded using spaces.\n\
-If tabsize is not given, a tab size of 8 characters is assumed.");
+Py_LOCAL_INLINE(PyObject *)
+return_self(PyObject *self)
+{
+#if !STRINGLIB_MUTABLE
+    if (STRINGLIB_CHECK_EXACT(self)) {
+        Py_INCREF(self);
+        return self;
+    }
+#endif
+    return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
+}
 
 static PyObject*
 stringlib_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
@@ -93,39 +100,25 @@ pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
     if (right < 0)
         right = 0;
 
-    if (left == 0 && right == 0 && STRINGLIB_CHECK_EXACT(self)) {
-#if STRINGLIB_MUTABLE
-        /* We're defined as returning a copy;  If the object is mutable
-         * that means we must make an identical copy. */
-        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
-#else
-        Py_INCREF(self);
-        return (PyObject *)self;
-#endif /* STRINGLIB_MUTABLE */
+    if (left == 0 && right == 0) {
+        return return_self(self);
     }
 
-    u = STRINGLIB_NEW(NULL,
-				   left + STRINGLIB_LEN(self) + right);
+    u = STRINGLIB_NEW(NULL, left + STRINGLIB_LEN(self) + right);
     if (u) {
         if (left)
             memset(STRINGLIB_STR(u), fill, left);
         Py_MEMCPY(STRINGLIB_STR(u) + left,
-	       STRINGLIB_STR(self),
-	       STRINGLIB_LEN(self));
+               STRINGLIB_STR(self),
+               STRINGLIB_LEN(self));
         if (right)
             memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self),
-		   fill, right);
+                   fill, right);
     }
 
     return u;
 }
 
-PyDoc_STRVAR(ljust__doc__,
-"B.ljust(width[, fillchar]) -> copy of B\n"
-"\n"
-"Return B left justified in a string of length width. Padding is\n"
-"done using the specified fill character (default is a space).");
-
 static PyObject *
 stringlib_ljust(PyObject *self, PyObject *args)
 {
@@ -135,27 +128,14 @@ stringlib_ljust(PyObject *self, PyObject *args)
     if (!PyArg_ParseTuple(args, "n|c:ljust", &width, &fillchar))
         return NULL;
 
-    if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) {
-#if STRINGLIB_MUTABLE
-        /* We're defined as returning a copy;  If the object is mutable
-         * that means we must make an identical copy. */
-        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
-#else
-        Py_INCREF(self);
-        return (PyObject*) self;
-#endif
+    if (STRINGLIB_LEN(self) >= width) {
+        return return_self(self);
     }
 
     return pad(self, 0, width - STRINGLIB_LEN(self), fillchar);
 }
 
 
-PyDoc_STRVAR(rjust__doc__,
-"B.rjust(width[, fillchar]) -> copy of B\n"
-"\n"
-"Return B right justified in a string of length width. Padding is\n"
-"done using the specified fill character (default is a space)");
-
 static PyObject *
 stringlib_rjust(PyObject *self, PyObject *args)
 {
@@ -165,27 +145,14 @@ stringlib_rjust(PyObject *self, PyObject *args)
     if (!PyArg_ParseTuple(args, "n|c:rjust", &width, &fillchar))
         return NULL;
 
-    if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) {
-#if STRINGLIB_MUTABLE
-        /* We're defined as returning a copy;  If the object is mutable
-         * that means we must make an identical copy. */
-        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
-#else
-        Py_INCREF(self);
-        return (PyObject*) self;
-#endif
+    if (STRINGLIB_LEN(self) >= width) {
+        return return_self(self);
     }
 
     return pad(self, width - STRINGLIB_LEN(self), 0, fillchar);
 }
 
 
-PyDoc_STRVAR(center__doc__,
-"B.center(width[, fillchar]) -> copy of B\n"
-"\n"
-"Return B centered in a string of length width.  Padding is\n"
-"done using the specified fill character (default is a space).");
-
 static PyObject *
 stringlib_center(PyObject *self, PyObject *args)
 {
@@ -196,15 +163,8 @@ stringlib_center(PyObject *self, PyObject *args)
     if (!PyArg_ParseTuple(args, "n|c:center", &width, &fillchar))
         return NULL;
 
-    if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) {
-#if STRINGLIB_MUTABLE
-        /* We're defined as returning a copy;  If the object is mutable
-         * that means we must make an identical copy. */
-        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
-#else
-        Py_INCREF(self);
-        return (PyObject*) self;
-#endif
+    if (STRINGLIB_LEN(self) >= width) {
+        return return_self(self);
     }
 
     marg = width - STRINGLIB_LEN(self);
@@ -213,12 +173,6 @@ stringlib_center(PyObject *self, PyObject *args)
     return pad(self, left, marg - left, fillchar);
 }
 
-PyDoc_STRVAR(zfill__doc__,
-"B.zfill(width) -> copy of B\n"
-"\n"
-"Pad a numeric string B with zeros on the left, to fill a field\n"
-"of the specified width.  B is never truncated.");
-
 static PyObject *
 stringlib_zfill(PyObject *self, PyObject *args)
 {
@@ -231,21 +185,7 @@ stringlib_zfill(PyObject *self, PyObject *args)
         return NULL;
 
     if (STRINGLIB_LEN(self) >= width) {
-        if (STRINGLIB_CHECK_EXACT(self)) {
-#if STRINGLIB_MUTABLE
-            /* We're defined as returning a copy;  If the object is mutable
-             * that means we must make an identical copy. */
-            return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
-#else
-            Py_INCREF(self);
-            return (PyObject*) self;
-#endif
-        }
-        else
-            return STRINGLIB_NEW(
-                STRINGLIB_STR(self),
-                STRINGLIB_LEN(self)
-            );
+        return return_self(self);
     }
 
     fill = width - STRINGLIB_LEN(self);
@@ -262,5 +202,500 @@ stringlib_zfill(PyObject *self, PyObject *args)
         p[fill] = '0';
     }
 
-    return (PyObject*) s;
+    return s;
+}
+
+
+/* find and count characters and substrings */
+
+#define findchar(target, target_len, c)                         \
+  ((char *)memchr((const void *)(target), c, target_len))
+
+
+Py_LOCAL_INLINE(Py_ssize_t)
+countchar(const char *target, Py_ssize_t target_len, char c,
+          Py_ssize_t maxcount)
+{
+    Py_ssize_t count = 0;
+    const char *start = target;
+    const char *end = target + target_len;
+
+    while ((start = findchar(start, end - start, c)) != NULL) {
+        count++;
+        if (count >= maxcount)
+            break;
+        start += 1;
+    }
+    return count;
+}
+
+
+/* Algorithms for different cases of string replacement */
+
+/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_interleave(PyObject *self,
+                             const char *to_s, Py_ssize_t to_len,
+                             Py_ssize_t maxcount)
+{
+    const char *self_s;
+    char *result_s;
+    Py_ssize_t self_len, result_len;
+    Py_ssize_t count, i;
+    PyObject *result;
+
+    self_len = STRINGLIB_LEN(self);
+
+    /* 1 at the end plus 1 after every character;
+       count = min(maxcount, self_len + 1) */
+    if (maxcount <= self_len) {
+        count = maxcount;
+    }
+    else {
+        /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */
+        count = self_len + 1;
+    }
+
+    /* Check for overflow */
+    /*   result_len = count * to_len + self_len; */
+    assert(count > 0);
+    if (to_len > (PY_SSIZE_T_MAX - self_len) / count) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "replace bytes are too long");
+        return NULL;
+    }
+    result_len = count * to_len + self_len;
+    result = STRINGLIB_NEW(NULL, result_len);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    self_s = STRINGLIB_STR(self);
+    result_s = STRINGLIB_STR(result);
+
+    if (to_len > 1) {
+        /* Lay the first one down (guaranteed this will occur) */
+        Py_MEMCPY(result_s, to_s, to_len);
+        result_s += to_len;
+        count -= 1;
+
+        for (i = 0; i < count; i++) {
+            *result_s++ = *self_s++;
+            Py_MEMCPY(result_s, to_s, to_len);
+            result_s += to_len;
+        }
+    }
+    else {
+        result_s[0] = to_s[0];
+        result_s += to_len;
+        count -= 1;
+        for (i = 0; i < count; i++) {
+            *result_s++ = *self_s++;
+            result_s[0] = to_s[0];
+            result_s += to_len;
+        }
+    }
+
+    /* Copy the rest of the original string */
+    Py_MEMCPY(result_s, self_s, self_len - i);
+
+    return result;
 }
+
+/* Special case for deleting a single character */
+/* len(self)>=1, len(from)==1, to="", maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_delete_single_character(PyObject *self,
+                                          char from_c, Py_ssize_t maxcount)
+{
+    const char *self_s, *start, *next, *end;
+    char *result_s;
+    Py_ssize_t self_len, result_len;
+    Py_ssize_t count;
+    PyObject *result;
+
+    self_len = STRINGLIB_LEN(self);
+    self_s = STRINGLIB_STR(self);
+
+    count = countchar(self_s, self_len, from_c, maxcount);
+    if (count == 0) {
+        return return_self(self);
+    }
+
+    result_len = self_len - count;  /* from_len == 1 */
+    assert(result_len>=0);
+
+    result = STRINGLIB_NEW(NULL, result_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+
+    start = self_s;
+    end = self_s + self_len;
+    while (count-- > 0) {
+        next = findchar(start, end - start, from_c);
+        if (next == NULL)
+            break;
+        Py_MEMCPY(result_s, start, next - start);
+        result_s += (next - start);
+        start = next + 1;
+    }
+    Py_MEMCPY(result_s, start, end - start);
+
+    return result;
+}
+
+/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
+
+Py_LOCAL(PyObject *)
+stringlib_replace_delete_substring(PyObject *self,
+                                   const char *from_s, Py_ssize_t from_len,
+                                   Py_ssize_t maxcount)
+{
+    const char *self_s, *start, *next, *end;
+    char *result_s;
+    Py_ssize_t self_len, result_len;
+    Py_ssize_t count, offset;
+    PyObject *result;
+
+    self_len = STRINGLIB_LEN(self);
+    self_s = STRINGLIB_STR(self);
+
+    count = stringlib_count(self_s, self_len,
+                            from_s, from_len,
+                            maxcount);
+
+    if (count == 0) {
+        /* no matches */
+        return return_self(self);
+    }
+
+    result_len = self_len - (count * from_len);
+    assert (result_len>=0);
+
+    result = STRINGLIB_NEW(NULL, result_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+
+    start = self_s;
+    end = self_s + self_len;
+    while (count-- > 0) {
+        offset = stringlib_find(start, end - start,
+                                from_s, from_len,
+                                0);
+        if (offset == -1)
+            break;
+        next = start + offset;
+
+        Py_MEMCPY(result_s, start, next - start);
+
+        result_s += (next - start);
+        start = next + from_len;
+    }
+    Py_MEMCPY(result_s, start, end - start);
+    return result;
+}
+
+/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_single_character_in_place(PyObject *self,
+                                            char from_c, char to_c,
+                                            Py_ssize_t maxcount)
+{
+    const char *self_s, *end;
+    char *result_s, *start, *next;
+    Py_ssize_t self_len;
+    PyObject *result;
+
+    /* The result string will be the same size */
+    self_s = STRINGLIB_STR(self);
+    self_len = STRINGLIB_LEN(self);
+
+    next = findchar(self_s, self_len, from_c);
+
+    if (next == NULL) {
+        /* No matches; return the original bytes */
+        return return_self(self);
+    }
+
+    /* Need to make a new bytes */
+    result = STRINGLIB_NEW(NULL, self_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+    Py_MEMCPY(result_s, self_s, self_len);
+
+    /* change everything in-place, starting with this one */
+    start =  result_s + (next - self_s);
+    *start = to_c;
+    start++;
+    end = result_s + self_len;
+
+    while (--maxcount > 0) {
+        next = findchar(start, end - start, from_c);
+        if (next == NULL)
+            break;
+        *next = to_c;
+        start = next + 1;
+    }
+
+    return result;
+}
+
+/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_substring_in_place(PyObject *self,
+                                     const char *from_s, Py_ssize_t from_len,
+                                     const char *to_s, Py_ssize_t to_len,
+                                     Py_ssize_t maxcount)
+{
+    const char *self_s, *end;
+    char *result_s, *start;
+    Py_ssize_t self_len, offset;
+    PyObject *result;
+
+    /* The result bytes will be the same size */
+
+    self_s = STRINGLIB_STR(self);
+    self_len = STRINGLIB_LEN(self);
+
+    offset = stringlib_find(self_s, self_len,
+                            from_s, from_len,
+                            0);
+    if (offset == -1) {
+        /* No matches; return the original bytes */
+        return return_self(self);
+    }
+
+    /* Need to make a new bytes */
+    result = STRINGLIB_NEW(NULL, self_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+    Py_MEMCPY(result_s, self_s, self_len);
+
+    /* change everything in-place, starting with this one */
+    start =  result_s + offset;
+    Py_MEMCPY(start, to_s, from_len);
+    start += from_len;
+    end = result_s + self_len;
+
+    while ( --maxcount > 0) {
+        offset = stringlib_find(start, end - start,
+                                from_s, from_len,
+                                0);
+        if (offset == -1)
+            break;
+        Py_MEMCPY(start + offset, to_s, from_len);
+        start += offset + from_len;
+    }
+
+    return result;
+}
+
+/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_single_character(PyObject *self,
+                                   char from_c,
+                                   const char *to_s, Py_ssize_t to_len,
+                                   Py_ssize_t maxcount)
+{
+    const char *self_s, *start, *next, *end;
+    char *result_s;
+    Py_ssize_t self_len, result_len;
+    Py_ssize_t count;
+    PyObject *result;
+
+    self_s = STRINGLIB_STR(self);
+    self_len = STRINGLIB_LEN(self);
+
+    count = countchar(self_s, self_len, from_c, maxcount);
+    if (count == 0) {
+        /* no matches, return unchanged */
+        return return_self(self);
+    }
+
+    /* use the difference between current and new, hence the "-1" */
+    /*   result_len = self_len + count * (to_len-1)  */
+    assert(count > 0);
+    if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) {
+        PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
+        return NULL;
+    }
+    result_len = self_len + count * (to_len - 1);
+
+    result = STRINGLIB_NEW(NULL, result_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+
+    start = self_s;
+    end = self_s + self_len;
+    while (count-- > 0) {
+        next = findchar(start, end - start, from_c);
+        if (next == NULL)
+            break;
+
+        if (next == start) {
+            /* replace with the 'to' */
+            Py_MEMCPY(result_s, to_s, to_len);
+            result_s += to_len;
+            start += 1;
+        } else {
+            /* copy the unchanged old then the 'to' */
+            Py_MEMCPY(result_s, start, next - start);
+            result_s += (next - start);
+            Py_MEMCPY(result_s, to_s, to_len);
+            result_s += to_len;
+            start = next + 1;
+        }
+    }
+    /* Copy the remainder of the remaining bytes */
+    Py_MEMCPY(result_s, start, end - start);
+
+    return result;
+}
+
+/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
+Py_LOCAL(PyObject *)
+stringlib_replace_substring(PyObject *self,
+                            const char *from_s, Py_ssize_t from_len,
+                            const char *to_s, Py_ssize_t to_len,
+                            Py_ssize_t maxcount)
+{
+    const char *self_s, *start, *next, *end;
+    char *result_s;
+    Py_ssize_t self_len, result_len;
+    Py_ssize_t count, offset;
+    PyObject *result;
+
+    self_s = STRINGLIB_STR(self);
+    self_len = STRINGLIB_LEN(self);
+
+    count = stringlib_count(self_s, self_len,
+                            from_s, from_len,
+                            maxcount);
+
+    if (count == 0) {
+        /* no matches, return unchanged */
+        return return_self(self);
+    }
+
+    /* Check for overflow */
+    /*    result_len = self_len + count * (to_len-from_len) */
+    assert(count > 0);
+    if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) {
+        PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
+        return NULL;
+    }
+    result_len = self_len + count * (to_len - from_len);
+
+    result = STRINGLIB_NEW(NULL, result_len);
+    if (result == NULL) {
+        return NULL;
+    }
+    result_s = STRINGLIB_STR(result);
+
+    start = self_s;
+    end = self_s + self_len;
+    while (count-- > 0) {
+        offset = stringlib_find(start, end - start,
+                                from_s, from_len,
+                                0);
+        if (offset == -1)
+            break;
+        next = start + offset;
+        if (next == start) {
+            /* replace with the 'to' */
+            Py_MEMCPY(result_s, to_s, to_len);
+            result_s += to_len;
+            start += from_len;
+        } else {
+            /* copy the unchanged old then the 'to' */
+            Py_MEMCPY(result_s, start, next - start);
+            result_s += (next - start);
+            Py_MEMCPY(result_s, to_s, to_len);
+            result_s += to_len;
+            start = next + from_len;
+        }
+    }
+    /* Copy the remainder of the remaining bytes */
+    Py_MEMCPY(result_s, start, end - start);
+
+    return result;
+}
+
+
+Py_LOCAL(PyObject *)
+stringlib_replace(PyObject *self,
+                  const char *from_s, Py_ssize_t from_len,
+                  const char *to_s, Py_ssize_t to_len,
+                  Py_ssize_t maxcount)
+{
+    if (maxcount < 0) {
+        maxcount = PY_SSIZE_T_MAX;
+    } else if (maxcount == 0 || STRINGLIB_LEN(self) == 0) {
+        /* nothing to do; return the original bytes */
+        return return_self(self);
+    }
+
+    /* Handle zero-length special cases */
+    if (from_len == 0) {
+        if (to_len == 0) {
+            /* nothing to do; return the original bytes */
+            return return_self(self);
+        }
+        /* insert the 'to' bytes everywhere.    */
+        /*    >>> b"Python".replace(b"", b".")  */
+        /*    b'.P.y.t.h.o.n.'                  */
+        return stringlib_replace_interleave(self, to_s, to_len, maxcount);
+    }
+
+    /* Except for b"".replace(b"", b"A") == b"A" there is no way beyond this */
+    /* point for an empty self bytes to generate a non-empty bytes */
+    /* Special case so the remaining code always gets a non-empty bytes */
+    if (STRINGLIB_LEN(self) == 0) {
+        return return_self(self);
+    }
+
+    if (to_len == 0) {
+        /* delete all occurrences of 'from' bytes */
+        if (from_len == 1) {
+            return stringlib_replace_delete_single_character(
+                self, from_s[0], maxcount);
+        } else {
+            return stringlib_replace_delete_substring(
+                self, from_s, from_len, maxcount);
+        }
+    }
+
+    /* Handle special case where both bytes have the same length */
+
+    if (from_len == to_len) {
+        if (from_len == 1) {
+            return stringlib_replace_single_character_in_place(
+                self, from_s[0], to_s[0], maxcount);
+        } else {
+            return stringlib_replace_substring_in_place(
+                self, from_s, from_len, to_s, to_len, maxcount);
+        }
+    }
+
+    /* Otherwise use the more generic algorithms */
+    if (from_len == 1) {
+        return stringlib_replace_single_character(
+            self, from_s[0], to_s, to_len, maxcount);
+    } else {
+        /* len('from')>=2, len('to')>=1 */
+        return stringlib_replace_substring(
+            self, from_s, from_len, to_s, to_len, maxcount);
+    }
+}
+
+#undef findchar
diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h
index be09b5f..14fa28e 100644
--- a/Objects/stringlib/unicode_format.h
+++ b/Objects/stringlib/unicode_format.h
@@ -67,7 +67,7 @@ SubString_new_object(SubString *str)
     return PyUnicode_Substring(str->str, str->start, str->end);
 }
 
-/* return a new string.  if str->str is NULL, return None */
+/* return a new string.  if str->str is NULL, return a new empty string */
 Py_LOCAL_INLINE(PyObject *)
 SubString_new_object_or_empty(SubString *str)
 {
diff --git a/Objects/structseq.c b/Objects/structseq.c
index 664344b..e315cba 100644
--- a/Objects/structseq.c
+++ b/Objects/structseq.c
@@ -4,9 +4,9 @@
 #include "Python.h"
 #include "structmember.h"
 
-static char visible_length_key[] = "n_sequence_fields";
-static char real_length_key[] = "n_fields";
-static char unnamed_fields_key[] = "n_unnamed_fields";
+static const char visible_length_key[] = "n_sequence_fields";
+static const char real_length_key[] = "n_fields";
+static const char unnamed_fields_key[] = "n_unnamed_fields";
 
 /* Fields with this name have only a field index, not a field name.
    They are only allowed for indices < n_visible_fields. */
@@ -16,14 +16,14 @@ _Py_IDENTIFIER(n_fields);
 _Py_IDENTIFIER(n_unnamed_fields);
 
 #define VISIBLE_SIZE(op) Py_SIZE(op)
-#define VISIBLE_SIZE_TP(tp) PyLong_AsLong( \
+#define VISIBLE_SIZE_TP(tp) PyLong_AsSsize_t( \
                       _PyDict_GetItemId((tp)->tp_dict, &PyId_n_sequence_fields))
 
-#define REAL_SIZE_TP(tp) PyLong_AsLong( \
+#define REAL_SIZE_TP(tp) PyLong_AsSsize_t( \
                       _PyDict_GetItemId((tp)->tp_dict, &PyId_n_fields))
 #define REAL_SIZE(op) REAL_SIZE_TP(Py_TYPE(op))
 
-#define UNNAMED_FIELDS_TP(tp) PyLong_AsLong( \
+#define UNNAMED_FIELDS_TP(tp) PyLong_AsSsize_t( \
                       _PyDict_GetItemId((tp)->tp_dict, &PyId_n_unnamed_fields))
 #define UNNAMED_FIELDS(op) UNNAMED_FIELDS_TP(Py_TYPE(op))
 
@@ -164,7 +164,8 @@ structseq_repr(PyStructSequence *obj)
 #define TYPE_MAXSIZE 100
 
     PyTypeObject *typ = Py_TYPE(obj);
-    int i, removelast = 0;
+    Py_ssize_t i;
+    int removelast = 0;
     Py_ssize_t len;
     char buf[REPR_BUFFER_SIZE];
     char *endofbuf, *pbuf = buf;
@@ -236,8 +237,7 @@ structseq_reduce(PyStructSequence* self)
     PyObject* tup = NULL;
     PyObject* dict = NULL;
     PyObject* result;
-    Py_ssize_t n_fields, n_visible_fields, n_unnamed_fields;
-    int i;
+    Py_ssize_t n_fields, n_visible_fields, n_unnamed_fields, i;
 
     n_fields = REAL_SIZE(self);
     n_visible_fields = VISIBLE_SIZE(self);
@@ -325,7 +325,7 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
 {
     PyObject *dict;
     PyMemberDef* members;
-    int n_members, n_unnamed_members, i, k;
+    Py_ssize_t n_members, n_unnamed_members, i, k;
     PyObject *v;
 
 #ifdef Py_TRACE_REFS
@@ -373,9 +373,9 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
     Py_INCREF(type);
 
     dict = type->tp_dict;
-#define SET_DICT_FROM_INT(key, value)                           \
+#define SET_DICT_FROM_SIZE(key, value)                          \
     do {                                                        \
-        v = PyLong_FromLong((long) value);                      \
+        v = PyLong_FromSsize_t(value);                          \
         if (v == NULL)                                          \
             return -1;                                          \
         if (PyDict_SetItemString(dict, key, v) < 0) {           \
@@ -385,9 +385,9 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
         Py_DECREF(v);                                           \
     } while (0)
 
-    SET_DICT_FROM_INT(visible_length_key, desc->n_in_sequence);
-    SET_DICT_FROM_INT(real_length_key, n_members);
-    SET_DICT_FROM_INT(unnamed_fields_key, n_unnamed_members);
+    SET_DICT_FROM_SIZE(visible_length_key, desc->n_in_sequence);
+    SET_DICT_FROM_SIZE(real_length_key, n_members);
+    SET_DICT_FROM_SIZE(unnamed_fields_key, n_unnamed_members);
 
     return 0;
 }
diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c
index 7920fec..c0ff499 100644
--- a/Objects/tupleobject.c
+++ b/Objects/tupleobject.c
@@ -36,6 +36,16 @@ static Py_ssize_t count_tracked = 0;
 static void
 show_track(void)
 {
+    PyObject *xoptions, *value;
+    _Py_IDENTIFIER(showalloccount);
+
+    xoptions = PySys_GetXOptions();
+    if (xoptions == NULL)
+        return;
+    value = _PyDict_GetItemId(xoptions, &PyId_showalloccount);
+    if (value != Py_True)
+        return;
+
     fprintf(stderr, "Tuples created: %" PY_FORMAT_SIZE_T "d\n",
         count_tracked + count_untracked);
     fprintf(stderr, "Tuples tracked by the GC: %" PY_FORMAT_SIZE_T
@@ -149,7 +159,6 @@ PyTuple_GetItem(PyObject *op, Py_ssize_t i)
 int
 PyTuple_SetItem(PyObject *op, Py_ssize_t i, PyObject *newitem)
 {
-    PyObject *olditem;
     PyObject **p;
     if (!PyTuple_Check(op) || op->ob_refcnt != 1) {
         Py_XDECREF(newitem);
@@ -163,9 +172,7 @@ PyTuple_SetItem(PyObject *op, Py_ssize_t i, PyObject *newitem)
         return -1;
     }
     p = ((PyTupleObject *)op) -> ob_item + i;
-    olditem = *p;
-    *p = newitem;
-    Py_XDECREF(olditem);
+    Py_XSETREF(*p, newitem);
     return 0;
 }
 
@@ -446,9 +453,9 @@ tupleconcat(PyTupleObject *a, PyObject *bb)
         return NULL;
     }
 #define b ((PyTupleObject *)bb)
-    size = Py_SIZE(a) + Py_SIZE(b);
-    if (size < 0)
+    if (Py_SIZE(a) > PY_SSIZE_T_MAX - Py_SIZE(b))
         return PyErr_NoMemory();
+    size = Py_SIZE(a) + Py_SIZE(b);
     np = (PyTupleObject *) PyTuple_New(size);
     if (np == NULL) {
         return NULL;
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index ccde3de..209d4fa 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -48,15 +48,18 @@ static size_t method_cache_collisions = 0;
 _Py_IDENTIFIER(__abstractmethods__);
 _Py_IDENTIFIER(__class__);
 _Py_IDENTIFIER(__delitem__);
+_Py_IDENTIFIER(__definition_order__);
 _Py_IDENTIFIER(__dict__);
 _Py_IDENTIFIER(__doc__);
 _Py_IDENTIFIER(__getattribute__);
 _Py_IDENTIFIER(__getitem__);
 _Py_IDENTIFIER(__hash__);
+_Py_IDENTIFIER(__init_subclass__);
 _Py_IDENTIFIER(__len__);
 _Py_IDENTIFIER(__module__);
 _Py_IDENTIFIER(__name__);
 _Py_IDENTIFIER(__new__);
+_Py_IDENTIFIER(__set_name__);
 _Py_IDENTIFIER(__setitem__);
 _Py_IDENTIFIER(builtins);
 
@@ -460,7 +463,7 @@ type_module(PyTypeObject *type, void *context)
             PyErr_Format(PyExc_AttributeError, "__module__");
             return 0;
         }
-        Py_XINCREF(mod);
+        Py_INCREF(mod);
         return mod;
     }
     else {
@@ -487,6 +490,23 @@ type_set_module(PyTypeObject *type, PyObject *value, void *context)
 }
 
 static PyObject *
+type_deforder(PyTypeObject *type, void *context)
+{
+    if (type->tp_deforder == NULL)
+        Py_RETURN_NONE;
+    Py_INCREF(type->tp_deforder);
+    return type->tp_deforder;
+}
+
+static int
+type_set_deforder(PyTypeObject *type, PyObject *value, void *context)
+{
+    Py_XINCREF(value);
+    Py_XSETREF(type->tp_deforder, value);
+    return 0;
+}
+
+static PyObject *
 type_abstractmethods(PyTypeObject *type, void *context)
 {
     PyObject *mod = NULL;
@@ -500,7 +520,7 @@ type_abstractmethods(PyTypeObject *type, void *context)
             PyErr_SetObject(PyExc_AttributeError, message);
         return NULL;
     }
-    Py_XINCREF(mod);
+    Py_INCREF(mod);
     return mod;
 }
 
@@ -548,7 +568,7 @@ type_get_bases(PyTypeObject *type, void *context)
 static PyTypeObject *best_base(PyObject *);
 static int mro_internal(PyTypeObject *, PyObject **);
 Py_LOCAL_INLINE(int) type_is_subtype_base_chain(PyTypeObject *, PyTypeObject *);
-static int compatible_for_assignment(PyTypeObject *, PyTypeObject *, char *);
+static int compatible_for_assignment(PyTypeObject *, PyTypeObject *, const char *);
 static int add_subclass(PyTypeObject*, PyTypeObject*);
 static int add_all_subclasses(PyTypeObject *type, PyObject *bases);
 static void remove_subclass(PyTypeObject *, PyTypeObject *);
@@ -832,6 +852,8 @@ static PyGetSetDef type_getsets[] = {
     {"__qualname__", (getter)type_qualname, (setter)type_set_qualname, NULL},
     {"__bases__", (getter)type_get_bases, (setter)type_set_bases, NULL},
     {"__module__", (getter)type_module, (setter)type_set_module, NULL},
+    {"__definition_order__", (getter)type_deforder,
+     (setter)type_set_deforder, NULL},
     {"__abstractmethods__", (getter)type_abstractmethods,
      (setter)type_set_abstractmethods, NULL},
     {"__dict__",  (getter)type_dict,  NULL, NULL},
@@ -888,25 +910,33 @@ type_call(PyTypeObject *type, PyObject *args, PyObject *kwds)
 #endif
 
     obj = type->tp_new(type, args, kwds);
-    if (obj != NULL) {
-        /* Ugly exception: when the call was type(something),
-           don't call tp_init on the result. */
-        if (type == &PyType_Type &&
-            PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 &&
-            (kwds == NULL ||
-             (PyDict_Check(kwds) && PyDict_Size(kwds) == 0)))
-            return obj;
-        /* If the returned object is not an instance of type,
-           it won't be initialized. */
-        if (!PyType_IsSubtype(Py_TYPE(obj), type))
-            return obj;
-        type = Py_TYPE(obj);
-        if (type->tp_init != NULL) {
-            int res = type->tp_init(obj, args, kwds);
-            if (res < 0) {
-                Py_DECREF(obj);
-                obj = NULL;
-            }
+    obj = _Py_CheckFunctionResult((PyObject*)type, obj, NULL);
+    if (obj == NULL)
+        return NULL;
+
+    /* Ugly exception: when the call was type(something),
+       don't call tp_init on the result. */
+    if (type == &PyType_Type &&
+        PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 &&
+        (kwds == NULL ||
+         (PyDict_Check(kwds) && PyDict_Size(kwds) == 0)))
+        return obj;
+
+    /* If the returned object is not an instance of type,
+       it won't be initialized. */
+    if (!PyType_IsSubtype(Py_TYPE(obj), type))
+        return obj;
+
+    type = Py_TYPE(obj);
+    if (type->tp_init != NULL) {
+        int res = type->tp_init(obj, args, kwds);
+        if (res < 0) {
+            assert(PyErr_Occurred());
+            Py_DECREF(obj);
+            obj = NULL;
+        }
+        else {
+            assert(!PyErr_Occurred());
         }
     }
     return obj;
@@ -1411,36 +1441,38 @@ _PyObject_LookupSpecial(PyObject *self, _Py_Identifier *attrid)
    as lookup_method to cache the interned name string object. */
 
 static PyObject *
-call_method(PyObject *o, _Py_Identifier *nameid, char *format, ...)
+call_method(PyObject *o, _Py_Identifier *nameid, const char *format, ...)
 {
     va_list va;
-    PyObject *args, *func = 0, *retval;
-    va_start(va, format);
+    PyObject *func = NULL, *retval;
 
     func = lookup_maybe(o, nameid);
     if (func == NULL) {
-        va_end(va);
         if (!PyErr_Occurred())
             PyErr_SetObject(PyExc_AttributeError, nameid->object);
         return NULL;
     }
 
-    if (format && *format)
+    if (format && *format) {
+        PyObject *args;
+
+        va_start(va, format);
         args = Py_VaBuildValue(format, va);
-    else
-        args = PyTuple_New(0);
+        va_end(va);
 
-    va_end(va);
+        if (args == NULL) {
+            Py_DECREF(func);
+            return NULL;
+        }
+        assert(PyTuple_Check(args));
 
-    if (args == NULL) {
-        Py_DECREF(func);
-        return NULL;
+        retval = PyObject_Call(func, args, NULL);
+        Py_DECREF(args);
+    }
+    else {
+        retval = _PyObject_CallNoArg(func);
     }
 
-    assert(PyTuple_Check(args));
-    retval = PyObject_Call(func, args, NULL);
-
-    Py_DECREF(args);
     Py_DECREF(func);
 
     return retval;
@@ -1449,36 +1481,38 @@ call_method(PyObject *o, _Py_Identifier *nameid, char *format, ...)
 /* Clone of call_method() that returns NotImplemented when the lookup fails. */
 
 static PyObject *
-call_maybe(PyObject *o, _Py_Identifier *nameid, char *format, ...)
+call_maybe(PyObject *o, _Py_Identifier *nameid, const char *format, ...)
 {
     va_list va;
-    PyObject *args, *func = 0, *retval;
-    va_start(va, format);
+    PyObject *func = NULL, *retval;
 
     func = lookup_maybe(o, nameid);
     if (func == NULL) {
-        va_end(va);
         if (!PyErr_Occurred())
             Py_RETURN_NOTIMPLEMENTED;
         return NULL;
     }
 
-    if (format && *format)
+    if (format && *format) {
+        PyObject *args;
+
+        va_start(va, format);
         args = Py_VaBuildValue(format, va);
-    else
-        args = PyTuple_New(0);
+        va_end(va);
 
-    va_end(va);
+        if (args == NULL) {
+            Py_DECREF(func);
+            return NULL;
+        }
+        assert(PyTuple_Check(args));
 
-    if (args == NULL) {
-        Py_DECREF(func);
-        return NULL;
+        retval = PyObject_Call(func, args, NULL);
+        Py_DECREF(args);
+    }
+    else {
+        retval = _PyObject_CallNoArg(func);
     }
 
-    assert(PyTuple_Check(args));
-    retval = PyObject_Call(func, args, NULL);
-
-    Py_DECREF(args);
     Py_DECREF(func);
 
     return retval;
@@ -1530,7 +1564,6 @@ class_name(PyObject *cls)
     PyObject *name = _PyObject_GetAttrId(cls, &PyId___name__);
     if (name == NULL) {
         PyErr_Clear();
-        Py_XDECREF(name);
         name = PyObject_Repr(cls);
     }
     if (name == NULL)
@@ -2024,6 +2057,8 @@ static void object_dealloc(PyObject *);
 static int object_init(PyObject *, PyObject *, PyObject *);
 static int update_slot(PyTypeObject *, PyObject *);
 static void fixup_slot_dispatchers(PyTypeObject *);
+static int set_names(PyTypeObject *);
+static int init_subclass(PyTypeObject *, PyObject *);
 
 /*
  * Helpers for  __dict__ descriptor.  We don't want to expose the dicts
@@ -2088,7 +2123,7 @@ subtype_dict(PyObject *obj, void *context)
 static int
 subtype_setdict(PyObject *obj, PyObject *value, void *context)
 {
-    PyObject *dict, **dictptr;
+    PyObject **dictptr;
     PyTypeObject *base;
 
     base = get_builtin_base_with_dict(Py_TYPE(obj));
@@ -2119,10 +2154,8 @@ subtype_setdict(PyObject *obj, PyObject *value, void *context)
                      "not a '%.200s'", Py_TYPE(value)->tp_name);
         return -1;
     }
-    dict = *dictptr;
     Py_XINCREF(value);
-    *dictptr = value;
-    Py_XDECREF(dict);
+    Py_XSETREF(*dictptr, value);
     return 0;
 }
 
@@ -2201,7 +2234,8 @@ type_init(PyObject *cls, PyObject *args, PyObject *kwds)
     assert(args != NULL && PyTuple_Check(args));
     assert(kwds == NULL || PyDict_Check(kwds));
 
-    if (kwds != NULL && PyDict_Check(kwds) && PyDict_Size(kwds) != 0) {
+    if (kwds != NULL && PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 &&
+        PyDict_Check(kwds) && PyDict_Size(kwds) != 0) {
         PyErr_SetString(PyExc_TypeError,
                         "type.__init__() takes no keyword arguments");
         return -1;
@@ -2268,7 +2302,6 @@ static PyObject *
 type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
 {
     PyObject *name, *bases = NULL, *orig_dict, *dict = NULL;
-    static char *kwlist[] = {"name", "bases", "dict", 0};
     PyObject *qualname, *slots = NULL, *tmp, *newslots;
     PyTypeObject *type = NULL, *base, *tmptype, *winner;
     PyHeapTypeObject *et;
@@ -2282,11 +2315,13 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
     assert(kwds == NULL || PyDict_Check(kwds));
 
     /* Special case: type(x) should return x->ob_type */
-    {
+    /* We only want type itself to accept the one-argument form (#27157)
+       Note: We don't call PyType_CheckExact as that also allows subclasses */
+    if (metatype == &PyType_Type) {
         const Py_ssize_t nargs = PyTuple_GET_SIZE(args);
         const Py_ssize_t nkwds = kwds == NULL ? 0 : PyDict_Size(kwds);
 
-        if (PyType_CheckExact(metatype) && nargs == 1 && nkwds == 0) {
+        if (nargs == 1 && nkwds == 0) {
             PyObject *x = PyTuple_GET_ITEM(args, 0);
             Py_INCREF(Py_TYPE(x));
             return (PyObject *) Py_TYPE(x);
@@ -2295,7 +2330,7 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
         /* SF bug 475327 -- if that didn't trigger, we need 3
            arguments. but PyArg_ParseTupleAndKeywords below may give
            a msg saying type() needs exactly 3. */
-        if (nargs + nkwds != 3) {
+        if (nargs != 3) {
             PyErr_SetString(PyExc_TypeError,
                             "type() takes 1 or 3 arguments");
             return NULL;
@@ -2303,10 +2338,8 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
     }
 
     /* Check arguments: (name, bases, dict) */
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "UO!O!:type", kwlist,
-                                     &name,
-                                     &PyTuple_Type, &bases,
-                                     &PyDict_Type, &orig_dict))
+    if (!PyArg_ParseTuple(args, "UO!O!:type.__new__", &name, &PyTuple_Type,
+                          &bases, &PyDict_Type, &orig_dict))
         return NULL;
 
     /* Determine the proper metatype to deal with this: */
@@ -2338,6 +2371,7 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
         goto error;
     }
 
+    /* Copy the definition namespace into a new dict. */
     dict = PyDict_Copy(orig_dict);
     if (dict == NULL)
         goto error;
@@ -2546,6 +2580,48 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
     if (qualname != NULL && PyDict_DelItem(dict, PyId___qualname__.object) < 0)
         goto error;
 
+    /* Set tp_deforder to the extracted definition order, if any. */
+    type->tp_deforder = _PyDict_GetItemId(dict, &PyId___definition_order__);
+    if (type->tp_deforder != NULL) {
+        Py_INCREF(type->tp_deforder);
+
+        // Due to subclass lookup, __definition_order__ can't be in __dict__.
+        if (_PyDict_DelItemId(dict, &PyId___definition_order__) != 0) {
+            goto error;
+        }
+
+        if (type->tp_deforder != Py_None) {
+            Py_ssize_t numnames;
+
+            if (!PyTuple_Check(type->tp_deforder)) {
+                PyErr_SetString(PyExc_TypeError,
+                                "__definition_order__ must be a tuple or None");
+                goto error;
+            }
+
+            // Make sure they are identifers.
+            numnames = PyTuple_Size(type->tp_deforder);
+            for (i = 0; i < numnames; i++) {
+                PyObject *name = PyTuple_GET_ITEM(type->tp_deforder, i);
+                if (name == NULL) {
+                    goto error;
+                }
+                if (!PyUnicode_Check(name) || !PyUnicode_IsIdentifier(name)) {
+                    PyErr_Format(PyExc_TypeError,
+                                 "__definition_order__ must "
+                                 "contain only identifiers, got '%s'",
+                                 name);
+                    goto error;
+                }
+            }
+        }
+    }
+    else if (PyODict_Check(orig_dict)) {
+        type->tp_deforder = _PyODict_KeysAsTuple(orig_dict);
+        if (type->tp_deforder == NULL)
+            goto error;
+    }
+
     /* Set tp_doc to a copy of dict['__doc__'], if the latter is there
        and is a string.  The __doc__ accessor will first look for tp_doc;
        if that fails, it will still look into __dict__.
@@ -2586,6 +2662,20 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
         Py_DECREF(tmp);
     }
 
+    /* Special-case __init_subclass__: if it's a plain function,
+       make it a classmethod */
+    tmp = _PyDict_GetItemId(dict, &PyId___init_subclass__);
+    if (tmp != NULL && PyFunction_Check(tmp)) {
+        tmp = PyClassMethod_New(tmp);
+        if (tmp == NULL)
+            goto error;
+        if (_PyDict_SetItemId(dict, &PyId___init_subclass__, tmp) < 0) {
+            Py_DECREF(tmp);
+            goto error;
+        }
+        Py_DECREF(tmp);
+    }
+
     /* Add descriptors for custom slots from __slots__, or for __dict__ */
     mp = PyHeapType_GET_MEMBERS(et);
     slotoffset = base->tp_basicsize;
@@ -2666,6 +2756,12 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
         et->ht_cached_keys = _PyDict_NewKeysForClass();
     }
 
+    if (set_names(type) < 0)
+        goto error;
+
+    if (init_subclass(type, kwds) < 0)
+        goto error;
+
     Py_DECREF(dict);
     return (PyObject *)type;
 
@@ -2677,7 +2773,7 @@ error:
     return NULL;
 }
 
-static short slotoffsets[] = {
+static const short slotoffsets[] = {
     -1, /* invalid slot */
 #include "typeslots.inc"
 };
@@ -3040,6 +3136,7 @@ type_dealloc(PyTypeObject *type)
     Py_XDECREF(type->tp_mro);
     Py_XDECREF(type->tp_cache);
     Py_XDECREF(type->tp_subclasses);
+    Py_XDECREF(type->tp_deforder);
     /* A type's tp_doc is heap allocated, unlike the tp_doc slots
      * of most other objects.  It's okay to cast it to char *.
      */
@@ -3082,7 +3179,7 @@ type_subclasses(PyTypeObject *type, PyObject *args_ignored)
 static PyObject *
 type_prepare(PyObject *self, PyObject *args, PyObject *kwds)
 {
-    return PyDict_New();
+    return PyODict_New();
 }
 
 /*
@@ -3596,7 +3693,7 @@ same_slots_added(PyTypeObject *a, PyTypeObject *b)
 }
 
 static int
-compatible_for_assignment(PyTypeObject* oldto, PyTypeObject* newto, char* attr)
+compatible_for_assignment(PyTypeObject* oldto, PyTypeObject* newto, const char* attr)
 {
     PyTypeObject *newbase, *oldbase;
 
@@ -3872,6 +3969,24 @@ _PyObject_GetState(PyObject *obj, int required)
         }
 
         assert(slotnames == Py_None || PyList_Check(slotnames));
+        if (required) {
+            Py_ssize_t basicsize = PyBaseObject_Type.tp_basicsize;
+            if (obj->ob_type->tp_dictoffset)
+                basicsize += sizeof(PyObject *);
+            if (obj->ob_type->tp_weaklistoffset)
+                basicsize += sizeof(PyObject *);
+            if (slotnames != Py_None)
+                basicsize += sizeof(PyObject *) * Py_SIZE(slotnames);
+            if (obj->ob_type->tp_basicsize > basicsize) {
+                Py_DECREF(slotnames);
+                Py_DECREF(state);
+                PyErr_Format(PyExc_TypeError,
+                             "can't pickle %.200s objects",
+                             Py_TYPE(obj)->tp_name);
+                return NULL;
+            }
+        }
+
         if (slotnames != Py_None && Py_SIZE(slotnames) > 0) {
             PyObject *slots;
             Py_ssize_t slotnames_size, i;
@@ -3926,7 +4041,7 @@ _PyObject_GetState(PyObject *obj, int required)
             }
 
             /* If we found some slot attributes, pack them in a tuple along
-               the orginal attribute dictionary. */
+               the original attribute dictionary. */
             if (PyDict_Size(slots) > 0) {
                 PyObject *state2;
 
@@ -4095,7 +4210,7 @@ _PyObject_GetItemsIter(PyObject *obj, PyObject **listitems,
 }
 
 static PyObject *
-reduce_newobj(PyObject *obj, int proto)
+reduce_newobj(PyObject *obj)
 {
     PyObject *args = NULL, *kwargs = NULL;
     PyObject *copyreg;
@@ -4148,7 +4263,7 @@ reduce_newobj(PyObject *obj, int proto)
         }
         Py_XDECREF(args);
     }
-    else if (proto >= 4) {
+    else {
         _Py_IDENTIFIER(__newobj_ex__);
 
         newobj = _PyObject_GetAttrId(copyreg, &PyId___newobj_ex__);
@@ -4166,16 +4281,6 @@ reduce_newobj(PyObject *obj, int proto)
             return NULL;
         }
     }
-    else {
-        PyErr_SetString(PyExc_ValueError,
-                        "must use protocol 4 or greater to copy this "
-                        "object; since __getnewargs_ex__ returned "
-                        "keyword arguments.");
-        Py_DECREF(args);
-        Py_DECREF(kwargs);
-        Py_DECREF(copyreg);
-        return NULL;
-    }
 
     state = _PyObject_GetState(obj,
                 !hasargs && !PyList_Check(obj) && !PyDict_Check(obj));
@@ -4221,7 +4326,7 @@ _common_reduce(PyObject *self, int proto)
     PyObject *copyreg, *res;
 
     if (proto >= 2)
-        return reduce_newobj(self, proto);
+        return reduce_newobj(self);
 
     copyreg = import_copyreg();
     if (!copyreg)
@@ -4303,6 +4408,18 @@ PyDoc_STRVAR(object_subclasshook_doc,
 "NotImplemented, the normal algorithm is used.  Otherwise, it\n"
 "overrides the normal algorithm (and the outcome is cached).\n");
 
+static PyObject *
+object_init_subclass(PyObject *cls, PyObject *arg)
+{
+    Py_RETURN_NONE;
+}
+
+PyDoc_STRVAR(object_init_subclass_doc,
+"This method is called when a class is subclassed.\n"
+"\n"
+"The default implementation does nothing. It may be\n"
+"overridden to extend subclasses.\n");
+
 /*
    from PEP 3101, this code implements:
 
@@ -4407,6 +4524,8 @@ static PyMethodDef object_methods[] = {
      PyDoc_STR("helper for pickle")},
     {"__subclasshook__", object_subclasshook, METH_CLASS | METH_VARARGS,
      object_subclasshook_doc},
+    {"__init_subclass__", object_init_subclass, METH_CLASS | METH_NOARGS,
+     object_init_subclass_doc},
     {"__format__", object_format, METH_VARARGS,
      PyDoc_STR("default object formatter")},
     {"__sizeof__", object_sizeof, METH_NOARGS,
@@ -5352,7 +5471,7 @@ wrap_delitem(PyObject *self, PyObject *args, void *wrapped)
 /* Helper to check for object.__setattr__ or __delattr__ applied to a type.
    This is called the Carlo Verre hack after its discoverer. */
 static int
-hackcheck(PyObject *self, setattrofunc func, char *what)
+hackcheck(PyObject *self, setattrofunc func, const char *what)
 {
     PyTypeObject *type = Py_TYPE(self);
     while (type && type->tp_flags & Py_TPFLAGS_HEAPTYPE)
@@ -5639,7 +5758,7 @@ static PyObject * \
 FUNCNAME(PyObject *self) \
 { \
     _Py_static_string(id, OPSTR); \
-    return call_method(self, &id, "()"); \
+    return call_method(self, &id, NULL); \
 }
 
 #define SLOT1(FUNCNAME, OPSTR, ARG1TYPE, ARGCODES) \
@@ -5732,7 +5851,7 @@ FUNCNAME(PyObject *self, ARG1TYPE arg1, ARG2TYPE arg2) \
 static Py_ssize_t
 slot_sq_length(PyObject *self)
 {
-    PyObject *res = call_method(self, &PyId___len__, "()");
+    PyObject *res = call_method(self, &PyId___len__, NULL);
     Py_ssize_t len;
 
     if (res == NULL)
@@ -5753,38 +5872,39 @@ slot_sq_length(PyObject *self)
 static PyObject *
 slot_sq_item(PyObject *self, Py_ssize_t i)
 {
-    PyObject *func, *args = NULL, *ival = NULL, *retval = NULL;
+    PyObject *func, *ival = NULL, *retval = NULL;
     descrgetfunc f;
 
     func = _PyType_LookupId(Py_TYPE(self), &PyId___getitem__);
-    if (func != NULL) {
-        if ((f = Py_TYPE(func)->tp_descr_get) == NULL)
-            Py_INCREF(func);
-        else {
-            func = f(func, self, (PyObject *)(Py_TYPE(self)));
-            if (func == NULL) {
-                return NULL;
-            }
-        }
-        ival = PyLong_FromSsize_t(i);
-        if (ival != NULL) {
-            args = PyTuple_New(1);
-            if (args != NULL) {
-                PyTuple_SET_ITEM(args, 0, ival);
-                retval = PyObject_Call(func, args, NULL);
-                Py_XDECREF(args);
-                Py_XDECREF(func);
-                return retval;
-            }
-        }
-    }
-    else {
+    if (func == NULL) {
         PyObject *getitem_str = _PyUnicode_FromId(&PyId___getitem__);
         PyErr_SetObject(PyExc_AttributeError, getitem_str);
+        return NULL;
+    }
+
+    f = Py_TYPE(func)->tp_descr_get;
+    if (f == NULL) {
+        Py_INCREF(func);
+    }
+    else {
+        func = f(func, self, (PyObject *)(Py_TYPE(self)));
+        if (func == NULL) {
+            return NULL;
+        }
     }
-    Py_XDECREF(args);
-    Py_XDECREF(ival);
-    Py_XDECREF(func);
+
+    ival = PyLong_FromSsize_t(i);
+    if (ival == NULL) {
+        goto error;
+    }
+
+    retval = _PyObject_CallArg1(func, ival);
+    Py_DECREF(func);
+    Py_DECREF(ival);
+    return retval;
+
+error:
+    Py_DECREF(func);
     return NULL;
 }
 
@@ -5806,19 +5926,20 @@ slot_sq_ass_item(PyObject *self, Py_ssize_t index, PyObject *value)
 static int
 slot_sq_contains(PyObject *self, PyObject *value)
 {
-    PyObject *func, *res, *args;
+    PyObject *func, *res;
     int result = -1;
     _Py_IDENTIFIER(__contains__);
 
     func = lookup_maybe(self, &PyId___contains__);
+    if (func == Py_None) {
+        Py_DECREF(func);
+        PyErr_Format(PyExc_TypeError,
+                     "'%.200s' object is not a container",
+                     Py_TYPE(self)->tp_name);
+        return -1;
+    }
     if (func != NULL) {
-        args = PyTuple_Pack(1, value);
-        if (args == NULL)
-            res = NULL;
-        else {
-            res = PyObject_Call(func, args, NULL);
-            Py_DECREF(args);
-        }
+        res = _PyObject_CallArg1(func, value);
         Py_DECREF(func);
         if (res != NULL) {
             result = PyObject_IsTrue(res);
@@ -5889,44 +6010,54 @@ SLOT0(slot_nb_absolute, "__abs__")
 static int
 slot_nb_bool(PyObject *self)
 {
-    PyObject *func, *args;
-    int result = -1;
+    PyObject *func, *value;
+    int result;
     int using_len = 0;
     _Py_IDENTIFIER(__bool__);
 
     func = lookup_maybe(self, &PyId___bool__);
     if (func == NULL) {
-        if (PyErr_Occurred())
+        if (PyErr_Occurred()) {
             return -1;
+        }
+
         func = lookup_maybe(self, &PyId___len__);
-        if (func == NULL)
-            return PyErr_Occurred() ? -1 : 1;
-        using_len = 1;
-    }
-    args = PyTuple_New(0);
-    if (args != NULL) {
-        PyObject *temp = PyObject_Call(func, args, NULL);
-        Py_DECREF(args);
-        if (temp != NULL) {
-            if (using_len) {
-                /* enforced by slot_nb_len */
-                result = PyObject_IsTrue(temp);
-            }
-            else if (PyBool_Check(temp)) {
-                result = PyObject_IsTrue(temp);
-            }
-            else {
-                PyErr_Format(PyExc_TypeError,
-                             "__bool__ should return "
-                             "bool, returned %s",
-                             Py_TYPE(temp)->tp_name);
-                result = -1;
+        if (func == NULL) {
+            if (PyErr_Occurred()) {
+                return -1;
             }
-            Py_DECREF(temp);
+            return 1;
         }
+        using_len = 1;
+    }
+
+    value = _PyObject_CallNoArg(func);
+    if (value == NULL) {
+        goto error;
+    }
+
+    if (using_len) {
+        /* bool type enforced by slot_nb_len */
+        result = PyObject_IsTrue(value);
+    }
+    else if (PyBool_Check(value)) {
+        result = PyObject_IsTrue(value);
+    }
+    else {
+        PyErr_Format(PyExc_TypeError,
+                     "__bool__ should return "
+                     "bool, returned %s",
+                     Py_TYPE(value)->tp_name);
+        result = -1;
     }
+
+    Py_DECREF(value);
     Py_DECREF(func);
     return result;
+
+error:
+    Py_DECREF(func);
+    return -1;
 }
 
 
@@ -5934,7 +6065,7 @@ static PyObject *
 slot_nb_index(PyObject *self)
 {
     _Py_IDENTIFIER(__index__);
-    return call_method(self, &PyId___index__, "()");
+    return call_method(self, &PyId___index__, NULL);
 }
 
 
@@ -6171,20 +6302,14 @@ static _Py_Identifier name_op[] = {
 static PyObject *
 slot_tp_richcompare(PyObject *self, PyObject *other, int op)
 {
-    PyObject *func, *args, *res;
+    PyObject *func, *res;
 
     func = lookup_method(self, &name_op[op]);
     if (func == NULL) {
         PyErr_Clear();
         Py_RETURN_NOTIMPLEMENTED;
     }
-    args = PyTuple_Pack(1, other);
-    if (args == NULL)
-        res = NULL;
-    else {
-        res = PyObject_Call(func, args, NULL);
-        Py_DECREF(args);
-    }
+    res = _PyObject_CallArg1(func, other);
     Py_DECREF(func);
     return res;
 }
@@ -6196,16 +6321,20 @@ slot_tp_iter(PyObject *self)
     _Py_IDENTIFIER(__iter__);
 
     func = lookup_method(self, &PyId___iter__);
+    if (func == Py_None) {
+        Py_DECREF(func);
+        PyErr_Format(PyExc_TypeError,
+                     "'%.200s' object is not iterable",
+                     Py_TYPE(self)->tp_name);
+        return NULL;
+    }
+
     if (func != NULL) {
-        PyObject *args;
-        args = res = PyTuple_New(0);
-        if (args != NULL) {
-            res = PyObject_Call(func, args, NULL);
-            Py_DECREF(args);
-        }
+        res = _PyObject_CallNoArg(func);
         Py_DECREF(func);
         return res;
     }
+
     PyErr_Clear();
     func = lookup_method(self, &PyId___getitem__);
     if (func == NULL) {
@@ -6222,7 +6351,7 @@ static PyObject *
 slot_tp_iternext(PyObject *self)
 {
     _Py_IDENTIFIER(__next__);
-    return call_method(self, &PyId___next__, "()");
+    return call_method(self, &PyId___next__, NULL);
 }
 
 static PyObject *
@@ -6291,29 +6420,16 @@ slot_tp_init(PyObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 slot_tp_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 {
-    PyObject *func;
-    PyObject *newargs, *x;
-    Py_ssize_t i, n;
+    PyObject *func, *result;
 
     func = _PyObject_GetAttrId((PyObject *)type, &PyId___new__);
-    if (func == NULL)
-        return NULL;
-    assert(PyTuple_Check(args));
-    n = PyTuple_GET_SIZE(args);
-    newargs = PyTuple_New(n+1);
-    if (newargs == NULL)
+    if (func == NULL) {
         return NULL;
-    Py_INCREF(type);
-    PyTuple_SET_ITEM(newargs, 0, (PyObject *)type);
-    for (i = 0; i < n; i++) {
-        x = PyTuple_GET_ITEM(args, i);
-        Py_INCREF(x);
-        PyTuple_SET_ITEM(newargs, i+1, x);
     }
-    x = PyObject_Call(func, newargs, kwds);
-    Py_DECREF(newargs);
+
+    result = _PyObject_Call_Prepend(func, (PyObject *)type, args, kwds);
     Py_DECREF(func);
-    return x;
+    return result;
 }
 
 static void
@@ -6916,6 +7032,57 @@ update_all_slots(PyTypeObject* type)
     }
 }
 
+/* Call __set_name__ on all descriptors in a newly generated type */
+static int
+set_names(PyTypeObject *type)
+{
+    PyObject *key, *value, *tmp;
+    Py_ssize_t i = 0;
+
+    while (PyDict_Next(type->tp_dict, &i, &key, &value)) {
+        if (PyObject_HasAttr(value, _PyUnicode_FromId(&PyId___set_name__))) {
+            tmp = PyObject_CallMethodObjArgs(
+                value, _PyUnicode_FromId(&PyId___set_name__),
+                type, key, NULL);
+            if (tmp == NULL)
+                return -1;
+            else
+                Py_DECREF(tmp);
+        }
+    }
+
+    return 0;
+}
+
+/* Call __init_subclass__ on the parent of a newly generated type */
+static int
+init_subclass(PyTypeObject *type, PyObject *kwds)
+{
+    PyObject *super, *func, *result;
+    PyObject *args[2] = {(PyObject *)type, (PyObject *)type};
+
+    super = _PyObject_FastCall((PyObject *)&PySuper_Type, args, 2);
+    if (super == NULL) {
+        return -1;
+    }
+
+    func = _PyObject_GetAttrId(super, &PyId___init_subclass__);
+    Py_DECREF(super);
+    if (func == NULL) {
+        return -1;
+    }
+
+
+    result = _PyObject_FastCallDict(func, NULL, 0, kwds);
+    Py_DECREF(func);
+    if (result == NULL) {
+        return -1;
+    }
+
+    Py_DECREF(result);
+    return 0;
+}
+
 /* recurse_down_subclasses() and update_subclasses() are mutually
    recursive functions to call a callback for all subclasses,
    but refraining from recursing into subclasses that define 'name'. */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index af04564..3553aaf 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #include "Python.h"
 #include "ucnhash.h"
 #include "bytes_methods.h"
+#include "stringlib/eq.h"
 
 #ifdef MS_WINDOWS
 #include <windows.h>
@@ -162,6 +163,14 @@ extern "C" {
             *_to++ = (to_type) *_iter++;                \
     } while (0)
 
+#ifdef MS_WINDOWS
+   /* On Windows, overallocate by 50% is the best factor */
+#  define OVERALLOCATE_FACTOR 2
+#else
+   /* On Linux, overallocate by 25% is the best factor */
+#  define OVERALLOCATE_FACTOR 4
+#endif
+
 /* This dictionary holds all interned unicode strings.  Note that references
    to strings in this dictionary are *not* counted in the string's ob_refcnt.
    When the interned string reaches a refcnt of 0 the string deallocation
@@ -195,7 +204,7 @@ static PyObject *unicode_empty = NULL;
     } while (0)
 
 /* Forward declaration */
-Py_LOCAL_INLINE(int)
+static inline int
 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
 
 /* List of static strings. */
@@ -263,7 +272,7 @@ raise_encode_exception(PyObject **exceptionObject,
                        const char *reason);
 
 /* Same for linebreaks */
-static unsigned char ascii_linebreak[] = {
+static const unsigned char ascii_linebreak[] = {
     0, 0, 0, 0, 0, 0, 0, 0,
 /*         0x000A, * LINE FEED */
 /*         0x000B, * LINE TABULATION */
@@ -292,6 +301,45 @@ static unsigned char ascii_linebreak[] = {
 
 #include "clinic/unicodeobject.c.h"
 
+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_STRICT,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_BACKSLASHREPLACE,
+    _Py_ERROR_SURROGATEPASS,
+    _Py_ERROR_XMLCHARREFREPLACE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+    if (errors == NULL || strcmp(errors, "strict") == 0) {
+        return _Py_ERROR_STRICT;
+    }
+    if (strcmp(errors, "surrogateescape") == 0) {
+        return _Py_ERROR_SURROGATEESCAPE;
+    }
+    if (strcmp(errors, "replace") == 0) {
+        return _Py_ERROR_REPLACE;
+    }
+    if (strcmp(errors, "ignore") == 0) {
+        return _Py_ERROR_IGNORE;
+    }
+    if (strcmp(errors, "backslashreplace") == 0) {
+        return _Py_ERROR_BACKSLASHREPLACE;
+    }
+    if (strcmp(errors, "surrogatepass") == 0) {
+        return _Py_ERROR_SURROGATEPASS;
+    }
+    if (strcmp(errors, "xmlcharrefreplace") == 0) {
+        return _Py_ERROR_XMLCHARREFREPLACE;
+    }
+    return _Py_ERROR_OTHER;
+}
+
 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
    This function is kept for backward compatibility with the old API. */
 Py_UNICODE
@@ -521,6 +569,129 @@ unicode_result_unchanged(PyObject *unicode)
         return _PyUnicode_Copy(unicode);
 }
 
+/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
+   ASCII, Latin1, UTF-8, etc. */
+static char*
+backslashreplace(_PyBytesWriter *writer, char *str,
+                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+    Py_ssize_t size, i;
+    Py_UCS4 ch;
+    enum PyUnicode_Kind kind;
+    void *data;
+
+    assert(PyUnicode_IS_READY(unicode));
+    kind = PyUnicode_KIND(unicode);
+    data = PyUnicode_DATA(unicode);
+
+    size = 0;
+    /* determine replacement size */
+    for (i = collstart; i < collend; ++i) {
+        Py_ssize_t incr;
+
+        ch = PyUnicode_READ(kind, data, i);
+        if (ch < 0x100)
+            incr = 2+2;
+        else if (ch < 0x10000)
+            incr = 2+4;
+        else {
+            assert(ch <= MAX_UNICODE);
+            incr = 2+8;
+        }
+        if (size > PY_SSIZE_T_MAX - incr) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "encoded result is too long for a Python string");
+            return NULL;
+        }
+        size += incr;
+    }
+
+    str = _PyBytesWriter_Prepare(writer, str, size);
+    if (str == NULL)
+        return NULL;
+
+    /* generate replacement */
+    for (i = collstart; i < collend; ++i) {
+        ch = PyUnicode_READ(kind, data, i);
+        *str++ = '\\';
+        if (ch >= 0x00010000) {
+            *str++ = 'U';
+            *str++ = Py_hexdigits[(ch>>28)&0xf];
+            *str++ = Py_hexdigits[(ch>>24)&0xf];
+            *str++ = Py_hexdigits[(ch>>20)&0xf];
+            *str++ = Py_hexdigits[(ch>>16)&0xf];
+            *str++ = Py_hexdigits[(ch>>12)&0xf];
+            *str++ = Py_hexdigits[(ch>>8)&0xf];
+        }
+        else if (ch >= 0x100) {
+            *str++ = 'u';
+            *str++ = Py_hexdigits[(ch>>12)&0xf];
+            *str++ = Py_hexdigits[(ch>>8)&0xf];
+        }
+        else
+            *str++ = 'x';
+        *str++ = Py_hexdigits[(ch>>4)&0xf];
+        *str++ = Py_hexdigits[ch&0xf];
+    }
+    return str;
+}
+
+/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
+   ASCII, Latin1, UTF-8, etc. */
+static char*
+xmlcharrefreplace(_PyBytesWriter *writer, char *str,
+                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+    Py_ssize_t size, i;
+    Py_UCS4 ch;
+    enum PyUnicode_Kind kind;
+    void *data;
+
+    assert(PyUnicode_IS_READY(unicode));
+    kind = PyUnicode_KIND(unicode);
+    data = PyUnicode_DATA(unicode);
+
+    size = 0;
+    /* determine replacement size */
+    for (i = collstart; i < collend; ++i) {
+        Py_ssize_t incr;
+
+        ch = PyUnicode_READ(kind, data, i);
+        if (ch < 10)
+            incr = 2+1+1;
+        else if (ch < 100)
+            incr = 2+2+1;
+        else if (ch < 1000)
+            incr = 2+3+1;
+        else if (ch < 10000)
+            incr = 2+4+1;
+        else if (ch < 100000)
+            incr = 2+5+1;
+        else if (ch < 1000000)
+            incr = 2+6+1;
+        else {
+            assert(ch <= MAX_UNICODE);
+            incr = 2+7+1;
+        }
+        if (size > PY_SSIZE_T_MAX - incr) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "encoded result is too long for a Python string");
+            return NULL;
+        }
+        size += incr;
+    }
+
+    str = _PyBytesWriter_Prepare(writer, str, size);
+    if (str == NULL)
+        return NULL;
+
+    /* generate replacement */
+    for (i = collstart; i < collend; ++i) {
+        str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
+    }
+    return str;
+}
+
 /* --- Bloom Filters ----------------------------------------------------- */
 
 /* stuff to implement simple "bloom filters" for Unicode characters.
@@ -549,7 +720,7 @@ static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 
-Py_LOCAL_INLINE(BLOOM_MASK)
+static inline BLOOM_MASK
 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
 {
 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
@@ -587,6 +758,18 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
 #undef BLOOM_UPDATE
 }
 
+static int
+ensure_unicode(PyObject *obj)
+{
+    if (!PyUnicode_Check(obj)) {
+        PyErr_Format(PyExc_TypeError,
+                     "must be str, not %.100s",
+                     Py_TYPE(obj)->tp_name);
+        return -1;
+    }
+    return PyUnicode_READY(obj);
+}
+
 /* Compilation of templated routines */
 
 #include "stringlib/asciilib.h"
@@ -643,31 +826,31 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
 static PyObject *
 fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
 
-Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
-                                     Py_ssize_t size, Py_UCS4 ch,
-                                     int direction)
+static inline Py_ssize_t
+findchar(const void *s, int kind,
+         Py_ssize_t size, Py_UCS4 ch,
+         int direction)
 {
-    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
-
     switch (kind) {
     case PyUnicode_1BYTE_KIND:
-        {
-            Py_UCS1 ch1 = (Py_UCS1) ch;
-            if (ch1 == ch)
-                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
-            else
-                return -1;
-        }
+        if ((Py_UCS1) ch != ch)
+            return -1;
+        if (direction > 0)
+            return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
+        else
+            return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
     case PyUnicode_2BYTE_KIND:
-        {
-            Py_UCS2 ch2 = (Py_UCS2) ch;
-            if (ch2 == ch)
-                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
-            else
-                return -1;
-        }
+        if ((Py_UCS2) ch != ch)
+            return -1;
+        if (direction > 0)
+            return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
+        else
+            return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
     case PyUnicode_4BYTE_KIND:
-        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
+        if (direction > 0)
+            return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
+        else
+            return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
     default:
         assert(0);
         return -1;
@@ -1956,7 +2139,7 @@ kind_maxchar_limit(unsigned int kind)
     }
 }
 
-Py_LOCAL_INLINE(Py_UCS4)
+static inline Py_UCS4
 align_maxchar(Py_UCS4 maxchar)
 {
     if (maxchar <= 127)
@@ -2454,13 +2637,11 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
             longflag = 1;
             ++f;
         }
-#ifdef HAVE_LONG_LONG
         else if (f[1] == 'l' &&
                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
             longlongflag = 1;
             f += 2;
         }
-#endif
     }
     /* handle the size_t flag. */
     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
@@ -2498,11 +2679,9 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
             if (longflag)
                 len = sprintf(buffer, "%lu",
                         va_arg(*vargs, unsigned long));
-#ifdef HAVE_LONG_LONG
             else if (longlongflag)
                 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
-                        va_arg(*vargs, unsigned PY_LONG_LONG));
-#endif
+                        va_arg(*vargs, unsigned long long));
             else if (size_tflag)
                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
                         va_arg(*vargs, size_t));
@@ -2517,11 +2696,9 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
             if (longflag)
                 len = sprintf(buffer, "%li",
                         va_arg(*vargs, long));
-#ifdef HAVE_LONG_LONG
             else if (longlongflag)
                 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
-                        va_arg(*vargs, PY_LONG_LONG));
-#endif
+                        va_arg(*vargs, long long));
             else if (size_tflag)
                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
                         va_arg(*vargs, Py_ssize_t));
@@ -2903,7 +3080,7 @@ PyUnicode_FromEncodedObject(PyObject *obj,
     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
         PyErr_Format(PyExc_TypeError,
-                     "coercing to str: need a bytes-like object, %.80s found",
+                     "decoding to str: need a bytes-like object, %.80s found",
                      Py_TYPE(obj)->tp_name);
         return NULL;
     }
@@ -2918,9 +3095,9 @@ PyUnicode_FromEncodedObject(PyObject *obj,
     return v;
 }
 
-/* Convert encoding to lower case and replace '_' with '-' in order to
-   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
-   1 on success. */
+/* Normalize an encoding name: C implementation of
+   encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
+   is longer than lower_len-1). */
 int
 _Py_normalize_encoding(const char *encoding,
                        char *lower,
@@ -2929,30 +3106,39 @@ _Py_normalize_encoding(const char *encoding,
     const char *e;
     char *l;
     char *l_end;
+    int punct;
+
+    assert(encoding != NULL);
 
-    if (encoding == NULL) {
-        /* 6 == strlen("utf-8") + 1 */
-        if (lower_len < 6)
-            return 0;
-        strcpy(lower, "utf-8");
-        return 1;
-    }
     e = encoding;
     l = lower;
     l_end = &lower[lower_len - 1];
-    while (*e) {
-        if (l == l_end)
-            return 0;
-        if (Py_ISUPPER(*e)) {
-            *l++ = Py_TOLOWER(*e++);
+    punct = 0;
+    while (1) {
+        char c = *e;
+        if (c == 0) {
+            break;
         }
-        else if (*e == '_') {
-            *l++ = '-';
-            e++;
+
+        if (Py_ISALNUM(c) || c == '.') {
+            if (punct && l != lower) {
+                if (l == l_end) {
+                    return 0;
+                }
+                *l++ = '_';
+            }
+            punct = 0;
+
+            if (l == l_end) {
+                return 0;
+            }
+            *l++ = Py_TOLOWER(c);
         }
         else {
-            *l++ = *e++;
+            punct = 1;
         }
+
+        e++;
     }
     *l = '\0';
     return 1;
@@ -2966,28 +3152,51 @@ PyUnicode_Decode(const char *s,
 {
     PyObject *buffer = NULL, *unicode;
     Py_buffer info;
-    char lower[11];  /* Enough for any encoding shortcut */
+    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
+
+    if (encoding == NULL) {
+        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
+    }
 
     /* Shortcuts for common default encodings */
-    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
-        if ((strcmp(lower, "utf-8") == 0) ||
-            (strcmp(lower, "utf8") == 0))
-            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
-        else if ((strcmp(lower, "latin-1") == 0) ||
-                 (strcmp(lower, "latin1") == 0) ||
-                 (strcmp(lower, "iso-8859-1") == 0) ||
-                 (strcmp(lower, "iso8859-1") == 0))
-            return PyUnicode_DecodeLatin1(s, size, errors);
-#ifdef HAVE_MBCS
-        else if (strcmp(lower, "mbcs") == 0)
-            return PyUnicode_DecodeMBCS(s, size, errors);
-#endif
-        else if (strcmp(lower, "ascii") == 0)
-            return PyUnicode_DecodeASCII(s, size, errors);
-        else if (strcmp(lower, "utf-16") == 0)
-            return PyUnicode_DecodeUTF16(s, size, errors, 0);
-        else if (strcmp(lower, "utf-32") == 0)
-            return PyUnicode_DecodeUTF32(s, size, errors, 0);
+    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
+        char *lower = buflower;
+
+        /* Fast paths */
+        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
+            lower += 3;
+            if (*lower == '_') {
+                /* Match "utf8" and "utf_8" */
+                lower++;
+            }
+
+            if (lower[0] == '8' && lower[1] == 0) {
+                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
+            }
+            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
+                return PyUnicode_DecodeUTF16(s, size, errors, 0);
+            }
+            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
+                return PyUnicode_DecodeUTF32(s, size, errors, 0);
+            }
+        }
+        else {
+            if (strcmp(lower, "ascii") == 0
+                || strcmp(lower, "us_ascii") == 0) {
+                return PyUnicode_DecodeASCII(s, size, errors);
+            }
+    #ifdef HAVE_MBCS
+            else if (strcmp(lower, "mbcs") == 0) {
+                return PyUnicode_DecodeMBCS(s, size, errors);
+            }
+    #endif
+            else if (strcmp(lower, "latin1") == 0
+                     || strcmp(lower, "latin_1") == 0
+                     || strcmp(lower, "iso_8859_1") == 0
+                     || strcmp(lower, "iso8859_1") == 0) {
+                return PyUnicode_DecodeLatin1(s, size, errors);
+            }
+        }
     }
 
     /* Decode via the codec registry */
@@ -3167,24 +3376,22 @@ wcstombs_errorpos(const wchar_t *wstr)
 static int
 locale_error_handler(const char *errors, int *surrogateescape)
 {
-    if (errors == NULL) {
-        *surrogateescape = 0;
-        return 0;
-    }
-
-    if (strcmp(errors, "strict") == 0) {
+    _Py_error_handler error_handler = get_error_handler(errors);
+    switch (error_handler)
+    {
+    case _Py_ERROR_STRICT:
         *surrogateescape = 0;
         return 0;
-    }
-    if (strcmp(errors, "surrogateescape") == 0) {
+    case _Py_ERROR_SURROGATEESCAPE:
         *surrogateescape = 1;
         return 0;
+    default:
+        PyErr_Format(PyExc_ValueError,
+                     "only 'strict' and 'surrogateescape' error handlers "
+                     "are supported, not '%s'",
+                     errors);
+        return -1;
     }
-    PyErr_Format(PyExc_ValueError,
-                 "only 'strict' and 'surrogateescape' error handlers "
-                 "are supported, not '%s'",
-                 errors);
-    return -1;
 }
 
 PyObject *
@@ -3332,34 +3539,56 @@ PyUnicode_AsEncodedString(PyObject *unicode,
                           const char *errors)
 {
     PyObject *v;
-    char lower[11];  /* Enough for any encoding shortcut */
+    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
 
     if (!PyUnicode_Check(unicode)) {
         PyErr_BadArgument();
         return NULL;
     }
 
+    if (encoding == NULL) {
+        return _PyUnicode_AsUTF8String(unicode, errors);
+    }
+
     /* Shortcuts for common default encodings */
-    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
-        if ((strcmp(lower, "utf-8") == 0) ||
-            (strcmp(lower, "utf8") == 0))
-        {
-            if (errors == NULL || strcmp(errors, "strict") == 0)
-                return _PyUnicode_AsUTF8String(unicode, NULL);
-            else
+    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
+        char *lower = buflower;
+
+        /* Fast paths */
+        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
+            lower += 3;
+            if (*lower == '_') {
+                /* Match "utf8" and "utf_8" */
+                lower++;
+            }
+
+            if (lower[0] == '8' && lower[1] == 0) {
                 return _PyUnicode_AsUTF8String(unicode, errors);
+            }
+            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
+                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
+            }
+            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
+                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
+            }
         }
-        else if ((strcmp(lower, "latin-1") == 0) ||
-                 (strcmp(lower, "latin1") == 0) ||
-                 (strcmp(lower, "iso-8859-1") == 0) ||
-                 (strcmp(lower, "iso8859-1") == 0))
-            return _PyUnicode_AsLatin1String(unicode, errors);
+        else {
+            if (strcmp(lower, "ascii") == 0
+                || strcmp(lower, "us_ascii") == 0) {
+                return _PyUnicode_AsASCIIString(unicode, errors);
+            }
 #ifdef HAVE_MBCS
-        else if (strcmp(lower, "mbcs") == 0)
-            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
+            else if (strcmp(lower, "mbcs") == 0) {
+                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
+            }
 #endif
-        else if (strcmp(lower, "ascii") == 0)
-            return _PyUnicode_AsASCIIString(unicode, errors);
+            else if (strcmp(lower, "latin1") == 0 ||
+                     strcmp(lower, "latin_1") == 0 ||
+                     strcmp(lower, "iso_8859_1") == 0 ||
+                     strcmp(lower, "iso8859_1") == 0) {
+                return _PyUnicode_AsLatin1String(unicode, errors);
+            }
+        }
     }
 
     /* Encode via the codec registry */
@@ -3614,6 +3843,7 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
 int
 PyUnicode_FSConverter(PyObject* arg, void* addr)
 {
+    PyObject *path = NULL;
     PyObject *output = NULL;
     Py_ssize_t size;
     void *data;
@@ -3622,24 +3852,22 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)
         *(PyObject**)addr = NULL;
         return 1;
     }
-    if (PyBytes_Check(arg)) {
-        output = arg;
-        Py_INCREF(output);
+    path = PyOS_FSPath(arg);
+    if (path == NULL) {
+        return 0;
     }
-    else {
-        arg = PyUnicode_FromObject(arg);
-        if (!arg)
-            return 0;
-        output = PyUnicode_EncodeFSDefault(arg);
-        Py_DECREF(arg);
-        if (!output)
-            return 0;
-        if (!PyBytes_Check(output)) {
-            Py_DECREF(output);
-            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
+    if (PyBytes_Check(path)) {
+        output = path;
+    }
+    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
+        output = PyUnicode_EncodeFSDefault(path);
+        Py_DECREF(path);
+        if (!output) {
             return 0;
         }
+        assert(PyBytes_Check(output));
     }
+
     size = PyBytes_GET_SIZE(output);
     data = PyBytes_AS_STRING(output);
     if ((size_t)size != strlen(data)) {
@@ -3655,36 +3883,60 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)
 int
 PyUnicode_FSDecoder(PyObject* arg, void* addr)
 {
+    int is_buffer = 0;
+    PyObject *path = NULL;
     PyObject *output = NULL;
     if (arg == NULL) {
         Py_DECREF(*(PyObject**)addr);
         return 1;
     }
-    if (PyUnicode_Check(arg)) {
-        if (PyUnicode_READY(arg) == -1)
+
+    is_buffer = PyObject_CheckBuffer(arg);
+    if (!is_buffer) {
+        path = PyOS_FSPath(arg);
+        if (path == NULL) {
             return 0;
-        output = arg;
-        Py_INCREF(output);
+        }
     }
-    else if (PyObject_CheckBuffer(arg)) {
-        arg = PyBytes_FromObject(arg);
-        if (!arg)
+    else {
+        path = arg;
+        Py_INCREF(arg);
+    }
+
+    if (PyUnicode_Check(path)) {
+        if (PyUnicode_READY(path) == -1) {
+            Py_DECREF(path);
             return 0;
-        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
-                                                  PyBytes_GET_SIZE(arg));
-        Py_DECREF(arg);
-        if (!output)
+        }
+        output = path;
+    }
+    else if (PyBytes_Check(path) || is_buffer) {
+        PyObject *path_bytes = NULL;
+
+        if (!PyBytes_Check(path) &&
+            PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+            "path should be string, bytes, or os.PathLike, not %.200s",
+            Py_TYPE(arg)->tp_name)) {
+                Py_DECREF(path);
+            return 0;
+        }
+        path_bytes = PyBytes_FromObject(path);
+        Py_DECREF(path);
+        if (!path_bytes) {
             return 0;
-        if (!PyUnicode_Check(output)) {
-            Py_DECREF(output);
-            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
+        }
+        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
+                                                  PyBytes_GET_SIZE(path_bytes));
+        Py_DECREF(path_bytes);
+        if (!output) {
             return 0;
         }
     }
     else {
         PyErr_Format(PyExc_TypeError,
-                     "path should be string or bytes, not %.200s",
+                     "path should be string, bytes, or os.PathLike, not %.200s",
                      Py_TYPE(arg)->tp_name);
+        Py_DECREF(path);
         return 0;
     }
     if (PyUnicode_READY(output) == -1) {
@@ -3716,7 +3968,7 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
 
     if (PyUnicode_UTF8(unicode) == NULL) {
         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
-        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
+        bytes = _PyUnicode_AsUTF8String(unicode, NULL);
         if (bytes == NULL)
             return NULL;
         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
@@ -3982,7 +4234,7 @@ unicode_decode_call_errorhandler_wchar(
     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
     PyObject **output, Py_ssize_t *outpos)
 {
-    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
+    static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
 
     PyObject *restuple = NULL;
     PyObject *repunicode = NULL;
@@ -4090,7 +4342,7 @@ unicode_decode_call_errorhandler_writer(
     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
 {
-    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
+    static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
 
     PyObject *restuple = NULL;
     PyObject *repunicode = NULL;
@@ -4696,8 +4948,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     Py_ssize_t startinpos;
     Py_ssize_t endinpos;
     const char *errmsg = "";
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0) {
         if (consumed)
@@ -4722,6 +4975,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     while (s < end) {
         Py_UCS4 ch;
         int kind = writer.kind;
+
         if (kind == PyUnicode_1BYTE_KIND) {
             if (PyUnicode_IS_ASCII(writer.buffer))
                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
@@ -4760,24 +5014,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
             continue;
         }
 
-        if (unicode_decode_call_errorhandler_writer(
-                errors, &errorHandler,
-                "utf-8", errmsg,
-                &starts, &end, &startinpos, &endinpos, &exc, &s,
-                &writer))
-            goto onError;
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler) {
+        case _Py_ERROR_IGNORE:
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_REPLACE:
+            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
+                goto onError;
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_SURROGATEESCAPE:
+        {
+            Py_ssize_t i;
+
+            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+                goto onError;
+            for (i=startinpos; i<endinpos; i++) {
+                ch = (Py_UCS4)(unsigned char)(starts[i]);
+                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
+                                ch + 0xdc00);
+                writer.pos++;
+            }
+            s += (endinpos - startinpos);
+            break;
+        }
+
+        default:
+            if (unicode_decode_call_errorhandler_writer(
+                    errors, &error_handler_obj,
+                    "utf-8", errmsg,
+                    &starts, &end, &startinpos, &endinpos, &exc, &s,
+                    &writer))
+                goto onError;
+        }
     }
 
 End:
     if (consumed)
         *consumed = s - starts;
 
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
 onError:
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     _PyUnicodeWriter_Dealloc(&writer);
     return NULL;
@@ -5069,7 +5355,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
     const void *data;
     Py_ssize_t len;
     PyObject *v;
-    PY_UINT32_T *out;
+    uint32_t *out;
 #if PY_LITTLE_ENDIAN
     int native_ordering = byteorder <= 0;
 #else
@@ -5100,7 +5386,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
 
     /* output buffer is 4-bytes aligned */
     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
-    out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
+    out = (uint32_t *)PyBytes_AS_STRING(v);
     if (byteorder == 0)
         *out++ = 0xFEFF;
     if (len == 0)
@@ -5166,7 +5452,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
 
         /* four bytes are reserved for each surrogate */
         if (moreunits > 1) {
-            Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
+            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
             Py_ssize_t morebytes = 4 * (moreunits - 1);
             if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
                 /* integer overflow */
@@ -5175,7 +5461,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
             }
             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
                 goto error;
-            out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
+            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
         }
 
         if (PyBytes_Check(rep)) {
@@ -5430,36 +5716,45 @@ _PyUnicode_EncodeUTF16(PyObject *str,
     if (kind == PyUnicode_4BYTE_KIND) {
         const Py_UCS4 *in = (const Py_UCS4 *)data;
         const Py_UCS4 *end = in + len;
-        while (in < end)
-            if (*in++ >= 0x10000)
+        while (in < end) {
+            if (*in++ >= 0x10000) {
                 pairs++;
+            }
+        }
     }
-    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
+    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
         return PyErr_NoMemory();
+    }
     nsize = len + pairs + (byteorder == 0);
     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
-    if (v == NULL)
+    if (v == NULL) {
         return NULL;
+    }
 
     /* output buffer is 2-bytes aligned */
     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
     out = (unsigned short *)PyBytes_AS_STRING(v);
-    if (byteorder == 0)
+    if (byteorder == 0) {
         *out++ = 0xFEFF;
-    if (len == 0)
+    }
+    if (len == 0) {
         goto done;
+    }
 
     if (kind == PyUnicode_1BYTE_KIND) {
         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
         goto done;
     }
 
-    if (byteorder < 0)
+    if (byteorder < 0) {
         encoding = "utf-16-le";
-    else if (byteorder > 0)
+    }
+    else if (byteorder > 0) {
         encoding = "utf-16-be";
-    else
+    }
+    else {
         encoding = "utf-16";
+    }
 
     pos = 0;
     while (pos < len) {
@@ -5575,61 +5870,6 @@ PyUnicode_AsUTF16String(PyObject *unicode)
 
 /* --- Unicode Escape Codec ----------------------------------------------- */
 
-/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
-   if all the escapes in the string make it still a valid ASCII string.
-   Returns -1 if any escapes were found which cause the string to
-   pop out of ASCII range.  Otherwise returns the length of the
-   required buffer to hold the string.
-   */
-static Py_ssize_t
-length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
-{
-    const unsigned char *p = (const unsigned char *)s;
-    const unsigned char *end = p + size;
-    Py_ssize_t length = 0;
-
-    if (size < 0)
-        return -1;
-
-    for (; p < end; ++p) {
-        if (*p > 127) {
-            /* Non-ASCII */
-            return -1;
-        }
-        else if (*p != '\\') {
-            /* Normal character */
-            ++length;
-        }
-        else {
-            /* Backslash-escape, check next char */
-            ++p;
-            /* Escape sequence reaches till end of string or
-               non-ASCII follow-up. */
-            if (p >= end || *p > 127)
-                return -1;
-            switch (*p) {
-            case '\n':
-                /* backslash + \n result in zero characters */
-                break;
-            case '\\': case '\'': case '\"':
-            case 'b': case 'f': case 't':
-            case 'n': case 'r': case 'v': case 'a':
-                ++length;
-                break;
-            case '0': case '1': case '2': case '3':
-            case '4': case '5': case '6': case '7':
-            case 'x': case 'u': case 'U': case 'N':
-                /* these do not guarantee ASCII characters */
-                return -1;
-            default:
-                /* count the backslash + the other character */
-                length += 2;
-            }
-        }
-    }
-    return length;
-}
-
 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
 
 PyObject *
@@ -5638,218 +5878,212 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
                               const char *errors)
 {
     const char *starts = s;
-    Py_ssize_t startinpos;
-    Py_ssize_t endinpos;
     _PyUnicodeWriter writer;
     const char *end;
-    char* message;
-    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
-    Py_ssize_t len;
 
-    len = length_of_escaped_ascii_string(s, size);
-    if (len == 0)
+    if (size == 0) {
         _Py_RETURN_UNICODE_EMPTY();
-
-    /* After length_of_escaped_ascii_string() there are two alternatives,
-       either the string is pure ASCII with named escapes like \n, etc.
-       and we determined it's exact size (common case)
-       or it contains \x, \u, ... escape sequences.  then we create a
-       legacy wchar string and resize it at the end of this function. */
-    _PyUnicodeWriter_Init(&writer);
-    if (len > 0) {
-        writer.min_length = len;
     }
-    else {
-        /* Escaped strings will always be longer than the resulting
-           Unicode string, so we start with size here and then reduce the
-           length after conversion to the true value.
-           (but if the error callback returns a long replacement string
-           we'll have to allocate more space) */
-        writer.min_length = size;
+    /* Escaped strings will always be longer than the resulting
+       Unicode string, so we start with size here and then reduce the
+       length after conversion to the true value.
+       (but if the error callback returns a long replacement string
+       we'll have to allocate more space) */
+    _PyUnicodeWriter_Init(&writer);
+    writer.min_length = size;
+    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
+        goto onError;
     }
 
-    if (size == 0)
-        return _PyUnicodeWriter_Finish(&writer);
     end = s + size;
-
     while (s < end) {
-        unsigned char c;
-        Py_UCS4 x;
-        int digits;
+        unsigned char c = (unsigned char) *s++;
+        Py_UCS4 ch;
+        int count;
+        Py_ssize_t startinpos;
+        Py_ssize_t endinpos;
+        const char *message;
+
+#define WRITE_ASCII_CHAR(ch)                                                  \
+            do {                                                              \
+                assert(ch <= 127);                                            \
+                assert(writer.pos < writer.size);                             \
+                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
+            } while(0)
+
+#define WRITE_CHAR(ch)                                                        \
+            do {                                                              \
+                if (ch <= writer.maxchar) {                                   \
+                    assert(writer.pos < writer.size);                         \
+                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
+                }                                                             \
+                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
+                    goto onError;                                             \
+                }                                                             \
+            } while(0)
 
         /* Non-escape characters are interpreted as Unicode ordinals */
-        if (*s != '\\') {
-            x = (unsigned char)*s;
-            s++;
-            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
-                goto onError;
+        if (c != '\\') {
+            WRITE_CHAR(c);
             continue;
         }
 
-        startinpos = s-starts;
+        startinpos = s - starts - 1;
         /* \ - Escapes */
-        s++;
-        c = *s++;
-        if (s > end)
-            c = '\0'; /* Invalid after \ */
+        if (s >= end) {
+            message = "\\ at end of string";
+            goto error;
+        }
+        c = (unsigned char) *s++;
 
+        assert(writer.pos < writer.size);
         switch (c) {
 
             /* \x escapes */
-#define WRITECHAR(ch)                                                      \
-            do {                                                           \
-                if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0)    \
-                    goto onError;                                          \
-            } while(0)
-
-        case '\n': break;
-        case '\\': WRITECHAR('\\'); break;
-        case '\'': WRITECHAR('\''); break;
-        case '\"': WRITECHAR('\"'); break;
-        case 'b': WRITECHAR('\b'); break;
+        case '\n': continue;
+        case '\\': WRITE_ASCII_CHAR('\\'); continue;
+        case '\'': WRITE_ASCII_CHAR('\''); continue;
+        case '\"': WRITE_ASCII_CHAR('\"'); continue;
+        case 'b': WRITE_ASCII_CHAR('\b'); continue;
         /* FF */
-        case 'f': WRITECHAR('\014'); break;
-        case 't': WRITECHAR('\t'); break;
-        case 'n': WRITECHAR('\n'); break;
-        case 'r': WRITECHAR('\r'); break;
+        case 'f': WRITE_ASCII_CHAR('\014'); continue;
+        case 't': WRITE_ASCII_CHAR('\t'); continue;
+        case 'n': WRITE_ASCII_CHAR('\n'); continue;
+        case 'r': WRITE_ASCII_CHAR('\r'); continue;
         /* VT */
-        case 'v': WRITECHAR('\013'); break;
+        case 'v': WRITE_ASCII_CHAR('\013'); continue;
         /* BEL, not classic C */
-        case 'a': WRITECHAR('\007'); break;
+        case 'a': WRITE_ASCII_CHAR('\007'); continue;
 
             /* \OOO (octal) escapes */
         case '0': case '1': case '2': case '3':
         case '4': case '5': case '6': case '7':
-            x = s[-1] - '0';
+            ch = c - '0';
             if (s < end && '0' <= *s && *s <= '7') {
-                x = (x<<3) + *s++ - '0';
-                if (s < end && '0' <= *s && *s <= '7')
-                    x = (x<<3) + *s++ - '0';
+                ch = (ch<<3) + *s++ - '0';
+                if (s < end && '0' <= *s && *s <= '7') {
+                    ch = (ch<<3) + *s++ - '0';
+                }
             }
-            WRITECHAR(x);
-            break;
+            WRITE_CHAR(ch);
+            continue;
 
             /* hex escapes */
             /* \xXX */
         case 'x':
-            digits = 2;
+            count = 2;
             message = "truncated \\xXX escape";
             goto hexescape;
 
             /* \uXXXX */
         case 'u':
-            digits = 4;
+            count = 4;
             message = "truncated \\uXXXX escape";
             goto hexescape;
 
             /* \UXXXXXXXX */
         case 'U':
-            digits = 8;
+            count = 8;
             message = "truncated \\UXXXXXXXX escape";
         hexescape:
-            chr = 0;
-            if (end - s < digits) {
-                /* count only hex digits */
-                for (; s < end; ++s) {
-                    c = (unsigned char)*s;
-                    if (!Py_ISXDIGIT(c))
-                        goto error;
+            for (ch = 0; count && s < end; ++s, --count) {
+                c = (unsigned char)*s;
+                ch <<= 4;
+                if (c >= '0' && c <= '9') {
+                    ch += c - '0';
+                }
+                else if (c >= 'a' && c <= 'f') {
+                    ch += c - ('a' - 10);
+                }
+                else if (c >= 'A' && c <= 'F') {
+                    ch += c - ('A' - 10);
+                }
+                else {
+                    break;
                 }
-                goto error;
             }
-            for (; digits--; ++s) {
-                c = (unsigned char)*s;
-                if (!Py_ISXDIGIT(c))
-                    goto error;
-                chr = (chr<<4) & ~0xF;
-                if (c >= '0' && c <= '9')
-                    chr += c - '0';
-                else if (c >= 'a' && c <= 'f')
-                    chr += 10 + c - 'a';
-                else
-                    chr += 10 + c - 'A';
+            if (count) {
+                goto error;
             }
-            if (chr == 0xffffffff && PyErr_Occurred())
-                /* _decoding_error will have already written into the
-                   target buffer. */
-                break;
-        store:
-            /* when we get here, chr is a 32-bit unicode character */
-            message = "illegal Unicode character";
-            if (chr > MAX_UNICODE)
+
+            /* when we get here, ch is a 32-bit unicode character */
+            if (ch > MAX_UNICODE) {
+                message = "illegal Unicode character";
                 goto error;
-            WRITECHAR(chr);
-            break;
+            }
+
+            WRITE_CHAR(ch);
+            continue;
 
             /* \N{name} */
         case 'N':
-            message = "malformed \\N character escape";
             if (ucnhash_CAPI == NULL) {
                 /* load the unicode data module */
                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
                                                 PyUnicodeData_CAPSULE_NAME, 1);
-                if (ucnhash_CAPI == NULL)
-                    goto ucnhashError;
+                if (ucnhash_CAPI == NULL) {
+                    PyErr_SetString(
+                        PyExc_UnicodeError,
+                        "\\N escapes not supported (can't load unicodedata module)"
+                        );
+                    goto onError;
+                }
             }
+
+            message = "malformed \\N character escape";
             if (*s == '{') {
-                const char *start = s+1;
+                const char *start = ++s;
+                size_t namelen;
                 /* look for the closing brace */
-                while (*s != '}' && s < end)
+                while (s < end && *s != '}')
                     s++;
-                if (s > start && s < end && *s == '}') {
+                namelen = s - start;
+                if (namelen && s < end) {
                     /* found a name.  look it up in the unicode database */
-                    message = "unknown Unicode character name";
                     s++;
-                    if (s - start - 1 <= INT_MAX &&
-                        ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
-                                              &chr, 0))
-                        goto store;
+                    ch = 0xffffffff; /* in case 'getcode' messes up */
+                    if (namelen <= INT_MAX &&
+                        ucnhash_CAPI->getcode(NULL, start, (int)namelen,
+                                              &ch, 0)) {
+                        assert(ch <= MAX_UNICODE);
+                        WRITE_CHAR(ch);
+                        continue;
+                    }
+                    message = "unknown Unicode character name";
                 }
             }
             goto error;
 
         default:
-            if (s > end) {
-                message = "\\ at end of string";
-                s--;
-                goto error;
-            }
-            else {
-                WRITECHAR('\\');
-                WRITECHAR((unsigned char)s[-1]);
-            }
-            break;
+            WRITE_ASCII_CHAR('\\');
+            WRITE_CHAR(c);
+            continue;
         }
-        continue;
 
       error:
         endinpos = s-starts;
+        writer.min_length = end - s + writer.pos;
         if (unicode_decode_call_errorhandler_writer(
                 errors, &errorHandler,
                 "unicodeescape", message,
                 &starts, &end, &startinpos, &endinpos, &exc, &s,
-                &writer))
+                &writer)) {
             goto onError;
-        continue;
+        }
+        if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
+            goto onError;
+        }
+
+#undef WRITE_ASCII_CHAR
+#undef WRITE_CHAR
     }
-#undef WRITECHAR
 
     Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
-  ucnhashError:
-    PyErr_SetString(
-        PyExc_UnicodeError,
-        "\\N escapes not supported (can't load unicodedata module)"
-        );
-    _PyUnicodeWriter_Dealloc(&writer);
-    Py_XDECREF(errorHandler);
-    Py_XDECREF(exc);
-    return NULL;
-
   onError:
     _PyUnicodeWriter_Dealloc(&writer);
     Py_XDECREF(errorHandler);
@@ -5870,9 +6104,9 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
     Py_ssize_t i, len;
     PyObject *repr;
     char *p;
-    int kind;
+    enum PyUnicode_Kind kind;
     void *data;
-    Py_ssize_t expandsize = 0;
+    Py_ssize_t expandsize;
 
     /* Initial allocation is based on the longest-possible character
        escape.
@@ -5886,60 +6120,71 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
         PyErr_BadArgument();
         return NULL;
     }
-    if (PyUnicode_READY(unicode) == -1)
+    if (PyUnicode_READY(unicode) == -1) {
         return NULL;
-    len = PyUnicode_GET_LENGTH(unicode);
-    kind = PyUnicode_KIND(unicode);
-    data = PyUnicode_DATA(unicode);
-    switch (kind) {
-    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
-    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
-    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
     }
 
-    if (len == 0)
+    len = PyUnicode_GET_LENGTH(unicode);
+    if (len == 0) {
         return PyBytes_FromStringAndSize(NULL, 0);
+    }
 
-    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
+    kind = PyUnicode_KIND(unicode);
+    data = PyUnicode_DATA(unicode);
+    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
+       bytes, and 1 byte characters 4. */
+    expandsize = kind * 2 + 2;
+    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) {
         return PyErr_NoMemory();
-
-    repr = PyBytes_FromStringAndSize(NULL,
-                                     2
-                                     + expandsize*len
-                                     + 1);
-    if (repr == NULL)
+    }
+    repr = PyBytes_FromStringAndSize(NULL, 2 + expandsize * len + 1);
+    if (repr == NULL) {
         return NULL;
+    }
 
     p = PyBytes_AS_STRING(repr);
-
     for (i = 0; i < len; i++) {
         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
 
-        /* Escape backslashes */
-        if (ch == '\\') {
-            *p++ = '\\';
-            *p++ = (char) ch;
-            continue;
-        }
+        /* U+0000-U+00ff range */
+        if (ch < 0x100) {
+            if (ch >= ' ' && ch < 127) {
+                if (ch != '\\') {
+                    /* Copy printable US ASCII as-is */
+                    *p++ = (char) ch;
+                }
+                /* Escape backslashes */
+                else {
+                    *p++ = '\\';
+                    *p++ = '\\';
+                }
+            }
 
-        /* Map 21-bit characters to '\U00xxxxxx' */
-        else if (ch >= 0x10000) {
-            assert(ch <= MAX_UNICODE);
-            *p++ = '\\';
-            *p++ = 'U';
-            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
-            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
-            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
-            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
-            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
-            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
-            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
-            *p++ = Py_hexdigits[ch & 0x0000000F];
-            continue;
-        }
+            /* Map special whitespace to '\t', \n', '\r' */
+            else if (ch == '\t') {
+                *p++ = '\\';
+                *p++ = 't';
+            }
+            else if (ch == '\n') {
+                *p++ = '\\';
+                *p++ = 'n';
+            }
+            else if (ch == '\r') {
+                *p++ = '\\';
+                *p++ = 'r';
+            }
 
-        /* Map 16-bit characters to '\uxxxx' */
-        if (ch >= 256) {
+            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
+            else {
+                *p++ = '\\';
+                *p++ = 'x';
+                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
+                *p++ = Py_hexdigits[ch & 0x000F];
+            }
+        }
+        /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
+        else if (ch < 0x10000) {
+            /* U+0100-U+ffff */
             *p++ = '\\';
             *p++ = 'u';
             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
@@ -5947,37 +6192,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
             *p++ = Py_hexdigits[ch & 0x000F];
         }
+        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
+        else {
 
-        /* Map special whitespace to '\t', \n', '\r' */
-        else if (ch == '\t') {
-            *p++ = '\\';
-            *p++ = 't';
-        }
-        else if (ch == '\n') {
-            *p++ = '\\';
-            *p++ = 'n';
-        }
-        else if (ch == '\r') {
-            *p++ = '\\';
-            *p++ = 'r';
-        }
-
-        /* Map non-printable US ASCII to '\xhh' */
-        else if (ch < ' ' || ch >= 0x7F) {
+            /* Make sure that the first two digits are zero */
+            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
             *p++ = '\\';
-            *p++ = 'x';
-            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
-            *p++ = Py_hexdigits[ch & 0x000F];
+            *p++ = 'U';
+            *p++ = '0';
+            *p++ = '0';
+            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
+            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
+            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
+            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
+            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
+            *p++ = Py_hexdigits[ch & 0x0000000F];
         }
-
-        /* Copy everything else as-is */
-        else
-            *p++ = (char) ch;
     }
 
     assert(p - PyBytes_AS_STRING(repr) > 0);
-    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
+    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
         return NULL;
+    }
     return repr;
 }
 
@@ -5987,8 +6223,10 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
 {
     PyObject *result;
     PyObject *tmp = PyUnicode_FromUnicode(s, size);
-    if (tmp == NULL)
+    if (tmp == NULL) {
         return NULL;
+    }
+
     result = PyUnicode_AsUnicodeEscapeString(tmp);
     Py_DECREF(tmp);
     return result;
@@ -6002,95 +6240,107 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
                                  const char *errors)
 {
     const char *starts = s;
-    Py_ssize_t startinpos;
-    Py_ssize_t endinpos;
     _PyUnicodeWriter writer;
     const char *end;
-    const char *bs;
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
 
-    if (size == 0)
+    if (size == 0) {
         _Py_RETURN_UNICODE_EMPTY();
+    }
 
     /* Escaped strings will always be longer than the resulting
        Unicode string, so we start with size here and then reduce the
        length after conversion to the true value. (But decoding error
        handler might have to resize the string) */
     _PyUnicodeWriter_Init(&writer);
-    writer.min_length = size;
+     writer.min_length = size;
+    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
+        goto onError;
+    }
 
     end = s + size;
     while (s < end) {
-        unsigned char c;
-        Py_UCS4 x;
-        int i;
+        unsigned char c = (unsigned char) *s++;
+        Py_UCS4 ch;
         int count;
+        Py_ssize_t startinpos;
+        Py_ssize_t endinpos;
+        const char *message;
+
+#define WRITE_CHAR(ch)                                                        \
+            do {                                                              \
+                if (ch <= writer.maxchar) {                                   \
+                    assert(writer.pos < writer.size);                         \
+                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
+                }                                                             \
+                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
+                    goto onError;                                             \
+                }                                                             \
+            } while(0)
 
         /* Non-escape characters are interpreted as Unicode ordinals */
-        if (*s != '\\') {
-            x = (unsigned char)*s++;
-            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
-                goto onError;
+        if (c != '\\' || s >= end) {
+            WRITE_CHAR(c);
             continue;
         }
-        startinpos = s-starts;
 
-        /* \u-escapes are only interpreted iff the number of leading
-           backslashes if odd */
-        bs = s;
-        for (;s < end;) {
-            if (*s != '\\')
-                break;
-            x = (unsigned char)*s++;
-            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
-                goto onError;
+        c = (unsigned char) *s++;
+        if (c == 'u') {
+            count = 4;
+            message = "truncated \\uXXXX escape";
         }
-        if (((s - bs) & 1) == 0 ||
-            s >= end ||
-            (*s != 'u' && *s != 'U')) {
+        else if (c == 'U') {
+            count = 8;
+            message = "truncated \\UXXXXXXXX escape";
+        }
+        else {
+            assert(writer.pos < writer.size);
+            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
+            WRITE_CHAR(c);
             continue;
         }
-        writer.pos--;
-        count = *s=='u' ? 4 : 8;
-        s++;
+        startinpos = s - starts - 2;
 
-        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
-        for (x = 0, i = 0; i < count; ++i, ++s) {
+        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
+        for (ch = 0; count && s < end; ++s, --count) {
             c = (unsigned char)*s;
-            if (!Py_ISXDIGIT(c)) {
-                endinpos = s-starts;
-                if (unicode_decode_call_errorhandler_writer(
-                        errors, &errorHandler,
-                        "rawunicodeescape", "truncated \\uXXXX",
-                        &starts, &end, &startinpos, &endinpos, &exc, &s,
-                        &writer))
-                    goto onError;
-                goto nextByte;
+            ch <<= 4;
+            if (c >= '0' && c <= '9') {
+                ch += c - '0';
+            }
+            else if (c >= 'a' && c <= 'f') {
+                ch += c - ('a' - 10);
+            }
+            else if (c >= 'A' && c <= 'F') {
+                ch += c - ('A' - 10);
+            }
+            else {
+                break;
             }
-            x = (x<<4) & ~0xF;
-            if (c >= '0' && c <= '9')
-                x += c - '0';
-            else if (c >= 'a' && c <= 'f')
-                x += 10 + c - 'a';
-            else
-                x += 10 + c - 'A';
         }
-        if (x <= MAX_UNICODE) {
-            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
-                goto onError;
+        if (!count) {
+            if (ch <= MAX_UNICODE) {
+                WRITE_CHAR(ch);
+                continue;
+            }
+            message = "\\Uxxxxxxxx out of range";
         }
-        else {
-            endinpos = s-starts;
-            if (unicode_decode_call_errorhandler_writer(
-                    errors, &errorHandler,
-                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
-                    &starts, &end, &startinpos, &endinpos, &exc, &s,
-                    &writer))
-                goto onError;
+
+        endinpos = s-starts;
+        writer.min_length = end - s + writer.pos;
+        if (unicode_decode_call_errorhandler_writer(
+                errors, &errorHandler,
+                "rawunicodeescape", message,
+                &starts, &end, &startinpos, &endinpos, &exc, &s,
+                &writer)) {
+            goto onError;
         }
-      nextByte:
-        ;
+        if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
+            goto onError;
+        }
+
+#undef WRITE_CHAR
     }
     Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
@@ -6101,6 +6351,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
     Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
     return NULL;
+
 }
 
 
@@ -6109,7 +6360,6 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
 {
     PyObject *repr;
     char *p;
-    char *q;
     Py_ssize_t expandsize, pos;
     int kind;
     void *data;
@@ -6119,58 +6369,68 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
         PyErr_BadArgument();
         return NULL;
     }
-    if (PyUnicode_READY(unicode) == -1)
+    if (PyUnicode_READY(unicode) == -1) {
         return NULL;
+    }
     kind = PyUnicode_KIND(unicode);
     data = PyUnicode_DATA(unicode);
     len = PyUnicode_GET_LENGTH(unicode);
+    if (kind == PyUnicode_1BYTE_KIND) {
+        return PyBytes_FromStringAndSize(data, len);
+    }
+
     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
        bytes, and 1 byte characters 4. */
     expandsize = kind * 2 + 2;
 
-    if (len > PY_SSIZE_T_MAX / expandsize)
+    if (len > PY_SSIZE_T_MAX / expandsize) {
         return PyErr_NoMemory();
-
+    }
     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
-    if (repr == NULL)
+    if (repr == NULL) {
         return NULL;
-    if (len == 0)
+    }
+    if (len == 0) {
         return repr;
+    }
 
-    p = q = PyBytes_AS_STRING(repr);
+    p = PyBytes_AS_STRING(repr);
     for (pos = 0; pos < len; pos++) {
         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
-        /* Map 32-bit characters to '\Uxxxxxxxx' */
-        if (ch >= 0x10000) {
-            assert(ch <= MAX_UNICODE);
+
+        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
+        if (ch < 0x100) {
+            *p++ = (char) ch;
+        }
+        /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
+        else if (ch < 0x10000) {
             *p++ = '\\';
-            *p++ = 'U';
-            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
-            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
-            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
-            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
+            *p++ = 'u';
             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
             *p++ = Py_hexdigits[ch & 15];
         }
-        /* Map 16-bit characters to '\uxxxx' */
-        else if (ch >= 256) {
+        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
+        else {
+            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
             *p++ = '\\';
-            *p++ = 'u';
+            *p++ = 'U';
+            *p++ = '0';
+            *p++ = '0';
+            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
+            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
             *p++ = Py_hexdigits[ch & 15];
         }
-        /* Copy everything else as-is */
-        else
-            *p++ = (char) ch;
     }
 
-    assert(p > q);
-    if (_PyBytes_Resize(&repr, p - q) < 0)
+    assert(p > PyBytes_AS_STRING(repr));
+    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
         return NULL;
+    }
     return repr;
 }
 
@@ -6348,7 +6608,7 @@ unicode_encode_call_errorhandler(const char *errors,
                                  Py_ssize_t startpos, Py_ssize_t endpos,
                                  Py_ssize_t *newpos)
 {
-    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
+    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
     Py_ssize_t len;
     PyObject *restuple;
     PyObject *resunicode;
@@ -6402,25 +6662,22 @@ unicode_encode_call_errorhandler(const char *errors,
 static PyObject *
 unicode_encode_ucs1(PyObject *unicode,
                     const char *errors,
-                    unsigned int limit)
+                    const Py_UCS4 limit)
 {
     /* input state */
     Py_ssize_t pos=0, size;
     int kind;
     void *data;
-    /* output object */
-    PyObject *res;
     /* pointer into the output */
     char *str;
-    /* current output position */
-    Py_ssize_t ressize;
     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
-    /* the following variable is used for caching string comparisons
-     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
-    int known_errorHandler = -1;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
+    PyObject *rep = NULL;
+    /* output object */
+    _PyBytesWriter writer;
 
     if (PyUnicode_READY(unicode) == -1)
         return NULL;
@@ -6431,186 +6688,157 @@ unicode_encode_ucs1(PyObject *unicode,
        replacements, if we need more, we'll resize */
     if (size == 0)
         return PyBytes_FromStringAndSize(NULL, 0);
-    res = PyBytes_FromStringAndSize(NULL, size);
-    if (res == NULL)
+
+    _PyBytesWriter_Init(&writer);
+    str = _PyBytesWriter_Alloc(&writer, size);
+    if (str == NULL)
         return NULL;
-    str = PyBytes_AS_STRING(res);
-    ressize = size;
 
     while (pos < size) {
-        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
+        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
 
         /* can we encode this? */
-        if (c<limit) {
+        if (ch < limit) {
             /* no overflow check, because we know that the space is enough */
-            *str++ = (char)c;
+            *str++ = (char)ch;
             ++pos;
         }
         else {
-            Py_ssize_t requiredsize;
-            PyObject *repunicode;
-            Py_ssize_t repsize, newpos, respos, i;
+            Py_ssize_t newpos, i;
             /* startpos for collecting unencodable chars */
             Py_ssize_t collstart = pos;
-            Py_ssize_t collend = pos;
+            Py_ssize_t collend = collstart + 1;
             /* find all unecodable characters */
+
             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
                 ++collend;
+
+            /* Only overallocate the buffer if it's not the last write */
+            writer.overallocate = (collend < size);
+
             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
-            if (known_errorHandler==-1) {
-                if ((errors==NULL) || (!strcmp(errors, "strict")))
-                    known_errorHandler = 1;
-                else if (!strcmp(errors, "replace"))
-                    known_errorHandler = 2;
-                else if (!strcmp(errors, "ignore"))
-                    known_errorHandler = 3;
-                else if (!strcmp(errors, "xmlcharrefreplace"))
-                    known_errorHandler = 4;
-                else
-                    known_errorHandler = 0;
-            }
-            switch (known_errorHandler) {
-            case 1: /* strict */
+            if (error_handler == _Py_ERROR_UNKNOWN)
+                error_handler = get_error_handler(errors);
+
+            switch (error_handler) {
+            case _Py_ERROR_STRICT:
                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
                 goto onError;
-            case 2: /* replace */
-                while (collstart++ < collend)
-                    *str++ = '?'; /* fall through */
-            case 3: /* ignore */
+
+            case _Py_ERROR_REPLACE:
+                memset(str, '?', collend - collstart);
+                str += (collend - collstart);
+                /* fall through ignore error handler */
+            case _Py_ERROR_IGNORE:
+                pos = collend;
+                break;
+
+            case _Py_ERROR_BACKSLASHREPLACE:
+                /* subtract preallocated bytes */
+                writer.min_size -= (collend - collstart);
+                str = backslashreplace(&writer, str,
+                                       unicode, collstart, collend);
+                if (str == NULL)
+                    goto onError;
                 pos = collend;
                 break;
-            case 4: /* xmlcharrefreplace */
-                respos = str - PyBytes_AS_STRING(res);
-                requiredsize = respos;
-                /* determine replacement size */
+
+            case _Py_ERROR_XMLCHARREFREPLACE:
+                /* subtract preallocated bytes */
+                writer.min_size -= (collend - collstart);
+                str = xmlcharrefreplace(&writer, str,
+                                        unicode, collstart, collend);
+                if (str == NULL)
+                    goto onError;
+                pos = collend;
+                break;
+
+            case _Py_ERROR_SURROGATEESCAPE:
                 for (i = collstart; i < collend; ++i) {
-                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
-                    Py_ssize_t incr;
-                    if (ch < 10)
-                        incr = 2+1+1;
-                    else if (ch < 100)
-                        incr = 2+2+1;
-                    else if (ch < 1000)
-                        incr = 2+3+1;
-                    else if (ch < 10000)
-                        incr = 2+4+1;
-                    else if (ch < 100000)
-                        incr = 2+5+1;
-                    else if (ch < 1000000)
-                        incr = 2+6+1;
-                    else {
-                        assert(ch <= MAX_UNICODE);
-                        incr = 2+7+1;
+                    ch = PyUnicode_READ(kind, data, i);
+                    if (ch < 0xdc80 || 0xdcff < ch) {
+                        /* Not a UTF-8b surrogate */
+                        break;
                     }
-                    if (requiredsize > PY_SSIZE_T_MAX - incr)
-                        goto overflow;
-                    requiredsize += incr;
-                }
-                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
-                    goto overflow;
-                requiredsize += size - collend;
-                if (requiredsize > ressize) {
-                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
-                        requiredsize = 2*ressize;
-                    if (_PyBytes_Resize(&res, requiredsize))
-                        goto onError;
-                    str = PyBytes_AS_STRING(res) + respos;
-                    ressize = requiredsize;
+                    *str++ = (char)(ch - 0xdc00);
+                    ++pos;
                 }
-                /* generate replacement */
-                for (i = collstart; i < collend; ++i) {
-                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
-                }
-                pos = collend;
-                break;
+                if (i >= collend)
+                    break;
+                collstart = pos;
+                assert(collstart != collend);
+                /* fallback to general error handling */
+
             default:
-                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
-                                                              encoding, reason, unicode, &exc,
-                                                              collstart, collend, &newpos);
-                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
-                                           PyUnicode_READY(repunicode) == -1))
+                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
+                                                       encoding, reason, unicode, &exc,
+                                                       collstart, collend, &newpos);
+                if (rep == NULL)
                     goto onError;
-                if (PyBytes_Check(repunicode)) {
+
+                /* subtract preallocated bytes */
+                writer.min_size -= 1;
+
+                if (PyBytes_Check(rep)) {
                     /* Directly copy bytes result to output. */
-                    repsize = PyBytes_Size(repunicode);
-                    if (repsize > 1) {
-                        /* Make room for all additional bytes. */
-                        respos = str - PyBytes_AS_STRING(res);
-                        if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
-                            Py_DECREF(repunicode);
-                            goto overflow;
-                        }
-                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
-                            Py_DECREF(repunicode);
-                            goto onError;
-                        }
-                        str = PyBytes_AS_STRING(res) + respos;
-                        ressize += repsize-1;
-                    }
-                    memcpy(str, PyBytes_AsString(repunicode), repsize);
-                    str += repsize;
-                    pos = newpos;
-                    Py_DECREF(repunicode);
-                    break;
-                }
-                /* need more space? (at least enough for what we
-                   have+the replacement+the rest of the string, so
-                   we won't have to check space for encodable characters) */
-                respos = str - PyBytes_AS_STRING(res);
-                repsize = PyUnicode_GET_LENGTH(repunicode);
-                requiredsize = respos;
-                if (requiredsize > PY_SSIZE_T_MAX - repsize)
-                    goto overflow;
-                requiredsize += repsize;
-                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
-                    goto overflow;
-                requiredsize += size - collend;
-                if (requiredsize > ressize) {
-                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
-                        requiredsize = 2*ressize;
-                    if (_PyBytes_Resize(&res, requiredsize)) {
-                        Py_DECREF(repunicode);
+                    str = _PyBytesWriter_WriteBytes(&writer, str,
+                                                    PyBytes_AS_STRING(rep),
+                                                    PyBytes_GET_SIZE(rep));
+                    if (str == NULL)
                         goto onError;
-                    }
-                    str = PyBytes_AS_STRING(res) + respos;
-                    ressize = requiredsize;
                 }
-                /* check if there is anything unencodable in the replacement
-                   and copy it to the output */
-                for (i = 0; repsize-->0; ++i, ++str) {
-                    c = PyUnicode_READ_CHAR(repunicode, i);
-                    if (c >= limit) {
-                        raise_encode_exception(&exc, encoding, unicode,
-                                               pos, pos+1, reason);
-                        Py_DECREF(repunicode);
+                else {
+                    assert(PyUnicode_Check(rep));
+
+                    if (PyUnicode_READY(rep) < 0)
                         goto onError;
+
+                    if (PyUnicode_IS_ASCII(rep)) {
+                        /* Fast path: all characters are smaller than limit */
+                        assert(limit >= 128);
+                        assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+                        str = _PyBytesWriter_WriteBytes(&writer, str,
+                                                        PyUnicode_DATA(rep),
+                                                        PyUnicode_GET_LENGTH(rep));
+                    }
+                    else {
+                        Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
+
+                        str = _PyBytesWriter_Prepare(&writer, str, repsize);
+                        if (str == NULL)
+                            goto onError;
+
+                        /* check if there is anything unencodable in the
+                           replacement and copy it to the output */
+                        for (i = 0; repsize-->0; ++i, ++str) {
+                            ch = PyUnicode_READ_CHAR(rep, i);
+                            if (ch >= limit) {
+                                raise_encode_exception(&exc, encoding, unicode,
+                                                       pos, pos+1, reason);
+                                goto onError;
+                            }
+                            *str = (char)ch;
+                        }
                     }
-                    *str = (char)c;
                 }
                 pos = newpos;
-                Py_DECREF(repunicode);
+                Py_CLEAR(rep);
             }
+
+            /* If overallocation was disabled, ensure that it was the last
+               write. Otherwise, we missed an optimization */
+            assert(writer.overallocate || pos == size);
         }
     }
-    /* Resize if we allocated to much */
-    size = str - PyBytes_AS_STRING(res);
-    if (size < ressize) { /* If this falls res will be NULL */
-        assert(size >= 0);
-        if (_PyBytes_Resize(&res, size) < 0)
-            goto onError;
-    }
 
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
-    return res;
-
-  overflow:
-    PyErr_SetString(PyExc_OverflowError,
-                    "encoded result is too long for a Python string");
+    return _PyBytesWriter_Finish(&writer, str);
 
   onError:
-    Py_XDECREF(res);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(rep);
+    _PyBytesWriter_Dealloc(&writer);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return NULL;
 }
@@ -6670,8 +6898,9 @@ PyUnicode_DecodeASCII(const char *s,
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
     const char *e;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0)
         _Py_RETURN_UNICODE_EMPTY();
@@ -6700,12 +6929,42 @@ PyUnicode_DecodeASCII(const char *s,
             PyUnicode_WRITE(kind, data, writer.pos, c);
             writer.pos++;
             ++s;
+            continue;
         }
-        else {
+
+        /* byte outsize range 0x00..0x7f: call the error handler */
+
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler)
+        {
+        case _Py_ERROR_REPLACE:
+        case _Py_ERROR_SURROGATEESCAPE:
+            /* Fast-path: the error handler only writes one character,
+               but we may switch to UCS2 at the first write */
+            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+                goto onError;
+            kind = writer.kind;
+            data = writer.data;
+
+            if (error_handler == _Py_ERROR_REPLACE)
+                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+            else
+                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+            writer.pos++;
+            ++s;
+            break;
+
+        case _Py_ERROR_IGNORE:
+            ++s;
+            break;
+
+        default:
             startinpos = s-starts;
             endinpos = startinpos + 1;
             if (unicode_decode_call_errorhandler_writer(
-                    errors, &errorHandler,
+                    errors, &error_handler_obj,
                     "ascii", "ordinal not in range(128)",
                     &starts, &e, &startinpos, &endinpos, &exc, &s,
                     &writer))
@@ -6714,13 +6973,13 @@ PyUnicode_DecodeASCII(const char *s,
             data = writer.data;
         }
     }
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
   onError:
     _PyUnicodeWriter_Dealloc(&writer);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return NULL;
 }
@@ -6775,7 +7034,7 @@ PyUnicode_AsASCIIString(PyObject *unicode)
 #  define WC_ERR_INVALID_CHARS 0x0080
 #endif
 
-static char*
+static const char*
 code_page_name(UINT code_page, PyObject **obj)
 {
     *obj = NULL;
@@ -6883,7 +7142,7 @@ decode_code_page_errors(UINT code_page,
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
     PyObject *encoding_obj = NULL;
-    char *encoding;
+    const char *encoding;
     DWORD err;
     int ret = -1;
 
@@ -7119,7 +7378,6 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
     BOOL usedDefaultChar = FALSE;
     BOOL *pusedDefaultChar = &usedDefaultChar;
     int outsize;
-    PyObject *exc = NULL;
     wchar_t *p;
     Py_ssize_t size;
     const DWORD flags = encode_code_page_flags(code_page, NULL);
@@ -7228,7 +7486,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
     PyObject *encoding_obj = NULL;
-    char *encoding;
+    const char *encoding;
     Py_ssize_t newpos, newoutsize;
     PyObject *rep;
     int ret = -1;
@@ -8086,7 +8344,7 @@ static int
 charmap_encoding_error(
     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
     PyObject **exceptionObject,
-    int *known_errorHandler, PyObject **errorHandler, const char *errors,
+    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
     PyObject **res, Py_ssize_t *respos)
 {
     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
@@ -8133,23 +8391,15 @@ charmap_encoding_error(
     }
     /* cache callback name lookup
      * (if not done yet, i.e. it's the first error) */
-    if (*known_errorHandler==-1) {
-        if ((errors==NULL) || (!strcmp(errors, "strict")))
-            *known_errorHandler = 1;
-        else if (!strcmp(errors, "replace"))
-            *known_errorHandler = 2;
-        else if (!strcmp(errors, "ignore"))
-            *known_errorHandler = 3;
-        else if (!strcmp(errors, "xmlcharrefreplace"))
-            *known_errorHandler = 4;
-        else
-            *known_errorHandler = 0;
-    }
-    switch (*known_errorHandler) {
-    case 1: /* strict */
+    if (*error_handler == _Py_ERROR_UNKNOWN)
+        *error_handler = get_error_handler(errors);
+
+    switch (*error_handler) {
+    case _Py_ERROR_STRICT:
         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
         return -1;
-    case 2: /* replace */
+
+    case _Py_ERROR_REPLACE:
         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
             x = charmapencode_output('?', mapping, res, respos);
             if (x==enc_EXCEPTION) {
@@ -8161,10 +8411,11 @@ charmap_encoding_error(
             }
         }
         /* fall through */
-    case 3: /* ignore */
+    case _Py_ERROR_IGNORE:
         *inpos = collendpos;
         break;
-    case 4: /* xmlcharrefreplace */
+
+    case _Py_ERROR_XMLCHARREFREPLACE:
         /* generate replacement (temporarily (mis)uses p) */
         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
             char buffer[2+29+1+1];
@@ -8182,8 +8433,9 @@ charmap_encoding_error(
         }
         *inpos = collendpos;
         break;
+
     default:
-        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
+        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
                                                       encoding, reason, unicode, exceptionObject,
                                                       collstartpos, collendpos, &newpos);
         if (repunicode == NULL)
@@ -8246,12 +8498,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
     Py_ssize_t size;
     /* current output position */
     Py_ssize_t respos = 0;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
-    /* the following variable is used for caching string comparisons
-     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
-     * 3=ignore, 4=xmlcharrefreplace */
-    int known_errorHandler = -1;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
     void *data;
     int kind;
 
@@ -8282,7 +8531,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
         if (x==enc_FAILED) { /* unencodable character */
             if (charmap_encoding_error(unicode, &inpos, mapping,
                                        &exc,
-                                       &known_errorHandler, &errorHandler, errors,
+                                       &error_handler, &error_handler_obj, errors,
                                        &res, &respos)) {
                 goto onError;
             }
@@ -8298,13 +8547,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
             goto onError;
 
     Py_XDECREF(exc);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     return res;
 
   onError:
     Py_XDECREF(res);
     Py_XDECREF(exc);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     return NULL;
 }
 
@@ -8371,7 +8620,7 @@ unicode_translate_call_errorhandler(const char *errors,
                                     Py_ssize_t startpos, Py_ssize_t endpos,
                                     Py_ssize_t *newpos)
 {
-    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
+    static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
 
     Py_ssize_t i_newpos;
     PyObject *restuple;
@@ -8628,7 +8877,7 @@ exit:
     return res;
 }
 
-PyObject *
+static PyObject *
 _PyUnicode_TranslateCharmap(PyObject *input,
                             PyObject *mapping,
                             const char *errors)
@@ -8657,10 +8906,8 @@ _PyUnicode_TranslateCharmap(PyObject *input,
     kind = PyUnicode_KIND(input);
     size = PyUnicode_GET_LENGTH(input);
 
-    if (size == 0) {
-        Py_INCREF(input);
-        return input;
-    }
+    if (size == 0)
+        return PyUnicode_FromObject(input);
 
     /* allocate enough for a simple 1:1 translation without
        replacements, if we need more, we'll resize */
@@ -8771,14 +9018,9 @@ PyUnicode_Translate(PyObject *str,
                     PyObject *mapping,
                     const char *errors)
 {
-    PyObject *result;
-
-    str = PyUnicode_FromObject(str);
-    if (str == NULL)
+    if (ensure_unicode(str) < 0)
         return NULL;
-    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
-    Py_DECREF(str);
-    return result;
+    return _PyUnicode_TranslateCharmap(str, mapping, errors);
 }
 
 static Py_UCS4
@@ -8960,9 +9202,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
     }
 
 static Py_ssize_t
-any_find_slice(int direction, PyObject* s1, PyObject* s2,
+any_find_slice(PyObject* s1, PyObject* s2,
                Py_ssize_t start,
-               Py_ssize_t end)
+               Py_ssize_t end,
+               int direction)
 {
     int kind1, kind2;
     void *buf1, *buf2;
@@ -9131,54 +9374,35 @@ PyUnicode_Count(PyObject *str,
                 Py_ssize_t end)
 {
     Py_ssize_t result;
-    PyObject* str_obj;
-    PyObject* sub_obj;
     int kind1, kind2;
     void *buf1 = NULL, *buf2 = NULL;
     Py_ssize_t len1, len2;
 
-    str_obj = PyUnicode_FromObject(str);
-    if (!str_obj)
+    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
         return -1;
-    sub_obj = PyUnicode_FromObject(substr);
-    if (!sub_obj) {
-        Py_DECREF(str_obj);
-        return -1;
-    }
-    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
-        Py_DECREF(sub_obj);
-        Py_DECREF(str_obj);
-        return -1;
-    }
 
-    kind1 = PyUnicode_KIND(str_obj);
-    kind2 = PyUnicode_KIND(sub_obj);
-    if (kind1 < kind2) {
-        Py_DECREF(sub_obj);
-        Py_DECREF(str_obj);
+    kind1 = PyUnicode_KIND(str);
+    kind2 = PyUnicode_KIND(substr);
+    if (kind1 < kind2)
         return 0;
-    }
 
-    len1 = PyUnicode_GET_LENGTH(str_obj);
-    len2 = PyUnicode_GET_LENGTH(sub_obj);
+    len1 = PyUnicode_GET_LENGTH(str);
+    len2 = PyUnicode_GET_LENGTH(substr);
     ADJUST_INDICES(start, end, len1);
-    if (end - start < len2) {
-        Py_DECREF(sub_obj);
-        Py_DECREF(str_obj);
+    if (end - start < len2)
         return 0;
-    }
 
-    buf1 = PyUnicode_DATA(str_obj);
-    buf2 = PyUnicode_DATA(sub_obj);
+    buf1 = PyUnicode_DATA(str);
+    buf2 = PyUnicode_DATA(substr);
     if (kind2 != kind1) {
-        buf2 = _PyUnicode_AsKind(sub_obj, kind1);
+        buf2 = _PyUnicode_AsKind(substr, kind1);
         if (!buf2)
             goto onError;
     }
 
     switch (kind1) {
     case PyUnicode_1BYTE_KIND:
-        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
+        if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
             result = asciilib_count(
                 ((Py_UCS1*)buf1) + start, end - start,
                 buf2, len2, PY_SSIZE_T_MAX
@@ -9205,16 +9429,11 @@ PyUnicode_Count(PyObject *str,
         assert(0); result = 0;
     }
 
-    Py_DECREF(sub_obj);
-    Py_DECREF(str_obj);
-
     if (kind2 != kind1)
         PyMem_Free(buf2);
 
     return result;
   onError:
-    Py_DECREF(sub_obj);
-    Py_DECREF(str_obj);
     if (kind2 != kind1 && buf2)
         PyMem_Free(buf2);
     return -1;
@@ -9222,35 +9441,15 @@ PyUnicode_Count(PyObject *str,
 
 Py_ssize_t
 PyUnicode_Find(PyObject *str,
-               PyObject *sub,
+               PyObject *substr,
                Py_ssize_t start,
                Py_ssize_t end,
                int direction)
 {
-    Py_ssize_t result;
-
-    str = PyUnicode_FromObject(str);
-    if (!str)
-        return -2;
-    sub = PyUnicode_FromObject(sub);
-    if (!sub) {
-        Py_DECREF(str);
+    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
         return -2;
-    }
-    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
-        Py_DECREF(sub);
-        Py_DECREF(str);
-        return -2;
-    }
 
-    result = any_find_slice(direction,
-        str, sub, start, end
-        );
-
-    Py_DECREF(str);
-    Py_DECREF(sub);
-
-    return result;
+    return any_find_slice(str, substr, start, end, direction);
 }
 
 Py_ssize_t
@@ -9353,22 +9552,10 @@ PyUnicode_Tailmatch(PyObject *str,
                     Py_ssize_t end,
                     int direction)
 {
-    Py_ssize_t result;
-
-    str = PyUnicode_FromObject(str);
-    if (str == NULL)
+    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
         return -1;
-    substr = PyUnicode_FromObject(substr);
-    if (substr == NULL) {
-        Py_DECREF(str);
-        return -1;
-    }
 
-    result = tailmatch(str, substr,
-                       start, end, direction);
-    Py_DECREF(str);
-    Py_DECREF(substr);
-    return result;
+    return tailmatch(str, substr, start, end, direction);
 }
 
 /* Apply fixfct filter to the Unicode object self and return a
@@ -9670,20 +9857,10 @@ case_operation(PyObject *self,
 PyObject *
 PyUnicode_Join(PyObject *separator, PyObject *seq)
 {
-    PyObject *sep = NULL;
-    Py_ssize_t seplen;
-    PyObject *res = NULL; /* the result */
-    PyObject *fseq;          /* PySequence_Fast(seq) */
-    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
+    PyObject *res;
+    PyObject *fseq;
+    Py_ssize_t seqlen;
     PyObject **items;
-    PyObject *item;
-    Py_ssize_t sz, i, res_offset;
-    Py_UCS4 maxchar;
-    Py_UCS4 item_maxchar;
-    int use_memcpy;
-    unsigned char *res_data = NULL, *sep_data = NULL;
-    PyObject *last_obj;
-    unsigned int kind = 0;
 
     fseq = PySequence_Fast(seq, "can only join an iterable");
     if (fseq == NULL) {
@@ -9694,21 +9871,39 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
      * so we are sure that fseq won't be mutated.
      */
 
+    items = PySequence_Fast_ITEMS(fseq);
     seqlen = PySequence_Fast_GET_SIZE(fseq);
+    res = _PyUnicode_JoinArray(separator, items, seqlen);
+    Py_DECREF(fseq);
+    return res;
+}
+
+PyObject *
+_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
+{
+    PyObject *res = NULL; /* the result */
+    PyObject *sep = NULL;
+    Py_ssize_t seplen;
+    PyObject *item;
+    Py_ssize_t sz, i, res_offset;
+    Py_UCS4 maxchar;
+    Py_UCS4 item_maxchar;
+    int use_memcpy;
+    unsigned char *res_data = NULL, *sep_data = NULL;
+    PyObject *last_obj;
+    unsigned int kind = 0;
+
     /* If empty sequence, return u"". */
     if (seqlen == 0) {
-        Py_DECREF(fseq);
         _Py_RETURN_UNICODE_EMPTY();
     }
 
     /* If singleton sequence with an exact Unicode, return that. */
     last_obj = NULL;
-    items = PySequence_Fast_ITEMS(fseq);
     if (seqlen == 1) {
         if (PyUnicode_CheckExact(items[0])) {
             res = items[0];
             Py_INCREF(res);
-            Py_DECREF(fseq);
             return res;
         }
         seplen = 0;
@@ -9843,13 +10038,11 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
         assert(res_offset == PyUnicode_GET_LENGTH(res));
     }
 
-    Py_DECREF(fseq);
     Py_XDECREF(sep);
     assert(_PyUnicode_CheckConsistency(res, 1));
     return res;
 
   onError:
-    Py_DECREF(fseq);
     Py_XDECREF(sep);
     Py_XDECREF(res);
     return NULL;
@@ -9974,13 +10167,8 @@ PyUnicode_Splitlines(PyObject *string, int keepends)
 {
     PyObject *list;
 
-    string = PyUnicode_FromObject(string);
-    if (string == NULL)
-        return NULL;
-    if (PyUnicode_READY(string) == -1) {
-        Py_DECREF(string);
+    if (ensure_unicode(string) < 0)
         return NULL;
-    }
 
     switch (PyUnicode_KIND(string)) {
     case PyUnicode_1BYTE_KIND:
@@ -10007,7 +10195,6 @@ PyUnicode_Splitlines(PyObject *string, int keepends)
         assert(0);
         list = 0;
     }
-    Py_DECREF(string);
     return list;
 }
 
@@ -10568,28 +10755,27 @@ unicode_casefold(PyObject *self)
 }
 
 
-/* Argument converter.  Coerces to a single unicode character */
+/* Argument converter. Accepts a single Unicode character. */
 
 static int
 convert_uc(PyObject *obj, void *addr)
 {
     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
-    PyObject *uniobj;
 
-    uniobj = PyUnicode_FromObject(obj);
-    if (uniobj == NULL) {
-        PyErr_SetString(PyExc_TypeError,
-                        "The fill character cannot be converted to Unicode");
+    if (!PyUnicode_Check(obj)) {
+        PyErr_Format(PyExc_TypeError,
+                     "The fill character must be a unicode character, "
+                     "not %.100s", Py_TYPE(obj)->tp_name);
         return 0;
     }
-    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
+    if (PyUnicode_READY(obj) < 0)
+        return 0;
+    if (PyUnicode_GET_LENGTH(obj) != 1) {
         PyErr_SetString(PyExc_TypeError,
                         "The fill character must be exactly one character long");
-        Py_DECREF(uniobj);
         return 0;
     }
-    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
-    Py_DECREF(uniobj);
+    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
     return 1;
 }
 
@@ -10905,59 +11091,49 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
 }
 
 int
-PyUnicode_Contains(PyObject *container, PyObject *element)
+_PyUnicode_EQ(PyObject *aa, PyObject *bb)
+{
+    return unicode_eq(aa, bb);
+}
+
+int
+PyUnicode_Contains(PyObject *str, PyObject *substr)
 {
-    PyObject *str, *sub;
     int kind1, kind2;
     void *buf1, *buf2;
     Py_ssize_t len1, len2;
     int result;
 
-    /* Coerce the two arguments */
-    sub = PyUnicode_FromObject(element);
-    if (!sub) {
+    if (!PyUnicode_Check(substr)) {
         PyErr_Format(PyExc_TypeError,
-                     "'in <string>' requires string as left operand, not %s",
-                     element->ob_type->tp_name);
+                     "'in <string>' requires string as left operand, not %.100s",
+                     Py_TYPE(substr)->tp_name);
         return -1;
     }
-
-    str = PyUnicode_FromObject(container);
-    if (!str) {
-        Py_DECREF(sub);
+    if (PyUnicode_READY(substr) == -1)
+        return -1;
+    if (ensure_unicode(str) < 0)
         return -1;
-    }
 
     kind1 = PyUnicode_KIND(str);
-    kind2 = PyUnicode_KIND(sub);
-    if (kind1 < kind2) {
-        Py_DECREF(sub);
-        Py_DECREF(str);
+    kind2 = PyUnicode_KIND(substr);
+    if (kind1 < kind2)
         return 0;
-    }
     len1 = PyUnicode_GET_LENGTH(str);
-    len2 = PyUnicode_GET_LENGTH(sub);
-    if (len1 < len2) {
-        Py_DECREF(sub);
-        Py_DECREF(str);
+    len2 = PyUnicode_GET_LENGTH(substr);
+    if (len1 < len2)
         return 0;
-    }
     buf1 = PyUnicode_DATA(str);
-    buf2 = PyUnicode_DATA(sub);
+    buf2 = PyUnicode_DATA(substr);
     if (len2 == 1) {
         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
-        Py_DECREF(sub);
-        Py_DECREF(str);
         return result;
     }
     if (kind2 != kind1) {
-        buf2 = _PyUnicode_AsKind(sub, kind1);
-        if (!buf2) {
-            Py_DECREF(sub);
-            Py_DECREF(str);
+        buf2 = _PyUnicode_AsKind(substr, kind1);
+        if (!buf2)
             return -1;
-        }
     }
 
     switch (kind1) {
@@ -10975,9 +11151,6 @@ PyUnicode_Contains(PyObject *container, PyObject *element)
         assert(0);
     }
 
-    Py_DECREF(str);
-    Py_DECREF(sub);
-
     if (kind2 != kind1)
         PyMem_Free(buf2);
 
@@ -10989,56 +11162,40 @@ PyUnicode_Contains(PyObject *container, PyObject *element)
 PyObject *
 PyUnicode_Concat(PyObject *left, PyObject *right)
 {
-    PyObject *u = NULL, *v = NULL, *w;
+    PyObject *result;
     Py_UCS4 maxchar, maxchar2;
-    Py_ssize_t u_len, v_len, new_len;
+    Py_ssize_t left_len, right_len, new_len;
 
-    /* Coerce the two arguments */
-    u = PyUnicode_FromObject(left);
-    if (u == NULL)
-        goto onError;
-    v = PyUnicode_FromObject(right);
-    if (v == NULL)
-        goto onError;
+    if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
+        return NULL;
 
     /* Shortcuts */
-    if (v == unicode_empty) {
-        Py_DECREF(v);
-        return u;
-    }
-    if (u == unicode_empty) {
-        Py_DECREF(u);
-        return v;
-    }
+    if (left == unicode_empty)
+        return PyUnicode_FromObject(right);
+    if (right == unicode_empty)
+        return PyUnicode_FromObject(left);
 
-    u_len = PyUnicode_GET_LENGTH(u);
-    v_len = PyUnicode_GET_LENGTH(v);
-    if (u_len > PY_SSIZE_T_MAX - v_len) {
+    left_len = PyUnicode_GET_LENGTH(left);
+    right_len = PyUnicode_GET_LENGTH(right);
+    if (left_len > PY_SSIZE_T_MAX - right_len) {
         PyErr_SetString(PyExc_OverflowError,
                         "strings are too large to concat");
-        goto onError;
+        return NULL;
     }
-    new_len = u_len + v_len;
+    new_len = left_len + right_len;
 
-    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
-    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
+    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
+    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
     maxchar = Py_MAX(maxchar, maxchar2);
 
     /* Concat the two Unicode strings */
-    w = PyUnicode_New(new_len, maxchar);
-    if (w == NULL)
-        goto onError;
-    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
-    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
-    Py_DECREF(u);
-    Py_DECREF(v);
-    assert(_PyUnicode_CheckConsistency(w, 1));
-    return w;
-
-  onError:
-    Py_XDECREF(u);
-    Py_XDECREF(v);
-    return NULL;
+    result = PyUnicode_New(new_len, maxchar);
+    if (result == NULL)
+        return NULL;
+    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
+    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
+    assert(_PyUnicode_CheckConsistency(result, 1));
+    return result;
 }
 
 void
@@ -11129,6 +11286,25 @@ PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
     Py_XDECREF(right);
 }
 
+/*
+Wraps stringlib_parse_args_finds() and additionally ensures that the
+first argument is a unicode object.
+*/
+
+static inline int
+parse_args_finds_unicode(const char * function_name, PyObject *args,
+                         PyObject **substring,
+                         Py_ssize_t *start, Py_ssize_t *end)
+{
+    if(stringlib_parse_args_finds(function_name, args, substring,
+                                  start, end)) {
+        if (ensure_unicode(*substring) < 0)
+            return 0;
+        return 1;
+    }
+    return 0;
+}
+
 PyDoc_STRVAR(count__doc__,
              "S.count(sub[, start[, end]]) -> int\n\
 \n\
@@ -11147,31 +11323,26 @@ unicode_count(PyObject *self, PyObject *args)
     void *buf1, *buf2;
     Py_ssize_t len1, len2, iresult;
 
-    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
-                                            &start, &end))
+    if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
         return NULL;
 
     kind1 = PyUnicode_KIND(self);
     kind2 = PyUnicode_KIND(substring);
-    if (kind1 < kind2) {
-        Py_DECREF(substring);
+    if (kind1 < kind2)
         return PyLong_FromLong(0);
-    }
+
     len1 = PyUnicode_GET_LENGTH(self);
     len2 = PyUnicode_GET_LENGTH(substring);
     ADJUST_INDICES(start, end, len1);
-    if (end - start < len2) {
-        Py_DECREF(substring);
+    if (end - start < len2)
         return PyLong_FromLong(0);
-    }
+
     buf1 = PyUnicode_DATA(self);
     buf2 = PyUnicode_DATA(substring);
     if (kind2 != kind1) {
         buf2 = _PyUnicode_AsKind(substring, kind1);
-        if (!buf2) {
-            Py_DECREF(substring);
+        if (!buf2)
             return NULL;
-        }
     }
     switch (kind1) {
     case PyUnicode_1BYTE_KIND:
@@ -11201,8 +11372,6 @@ unicode_count(PyObject *self, PyObject *args)
     if (kind2 != kind1)
         PyMem_Free(buf2);
 
-    Py_DECREF(substring);
-
     return result;
 }
 
@@ -11336,22 +11505,13 @@ unicode_find(PyObject *self, PyObject *args)
     Py_ssize_t end = 0;
     Py_ssize_t result;
 
-    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
-                                            &start, &end))
+    if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
         return NULL;
 
-    if (PyUnicode_READY(self) == -1) {
-        Py_DECREF(substring);
-        return NULL;
-    }
-    if (PyUnicode_READY(substring) == -1) {
-        Py_DECREF(substring);
+    if (PyUnicode_READY(self) == -1)
         return NULL;
-    }
-
-    result = any_find_slice(1, self, substring, start, end);
 
-    Py_DECREF(substring);
+    result = any_find_slice(self, substring, start, end, 1);
 
     if (result == -2)
         return NULL;
@@ -11424,22 +11584,13 @@ unicode_index(PyObject *self, PyObject *args)
     Py_ssize_t start = 0;
     Py_ssize_t end = 0;
 
-    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
-                                            &start, &end))
+    if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
         return NULL;
 
-    if (PyUnicode_READY(self) == -1) {
-        Py_DECREF(substring);
-        return NULL;
-    }
-    if (PyUnicode_READY(substring) == -1) {
-        Py_DECREF(substring);
+    if (PyUnicode_READY(self) == -1)
         return NULL;
-    }
-
-    result = any_find_slice(1, self, substring, start, end);
 
-    Py_DECREF(substring);
+    result = any_find_slice(self, substring, start, end, 1);
 
     if (result == -2)
         return NULL;
@@ -11953,7 +12104,7 @@ unicode_lower(PyObject *self)
 #define BOTHSTRIP 2
 
 /* Arrays indexed by above */
-static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
+static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
 
 #define STRIPNAME(i) (stripformat[i]+3)
 
@@ -12248,40 +12399,15 @@ unicode_repeat(PyObject *str, Py_ssize_t len)
 }
 
 PyObject *
-PyUnicode_Replace(PyObject *obj,
-                  PyObject *subobj,
-                  PyObject *replobj,
+PyUnicode_Replace(PyObject *str,
+                  PyObject *substr,
+                  PyObject *replstr,
                   Py_ssize_t maxcount)
 {
-    PyObject *self;
-    PyObject *str1;
-    PyObject *str2;
-    PyObject *result;
-
-    self = PyUnicode_FromObject(obj);
-    if (self == NULL)
-        return NULL;
-    str1 = PyUnicode_FromObject(subobj);
-    if (str1 == NULL) {
-        Py_DECREF(self);
-        return NULL;
-    }
-    str2 = PyUnicode_FromObject(replobj);
-    if (str2 == NULL) {
-        Py_DECREF(self);
-        Py_DECREF(str1);
+    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
+            ensure_unicode(replstr) < 0)
         return NULL;
-    }
-    if (PyUnicode_READY(self) == -1 ||
-        PyUnicode_READY(str1) == -1 ||
-        PyUnicode_READY(str2) == -1)
-        result = NULL;
-    else
-        result = replace(self, str1, str2, maxcount);
-    Py_DECREF(self);
-    Py_DECREF(str1);
-    Py_DECREF(str2);
-    return result;
+    return replace(str, substr, replstr, maxcount);
 }
 
 PyDoc_STRVAR(replace__doc__,
@@ -12297,28 +12423,12 @@ unicode_replace(PyObject *self, PyObject *args)
     PyObject *str1;
     PyObject *str2;
     Py_ssize_t maxcount = -1;
-    PyObject *result;
 
-    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
+    if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
         return NULL;
     if (PyUnicode_READY(self) == -1)
         return NULL;
-    str1 = PyUnicode_FromObject(str1);
-    if (str1 == NULL)
-        return NULL;
-    str2 = PyUnicode_FromObject(str2);
-    if (str2 == NULL) {
-        Py_DECREF(str1);
-        return NULL;
-    }
-    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
-        result = NULL;
-    else
-        result = replace(self, str1, str2, maxcount);
-
-    Py_DECREF(str1);
-    Py_DECREF(str2);
-    return result;
+    return replace(self, str1, str2, maxcount);
 }
 
 static PyObject *
@@ -12503,22 +12613,13 @@ unicode_rfind(PyObject *self, PyObject *args)
     Py_ssize_t end = 0;
     Py_ssize_t result;
 
-    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
-                                            &start, &end))
+    if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
         return NULL;
 
-    if (PyUnicode_READY(self) == -1) {
-        Py_DECREF(substring);
-        return NULL;
-    }
-    if (PyUnicode_READY(substring) == -1) {
-        Py_DECREF(substring);
+    if (PyUnicode_READY(self) == -1)
         return NULL;
-    }
-
-    result = any_find_slice(-1, self, substring, start, end);
 
-    Py_DECREF(substring);
+    result = any_find_slice(self, substring, start, end, -1);
 
     if (result == -2)
         return NULL;
@@ -12540,22 +12641,13 @@ unicode_rindex(PyObject *self, PyObject *args)
     Py_ssize_t end = 0;
     Py_ssize_t result;
 
-    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
-                                            &start, &end))
+    if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
         return NULL;
 
-    if (PyUnicode_READY(self) == -1) {
-        Py_DECREF(substring);
-        return NULL;
-    }
-    if (PyUnicode_READY(substring) == -1) {
-        Py_DECREF(substring);
+    if (PyUnicode_READY(self) == -1)
         return NULL;
-    }
-
-    result = any_find_slice(-1, self, substring, start, end);
 
-    Py_DECREF(substring);
+    result = any_find_slice(self, substring, start, end, -1);
 
     if (result == -2)
         return NULL;
@@ -12595,24 +12687,10 @@ unicode_rjust(PyObject *self, PyObject *args)
 PyObject *
 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
 {
-    PyObject *result;
-
-    s = PyUnicode_FromObject(s);
-    if (s == NULL)
+    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
         return NULL;
-    if (sep != NULL) {
-        sep = PyUnicode_FromObject(sep);
-        if (sep == NULL) {
-            Py_DECREF(s);
-            return NULL;
-        }
-    }
-
-    result = split(s, sep, maxsplit);
 
-    Py_DECREF(s);
-    Py_XDECREF(sep);
-    return result;
+    return split(s, sep, maxsplit);
 }
 
 PyDoc_STRVAR(split__doc__,
@@ -12637,35 +12715,26 @@ unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
 
     if (substring == Py_None)
         return split(self, NULL, maxcount);
-    else if (PyUnicode_Check(substring))
+
+    if (PyUnicode_Check(substring))
         return split(self, substring, maxcount);
-    else
-        return PyUnicode_Split(self, substring, maxcount);
+
+    PyErr_Format(PyExc_TypeError,
+                 "must be str or None, not %.100s",
+                 Py_TYPE(substring)->tp_name);
+    return NULL;
 }
 
 PyObject *
-PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
+PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
 {
-    PyObject* str_obj;
-    PyObject* sep_obj;
     PyObject* out;
     int kind1, kind2;
     void *buf1, *buf2;
     Py_ssize_t len1, len2;
 
-    str_obj = PyUnicode_FromObject(str_in);
-    if (!str_obj)
+    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
         return NULL;
-    sep_obj = PyUnicode_FromObject(sep_in);
-    if (!sep_obj) {
-        Py_DECREF(str_obj);
-        return NULL;
-    }
-    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
-        Py_DECREF(sep_obj);
-        Py_DECREF(str_obj);
-        return NULL;
-    }
 
     kind1 = PyUnicode_KIND(str_obj);
     kind2 = PyUnicode_KIND(sep_obj);
@@ -12679,8 +12748,6 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
             Py_DECREF(unicode_empty);
         }
-        Py_DECREF(sep_obj);
-        Py_DECREF(str_obj);
         return out;
     }
     buf1 = PyUnicode_DATA(str_obj);
@@ -12688,7 +12755,7 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
     if (kind2 != kind1) {
         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
         if (!buf2)
-            goto onError;
+            return NULL;
     }
 
     switch (kind1) {
@@ -12709,39 +12776,23 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
         out = 0;
     }
 
-    Py_DECREF(sep_obj);
-    Py_DECREF(str_obj);
     if (kind2 != kind1)
         PyMem_Free(buf2);
 
     return out;
-  onError:
-    Py_DECREF(sep_obj);
-    Py_DECREF(str_obj);
-    if (kind2 != kind1 && buf2)
-        PyMem_Free(buf2);
-    return NULL;
 }
 
 
 PyObject *
-PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
+PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
 {
-    PyObject* str_obj;
-    PyObject* sep_obj;
     PyObject* out;
     int kind1, kind2;
     void *buf1, *buf2;
     Py_ssize_t len1, len2;
 
-    str_obj = PyUnicode_FromObject(str_in);
-    if (!str_obj)
-        return NULL;
-    sep_obj = PyUnicode_FromObject(sep_in);
-    if (!sep_obj) {
-        Py_DECREF(str_obj);
+    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
         return NULL;
-    }
 
     kind1 = PyUnicode_KIND(str_obj);
     kind2 = PyUnicode_KIND(sep_obj);
@@ -12755,8 +12806,6 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
             Py_DECREF(unicode_empty);
         }
-        Py_DECREF(sep_obj);
-        Py_DECREF(str_obj);
         return out;
     }
     buf1 = PyUnicode_DATA(str_obj);
@@ -12764,7 +12813,7 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
     if (kind2 != kind1) {
         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
         if (!buf2)
-            goto onError;
+            return NULL;
     }
 
     switch (kind1) {
@@ -12785,18 +12834,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
         out = 0;
     }
 
-    Py_DECREF(sep_obj);
-    Py_DECREF(str_obj);
     if (kind2 != kind1)
         PyMem_Free(buf2);
 
     return out;
-  onError:
-    Py_DECREF(sep_obj);
-    Py_DECREF(str_obj);
-    if (kind2 != kind1 && buf2)
-        PyMem_Free(buf2);
-    return NULL;
 }
 
 PyDoc_STRVAR(partition__doc__,
@@ -12828,24 +12869,10 @@ unicode_rpartition(PyObject *self, PyObject *separator)
 PyObject *
 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
 {
-    PyObject *result;
-
-    s = PyUnicode_FromObject(s);
-    if (s == NULL)
+    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
         return NULL;
-    if (sep != NULL) {
-        sep = PyUnicode_FromObject(sep);
-        if (sep == NULL) {
-            Py_DECREF(s);
-            return NULL;
-        }
-    }
-
-    result = rsplit(s, sep, maxsplit);
 
-    Py_DECREF(s);
-    Py_XDECREF(sep);
-    return result;
+    return rsplit(s, sep, maxsplit);
 }
 
 PyDoc_STRVAR(rsplit__doc__,
@@ -12870,10 +12897,14 @@ unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
 
     if (substring == Py_None)
         return rsplit(self, NULL, maxcount);
-    else if (PyUnicode_Check(substring))
+
+    if (PyUnicode_Check(substring))
         return rsplit(self, substring, maxcount);
-    else
-        return PyUnicode_RSplit(self, substring, maxcount);
+
+    PyErr_Format(PyExc_TypeError,
+                 "must be str or None, not %.100s",
+                 Py_TYPE(substring)->tp_name);
+    return NULL;
 }
 
 PyDoc_STRVAR(splitlines__doc__,
@@ -13154,11 +13185,15 @@ unicode_startswith(PyObject *self,
     if (PyTuple_Check(subobj)) {
         Py_ssize_t i;
         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
-            if (substring == NULL)
+            substring = PyTuple_GET_ITEM(subobj, i);
+            if (!PyUnicode_Check(substring)) {
+                PyErr_Format(PyExc_TypeError,
+                             "tuple for startswith must only contain str, "
+                             "not %.100s",
+                             Py_TYPE(substring)->tp_name);
                 return NULL;
+            }
             result = tailmatch(self, substring, start, end, -1);
-            Py_DECREF(substring);
             if (result == -1)
                 return NULL;
             if (result) {
@@ -13168,15 +13203,13 @@ unicode_startswith(PyObject *self,
         /* nothing matched */
         Py_RETURN_FALSE;
     }
-    substring = PyUnicode_FromObject(subobj);
-    if (substring == NULL) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
-                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
+    if (!PyUnicode_Check(subobj)) {
+        PyErr_Format(PyExc_TypeError,
+                     "startswith first arg must be str or "
+                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
         return NULL;
     }
-    result = tailmatch(self, substring, start, end, -1);
-    Py_DECREF(substring);
+    result = tailmatch(self, subobj, start, end, -1);
     if (result == -1)
         return NULL;
     return PyBool_FromLong(result);
@@ -13206,12 +13239,15 @@ unicode_endswith(PyObject *self,
     if (PyTuple_Check(subobj)) {
         Py_ssize_t i;
         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
-            substring = PyUnicode_FromObject(
-                PyTuple_GET_ITEM(subobj, i));
-            if (substring == NULL)
+            substring = PyTuple_GET_ITEM(subobj, i);
+            if (!PyUnicode_Check(substring)) {
+                PyErr_Format(PyExc_TypeError,
+                             "tuple for endswith must only contain str, "
+                             "not %.100s",
+                             Py_TYPE(substring)->tp_name);
                 return NULL;
+            }
             result = tailmatch(self, substring, start, end, +1);
-            Py_DECREF(substring);
             if (result == -1)
                 return NULL;
             if (result) {
@@ -13220,61 +13256,67 @@ unicode_endswith(PyObject *self,
         }
         Py_RETURN_FALSE;
     }
-    substring = PyUnicode_FromObject(subobj);
-    if (substring == NULL) {
-        if (PyErr_ExceptionMatches(PyExc_TypeError))
-            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
-                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
+    if (!PyUnicode_Check(subobj)) {
+        PyErr_Format(PyExc_TypeError,
+                     "endswith first arg must be str or "
+                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
         return NULL;
     }
-    result = tailmatch(self, substring, start, end, +1);
-    Py_DECREF(substring);
+    result = tailmatch(self, subobj, start, end, +1);
     if (result == -1)
         return NULL;
     return PyBool_FromLong(result);
 }
 
-Py_LOCAL_INLINE(void)
+static inline void
 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
 {
-    if (!writer->readonly)
+    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
+    writer->data = PyUnicode_DATA(writer->buffer);
+
+    if (!writer->readonly) {
+        writer->kind = PyUnicode_KIND(writer->buffer);
         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+    }
     else {
+        /* use a value smaller than PyUnicode_1BYTE_KIND() so
+           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+        writer->kind = PyUnicode_WCHAR_KIND;
+        assert(writer->kind <= PyUnicode_1BYTE_KIND);
+
         /* Copy-on-write mode: set buffer size to 0 so
          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
          * next write. */
         writer->size = 0;
     }
-    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
-    writer->data = PyUnicode_DATA(writer->buffer);
-    writer->kind = PyUnicode_KIND(writer->buffer);
 }
 
 void
 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
 {
     memset(writer, 0, sizeof(*writer));
-#ifdef Py_DEBUG
-    writer->kind = 5;    /* invalid kind */
-#endif
+
+    /* ASCII is the bare minimum */
     writer->min_char = 127;
+
+    /* use a value smaller than PyUnicode_1BYTE_KIND() so
+       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+    writer->kind = PyUnicode_WCHAR_KIND;
+    assert(writer->kind <= PyUnicode_1BYTE_KIND);
 }
 
 int
 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                                  Py_ssize_t length, Py_UCS4 maxchar)
 {
-#ifdef MS_WINDOWS
-   /* On Windows, overallocate by 50% is the best factor */
-#  define OVERALLOCATE_FACTOR 2
-#else
-   /* On Linux, overallocate by 25% is the best factor */
-#  define OVERALLOCATE_FACTOR 4
-#endif
     Py_ssize_t newlen;
     PyObject *newbuffer;
 
-    assert(length > 0);
+    assert(maxchar <= MAX_UNICODE);
+
+    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
+    assert((maxchar > writer->maxchar && length >= 0)
+           || length > 0);
 
     if (length > PY_SSIZE_T_MAX - writer->pos) {
         PyErr_NoMemory();
@@ -13340,9 +13382,32 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
 #undef OVERALLOCATE_FACTOR
 }
 
-Py_LOCAL_INLINE(int)
+int
+_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
+                                     enum PyUnicode_Kind kind)
+{
+    Py_UCS4 maxchar;
+
+    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
+    assert(writer->kind < kind);
+
+    switch (kind)
+    {
+    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
+    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
+    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
+    default:
+        assert(0 && "invalid kind");
+        return -1;
+    }
+
+    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
+}
+
+static inline int
 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
 {
+    assert(ch <= MAX_UNICODE);
     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
         return -1;
     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
@@ -13510,17 +13575,26 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
         return str;
     }
-    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
-        PyObject *newbuffer;
-        newbuffer = resize_compact(writer->buffer, writer->pos);
-        if (newbuffer == NULL) {
-            Py_CLEAR(writer->buffer);
-            return NULL;
+    if (writer->pos == 0) {
+        Py_CLEAR(writer->buffer);
+
+        /* Get the empty Unicode string singleton ('') */
+        _Py_INCREF_UNICODE_EMPTY();
+        str  = unicode_empty;
+    }
+    else {
+        str = writer->buffer;
+        writer->buffer = NULL;
+
+        if (PyUnicode_GET_LENGTH(str) != writer->pos) {
+            PyObject *str2;
+            str2 = resize_compact(str, writer->pos);
+            if (str2 == NULL)
+                return NULL;
+            str = str2;
         }
-        writer->buffer = newbuffer;
     }
-    str = writer->buffer;
-    writer->buffer = NULL;
+
     assert(_PyUnicode_CheckConsistency(str, 1));
     return unicode_result_ready(str);
 }
@@ -14661,13 +14735,10 @@ PyUnicode_Format(PyObject *format, PyObject *args)
         return NULL;
     }
 
-    ctx.fmtstr = PyUnicode_FromObject(format);
-    if (ctx.fmtstr == NULL)
+    if (ensure_unicode(format) < 0)
         return NULL;
-    if (PyUnicode_READY(ctx.fmtstr) == -1) {
-        Py_DECREF(ctx.fmtstr);
-        return NULL;
-    }
+
+    ctx.fmtstr = format;
     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
@@ -14727,11 +14798,9 @@ PyUnicode_Format(PyObject *format, PyObject *args)
     if (ctx.args_owned) {
         Py_DECREF(ctx.args);
     }
-    Py_DECREF(ctx.fmtstr);
     return _PyUnicodeWriter_Finish(&ctx.writer);
 
   onError:
-    Py_DECREF(ctx.fmtstr);
     _PyUnicodeWriter_Dealloc(&ctx.writer);
     if (ctx.args_owned) {
         Py_DECREF(ctx.args);
@@ -15009,26 +15078,18 @@ PyUnicode_InternInPlace(PyObject **p)
             return;
         }
     }
-    /* It might be that the GetItem call fails even
-       though the key is present in the dictionary,
-       namely when this happens during a stack overflow. */
     Py_ALLOW_RECURSION
-    t = PyDict_GetItem(interned, s);
+    t = PyDict_SetDefault(interned, s, s);
     Py_END_ALLOW_RECURSION
-
-    if (t) {
-        Py_INCREF(t);
-        Py_SETREF(*p, t);
+    if (t == NULL) {
+        PyErr_Clear();
         return;
     }
-
-    PyThreadState_GET()->recursion_critical = 1;
-    if (PyDict_SetItem(interned, s, s) < 0) {
-        PyErr_Clear();
-        PyThreadState_GET()->recursion_critical = 0;
+    if (t != s) {
+        Py_INCREF(t);
+        Py_SETREF(*p, t);
         return;
     }
-    PyThreadState_GET()->recursion_critical = 0;
     /* The two references in interned are not counted by refcnt.
        The deallocator will take care of this */
     Py_REFCNT(s) -= 2;
diff --git a/Objects/weakrefobject.c b/Objects/weakrefobject.c
index 7e6f364..ab6b235 100644
--- a/Objects/weakrefobject.c
+++ b/Objects/weakrefobject.c
@@ -265,7 +265,7 @@ insert_head(PyWeakReference *newref, PyWeakReference **list)
 }
 
 static int
-parse_weakref_init_args(char *funcname, PyObject *args, PyObject *kwargs,
+parse_weakref_init_args(const char *funcname, PyObject *args, PyObject *kwargs,
                         PyObject **obp, PyObject **callbackp)
 {
     return PyArg_UnpackTuple(args, funcname, 1, 2, obp, callbackp);
@@ -453,7 +453,7 @@ proxy_checkref(PyWeakReference *proxy)
     method(PyObject *proxy) { \
             _Py_IDENTIFIER(special); \
             UNWRAP(proxy); \
-                return _PyObject_CallMethodId(proxy, &PyId_##special, ""); \
+                return _PyObject_CallMethodId(proxy, &PyId_##special, NULL); \
         }