2 files changed, 141 insertions, 66 deletions
diff --git a/Objects/listobject.c b/Objects/listobject.c
index fc20a9b..470ad8e 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1628,6 +1628,15 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
 /* Avoid malloc for small temp arrays. */
 #define MERGESTATE_TEMP_SIZE 256
 
+/* The largest value of minrun. This must be a power of 2, and >= 1, so that
+ * the compute_minrun() algorithm guarantees to return a result no larger than
+ * this,
+ */
+#define MAX_MINRUN 64
+#if ((MAX_MINRUN) < 1) || ((MAX_MINRUN) & ((MAX_MINRUN) - 1))
+#error "MAX_MINRUN must be a power of 2, and >= 1"
+#endif
+
 /* One MergeState exists on the stack per invocation of mergesort.  It's just
  * a convenient way to pass state around among the helper functions.
  */
@@ -1685,68 +1694,133 @@ struct s_MergeState {
     int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *);
 };
 
-/* binarysort is the best method for sorting small arrays: it does
-   few compares, but can do data movement quadratic in the number of
-   elements.
-   [lo.keys, hi) is a contiguous slice of a list of keys, and is sorted via
-   binary insertion.  This sort is stable.
-   On entry, must have lo.keys <= start <= hi, and that
-   [lo.keys, start) is already sorted (pass start == lo.keys if you don't
-   know!).
-   If islt() complains return -1, else 0.
+/* binarysort is the best method for sorting small arrays: it does few
+   compares, but can do data movement quadratic in the number of elements.
+   ss->keys is viewed as an array of n kays, a[:n]. a[:ok] is already sorted.
+   Pass ok = 0 (or 1) if you don't know.
+   It's sorted in-place, by a stable binary insertion sort. If ss->values
+   isn't NULL, it's permuted in lockstap with ss->keys.
+   On entry, must have n >= 1, and 0 <= ok <= n <= MAX_MINRUN.
+   Return -1 if comparison raises an exception, else 0.
    Even in case of error, the output slice will be some permutation of
    the input (nothing is lost or duplicated).
 */
 static int
-binarysort(MergeState *ms, sortslice lo, PyObject **hi, PyObject **start)
+binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
 {
-    Py_ssize_t k;
-    PyObject **l, **p, **r;
+    Py_ssize_t k; /* for IFLT macro expansion */
+    PyObject ** const a = ss->keys;
+    PyObject ** const v = ss->values;
+    const bool has_values = v != NULL;
     PyObject *pivot;
-
-    assert(lo.keys <= start && start <= hi);
-    /* assert [lo.keys, start) is sorted */
-    if (lo.keys == start)
-        ++start;
-    for (; start < hi; ++start) {
-        /* set l to where *start belongs */
-        l = lo.keys;
-        r = start;
-        pivot = *r;
-        /* Invariants:
-         * pivot >= all in [lo.keys, l).
-         * pivot  < all in [r, start).
-         * These are vacuously true at the start.
+    Py_ssize_t M;
+
+    assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
+    /* assert a[:ok] is sorted */
+    if (! ok)
+        ++ok;
+    /* Regular insertion sort has average- and worst-case O(n**2) cost
+       for both # of comparisons and number of bytes moved. But its branches
+       are highly predictable, and it loves sorted input (n-1 compares and no
+       data movement). This is significant in cases like sortperf.py's %sort,
+       where an out-of-order element near the start of a run is moved into
+       place slowly but then the remaining elements up to length minrun are
+       generally at worst one slot away from their correct position (so only
+       need 1 or 2 commpares to resolve). If comparisons are very fast (such
+       as for a list of Python floats), the simple inner loop leaves it
+       very competitive with binary insertion, despite that it does
+       significantly more compares overall on random data.
+
+       Binary insertion sort has worst, average, and best case O(n log n)
+       cost for # of comparisons, but worst and average case O(n**2) cost
+       for data movement. The more expensive comparisons, the more important
+       the comparison advantage. But its branches are less predictable the
+       more "randomish" the data, and that's so significant its worst case
+       in real life is random input rather than reverse-ordered (which does
+       about twice the data movement than random input does).
+
+       Note that the number of bytes moved doesn't seem to matter. MAX_MINRUN
+       of 64 is so small that the key and value pointers all fit in a corner
+       of L1 cache, and moving things around in that is very fast. */
+#if 0 // ordinary insertion sort.
+    PyObject * vpivot = NULL;
+    for (; ok < n; ++ok) {
+        pivot = a[ok];
+        if (has_values)
+            vpivot = v[ok];
+        for (M = ok - 1; M >= 0; --M) {
+            k = ISLT(pivot, a[M]);
+            if (k < 0) {
+                a[M + 1] = pivot;
+                if (has_values)
+                    v[M + 1] = vpivot;
+                goto fail;
+            }
+            else if (k) {
+                a[M + 1] = a[M];
+                if (has_values)
+                    v[M + 1] = v[M];
+            }
+            else
+                break;
+        }
+        a[M + 1] = pivot;
+        if (has_values)
+            v[M + 1] = vpivot;
+    }
+#else // binary insertion sort
+    Py_ssize_t L, R;
+    for (; ok < n; ++ok) {
+        /* set L to where a[ok] belongs */
+        L = 0;
+        R = ok;
+        pivot = a[ok];
+        /* Slice invariants. vacuously true at the start:
+         * all a[0:L]  <= pivot
+         * all a[L:R]     unknown
+         * all a[R:ok]  > pivot
          */
-        assert(l < r);
+        assert(L < R);
         do {
-            p = l + ((r - l) >> 1);
-            IFLT(pivot, *p)
-                r = p;
+            /* don't do silly ;-) things to prevent overflow when finding
+               the midpoint; L and R are very far from filling a Py_ssize_t */
+            M = (L + R) >> 1;
+#if 1 // straightforward, but highly unpredictable branch on random data
+            IFLT(pivot, a[M])
+                R = M;
             else
-                l = p+1;
-        } while (l < r);
-        assert(l == r);
-        /* The invariants still hold, so pivot >= all in [lo.keys, l) and
-           pivot < all in [l, start), so pivot belongs at l.  Note
-           that if there are elements equal to pivot, l points to the
-           first slot after them -- that's why this sort is stable.
-           Slide over to make room.
-           Caution: using memmove is much slower under MSVC 5;
-           we're not usually moving many slots. */
-        for (p = start; p > l; --p)
-            *p = *(p-1);
-        *l = pivot;
-        if (lo.values != NULL) {
-            Py_ssize_t offset = lo.values - lo.keys;
-            p = start + offset;
-            pivot = *p;
-            l += offset;
-            for ( ; p > l; --p)
-                *p = *(p-1);
-            *l = pivot;
+                L = M + 1;
+#else
+            /* Try to get compiler to generate conditional move instructions
+               instead. Works fine, but leaving it disabled for now because
+               it's not yielding consistently faster sorts. Needs more
+               investigation. More computation in the inner loop adds its own
+               costs, which can be significant when compares are fast. */
+            k = ISLT(pivot, a[M]);
+            if (k < 0)
+                goto fail;
+            Py_ssize_t Mp1 = M + 1;
+            R = k ? M : R;
+            L = k ? L : Mp1;
+#endif
+        } while (L < R);
+        assert(L == R);
+        /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
+           at index L. Slide a[L:ok] to the right a slot to make room for it.
+           Caution: using memmove is much slower under MSVC 5; we're not
+           usually moving many slots. Years later: under Visual Studio 2022,
+           memmove seems just slightly slower than doing it "by hand". */
+        for (M = ok; M > L; --M)
+            a[M] = a[M - 1];
+        a[L] = pivot;
+        if (has_values) {
+            pivot = v[ok];
+            for (M = ok; M > L; --M)
+                v[M] = v[M - 1];
+            v[L] = pivot;
         }
     }
+#endif // pick binary or regular insertion sort
     return 0;
 
  fail:
@@ -2559,10 +2633,10 @@ merge_force_collapse(MergeState *ms)
 /* Compute a good value for the minimum run length; natural runs shorter
  * than this are boosted artificially via binary insertion.
  *
- * If n < 64, return n (it's too small to bother with fancy stuff).
- * Else if n is an exact power of 2, return 32.
- * Else return an int k, 32 <= k <= 64, such that n/k is close to, but
- * strictly less than, an exact power of 2.
+ * If n < MAX_MINRUN return n (it's too small to bother with fancy stuff).
+ * Else if n is an exact power of 2, return MAX_MINRUN / 2.
+ * Else return an int k, MAX_MINRUN / 2 <= k <= MAX_MINRUN, such that n/k is
+ * close to, but strictly less than, an exact power of 2.
  *
  * See listsort.txt for more info.
  */
@@ -2572,7 +2646,7 @@ merge_compute_minrun(Py_ssize_t n)
     Py_ssize_t r = 0;           /* becomes 1 if any 1 bits are shifted off */
 
     assert(n >= 0);
-    while (n >= 64) {
+    while (n >= MAX_MINRUN) {
         r |= n & 1;
         n >>= 1;
     }
@@ -2956,7 +3030,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
         if (n < minrun) {
             const Py_ssize_t force = nremaining <= minrun ?
                               nremaining : minrun;
-            if (binarysort(&ms, lo, lo.keys + force, lo.keys + n) < 0)
+            if (binarysort(&ms, &lo, force, n) < 0)
                 goto fail;
             n = force;
         }
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 4f84e2c..f387d9c 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -270,9 +270,9 @@ result.  This has two primary good effects:
 
 Computing minrun
 ----------------
-If N < 64, minrun is N.  IOW, binary insertion sort is used for the whole
-array then; it's hard to beat that given the overheads of trying something
-fancier (see note BINSORT).
+If N < MAX_MINRUN, minrun is N.  IOW, binary insertion sort is used for the 
+whole array then; it's hard to beat that given the overheads of trying 
+something fancier (see note BINSORT).
 
 When N is a power of 2, testing on random data showed that minrun values of
 16, 32, 64 and 128 worked about equally well.  At 256 the data-movement cost
@@ -310,12 +310,13 @@ place, and r < minrun is small compared to N), or q a little larger than a
 power of 2 regardless of r (then we've got a case similar to "2112", again
 leaving too little work for the last merge to do).
 
-Instead we pick a minrun in range(32, 65) such that N/minrun is exactly a
-power of 2, or if that isn't possible, is close to, but strictly less than,
-a power of 2.  This is easier to do than it may sound:  take the first 6
-bits of N, and add 1 if any of the remaining bits are set.  In fact, that
-rule covers every case in this section, including small N and exact powers
-of 2; merge_compute_minrun() is a deceptively simple function.
+Instead we pick a minrun in range(MAX_MINRUN / 2, MAX_MINRUN + 1) such that 
+N/minrun is exactly a power of 2, or if that isn't possible, is close to, but 
+strictly less than, a power of 2.  This is easier to do than it may sound: 
+take the first log2(MAX_MINRUN) bits of N, and add 1 if any of the remaining 
+bits are set. In fact, that rule covers every case in this section, including 
+small N and exact powers of 2; merge_compute_minrun() is a deceptively simple 
+function.
 
 
 The Merge Pattern