summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Objects/listobject.c188
-rw-r--r--Objects/listsort.txt19
2 files changed, 141 insertions, 66 deletions
diff --git a/Objects/listobject.c b/Objects/listobject.c
index fc20a9b..470ad8e 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1628,6 +1628,15 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
/* Avoid malloc for small temp arrays. */
#define MERGESTATE_TEMP_SIZE 256
+/* The largest value of minrun. This must be a power of 2, and >= 1, so that
+ * the compute_minrun() algorithm guarantees to return a result no larger than
+ * this,
+ */
+#define MAX_MINRUN 64
+#if ((MAX_MINRUN) < 1) || ((MAX_MINRUN) & ((MAX_MINRUN) - 1))
+#error "MAX_MINRUN must be a power of 2, and >= 1"
+#endif
+
/* One MergeState exists on the stack per invocation of mergesort. It's just
* a convenient way to pass state around among the helper functions.
*/
@@ -1685,68 +1694,133 @@ struct s_MergeState {
int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *);
};
-/* binarysort is the best method for sorting small arrays: it does
- few compares, but can do data movement quadratic in the number of
- elements.
- [lo.keys, hi) is a contiguous slice of a list of keys, and is sorted via
- binary insertion. This sort is stable.
- On entry, must have lo.keys <= start <= hi, and that
- [lo.keys, start) is already sorted (pass start == lo.keys if you don't
- know!).
- If islt() complains return -1, else 0.
+/* binarysort is the best method for sorting small arrays: it does few
+ compares, but can do data movement quadratic in the number of elements.
+ ss->keys is viewed as an array of n kays, a[:n]. a[:ok] is already sorted.
+ Pass ok = 0 (or 1) if you don't know.
+ It's sorted in-place, by a stable binary insertion sort. If ss->values
+ isn't NULL, it's permuted in lockstap with ss->keys.
+ On entry, must have n >= 1, and 0 <= ok <= n <= MAX_MINRUN.
+ Return -1 if comparison raises an exception, else 0.
Even in case of error, the output slice will be some permutation of
the input (nothing is lost or duplicated).
*/
static int
-binarysort(MergeState *ms, sortslice lo, PyObject **hi, PyObject **start)
+binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
{
- Py_ssize_t k;
- PyObject **l, **p, **r;
+ Py_ssize_t k; /* for IFLT macro expansion */
+ PyObject ** const a = ss->keys;
+ PyObject ** const v = ss->values;
+ const bool has_values = v != NULL;
PyObject *pivot;
-
- assert(lo.keys <= start && start <= hi);
- /* assert [lo.keys, start) is sorted */
- if (lo.keys == start)
- ++start;
- for (; start < hi; ++start) {
- /* set l to where *start belongs */
- l = lo.keys;
- r = start;
- pivot = *r;
- /* Invariants:
- * pivot >= all in [lo.keys, l).
- * pivot < all in [r, start).
- * These are vacuously true at the start.
+ Py_ssize_t M;
+
+ assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
+ /* assert a[:ok] is sorted */
+ if (! ok)
+ ++ok;
+ /* Regular insertion sort has average- and worst-case O(n**2) cost
+ for both # of comparisons and number of bytes moved. But its branches
+ are highly predictable, and it loves sorted input (n-1 compares and no
+ data movement). This is significant in cases like sortperf.py's %sort,
+ where an out-of-order element near the start of a run is moved into
+ place slowly but then the remaining elements up to length minrun are
+ generally at worst one slot away from their correct position (so only
+ need 1 or 2 commpares to resolve). If comparisons are very fast (such
+ as for a list of Python floats), the simple inner loop leaves it
+ very competitive with binary insertion, despite that it does
+ significantly more compares overall on random data.
+
+ Binary insertion sort has worst, average, and best case O(n log n)
+ cost for # of comparisons, but worst and average case O(n**2) cost
+ for data movement. The more expensive comparisons, the more important
+ the comparison advantage. But its branches are less predictable the
+ more "randomish" the data, and that's so significant its worst case
+ in real life is random input rather than reverse-ordered (which does
+ about twice the data movement than random input does).
+
+ Note that the number of bytes moved doesn't seem to matter. MAX_MINRUN
+ of 64 is so small that the key and value pointers all fit in a corner
+ of L1 cache, and moving things around in that is very fast. */
+#if 0 // ordinary insertion sort.
+ PyObject * vpivot = NULL;
+ for (; ok < n; ++ok) {
+ pivot = a[ok];
+ if (has_values)
+ vpivot = v[ok];
+ for (M = ok - 1; M >= 0; --M) {
+ k = ISLT(pivot, a[M]);
+ if (k < 0) {
+ a[M + 1] = pivot;
+ if (has_values)
+ v[M + 1] = vpivot;
+ goto fail;
+ }
+ else if (k) {
+ a[M + 1] = a[M];
+ if (has_values)
+ v[M + 1] = v[M];
+ }
+ else
+ break;
+ }
+ a[M + 1] = pivot;
+ if (has_values)
+ v[M + 1] = vpivot;
+ }
+#else // binary insertion sort
+ Py_ssize_t L, R;
+ for (; ok < n; ++ok) {
+ /* set L to where a[ok] belongs */
+ L = 0;
+ R = ok;
+ pivot = a[ok];
+ /* Slice invariants. vacuously true at the start:
+ * all a[0:L] <= pivot
+ * all a[L:R] unknown
+ * all a[R:ok] > pivot
*/
- assert(l < r);
+ assert(L < R);
do {
- p = l + ((r - l) >> 1);
- IFLT(pivot, *p)
- r = p;
+ /* don't do silly ;-) things to prevent overflow when finding
+ the midpoint; L and R are very far from filling a Py_ssize_t */
+ M = (L + R) >> 1;
+#if 1 // straightforward, but highly unpredictable branch on random data
+ IFLT(pivot, a[M])
+ R = M;
else
- l = p+1;
- } while (l < r);
- assert(l == r);
- /* The invariants still hold, so pivot >= all in [lo.keys, l) and
- pivot < all in [l, start), so pivot belongs at l. Note
- that if there are elements equal to pivot, l points to the
- first slot after them -- that's why this sort is stable.
- Slide over to make room.
- Caution: using memmove is much slower under MSVC 5;
- we're not usually moving many slots. */
- for (p = start; p > l; --p)
- *p = *(p-1);
- *l = pivot;
- if (lo.values != NULL) {
- Py_ssize_t offset = lo.values - lo.keys;
- p = start + offset;
- pivot = *p;
- l += offset;
- for ( ; p > l; --p)
- *p = *(p-1);
- *l = pivot;
+ L = M + 1;
+#else
+ /* Try to get compiler to generate conditional move instructions
+ instead. Works fine, but leaving it disabled for now because
+ it's not yielding consistently faster sorts. Needs more
+ investigation. More computation in the inner loop adds its own
+ costs, which can be significant when compares are fast. */
+ k = ISLT(pivot, a[M]);
+ if (k < 0)
+ goto fail;
+ Py_ssize_t Mp1 = M + 1;
+ R = k ? M : R;
+ L = k ? L : Mp1;
+#endif
+ } while (L < R);
+ assert(L == R);
+ /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
+ at index L. Slide a[L:ok] to the right a slot to make room for it.
+ Caution: using memmove is much slower under MSVC 5; we're not
+ usually moving many slots. Years later: under Visual Studio 2022,
+ memmove seems just slightly slower than doing it "by hand". */
+ for (M = ok; M > L; --M)
+ a[M] = a[M - 1];
+ a[L] = pivot;
+ if (has_values) {
+ pivot = v[ok];
+ for (M = ok; M > L; --M)
+ v[M] = v[M - 1];
+ v[L] = pivot;
}
}
+#endif // pick binary or regular insertion sort
return 0;
fail:
@@ -2559,10 +2633,10 @@ merge_force_collapse(MergeState *ms)
/* Compute a good value for the minimum run length; natural runs shorter
* than this are boosted artificially via binary insertion.
*
- * If n < 64, return n (it's too small to bother with fancy stuff).
- * Else if n is an exact power of 2, return 32.
- * Else return an int k, 32 <= k <= 64, such that n/k is close to, but
- * strictly less than, an exact power of 2.
+ * If n < MAX_MINRUN return n (it's too small to bother with fancy stuff).
+ * Else if n is an exact power of 2, return MAX_MINRUN / 2.
+ * Else return an int k, MAX_MINRUN / 2 <= k <= MAX_MINRUN, such that n/k is
+ * close to, but strictly less than, an exact power of 2.
*
* See listsort.txt for more info.
*/
@@ -2572,7 +2646,7 @@ merge_compute_minrun(Py_ssize_t n)
Py_ssize_t r = 0; /* becomes 1 if any 1 bits are shifted off */
assert(n >= 0);
- while (n >= 64) {
+ while (n >= MAX_MINRUN) {
r |= n & 1;
n >>= 1;
}
@@ -2956,7 +3030,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
if (n < minrun) {
const Py_ssize_t force = nremaining <= minrun ?
nremaining : minrun;
- if (binarysort(&ms, lo, lo.keys + force, lo.keys + n) < 0)
+ if (binarysort(&ms, &lo, force, n) < 0)
goto fail;
n = force;
}
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 4f84e2c..f387d9c 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -270,9 +270,9 @@ result. This has two primary good effects:
Computing minrun
----------------
-If N < 64, minrun is N. IOW, binary insertion sort is used for the whole
-array then; it's hard to beat that given the overheads of trying something
-fancier (see note BINSORT).
+If N < MAX_MINRUN, minrun is N. IOW, binary insertion sort is used for the
+whole array then; it's hard to beat that given the overheads of trying
+something fancier (see note BINSORT).
When N is a power of 2, testing on random data showed that minrun values of
16, 32, 64 and 128 worked about equally well. At 256 the data-movement cost
@@ -310,12 +310,13 @@ place, and r < minrun is small compared to N), or q a little larger than a
power of 2 regardless of r (then we've got a case similar to "2112", again
leaving too little work for the last merge to do).
-Instead we pick a minrun in range(32, 65) such that N/minrun is exactly a
-power of 2, or if that isn't possible, is close to, but strictly less than,
-a power of 2. This is easier to do than it may sound: take the first 6
-bits of N, and add 1 if any of the remaining bits are set. In fact, that
-rule covers every case in this section, including small N and exact powers
-of 2; merge_compute_minrun() is a deceptively simple function.
+Instead we pick a minrun in range(MAX_MINRUN / 2, MAX_MINRUN + 1) such that
+N/minrun is exactly a power of 2, or if that isn't possible, is close to, but
+strictly less than, a power of 2. This is easier to do than it may sound:
+take the first log2(MAX_MINRUN) bits of N, and add 1 if any of the remaining
+bits are set. In fact, that rule covers every case in this section, including
+small N and exact powers of 2; merge_compute_minrun() is a deceptively simple
+function.
The Merge Pattern