summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorTim Peters <tim.peters@gmail.com>2021-09-06 17:54:41 (GMT)
committerGitHub <noreply@github.com>2021-09-06 17:54:41 (GMT)
commit5cb4c672d855033592f0e05162f887def236c00a (patch)
tree554ee49ff5f83295c93d945ac4ed60f8e3be1c1e /Objects
parent19871fce3b74fc3f37e334a999e00d0ef65a8f1e (diff)
downloadcpython-5cb4c672d855033592f0e05162f887def236c00a.zip
cpython-5cb4c672d855033592f0e05162f887def236c00a.tar.gz
cpython-5cb4c672d855033592f0e05162f887def236c00a.tar.bz2
bpo-34561: Switch to Munro & Wild "powersort" merge strategy. (#28108)
For list.sort(), replace our ad hoc merge ordering strategy with the principled, elegant, and provably near-optimal one from Munro and Wild's "powersort".
Diffstat (limited to 'Objects')
-rw-r--r--Objects/listobject.c112
-rw-r--r--Objects/listsort.txt157
2 files changed, 177 insertions, 92 deletions
diff --git a/Objects/listobject.c b/Objects/listobject.c
index 898cbc2..565c11e 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1139,12 +1139,11 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
if (k)
/* The maximum number of entries in a MergeState's pending-runs stack.
- * This is enough to sort arrays of size up to about
- * 32 * phi ** MAX_MERGE_PENDING
- * where phi ~= 1.618. 85 is ridiculouslylarge enough, good for an array
- * with 2**64 elements.
+ * For a list with n elements, this needs at most floor(log2(n)) + 1 entries
+ * even if we didn't force runs to a minimal length. So the number of bits
+ * in a Py_ssize_t is plenty large enough for all cases.
*/
-#define MAX_MERGE_PENDING 85
+#define MAX_MERGE_PENDING (SIZEOF_SIZE_T * 8)
/* When we get into galloping mode, we stay there until both runs win less
* often than MIN_GALLOP consecutive times. See listsort.txt for more info.
@@ -1159,7 +1158,8 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
*/
struct s_slice {
sortslice base;
- Py_ssize_t len;
+ Py_ssize_t len; /* length of run */
+ int power; /* node "level" for powersort merge strategy */
};
typedef struct s_MergeState MergeState;
@@ -1170,6 +1170,9 @@ struct s_MergeState {
*/
Py_ssize_t min_gallop;
+ Py_ssize_t listlen; /* len(input_list) - read only */
+ PyObject **basekeys; /* base address of keys array - read only */
+
/* 'a' is temp storage to help with merges. It contains room for
* alloced entries.
*/
@@ -1513,7 +1516,8 @@ fail:
/* Conceptually a MergeState's constructor. */
static void
-merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc)
+merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc,
+ sortslice *lo)
{
assert(ms != NULL);
if (has_keyfunc) {
@@ -1538,6 +1542,8 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc)
ms->a.keys = ms->temparray;
ms->n = 0;
ms->min_gallop = MIN_GALLOP;
+ ms->listlen = list_size;
+ ms->basekeys = lo->keys;
}
/* Free all the temp memory owned by the MergeState. This must be called
@@ -1920,37 +1926,74 @@ merge_at(MergeState *ms, Py_ssize_t i)
return merge_hi(ms, ssa, na, ssb, nb);
}
-/* Examine the stack of runs waiting to be merged, merging adjacent runs
- * until the stack invariants are re-established:
- *
- * 1. len[-3] > len[-2] + len[-1]
- * 2. len[-2] > len[-1]
+/* Two adjacent runs begin at index s1. The first run has length n1, and
+ * the second run (starting at index s1+n1) has length n2. The list has total
+ * length n.
+ * Compute the "power" of the first run. See listsort.txt for details.
+ */
+static int
+powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n)
+{
+ int result = 0;
+ assert(s1 >= 0);
+ assert(n1 > 0 && n2 > 0);
+ assert(s1 + n1 + n2 <= n);
+ /* midpoints a and b:
+ * a = s1 + n1/2
+ * b = s1 + n1 + n2/2 = a + (n1 + n2)/2
+ *
+ * Those may not be integers, though, because of the "/2". So we work with
+ * 2*a and 2*b instead, which are necessarily integers. It makes no
+ * difference to the outcome, since the bits in the expansion of (2*i)/n
+ * are merely shifted one position from those of i/n.
+ */
+ Py_ssize_t a = 2 * s1 + n1; /* 2*a */
+ Py_ssize_t b = a + n1 + n2; /* 2*b */
+ /* Emulate a/n and b/n one bit a time, until bits differ. */
+ for (;;) {
+ ++result;
+ if (a >= n) { /* both quotient bits are 1 */
+ assert(b >= a);
+ a -= n;
+ b -= n;
+ }
+ else if (b >= n) { /* a/n bit is 0, b/n bit is 1 */
+ break;
+ } /* else both quotient bits are 0 */
+ assert(a < b && b < n);
+ a <<= 1;
+ b <<= 1;
+ }
+ return result;
+}
+
+/* The next run has been identified, of length n2.
+ * If there's already a run on the stack, apply the "powersort" merge strategy:
+ * compute the topmost run's "power" (depth in a conceptual binary merge tree)
+ * and merge adjacent runs on the stack with greater power. See listsort.txt
+ * for more info.
*
- * See listsort.txt for more info.
+ * It's the caller's responsibilty to push the new run on the stack when this
+ * returns.
*
* Returns 0 on success, -1 on error.
*/
static int
-merge_collapse(MergeState *ms)
+found_new_run(MergeState *ms, Py_ssize_t n2)
{
- struct s_slice *p = ms->pending;
-
assert(ms);
- while (ms->n > 1) {
- Py_ssize_t n = ms->n - 2;
- if ((n > 0 && p[n-1].len <= p[n].len + p[n+1].len) ||
- (n > 1 && p[n-2].len <= p[n-1].len + p[n].len)) {
- if (p[n-1].len < p[n+1].len)
- --n;
- if (merge_at(ms, n) < 0)
+ if (ms->n) {
+ assert(ms->n > 0);
+ struct s_slice *p = ms->pending;
+ Py_ssize_t s1 = p[ms->n - 1].base.keys - ms->basekeys; /* start index */
+ Py_ssize_t n1 = p[ms->n - 1].len;
+ int power = powerloop(s1, n1, n2, ms->listlen);
+ while (ms->n > 1 && p[ms->n - 2].power > power) {
+ if (merge_at(ms, ms->n - 2) < 0)
return -1;
}
- else if (p[n].len <= p[n+1].len) {
- if (merge_at(ms, n) < 0)
- return -1;
- }
- else
- break;
+ assert(ms->n < 2 || p[ms->n - 2].power < power);
+ p[ms->n - 1].power = power;
}
return 0;
}
@@ -2357,7 +2400,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
}
/* End of pre-sort check: ms is now set properly! */
- merge_init(&ms, saved_ob_size, keys != NULL);
+ merge_init(&ms, saved_ob_size, keys != NULL, &lo);
nremaining = saved_ob_size;
if (nremaining < 2)
@@ -2393,13 +2436,16 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
goto fail;
n = force;
}
- /* Push run onto pending-runs stack, and maybe merge. */
+ /* Maybe merge pending runs. */
+ assert(ms.n == 0 || ms.pending[ms.n -1].base.keys +
+ ms.pending[ms.n-1].len == lo.keys);
+ if (found_new_run(&ms, n) < 0)
+ goto fail;
+ /* Push new run on stack. */
assert(ms.n < MAX_MERGE_PENDING);
ms.pending[ms.n].base = lo;
ms.pending[ms.n].len = n;
++ms.n;
- if (merge_collapse(&ms) < 0)
- goto fail;
/* Advance to find next run. */
sortslice_advance(&lo, n);
nremaining -= n;
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 174777a..306e5e4 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -318,65 +318,104 @@ merging must be done as (A+B)+C or A+(B+C) instead.
So merging is always done on two consecutive runs at a time, and in-place,
although this may require some temp memory (more on that later).
-When a run is identified, its base address and length are pushed on a stack
-in the MergeState struct. merge_collapse() is then called to potentially
-merge runs on that stack. We would like to delay merging as long as possible
-in order to exploit patterns that may come up later, but we like even more to
-do merging as soon as possible to exploit that the run just found is still
-high in the memory hierarchy. We also can't delay merging "too long" because
-it consumes memory to remember the runs that are still unmerged, and the
-stack has a fixed size.
-
-What turned out to be a good compromise maintains two invariants on the
-stack entries, where A, B and C are the lengths of the three rightmost not-yet
-merged slices:
-
-1. A > B+C
-2. B > C
-
-Note that, by induction, #2 implies the lengths of pending runs form a
-decreasing sequence. #1 implies that, reading the lengths right to left,
-the pending-run lengths grow at least as fast as the Fibonacci numbers.
-Therefore the stack can never grow larger than about log_base_phi(N) entries,
-where phi = (1+sqrt(5))/2 ~= 1.618. Thus a small # of stack slots suffice
-for very large arrays.
-
-If A <= B+C, the smaller of A and C is merged with B (ties favor C, for the
-freshness-in-cache reason), and the new run replaces the A,B or B,C entries;
-e.g., if the last 3 entries are
-
- A:30 B:20 C:10
-
-then B is merged with C, leaving
-
- A:30 BC:30
-
-on the stack. Or if they were
-
- A:500 B:400: C:1000
-
-then A is merged with B, leaving
-
- AB:900 C:1000
-
-on the stack.
-
-In both examples, the stack configuration after the merge still violates
-invariant #2, and merge_collapse() goes on to continue merging runs until
-both invariants are satisfied. As an extreme case, suppose we didn't do the
-minrun gimmick, and natural runs were of lengths 128, 64, 32, 16, 8, 4, 2,
-and 2. Nothing would get merged until the final 2 was seen, and that would
-trigger 7 perfectly balanced merges.
-
-The thrust of these rules when they trigger merging is to balance the run
-lengths as closely as possible, while keeping a low bound on the number of
-runs we have to remember. This is maximally effective for random data,
-where all runs are likely to be of (artificially forced) length minrun, and
-then we get a sequence of perfectly balanced merges (with, perhaps, some
-oddballs at the end).
-
-OTOH, one reason this sort is so good for partly ordered data has to do
-with wildly unbalanced run lengths.
+When a run is identified, its length is passed to found_new_run() to
+potentially merge runs on a stack of pending runs. We would like to delay
+merging as long as possible in order to exploit patterns that may come up
+later, but we like even more to do merging as soon as possible to exploit
+that the run just found is still high in the memory hierarchy. We also can't
+delay merging "too long" because it consumes memory to remember the runs that
+are still unmerged, and the stack has a fixed size.
+
+The original version of this code used the first thing I made up that didn't
+obviously suck ;-) It was loosely based on invariants involving the Fibonacci
+sequence.
+
+It worked OK, but it was hard to reason about, and was subtle enough that the
+intended invariants weren't actually preserved. Researchers discovered that
+when trying to complete a computer-generated correctness proof. That was
+easily-enough repaired, but the discovery spurred quite a bit of academic
+interest in truly good ways to manage incremental merging on the fly.
+
+At least a dozen different approaches were developed, some provably having
+near-optimal worst case behavior with respect to the entropy of the
+distribution of run lengths. Some details can be found in bpo-34561.
+
+The code now uses the "powersort" merge strategy from:
+
+ "Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods
+ That Optimally Adapt to Existing Runs"
+ J. Ian Munro and Sebastian Wild
+
+The code is pretty simple, but the justification is quite involved, as it's
+based on fast approximations to optimal binary search trees, which are
+substantial topics on their own.
+
+Here we'll just cover some pragmatic details:
+
+The `powerloop()` function computes a run's "power". Say two adjacent runs
+begin at index s1. The first run has length n1, and the second run (starting
+at index s1+n1, called "s2" below) has length n2. The list has total length n.
+The "power" of the first run is a small integer, the depth of the node
+connecting the two runs in an ideal binary merge tree, where power 1 is the
+root node, and the power increases by 1 for each level deeper in the tree.
+
+The power is the least integer L such that the "midpoint interval" contains
+a rational number of the form J/2**L. The midpoint interval is the semi-
+closed interval:
+
+ ((s1 + n1/2)/n, (s2 + n2/2)/n]
+
+Yes, that's brain-busting at first ;-) Concretely, if (s1 + n1/2)/n and
+(s2 + n2/2)/n are computed to infinite precision in binary, the power L is
+the first position at which the 2**-L bit differs between the expansions.
+Since the left end of the interval is less than the right end, the first
+differing bit must be a 0 bit in the left quotient and a 1 bit in the right
+quotient.
+
+`powerloop()` emulates these divisions, 1 bit at a time, using comparisons,
+subtractions, and shifts in a loop.
+
+You'll notice the paper uses an O(1) method instead, but that relies on two
+things we don't have:
+
+- An O(1) "count leading zeroes" primitive. We can find such a thing as a C
+ extension on most platforms, but not all, and there's no uniform spelling
+ on the platforms that support it.
+
+- Integer divison on an integer type twice as wide as needed to hold the
+ list length. But the latter is Py_ssize_t for us, and is typically the
+ widest native signed integer type the platform supports.
+
+But since runs in our algorithm are almost never very short, the once-per-run
+overhead of `powerloop()` seems lost in the noise.
+
+Detail: why is Py_ssize_t "wide enough" in `powerloop()`? We do, after all,
+shift integers of that width left by 1. How do we know that won't spill into
+the sign bit? The trick is that we have some slop. `n` (the total list
+length) is the number of list elements, which is at most 4 times (on a 32-box,
+with 4-byte pointers) smaller than than the largest size_t. So at least the
+leading two bits of the integers we're using are clear.
+
+Since we can't compute a run's power before seeing the run that follows it,
+the most-recently identified run is never merged by `found_new_run()`.
+Instead a new run is only used to compute the 2nd-most-recent run's power.
+Then adjacent runs are merged so long as their saved power (tree depth) is
+greater than that newly computed power. When found_new_run() returns, only
+then is a new run pushed on to the stack of pending runs.
+
+A key invariant is that powers on the run stack are strictly decreasing
+(starting from the run at the top of the stack).
+
+Note that even powersort's strategy isn't always truly optimal. It can't be.
+Computing an optimal merge sequence can be done in time quadratic in the
+number of runs, which is very much slower, and also requires finding &
+remembering _all_ the runs' lengths (of which there may be billions) in
+advance. It's remarkable, though, how close to optimal this strategy gets.
+
+Curious factoid: of all the alternatives I've seen in the literature,
+powersort's is the only one that's always truly optimal for a collection of 3
+run lengths (for three lengths A B C, it's always optimal to first merge the
+shorter of A and C with B).
Merge Memory