From 5cb4c672d855033592f0e05162f887def236c00a Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 6 Sep 2021 12:54:41 -0500 Subject: bpo-34561: Switch to Munro & Wild "powersort" merge strategy. (#28108) For list.sort(), replace our ad hoc merge ordering strategy with the principled, elegant, and provably near-optimal one from Munro and Wild's "powersort". --- .../2021-09-01-19-21-48.bpo-34561.uMAVA-.rst | 1 + Objects/listobject.c | 112 ++++++++++----- Objects/listsort.txt | 157 +++++++++++++-------- 3 files changed, 178 insertions(+), 92 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst b/Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst new file mode 100644 index 0000000..7c48cb3 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst @@ -0,0 +1 @@ +List sorting now uses the merge-ordering strategy from Munro and Wild's ``powersort()``. Unlike the former strategy, this is provably near-optimal in the entropy of the distribution of run lengths. Most uses of ``list.sort()`` probably won't see a significant time difference, but may see significant improvements in cases where the former strategy was exceptionally poor. However, as these are all fast linear-time approximations to a problem that's inherently at best quadratic-time to solve truly optimally, it's also possible to contrive cases where the former strategy did better. \ No newline at end of file diff --git a/Objects/listobject.c b/Objects/listobject.c index 898cbc2..565c11e 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1139,12 +1139,11 @@ sortslice_advance(sortslice *slice, Py_ssize_t n) if (k) /* The maximum number of entries in a MergeState's pending-runs stack. - * This is enough to sort arrays of size up to about - * 32 * phi ** MAX_MERGE_PENDING - * where phi ~= 1.618. 85 is ridiculouslylarge enough, good for an array - * with 2**64 elements. + * For a list with n elements, this needs at most floor(log2(n)) + 1 entries + * even if we didn't force runs to a minimal length. So the number of bits + * in a Py_ssize_t is plenty large enough for all cases. */ -#define MAX_MERGE_PENDING 85 +#define MAX_MERGE_PENDING (SIZEOF_SIZE_T * 8) /* When we get into galloping mode, we stay there until both runs win less * often than MIN_GALLOP consecutive times. See listsort.txt for more info. @@ -1159,7 +1158,8 @@ sortslice_advance(sortslice *slice, Py_ssize_t n) */ struct s_slice { sortslice base; - Py_ssize_t len; + Py_ssize_t len; /* length of run */ + int power; /* node "level" for powersort merge strategy */ }; typedef struct s_MergeState MergeState; @@ -1170,6 +1170,9 @@ struct s_MergeState { */ Py_ssize_t min_gallop; + Py_ssize_t listlen; /* len(input_list) - read only */ + PyObject **basekeys; /* base address of keys array - read only */ + /* 'a' is temp storage to help with merges. It contains room for * alloced entries. */ @@ -1513,7 +1516,8 @@ fail: /* Conceptually a MergeState's constructor. */ static void -merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc) +merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc, + sortslice *lo) { assert(ms != NULL); if (has_keyfunc) { @@ -1538,6 +1542,8 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc) ms->a.keys = ms->temparray; ms->n = 0; ms->min_gallop = MIN_GALLOP; + ms->listlen = list_size; + ms->basekeys = lo->keys; } /* Free all the temp memory owned by the MergeState. This must be called @@ -1920,37 +1926,74 @@ merge_at(MergeState *ms, Py_ssize_t i) return merge_hi(ms, ssa, na, ssb, nb); } -/* Examine the stack of runs waiting to be merged, merging adjacent runs - * until the stack invariants are re-established: - * - * 1. len[-3] > len[-2] + len[-1] - * 2. len[-2] > len[-1] +/* Two adjacent runs begin at index s1. The first run has length n1, and + * the second run (starting at index s1+n1) has length n2. The list has total + * length n. + * Compute the "power" of the first run. See listsort.txt for details. + */ +static int +powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n) +{ + int result = 0; + assert(s1 >= 0); + assert(n1 > 0 && n2 > 0); + assert(s1 + n1 + n2 <= n); + /* midpoints a and b: + * a = s1 + n1/2 + * b = s1 + n1 + n2/2 = a + (n1 + n2)/2 + * + * Those may not be integers, though, because of the "/2". So we work with + * 2*a and 2*b instead, which are necessarily integers. It makes no + * difference to the outcome, since the bits in the expansion of (2*i)/n + * are merely shifted one position from those of i/n. + */ + Py_ssize_t a = 2 * s1 + n1; /* 2*a */ + Py_ssize_t b = a + n1 + n2; /* 2*b */ + /* Emulate a/n and b/n one bit a time, until bits differ. */ + for (;;) { + ++result; + if (a >= n) { /* both quotient bits are 1 */ + assert(b >= a); + a -= n; + b -= n; + } + else if (b >= n) { /* a/n bit is 0, b/n bit is 1 */ + break; + } /* else both quotient bits are 0 */ + assert(a < b && b < n); + a <<= 1; + b <<= 1; + } + return result; +} + +/* The next run has been identified, of length n2. + * If there's already a run on the stack, apply the "powersort" merge strategy: + * compute the topmost run's "power" (depth in a conceptual binary merge tree) + * and merge adjacent runs on the stack with greater power. See listsort.txt + * for more info. * - * See listsort.txt for more info. + * It's the caller's responsibilty to push the new run on the stack when this + * returns. * * Returns 0 on success, -1 on error. */ static int -merge_collapse(MergeState *ms) +found_new_run(MergeState *ms, Py_ssize_t n2) { - struct s_slice *p = ms->pending; - assert(ms); - while (ms->n > 1) { - Py_ssize_t n = ms->n - 2; - if ((n > 0 && p[n-1].len <= p[n].len + p[n+1].len) || - (n > 1 && p[n-2].len <= p[n-1].len + p[n].len)) { - if (p[n-1].len < p[n+1].len) - --n; - if (merge_at(ms, n) < 0) + if (ms->n) { + assert(ms->n > 0); + struct s_slice *p = ms->pending; + Py_ssize_t s1 = p[ms->n - 1].base.keys - ms->basekeys; /* start index */ + Py_ssize_t n1 = p[ms->n - 1].len; + int power = powerloop(s1, n1, n2, ms->listlen); + while (ms->n > 1 && p[ms->n - 2].power > power) { + if (merge_at(ms, ms->n - 2) < 0) return -1; } - else if (p[n].len <= p[n+1].len) { - if (merge_at(ms, n) < 0) - return -1; - } - else - break; + assert(ms->n < 2 || p[ms->n - 2].power < power); + p[ms->n - 1].power = power; } return 0; } @@ -2357,7 +2400,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) } /* End of pre-sort check: ms is now set properly! */ - merge_init(&ms, saved_ob_size, keys != NULL); + merge_init(&ms, saved_ob_size, keys != NULL, &lo); nremaining = saved_ob_size; if (nremaining < 2) @@ -2393,13 +2436,16 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) goto fail; n = force; } - /* Push run onto pending-runs stack, and maybe merge. */ + /* Maybe merge pending runs. */ + assert(ms.n == 0 || ms.pending[ms.n -1].base.keys + + ms.pending[ms.n-1].len == lo.keys); + if (found_new_run(&ms, n) < 0) + goto fail; + /* Push new run on stack. */ assert(ms.n < MAX_MERGE_PENDING); ms.pending[ms.n].base = lo; ms.pending[ms.n].len = n; ++ms.n; - if (merge_collapse(&ms) < 0) - goto fail; /* Advance to find next run. */ sortslice_advance(&lo, n); nremaining -= n; diff --git a/Objects/listsort.txt b/Objects/listsort.txt index 174777a..306e5e4 100644 --- a/Objects/listsort.txt +++ b/Objects/listsort.txt @@ -318,65 +318,104 @@ merging must be done as (A+B)+C or A+(B+C) instead. So merging is always done on two consecutive runs at a time, and in-place, although this may require some temp memory (more on that later). -When a run is identified, its base address and length are pushed on a stack -in the MergeState struct. merge_collapse() is then called to potentially -merge runs on that stack. We would like to delay merging as long as possible -in order to exploit patterns that may come up later, but we like even more to -do merging as soon as possible to exploit that the run just found is still -high in the memory hierarchy. We also can't delay merging "too long" because -it consumes memory to remember the runs that are still unmerged, and the -stack has a fixed size. - -What turned out to be a good compromise maintains two invariants on the -stack entries, where A, B and C are the lengths of the three rightmost not-yet -merged slices: - -1. A > B+C -2. B > C - -Note that, by induction, #2 implies the lengths of pending runs form a -decreasing sequence. #1 implies that, reading the lengths right to left, -the pending-run lengths grow at least as fast as the Fibonacci numbers. -Therefore the stack can never grow larger than about log_base_phi(N) entries, -where phi = (1+sqrt(5))/2 ~= 1.618. Thus a small # of stack slots suffice -for very large arrays. - -If A <= B+C, the smaller of A and C is merged with B (ties favor C, for the -freshness-in-cache reason), and the new run replaces the A,B or B,C entries; -e.g., if the last 3 entries are - - A:30 B:20 C:10 - -then B is merged with C, leaving - - A:30 BC:30 - -on the stack. Or if they were - - A:500 B:400: C:1000 - -then A is merged with B, leaving - - AB:900 C:1000 - -on the stack. - -In both examples, the stack configuration after the merge still violates -invariant #2, and merge_collapse() goes on to continue merging runs until -both invariants are satisfied. As an extreme case, suppose we didn't do the -minrun gimmick, and natural runs were of lengths 128, 64, 32, 16, 8, 4, 2, -and 2. Nothing would get merged until the final 2 was seen, and that would -trigger 7 perfectly balanced merges. - -The thrust of these rules when they trigger merging is to balance the run -lengths as closely as possible, while keeping a low bound on the number of -runs we have to remember. This is maximally effective for random data, -where all runs are likely to be of (artificially forced) length minrun, and -then we get a sequence of perfectly balanced merges (with, perhaps, some -oddballs at the end). - -OTOH, one reason this sort is so good for partly ordered data has to do -with wildly unbalanced run lengths. +When a run is identified, its length is passed to found_new_run() to +potentially merge runs on a stack of pending runs. We would like to delay +merging as long as possible in order to exploit patterns that may come up +later, but we like even more to do merging as soon as possible to exploit +that the run just found is still high in the memory hierarchy. We also can't +delay merging "too long" because it consumes memory to remember the runs that +are still unmerged, and the stack has a fixed size. + +The original version of this code used the first thing I made up that didn't +obviously suck ;-) It was loosely based on invariants involving the Fibonacci +sequence. + +It worked OK, but it was hard to reason about, and was subtle enough that the +intended invariants weren't actually preserved. Researchers discovered that +when trying to complete a computer-generated correctness proof. That was +easily-enough repaired, but the discovery spurred quite a bit of academic +interest in truly good ways to manage incremental merging on the fly. + +At least a dozen different approaches were developed, some provably having +near-optimal worst case behavior with respect to the entropy of the +distribution of run lengths. Some details can be found in bpo-34561. + +The code now uses the "powersort" merge strategy from: + + "Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods + That Optimally Adapt to Existing Runs" + J. Ian Munro and Sebastian Wild + +The code is pretty simple, but the justification is quite involved, as it's +based on fast approximations to optimal binary search trees, which are +substantial topics on their own. + +Here we'll just cover some pragmatic details: + +The `powerloop()` function computes a run's "power". Say two adjacent runs +begin at index s1. The first run has length n1, and the second run (starting +at index s1+n1, called "s2" below) has length n2. The list has total length n. +The "power" of the first run is a small integer, the depth of the node +connecting the two runs in an ideal binary merge tree, where power 1 is the +root node, and the power increases by 1 for each level deeper in the tree. + +The power is the least integer L such that the "midpoint interval" contains +a rational number of the form J/2**L. The midpoint interval is the semi- +closed interval: + + ((s1 + n1/2)/n, (s2 + n2/2)/n] + +Yes, that's brain-busting at first ;-) Concretely, if (s1 + n1/2)/n and +(s2 + n2/2)/n are computed to infinite precision in binary, the power L is +the first position at which the 2**-L bit differs between the expansions. +Since the left end of the interval is less than the right end, the first +differing bit must be a 0 bit in the left quotient and a 1 bit in the right +quotient. + +`powerloop()` emulates these divisions, 1 bit at a time, using comparisons, +subtractions, and shifts in a loop. + +You'll notice the paper uses an O(1) method instead, but that relies on two +things we don't have: + +- An O(1) "count leading zeroes" primitive. We can find such a thing as a C + extension on most platforms, but not all, and there's no uniform spelling + on the platforms that support it. + +- Integer divison on an integer type twice as wide as needed to hold the + list length. But the latter is Py_ssize_t for us, and is typically the + widest native signed integer type the platform supports. + +But since runs in our algorithm are almost never very short, the once-per-run +overhead of `powerloop()` seems lost in the noise. + +Detail: why is Py_ssize_t "wide enough" in `powerloop()`? We do, after all, +shift integers of that width left by 1. How do we know that won't spill into +the sign bit? The trick is that we have some slop. `n` (the total list +length) is the number of list elements, which is at most 4 times (on a 32-box, +with 4-byte pointers) smaller than than the largest size_t. So at least the +leading two bits of the integers we're using are clear. + +Since we can't compute a run's power before seeing the run that follows it, +the most-recently identified run is never merged by `found_new_run()`. +Instead a new run is only used to compute the 2nd-most-recent run's power. +Then adjacent runs are merged so long as their saved power (tree depth) is +greater than that newly computed power. When found_new_run() returns, only +then is a new run pushed on to the stack of pending runs. + +A key invariant is that powers on the run stack are strictly decreasing +(starting from the run at the top of the stack). + +Note that even powersort's strategy isn't always truly optimal. It can't be. +Computing an optimal merge sequence can be done in time quadratic in the +number of runs, which is very much slower, and also requires finding & +remembering _all_ the runs' lengths (of which there may be billions) in +advance. It's remarkable, though, how close to optimal this strategy gets. + +Curious factoid: of all the alternatives I've seen in the literature, +powersort's is the only one that's always truly optimal for a collection of 3 +run lengths (for three lengths A B C, it's always optimal to first merge the +shorter of A and C with B). Merge Memory -- cgit v0.12