diff options
-rw-r--r-- | Objects/listobject.c | 84 | ||||
-rw-r--r-- | Objects/listsort.txt | 61 |
2 files changed, 92 insertions, 53 deletions
diff --git a/Objects/listobject.c b/Objects/listobject.c index 7fad905..cea8597 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1122,11 +1122,10 @@ fail: */ #define MAX_MERGE_PENDING 85 -/* If a run wins MIN_GALLOP times in a row, we switch to galloping mode, - * and stay there until both runs win less often than MIN_GALLOP - * consecutive times. See listsort.txt for more info. +/* When we get into galloping mode, we stay there until both runs win less + * often than MIN_GALLOP consecutive times. See listsort.txt for more info. */ -#define MIN_GALLOP 8 +#define MIN_GALLOP 7 /* Avoid malloc for small temp arrays. */ #define MERGESTATE_TEMP_SIZE 256 @@ -1134,10 +1133,21 @@ fail: /* One MergeState exists on the stack per invocation of mergesort. It's just * a convenient way to pass state around among the helper functions. */ +struct s_slice { + PyObject **base; + int len; +}; + typedef struct s_MergeState { /* The user-supplied comparison function. or NULL if none given. */ PyObject *compare; + /* This controls when we get *into* galloping mode. It's initialized + * to MIN_GALLOP. merge_lo and merge_hi tend to nudge it higher for + * random data, and lower for highly structured data. + */ + int min_gallop; + /* 'a' is temp storage to help with merges. It contains room for * alloced entries. */ @@ -1148,14 +1158,13 @@ typedef struct s_MergeState { * address base[i] and extends for len[i] elements. It's always * true (so long as the indices are in bounds) that * - * base[i] + len[i] == base[i+1] + * pending[i].base + pending[i].len == pending[i+1].base * * so we could cut the storage for this, but it's a minor amount, * and keeping all the info explicit simplifies the code. */ int n; - PyObject **base[MAX_MERGE_PENDING]; - int len[MAX_MERGE_PENDING]; + struct s_slice pending[MAX_MERGE_PENDING]; /* 'a' points to this when possible, rather than muck with malloc. */ PyObject *temparray[MERGESTATE_TEMP_SIZE]; @@ -1170,6 +1179,7 @@ merge_init(MergeState *ms, PyObject *compare) ms->a = ms->temparray; ms->alloced = MERGESTATE_TEMP_SIZE; ms->n = 0; + ms->min_gallop = MIN_GALLOP; } /* Free all the temp memory owned by the MergeState. This must be called @@ -1224,6 +1234,7 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) PyObject *compare; PyObject **dest; int result = -1; /* guilty until proved innocent */ + int min_gallop = ms->min_gallop; assert(ms && pa && pb && na > 0 && nb > 0 && pa + na == pb); if (MERGE_GETMEM(ms, na) < 0) @@ -1248,6 +1259,7 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) * appears to win consistently. */ for (;;) { + assert(na > 1 && nb > 0); k = ISLT(*pb, *pa, compare); if (k) { if (k < 0) @@ -1258,7 +1270,7 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) --nb; if (nb == 0) goto Succeed; - if (bcount >= MIN_GALLOP) + if (bcount >= min_gallop) break; } else { @@ -1268,7 +1280,7 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) --na; if (na == 1) goto CopyB; - if (acount >= MIN_GALLOP) + if (acount >= min_gallop) break; } } @@ -1278,7 +1290,11 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) * (if ever) neither run appears to be winning consistently * anymore. */ + ++min_gallop; do { + assert(na > 1 && nb > 0); + min_gallop -= min_gallop > 1; + ms->min_gallop = min_gallop; k = gallop_right(*pb, pa, na, 0, compare); acount = k; if (k) { @@ -1319,6 +1335,8 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) if (na == 1) goto CopyB; } while (acount >= MIN_GALLOP || bcount >= MIN_GALLOP); + ++min_gallop; /* penalize it for leaving galloping mode */ + ms->min_gallop = min_gallop; } Succeed: result = 0; @@ -1349,6 +1367,7 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) int result = -1; /* guilty until proved innocent */ PyObject **basea; PyObject **baseb; + int min_gallop = ms->min_gallop; assert(ms && pa && pb && na > 0 && nb > 0 && pa + na == pb); if (MERGE_GETMEM(ms, nb) < 0) @@ -1376,6 +1395,7 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) * appears to win consistently. */ for (;;) { + assert(na > 0 && nb > 1); k = ISLT(*pb, *pa, compare); if (k) { if (k < 0) @@ -1386,7 +1406,7 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) --na; if (na == 0) goto Succeed; - if (acount >= MIN_GALLOP) + if (acount >= min_gallop) break; } else { @@ -1396,7 +1416,7 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) --nb; if (nb == 1) goto CopyA; - if (bcount >= MIN_GALLOP) + if (bcount >= min_gallop) break; } } @@ -1406,7 +1426,11 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) * (if ever) neither run appears to be winning consistently * anymore. */ + ++min_gallop; do { + assert(na > 0 && nb > 1); + min_gallop -= min_gallop > 1; + ms->min_gallop = min_gallop; k = gallop_right(*pb, basea, na, na-1, compare); if (k < 0) goto Fail; @@ -1449,6 +1473,8 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) if (na == 0) goto Succeed; } while (acount >= MIN_GALLOP || bcount >= MIN_GALLOP); + ++min_gallop; /* penalize it for leaving galloping mode */ + ms->min_gallop = min_gallop; } Succeed: result = 0; @@ -1482,10 +1508,10 @@ merge_at(MergeState *ms, int i) assert(i >= 0); assert(i == ms->n - 2 || i == ms->n - 3); - pa = ms->base[i]; - pb = ms->base[i+1]; - na = ms->len[i]; - nb = ms->len[i+1]; + pa = ms->pending[i].base; + na = ms->pending[i].len; + pb = ms->pending[i+1].base; + nb = ms->pending[i+1].len; assert(na > 0 && nb > 0); assert(pa + na == pb); @@ -1493,11 +1519,9 @@ merge_at(MergeState *ms, int i) * run now, also slide over the last run (which isn't involved * in this merge). The current run i+1 goes away in any case. */ - if (i == ms->n - 3) { - ms->len[i+1] = ms->len[i+2]; - ms->base[i+1] = ms->base[i+2]; - } - ms->len[i] = na + nb; + ms->pending[i].len = na + nb; + if (i == ms->n - 3) + ms->pending[i+1] = ms->pending[i+2]; --ms->n; /* Where does b start in a? Elements in a before that can be @@ -1541,18 +1565,18 @@ merge_at(MergeState *ms, int i) static int merge_collapse(MergeState *ms) { - int *len = ms->len; + struct s_slice *p = ms->pending; assert(ms); while (ms->n > 1) { int n = ms->n - 2; - if (n > 0 && len[n-1] <= len[n] + len[n+1]) { - if (len[n-1] < len[n+1]) + if (n > 0 && p[n-1].len <= p[n].len + p[n+1].len) { + if (p[n-1].len < p[n+1].len) --n; if (merge_at(ms, n) < 0) return -1; } - else if (len[n] <= len[n+1]) { + else if (p[n].len <= p[n+1].len) { if (merge_at(ms, n) < 0) return -1; } @@ -1570,12 +1594,12 @@ merge_collapse(MergeState *ms) static int merge_force_collapse(MergeState *ms) { - int *len = ms->len; + struct s_slice *p = ms->pending; assert(ms); while (ms->n > 1) { int n = ms->n - 2; - if (n > 0 && len[n-1] < len[n+1]) + if (n > 0 && p[n-1].len < p[n+1].len) --n; if (merge_at(ms, n) < 0) return -1; @@ -1664,8 +1688,8 @@ listsort(PyListObject *self, PyObject *args) } /* Push run onto pending-runs stack, and maybe merge. */ assert(ms.n < MAX_MERGE_PENDING); - ms.base[ms.n] = lo; - ms.len[ms.n] = n; + ms.pending[ms.n].base = lo; + ms.pending[ms.n].len = n; ++ms.n; if (merge_collapse(&ms) < 0) goto fail; @@ -1678,8 +1702,8 @@ listsort(PyListObject *self, PyObject *args) if (merge_force_collapse(&ms) < 0) goto fail; assert(ms.n == 1); - assert(ms.base[0] == self->ob_item); - assert(ms.len[0] == self->ob_size); + assert(ms.pending[0].base == self->ob_item); + assert(ms.pending[0].len == self->ob_size); succeed: result = Py_None; diff --git a/Objects/listsort.txt b/Objects/listsort.txt index 545ce51..c561288 100644 --- a/Objects/listsort.txt +++ b/Objects/listsort.txt @@ -95,31 +95,31 @@ Comparison with Python's Samplesort Hybrid below that, it's either astronomically lucky, or is finding exploitable structure in the data. - n lg(n!) *sort 3sort +sort %sort ~sort !sort -------- ------- ------ -------- ------- ------ ------- -------- - 32768 444255 453096 453614 32908 452871 130491 469141 old - 449235 33019 33016 51328 188720 65534 new - 0.86% 1273.80% -0.33% 782.31% -30.85% 615.87% %ch from new + n lg(n!) *sort 3sort +sort %sort ~sort !sort +------- ------- ------ ------- ------- ------ ------- -------- + 32768 444255 453096 453614 32908 452871 130491 469141 old + 448885 33016 33007 50426 182083 65534 new + 0.94% 1273.92% -0.30% 798.09% -28.33% 615.87% %ch from new 65536 954037 972699 981940 65686 973104 260029 1004607 - 963714 65824 65809 103409 377634 131070 - 0.93% 1391.77% -0.19% 841.02% -31.14% 666.47% + 962991 65821 65808 101667 364341 131070 + 1.01% 1391.83% -0.19% 857.15% -28.63% 666.47% 131072 2039137 2101881 2091491 131232 2092894 554790 2161379 - 2059092 131413 131362 209130 755476 262142 - 2.08% 1491.54% -0.10% 900.76% -26.56% 724.51% + 2057533 131410 131361 206193 728871 262142 + 2.16% 1491.58% -0.10% 915.02% -23.88% 724.51% 262144 4340409 4464460 4403233 262314 4445884 1107842 4584560 - 4380687 262440 262460 421998 1511174 524286 - 1.91% 1577.81% -0.06% 953.53% -26.69% 774.44% + 4377402 262437 262459 416347 1457945 524286 + 1.99% 1577.82% -0.06% 967.83% -24.01% 774.44% 524288 9205096 9453356 9408463 524468 9441930 2218577 9692015 - 9285709 524581 524634 848590 3022584 1048574 - 1.81% 1693.52% -0.03% 1012.66% -26.60% 824.30% + 9278734 524580 524633 837947 2916107 1048574 + 1.88% 1693.52% -0.03% 1026.79% -23.92% 824.30% 1048576 19458756 19950272 19838588 1048766 19912134 4430649 20434212 - 19621118 1048960 1048942 1715806 6045418 2097150 - 1.68% 1791.26% -0.02% 1060.51% -26.71% 874.38% + 19606028 1048958 1048941 1694896 5832445 2097150 + 1.76% 1791.27% -0.02% 1074.83% -24.03% 874.38% Discussion of cases: @@ -171,7 +171,6 @@ Comparison with Python's Samplesort Hybrid bytes each on this box) needed by each test, again with arguments "15 20 1": - 2**i *sort \sort /sort 3sort +sort %sort ~sort =sort !sort 32768 16384 0 0 6256 0 10821 12288 0 16383 65536 32766 0 0 21652 0 31276 24576 0 32767 @@ -430,6 +429,11 @@ etc. We stay in galloping mode until both searches find slices to copy less than MIN_GALLOP elements long, at which point we go back to one-pair- at-a-time mode. +A refinement: The MergeState struct contains the value of min_gallop that +controls when we enter galloping mode, initialized to MIN_GALLOP. +merge_lo() and merge_hi() adjust this higher when gallooping isn't paying +off, and lower when it is. + Galloping --------- @@ -536,13 +540,21 @@ at the other values. At and after i=6, galloping always wins. We can't guess in advance when it's going to win, though, so we do one pair at a time until the evidence seems strong that galloping may pay. MIN_GALLOP -is 8 as I type this, and that's pretty strong evidence. However, if the data -is random, it simply will trigger galloping mode purely by luck every now -and again, and it's quite likely to hit one of the losing cases next. 8 -favors protecting against a slowdown on random data at the expense of giving -up small wins on lightly clustered data, and tiny marginal wins on highly -clustered data (they win huge anyway, and if you're getting a factor of -10 speedup, another percent just isn't worth fighting for). +is 7, and that's pretty strong evidence. However, if the data is random, it +simply will trigger galloping mode purely by luck every now and again, and +it's quite likely to hit one of the losing cases next. On the other hand, +in cases like ~sort, galloping always pays, and MIN_GALLOP is larger than it +"should be" then. So the MergeState struct keeps a min_gallop variable +that merge_lo and merge_hi adjust: the longer we stay in galloping mode, +the smaller min_gallop gets, making it easier to transition back to +galloping mode (if we ever leave it in the current merge, and at the +start of the next merge). But whenever the gallop loop doesn't pay, +min_gallop is increased by one, making it harder to transition to back +to galloping mode (and again both within a merge and across merges). For +random data, this all but eliminates the gallop penalty: min_gallop grows +large enough that we almost never get into galloping mode. And for cases +like ~sort, min_gallop can fall to as low as 1. This seems to work well, +but in all it's a minor improvement over using a fixed MIN_GALLOP value. Galloping Complication @@ -567,6 +579,9 @@ wildly unbalanced runs already enjoys excellent performance. Comparing Average # of Compares on Random Arrays ------------------------------------------------ +[NOTE: This was done when the new algorithm used about 0.1% more compares + on random data than does its current incarnation.] + Here list.sort() is samplesort, and list.msort() this sort: """ |