summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2015-05-11 17:19:03 (GMT)
committerRaymond Hettinger <python@rcn.com>2015-05-11 17:19:03 (GMT)
commitbc33e57d56434d6feb99b77f10a3bc884f3a61ce (patch)
treef9a2e2571496c639ba80eb54be8900364f77dc4d
parenta33df31629f2f6ed85890baa9b4e71c30efa95a9 (diff)
downloadcpython-bc33e57d56434d6feb99b77f10a3bc884f3a61ce.zip
cpython-bc33e57d56434d6feb99b77f10a3bc884f3a61ce.tar.gz
cpython-bc33e57d56434d6feb99b77f10a3bc884f3a61ce.tar.bz2
Issue #24155: Optimize heapify for better cache utililzation.
-rw-r--r--Misc/NEWS3
-rw-r--r--Modules/_heapqmodule.c72
2 files changed, 75 insertions, 0 deletions
diff --git a/Misc/NEWS b/Misc/NEWS
index cc2ff1d..c8f66bb 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -41,6 +41,9 @@ Library
- Issue #21795: smtpd now supports the 8BITMIME extension whenever
the new *decode_data* constructor argument is set to False.
+- Issue #24155: optimize heapq.heapify() for better cache performance
+ when heapifying large lists.
+
- Issue #21800: imaplib now supports RFC 5161 (enable), RFC 6855
(utf8/internationalized email) and automatically encodes non-ASCII
usernames and passwords to UTF8.
diff --git a/Modules/_heapqmodule.c b/Modules/_heapqmodule.c
index 4372ad4..380489e 100644
--- a/Modules/_heapqmodule.c
+++ b/Modules/_heapqmodule.c
@@ -250,6 +250,71 @@ PyDoc_STRVAR(heappushpop_doc,
from the heap. The combined action runs more efficiently than\n\
heappush() followed by a separate call to heappop().");
+static Py_ssize_t
+keep_top_bit(Py_ssize_t n)
+{
+ int i = 0;
+
+ while (n > 1) {
+ i += 1;
+ n >>= 1;
+ }
+ return n << i;
+}
+
+/* Cache friendly version of heapify()
+ -----------------------------------
+
+ Build-up a heap in O(n) time by performing siftup() operations
+ on nodes whose children are already heaps.
+
+ The simplest way is to sift the nodes in reverse order from
+ n//2-1 to 0 inclusive. The downside is that children may be
+ out of cache by the time their parent is reached.
+
+ A better way is to not wait for the children to go out of cache.
+ Once a sibling pair of child nodes have been sifted, immediately
+ sift their parent node (while the children are still in cache).
+
+ Both ways build child heaps before their parents, so both ways
+ do the exact same number of comparisons and produce exactly
+ the same heap. The only difference is that the traversal
+ order is optimized for cache efficiency.
+*/
+
+static PyObject *
+cache_friendly_heapify(PyObject *heap, int siftup_func(PyListObject *, Py_ssize_t))
+{
+ Py_ssize_t i, j, m, mhalf, leftmost;
+
+ m = PyList_GET_SIZE(heap) >> 1; /* index of first childless node */
+ leftmost = keep_top_bit(m + 1) - 1; /* leftmost node in row of m */
+ mhalf = m >> 1; /* parent of first childless node */
+
+ for (i = leftmost - 1 ; i >= mhalf ; i--) {
+ j = i;
+ while (1) {
+ if (siftup_func((PyListObject *)heap, j))
+ return NULL;
+ if (!(j & 1))
+ break;
+ j >>= 1;
+ }
+ }
+
+ for (i = m - 1 ; i >= leftmost ; i--) {
+ j = i;
+ while (1) {
+ if (siftup_func((PyListObject *)heap, j))
+ return NULL;
+ if (!(j & 1))
+ break;
+ j >>= 1;
+ }
+ }
+ Py_RETURN_NONE;
+}
+
static PyObject *
heapify_internal(PyObject *heap, int siftup_func(PyListObject *, Py_ssize_t))
{
@@ -260,7 +325,14 @@ heapify_internal(PyObject *heap, int siftup_func(PyListObject *, Py_ssize_t))
return NULL;
}
+ /* For heaps likely to be bigger than L1 cache, we use the cache
+ friendly heapify function. For smaller heaps that fit entirely
+ in cache, we prefer the simpler algorithm with less branching.
+ */
n = PyList_GET_SIZE(heap);
+ if (n > 10000)
+ return cache_friendly_heapify(heap, siftup_func);
+
/* Transform bottom-up. The largest index there's any point to
looking at is the largest with a child index in-range, so must
have 2*i + 1 < n, or i < (n-1)/2. If n is even = 2*j, this is