summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2013-03-05 07:15:01 (GMT)
committerRaymond Hettinger <python@rcn.com>2013-03-05 07:15:01 (GMT)
commit3e6aafe20993f5b3690c870bf88eed3d8ba53492 (patch)
tree1419be731e75dd4da77ce5a35f2afdb154f46f5f
parentdce969d2b05238755771bd9c65d86df182ab8390 (diff)
downloadcpython-3e6aafe20993f5b3690c870bf88eed3d8ba53492.zip
cpython-3e6aafe20993f5b3690c870bf88eed3d8ba53492.tar.gz
cpython-3e6aafe20993f5b3690c870bf88eed3d8ba53492.tar.bz2
Issue #16098: Update heapq.nsmallest to use the same algorithm as nlargest.
This removes the dependency on bisect and it bring the pure Python code in-sync with the C code.
-rw-r--r--Lib/heapq.py84
1 files changed, 59 insertions, 25 deletions
diff --git a/Lib/heapq.py b/Lib/heapq.py
index 6a4e0f4..ca79db1 100644
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -129,9 +129,8 @@ From all times, sorting has always been a Great Art! :-)
__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge',
'nlargest', 'nsmallest', 'heappushpop']
-from itertools import islice, repeat, count, imap, izip, tee, chain
+from itertools import islice, count, imap, izip, tee, chain
from operator import itemgetter
-import bisect
def cmp_lt(x, y):
# Use __lt__ if available; otherwise, try __le__.
@@ -188,6 +187,19 @@ def heapify(x):
for i in reversed(xrange(n//2)):
_siftup(x, i)
+def _heappushpop_max(heap, item):
+ """Maxheap version of a heappush followed by a heappop."""
+ if heap and cmp_lt(item, heap[0]):
+ item, heap[0] = heap[0], item
+ _siftup_max(heap, 0)
+ return item
+
+def _heapify_max(x):
+ """Transform list into a maxheap, in-place, in O(len(x)) time."""
+ n = len(x)
+ for i in reversed(range(n//2)):
+ _siftup_max(x, i)
+
def nlargest(n, iterable):
"""Find the n largest elements in a dataset.
@@ -213,30 +225,16 @@ def nsmallest(n, iterable):
"""
if n < 0:
return []
- if hasattr(iterable, '__len__') and n * 10 <= len(iterable):
- # For smaller values of n, the bisect method is faster than a minheap.
- # It is also memory efficient, consuming only n elements of space.
- it = iter(iterable)
- result = sorted(islice(it, 0, n))
- if not result:
- return result
- insort = bisect.insort
- pop = result.pop
- los = result[-1] # los --> Largest of the nsmallest
- for elem in it:
- if cmp_lt(elem, los):
- insort(result, elem)
- pop()
- los = result[-1]
+ it = iter(iterable)
+ result = list(islice(it, n))
+ if not result:
return result
- # An alternative approach manifests the whole iterable in memory but
- # saves comparisons by heapifying all at once. Also, saves time
- # over bisect.insort() which has O(n) data movement time for every
- # insertion. Finding the n smallest of an m length iterable requires
- # O(m) + O(n log m) comparisons.
- h = list(iterable)
- heapify(h)
- return map(heappop, repeat(h, min(n, len(h))))
+ _heapify_max(result)
+ _heappushpop = _heappushpop_max
+ for elem in it:
+ _heappushpop(result, elem)
+ result.sort()
+ return result
# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos
# is the index of a leaf with a possibly out-of-order value. Restore the
@@ -314,6 +312,42 @@ def _siftup(heap, pos):
heap[pos] = newitem
_siftdown(heap, startpos, pos)
+def _siftdown_max(heap, startpos, pos):
+ 'Maxheap variant of _siftdown'
+ newitem = heap[pos]
+ # Follow the path to the root, moving parents down until finding a place
+ # newitem fits.
+ while pos > startpos:
+ parentpos = (pos - 1) >> 1
+ parent = heap[parentpos]
+ if cmp_lt(parent, newitem):
+ heap[pos] = parent
+ pos = parentpos
+ continue
+ break
+ heap[pos] = newitem
+
+def _siftup_max(heap, pos):
+ 'Maxheap variant of _siftup'
+ endpos = len(heap)
+ startpos = pos
+ newitem = heap[pos]
+ # Bubble up the larger child until hitting a leaf.
+ childpos = 2*pos + 1 # leftmost child position
+ while childpos < endpos:
+ # Set childpos to index of larger child.
+ rightpos = childpos + 1
+ if rightpos < endpos and not cmp_lt(heap[rightpos], heap[childpos]):
+ childpos = rightpos
+ # Move the larger child up.
+ heap[pos] = heap[childpos]
+ pos = childpos
+ childpos = 2*pos + 1
+ # The leaf at pos is empty now. Put newitem there, and bubble it up
+ # to its final resting place (by sifting its parents down).
+ heap[pos] = newitem
+ _siftdown_max(heap, startpos, pos)
+
# If available, use C implementation
try:
from _heapq import *