summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2013-03-05 06:36:30 (GMT)
committerRaymond Hettinger <python@rcn.com>2013-03-05 06:36:30 (GMT)
commitf6b26676bc97f2f023cbfeabec5583895e9e054f (patch)
tree310f331a8e844faa02891b9e73d029e1b20bb74d
parent31584e30ab3e8d01613bb774b1e3d28e73314096 (diff)
downloadcpython-f6b26676bc97f2f023cbfeabec5583895e9e054f.zip
cpython-f6b26676bc97f2f023cbfeabec5583895e9e054f.tar.gz
cpython-f6b26676bc97f2f023cbfeabec5583895e9e054f.tar.bz2
Issue #16098: Update heapq.nsmallest to use the same algorithm as nlargest.
This removes the dependency on bisect and it bring the pure Python code in-sync with the C code.
-rw-r--r--Lib/heapq.py84
1 files changed, 59 insertions, 25 deletions
diff --git a/Lib/heapq.py b/Lib/heapq.py
index dec15ae..6859821 100644
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -127,8 +127,7 @@ From all times, sorting has always been a Great Art! :-)
__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge',
'nlargest', 'nsmallest', 'heappushpop']
-from itertools import islice, repeat, count, tee, chain
-import bisect
+from itertools import islice, count, tee, chain
def heappush(heap, item):
"""Push item onto heap, maintaining the heap invariant."""
@@ -180,6 +179,19 @@ def heapify(x):
for i in reversed(range(n//2)):
_siftup(x, i)
+def _heappushpop_max(heap, item):
+ """Maxheap version of a heappush followed by a heappop."""
+ if heap and item < heap[0]:
+ item, heap[0] = heap[0], item
+ _siftup_max(heap, 0)
+ return item
+
+def _heapify_max(x):
+ """Transform list into a maxheap, in-place, in O(len(x)) time."""
+ n = len(x)
+ for i in reversed(range(n//2)):
+ _siftup_max(x, i)
+
def nlargest(n, iterable):
"""Find the n largest elements in a dataset.
@@ -205,30 +217,16 @@ def nsmallest(n, iterable):
"""
if n < 0:
return []
- if hasattr(iterable, '__len__') and n * 10 <= len(iterable):
- # For smaller values of n, the bisect method is faster than a minheap.
- # It is also memory efficient, consuming only n elements of space.
- it = iter(iterable)
- result = sorted(islice(it, 0, n))
- if not result:
- return result
- insort = bisect.insort
- pop = result.pop
- los = result[-1] # los --> Largest of the nsmallest
- for elem in it:
- if elem < los:
- insort(result, elem)
- pop()
- los = result[-1]
+ it = iter(iterable)
+ result = list(islice(it, n))
+ if not result:
return result
- # An alternative approach manifests the whole iterable in memory but
- # saves comparisons by heapifying all at once. Also, saves time
- # over bisect.insort() which has O(n) data movement time for every
- # insertion. Finding the n smallest of an m length iterable requires
- # O(m) + O(n log m) comparisons.
- h = list(iterable)
- heapify(h)
- return list(map(heappop, repeat(h, min(n, len(h)))))
+ _heapify_max(result)
+ _heappushpop = _heappushpop_max
+ for elem in it:
+ _heappushpop(result, elem)
+ result.sort()
+ return result
# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos
# is the index of a leaf with a possibly out-of-order value. Restore the
@@ -306,6 +304,42 @@ def _siftup(heap, pos):
heap[pos] = newitem
_siftdown(heap, startpos, pos)
+def _siftdown_max(heap, startpos, pos):
+ 'Maxheap variant of _siftdown'
+ newitem = heap[pos]
+ # Follow the path to the root, moving parents down until finding a place
+ # newitem fits.
+ while pos > startpos:
+ parentpos = (pos - 1) >> 1
+ parent = heap[parentpos]
+ if parent < newitem:
+ heap[pos] = parent
+ pos = parentpos
+ continue
+ break
+ heap[pos] = newitem
+
+def _siftup_max(heap, pos):
+ 'Minheap variant of _siftup'
+ endpos = len(heap)
+ startpos = pos
+ newitem = heap[pos]
+ # Bubble up the larger child until hitting a leaf.
+ childpos = 2*pos + 1 # leftmost child position
+ while childpos < endpos:
+ # Set childpos to index of larger child.
+ rightpos = childpos + 1
+ if rightpos < endpos and not heap[rightpos] < heap[childpos]:
+ childpos = rightpos
+ # Move the larger child up.
+ heap[pos] = heap[childpos]
+ pos = childpos
+ childpos = 2*pos + 1
+ # The leaf at pos is empty now. Put newitem there, and bubble it up
+ # to its final resting place (by sifting its parents down).
+ heap[pos] = newitem
+ _siftdown_max(heap, startpos, pos)
+
# If available, use C implementation
try:
from _heapq import *