summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2004-06-12 08:33:36 (GMT)
committerRaymond Hettinger <python@rcn.com>2004-06-12 08:33:36 (GMT)
commitb25aa36f83a3cd2200f2bc479e594458e27794a3 (patch)
treeafdd8051a7b152562b6a4a006fe7fe814113021e
parent2e6694086f07d293d1907891f68cec6076d44f73 (diff)
downloadcpython-b25aa36f83a3cd2200f2bc479e594458e27794a3.zip
cpython-b25aa36f83a3cd2200f2bc479e594458e27794a3.tar.gz
cpython-b25aa36f83a3cd2200f2bc479e594458e27794a3.tar.bz2
Improve the memory performance and speed of heapq.nsmallest() by using
an alternate algorithm when the number of selected items is small relative to the full iterable.
-rw-r--r--Lib/heapq.py23
-rw-r--r--Lib/test/test_heapq.py1
2 files changed, 24 insertions, 0 deletions
diff --git a/Lib/heapq.py b/Lib/heapq.py
index d1aad98..65f4155 100644
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -130,6 +130,7 @@ __all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'nlargest',
'nsmallest']
from itertools import islice, repeat
+import bisect
def heappush(heap, item):
"""Push item onto heap, maintaining the heap invariant."""
@@ -196,6 +197,28 @@ def nsmallest(iterable, n):
Equivalent to: sorted(iterable)[:n]
"""
+ if hasattr(iterable, '__len__') and n * 10 <= len(iterable):
+ # For smaller values of n, the bisect method is faster than a minheap.
+ # It is also memory efficient, consuming only n elements of space.
+ it = iter(iterable)
+ result = sorted(islice(it, 0, n))
+ if not result:
+ return result
+ insort = bisect.insort
+ pop = result.pop
+ los = result[-1] # los --> Largest of the nsmallest
+ for elem in it:
+ if los <= elem:
+ continue
+ insort(result, elem)
+ pop()
+ los = result[-1]
+ return result
+ # An alternative approach manifests the whole iterable in memory but
+ # saves comparisons by heapifying all at once. Also, saves time
+ # over bisect.insort() which has O(n) data movement time for every
+ # insertion. Finding the n smallest of an m length iterable requires
+ # O(m) + O(n log m) comparisons.
h = list(iterable)
heapify(h)
return map(heappop, repeat(h, min(n, len(h))))
diff --git a/Lib/test/test_heapq.py b/Lib/test/test_heapq.py
index 944b17d..1cdaabe 100644
--- a/Lib/test/test_heapq.py
+++ b/Lib/test/test_heapq.py
@@ -92,6 +92,7 @@ class TestHeap(unittest.TestCase):
def test_nsmallest(self):
data = [random.randrange(2000) for i in range(1000)]
self.assertEqual(nsmallest(data, 400), sorted(data)[:400])
+ self.assertEqual(nsmallest(data, 50), sorted(data)[:50])
def test_largest(self):
data = [random.randrange(2000) for i in range(1000)]