summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/lib/libheapq.tex15
-rw-r--r--Lib/heapq.py42
-rw-r--r--Lib/test/test_heapq.py10
-rw-r--r--Misc/NEWS2
4 files changed, 64 insertions, 5 deletions
diff --git a/Doc/lib/libheapq.tex b/Doc/lib/libheapq.tex
index 5f3d8c5..32cc25e 100644
--- a/Doc/lib/libheapq.tex
+++ b/Doc/lib/libheapq.tex
@@ -88,7 +88,18 @@ True
>>>
\end{verbatim}
-The module also offers two general purpose functions based on heaps.
+The module also offers three general purpose functions based on heaps.
+
+\begin{funcdesc}{merge}{*iterables}
+Merge multiple sorted inputs into a single sorted output (for example, merge
+timestamped entries from multiple log files). Returns an iterator over
+over the sorted values.
+
+Similar to \code{sorted(itertools.chain(*iterables))} but returns an iterable,
+does not pull the data into memory all at once, and reduces the number of
+comparisons by assuming that each of the input streams is already sorted.
+\versionadded{2.6}
+\end{funcdesc}
\begin{funcdesc}{nlargest}{n, iterable\optional{, key}}
Return a list with the \var{n} largest elements from the dataset defined
@@ -110,7 +121,7 @@ Equivalent to: \samp{sorted(iterable, key=key)[:n]}
\versionchanged[Added the optional \var{key} argument]{2.5}
\end{funcdesc}
-Both functions perform best for smaller values of \var{n}. For larger
+The latter two functions perform best for smaller values of \var{n}. For larger
values, it is more efficient to use the \function{sorted()} function. Also,
when \code{n==1}, it is more efficient to use the builtin \function{min()}
and \function{max()} functions.
diff --git a/Lib/heapq.py b/Lib/heapq.py
index 753c3b7..b56d0f9 100644
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -126,8 +126,8 @@ Believe me, real good tape sorts were quite spectacular to watch!
From all times, sorting has always been a Great Art! :-)
"""
-__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'nlargest',
- 'nsmallest']
+__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge',
+ 'nlargest', 'nsmallest']
from itertools import islice, repeat, count, imap, izip, tee
from operator import itemgetter, neg
@@ -308,6 +308,41 @@ try:
except ImportError:
pass
+def merge(*iterables):
+ '''Merge multiple sorted inputs into a single sorted output.
+
+ Similar to sorted(itertools.chain(*iterables)) but returns an iterable,
+ does not pull the data into memory all at once, and reduces the number
+ of comparisons by assuming that each of the input streams is already sorted.
+
+ >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
+ [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
+
+ '''
+ _heappop, siftup, _StopIteration = heappop, _siftup, StopIteration
+
+ h = []
+ h_append = h.append
+ for it in map(iter, iterables):
+ try:
+ next = it.next
+ h_append([next(), next])
+ except _StopIteration:
+ pass
+ heapify(h)
+
+ while 1:
+ try:
+ while 1:
+ v, next = s = h[0] # raises IndexError when h is empty
+ yield v
+ s[0] = next() # raises StopIteration when exhausted
+ siftup(h, 0) # restore heap condition
+ except _StopIteration:
+ _heappop(h) # remove empty iterator
+ except IndexError:
+ return
+
# Extend the implementations of nsmallest and nlargest to use a key= argument
_nsmallest = nsmallest
def nsmallest(n, iterable, key=None):
@@ -341,3 +376,6 @@ if __name__ == "__main__":
while heap:
sort.append(heappop(heap))
print sort
+
+ import doctest
+ doctest.testmod()
diff --git a/Lib/test/test_heapq.py b/Lib/test/test_heapq.py
index e9f2798..b7c3ab2 100644
--- a/Lib/test/test_heapq.py
+++ b/Lib/test/test_heapq.py
@@ -1,6 +1,6 @@
"""Unittests for heapq."""
-from heapq import heappush, heappop, heapify, heapreplace, nlargest, nsmallest
+from heapq import heappush, heappop, heapify, heapreplace, merge, nlargest, nsmallest
import random
import unittest
from test import test_support
@@ -103,6 +103,14 @@ class TestHeap(unittest.TestCase):
heap_sorted = [heappop(heap) for i in range(size)]
self.assertEqual(heap_sorted, sorted(data))
+ def test_merge(self):
+ inputs = []
+ for i in xrange(random.randrange(5)):
+ row = sorted(random.randrange(1000) for j in range(random.randrange(10)))
+ inputs.append(row)
+ self.assertEqual(sorted(chain(*inputs)), list(merge(*inputs)))
+ self.assertEqual(list(merge()), [])
+
def test_nsmallest(self):
data = [(random.randrange(2000), i) for i in range(1000)]
for f in (None, lambda x: x[0] * 547 % 2000):
diff --git a/Misc/NEWS b/Misc/NEWS
index dffd5b7..8f5d57b 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -128,6 +128,8 @@ Core and builtins
Library
-------
+- Added heapq.merge() for merging sorted input streams.
+
- Have the encoding package's search function dynamically import using absolute
import semantics.