From 00166c5532fc7562c07383a0ae2985b3ffaf253a Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 19 Feb 2007 04:08:43 +0000 Subject: Add merge() function to heapq. --- Doc/lib/libheapq.tex | 15 +++++++++++++-- Lib/heapq.py | 42 ++++++++++++++++++++++++++++++++++++++++-- Lib/test/test_heapq.py | 10 +++++++++- Misc/NEWS | 2 ++ 4 files changed, 64 insertions(+), 5 deletions(-) diff --git a/Doc/lib/libheapq.tex b/Doc/lib/libheapq.tex index 5f3d8c5..32cc25e 100644 --- a/Doc/lib/libheapq.tex +++ b/Doc/lib/libheapq.tex @@ -88,7 +88,18 @@ True >>> \end{verbatim} -The module also offers two general purpose functions based on heaps. +The module also offers three general purpose functions based on heaps. + +\begin{funcdesc}{merge}{*iterables} +Merge multiple sorted inputs into a single sorted output (for example, merge +timestamped entries from multiple log files). Returns an iterator over +over the sorted values. + +Similar to \code{sorted(itertools.chain(*iterables))} but returns an iterable, +does not pull the data into memory all at once, and reduces the number of +comparisons by assuming that each of the input streams is already sorted. +\versionadded{2.6} +\end{funcdesc} \begin{funcdesc}{nlargest}{n, iterable\optional{, key}} Return a list with the \var{n} largest elements from the dataset defined @@ -110,7 +121,7 @@ Equivalent to: \samp{sorted(iterable, key=key)[:n]} \versionchanged[Added the optional \var{key} argument]{2.5} \end{funcdesc} -Both functions perform best for smaller values of \var{n}. For larger +The latter two functions perform best for smaller values of \var{n}. For larger values, it is more efficient to use the \function{sorted()} function. Also, when \code{n==1}, it is more efficient to use the builtin \function{min()} and \function{max()} functions. diff --git a/Lib/heapq.py b/Lib/heapq.py index 753c3b7..b56d0f9 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -126,8 +126,8 @@ Believe me, real good tape sorts were quite spectacular to watch! From all times, sorting has always been a Great Art! :-) """ -__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'nlargest', - 'nsmallest'] +__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge', + 'nlargest', 'nsmallest'] from itertools import islice, repeat, count, imap, izip, tee from operator import itemgetter, neg @@ -308,6 +308,41 @@ try: except ImportError: pass +def merge(*iterables): + '''Merge multiple sorted inputs into a single sorted output. + + Similar to sorted(itertools.chain(*iterables)) but returns an iterable, + does not pull the data into memory all at once, and reduces the number + of comparisons by assuming that each of the input streams is already sorted. + + >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25])) + [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25] + + ''' + _heappop, siftup, _StopIteration = heappop, _siftup, StopIteration + + h = [] + h_append = h.append + for it in map(iter, iterables): + try: + next = it.next + h_append([next(), next]) + except _StopIteration: + pass + heapify(h) + + while 1: + try: + while 1: + v, next = s = h[0] # raises IndexError when h is empty + yield v + s[0] = next() # raises StopIteration when exhausted + siftup(h, 0) # restore heap condition + except _StopIteration: + _heappop(h) # remove empty iterator + except IndexError: + return + # Extend the implementations of nsmallest and nlargest to use a key= argument _nsmallest = nsmallest def nsmallest(n, iterable, key=None): @@ -341,3 +376,6 @@ if __name__ == "__main__": while heap: sort.append(heappop(heap)) print sort + + import doctest + doctest.testmod() diff --git a/Lib/test/test_heapq.py b/Lib/test/test_heapq.py index e9f2798..b7c3ab2 100644 --- a/Lib/test/test_heapq.py +++ b/Lib/test/test_heapq.py @@ -1,6 +1,6 @@ """Unittests for heapq.""" -from heapq import heappush, heappop, heapify, heapreplace, nlargest, nsmallest +from heapq import heappush, heappop, heapify, heapreplace, merge, nlargest, nsmallest import random import unittest from test import test_support @@ -103,6 +103,14 @@ class TestHeap(unittest.TestCase): heap_sorted = [heappop(heap) for i in range(size)] self.assertEqual(heap_sorted, sorted(data)) + def test_merge(self): + inputs = [] + for i in xrange(random.randrange(5)): + row = sorted(random.randrange(1000) for j in range(random.randrange(10))) + inputs.append(row) + self.assertEqual(sorted(chain(*inputs)), list(merge(*inputs))) + self.assertEqual(list(merge()), []) + def test_nsmallest(self): data = [(random.randrange(2000), i) for i in range(1000)] for f in (None, lambda x: x[0] * 547 % 2000): diff --git a/Misc/NEWS b/Misc/NEWS index dffd5b7..8f5d57b 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -128,6 +128,8 @@ Core and builtins Library ------- +- Added heapq.merge() for merging sorted input streams. + - Have the encoding package's search function dynamically import using absolute import semantics. -- cgit v0.12