From 33ecffb65ae43ece95e4d828f95819395187d579 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 10 Jun 2004 05:03:17 +0000 Subject: SF patch #969791: Add nlargest() and nsmallest() to heapq. --- Doc/lib/libheapq.tex | 24 ++++++++++++++++++++++++ Doc/whatsnew/whatsnew24.tex | 5 ++++- Lib/heapq.py | 36 ++++++++++++++++++++++++++++++++++-- Lib/test/test_heapq.py | 11 ++++++++++- Misc/NEWS | 4 +++- 5 files changed, 75 insertions(+), 5 deletions(-) diff --git a/Doc/lib/libheapq.tex b/Doc/lib/libheapq.tex index 38f9b1a..4585058 100644 --- a/Doc/lib/libheapq.tex +++ b/Doc/lib/libheapq.tex @@ -83,6 +83,30 @@ True >>> \end{verbatim} +The module also offers two general purpose functions based on heaps. + +\begin{funcdesc}{nlargest}{iterable, n} +Return a list with the \var{n} largest elements from the dataset defined +by \var{iterable}. Equivalent to: \code{sorted(iterable, reverse=True)[:n]} +\versionadded{2.4} +\end{funcdesc} + +\begin{funcdesc}{nsmallest}{iterable, n} +Return a list with the \var{n} smallest elements from the dataset defined +by \var{iterable}. Equivalent to: \code{sorted(iterable)[:n]} +\versionadded{2.4} +\end{funcdesc} + +Though the above functions appear symmetrical, they each have different +speed and space requirements. In particular, \function{nsmallest()} +operates on a full copy of the dataset. In contrast, \function{nlargest()} +only requires storage space for \var{n} elements. + +Both functions perform best for smaller values of \var{n}. For larger +values, it is more efficient to use the \function{sorted()} function. Also, +when \code{n==1}, it is more efficient to use the builtin \function{min()} +and \function{max()} functions. + \subsection{Theory} diff --git a/Doc/whatsnew/whatsnew24.tex b/Doc/whatsnew/whatsnew24.tex index 3f4151f..4a67916 100644 --- a/Doc/whatsnew/whatsnew24.tex +++ b/Doc/whatsnew/whatsnew24.tex @@ -449,7 +449,10 @@ improved performance: \module{Queue}, \module{mutex}, \module{shlex} \item The \module{heapq} module has been converted to C. The resulting tenfold improvement in speed makes the module suitable for handling - high volumes of data. + high volumes of data. In addition, the module has two new functions + \function{nlargest()} and \function{nsmallest()} that use heaps to + find the largest or smallest n values in a dataset without the + expense of a full sort. \item The \module{imaplib} module now supports IMAP's THREAD command. (Contributed by Yves Dionne.) diff --git a/Lib/heapq.py b/Lib/heapq.py index 3eb69d8..d1aad98 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -30,7 +30,7 @@ without surprises: heap[0] is the smallest item, and heap.sort() maintains the heap invariant! """ -# Original code by Kevin O'Connor, augmented by Tim Peters +# Original code by Kevin O'Connor, augmented by Tim Peters and Raymond Hettinger __about__ = """Heap queues @@ -126,7 +126,10 @@ Believe me, real good tape sorts were quite spectacular to watch! From all times, sorting has always been a Great Art! :-) """ -__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace'] +__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'nlargest', + 'nsmallest'] + +from itertools import islice, repeat def heappush(heap, item): """Push item onto heap, maintaining the heap invariant.""" @@ -168,6 +171,35 @@ def heapify(x): for i in reversed(xrange(n//2)): _siftup(x, i) +def nlargest(iterable, n): + """Find the n largest elements in a dataset. + + Equivalent to: sorted(iterable, reverse=True)[:n] + """ + it = iter(iterable) + result = list(islice(it, n)) + if not result: + return result + heapify(result) + _heapreplace = heapreplace + sol = result[0] # sol --> smallest of the nlargest + for elem in it: + if elem <= sol: + continue + _heapreplace(result, elem) + sol = result[0] + result.sort(reverse=True) + return result + +def nsmallest(iterable, n): + """Find the n smallest elements in a dataset. + + Equivalent to: sorted(iterable)[:n] + """ + h = list(iterable) + heapify(h) + return map(heappop, repeat(h, min(n, len(h)))) + # 'heap' is a heap at all indices >= startpos, except possibly for pos. pos # is the index of a leaf with a possibly out-of-order value. Restore the # heap invariant. diff --git a/Lib/test/test_heapq.py b/Lib/test/test_heapq.py index 8f3c6f9..f04ea94 100644 --- a/Lib/test/test_heapq.py +++ b/Lib/test/test_heapq.py @@ -2,7 +2,7 @@ from test.test_support import verify, vereq, verbose, TestFailed -from heapq import heappush, heappop, heapify, heapreplace +from heapq import heappush, heappop, heapify, heapreplace, nlargest, nsmallest import random def check_invariant(heap): @@ -84,6 +84,15 @@ def test_main(): data.sort() sorted = [heappop(heap) for i in range(size)] vereq(data, sorted) + + # 7) Check nlargest() and nsmallest() + data = [random.randrange(2000) for i in range(1000)] + copy = data[:] + copy.sort(reverse=True) + vereq(nlargest(data, 400), copy[:400]) + copy.sort() + vereq(nsmallest(data, 400), copy[:400]) + # Make user happy if verbose: print "All OK" diff --git a/Misc/NEWS b/Misc/NEWS index a93ae18..dd75f61 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -416,7 +416,9 @@ Library os.path.exists(), switched to using os.lstat() directly if possible. - bisect.py and heapq.py now have underlying C implementations - for better performance + for better performance. + +- heapq.py has two new functions, nsmallest() and nlargest(). - traceback.format_exc has been added (similar to print_exc but it returns a string). -- cgit v0.12