summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2004-06-10 05:03:17 (GMT)
committerRaymond Hettinger <python@rcn.com>2004-06-10 05:03:17 (GMT)
commit33ecffb65ae43ece95e4d828f95819395187d579 (patch)
tree499adce5b4fc964973a8e72baf2c6214bcef89e3
parent7d019664d7fcd3692eafef668fbc2e17126dee14 (diff)
downloadcpython-33ecffb65ae43ece95e4d828f95819395187d579.zip
cpython-33ecffb65ae43ece95e4d828f95819395187d579.tar.gz
cpython-33ecffb65ae43ece95e4d828f95819395187d579.tar.bz2
SF patch #969791: Add nlargest() and nsmallest() to heapq.
-rw-r--r--Doc/lib/libheapq.tex24
-rw-r--r--Doc/whatsnew/whatsnew24.tex5
-rw-r--r--Lib/heapq.py36
-rw-r--r--Lib/test/test_heapq.py11
-rw-r--r--Misc/NEWS4
5 files changed, 75 insertions, 5 deletions
diff --git a/Doc/lib/libheapq.tex b/Doc/lib/libheapq.tex
index 38f9b1a..4585058 100644
--- a/Doc/lib/libheapq.tex
+++ b/Doc/lib/libheapq.tex
@@ -83,6 +83,30 @@ True
>>>
\end{verbatim}
+The module also offers two general purpose functions based on heaps.
+
+\begin{funcdesc}{nlargest}{iterable, n}
+Return a list with the \var{n} largest elements from the dataset defined
+by \var{iterable}. Equivalent to: \code{sorted(iterable, reverse=True)[:n]}
+\versionadded{2.4}
+\end{funcdesc}
+
+\begin{funcdesc}{nsmallest}{iterable, n}
+Return a list with the \var{n} smallest elements from the dataset defined
+by \var{iterable}. Equivalent to: \code{sorted(iterable)[:n]}
+\versionadded{2.4}
+\end{funcdesc}
+
+Though the above functions appear symmetrical, they each have different
+speed and space requirements. In particular, \function{nsmallest()}
+operates on a full copy of the dataset. In contrast, \function{nlargest()}
+only requires storage space for \var{n} elements.
+
+Both functions perform best for smaller values of \var{n}. For larger
+values, it is more efficient to use the \function{sorted()} function. Also,
+when \code{n==1}, it is more efficient to use the builtin \function{min()}
+and \function{max()} functions.
+
\subsection{Theory}
diff --git a/Doc/whatsnew/whatsnew24.tex b/Doc/whatsnew/whatsnew24.tex
index 3f4151f..4a67916 100644
--- a/Doc/whatsnew/whatsnew24.tex
+++ b/Doc/whatsnew/whatsnew24.tex
@@ -449,7 +449,10 @@ improved performance: \module{Queue}, \module{mutex}, \module{shlex}
\item The \module{heapq} module has been converted to C. The resulting
tenfold improvement in speed makes the module suitable for handling
- high volumes of data.
+ high volumes of data. In addition, the module has two new functions
+ \function{nlargest()} and \function{nsmallest()} that use heaps to
+ find the largest or smallest n values in a dataset without the
+ expense of a full sort.
\item The \module{imaplib} module now supports IMAP's THREAD command.
(Contributed by Yves Dionne.)
diff --git a/Lib/heapq.py b/Lib/heapq.py
index 3eb69d8..d1aad98 100644
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -30,7 +30,7 @@ without surprises: heap[0] is the smallest item, and heap.sort()
maintains the heap invariant!
"""
-# Original code by Kevin O'Connor, augmented by Tim Peters
+# Original code by Kevin O'Connor, augmented by Tim Peters and Raymond Hettinger
__about__ = """Heap queues
@@ -126,7 +126,10 @@ Believe me, real good tape sorts were quite spectacular to watch!
From all times, sorting has always been a Great Art! :-)
"""
-__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace']
+__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'nlargest',
+ 'nsmallest']
+
+from itertools import islice, repeat
def heappush(heap, item):
"""Push item onto heap, maintaining the heap invariant."""
@@ -168,6 +171,35 @@ def heapify(x):
for i in reversed(xrange(n//2)):
_siftup(x, i)
+def nlargest(iterable, n):
+ """Find the n largest elements in a dataset.
+
+ Equivalent to: sorted(iterable, reverse=True)[:n]
+ """
+ it = iter(iterable)
+ result = list(islice(it, n))
+ if not result:
+ return result
+ heapify(result)
+ _heapreplace = heapreplace
+ sol = result[0] # sol --> smallest of the nlargest
+ for elem in it:
+ if elem <= sol:
+ continue
+ _heapreplace(result, elem)
+ sol = result[0]
+ result.sort(reverse=True)
+ return result
+
+def nsmallest(iterable, n):
+ """Find the n smallest elements in a dataset.
+
+ Equivalent to: sorted(iterable)[:n]
+ """
+ h = list(iterable)
+ heapify(h)
+ return map(heappop, repeat(h, min(n, len(h))))
+
# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos
# is the index of a leaf with a possibly out-of-order value. Restore the
# heap invariant.
diff --git a/Lib/test/test_heapq.py b/Lib/test/test_heapq.py
index 8f3c6f9..f04ea94 100644
--- a/Lib/test/test_heapq.py
+++ b/Lib/test/test_heapq.py
@@ -2,7 +2,7 @@
from test.test_support import verify, vereq, verbose, TestFailed
-from heapq import heappush, heappop, heapify, heapreplace
+from heapq import heappush, heappop, heapify, heapreplace, nlargest, nsmallest
import random
def check_invariant(heap):
@@ -84,6 +84,15 @@ def test_main():
data.sort()
sorted = [heappop(heap) for i in range(size)]
vereq(data, sorted)
+
+ # 7) Check nlargest() and nsmallest()
+ data = [random.randrange(2000) for i in range(1000)]
+ copy = data[:]
+ copy.sort(reverse=True)
+ vereq(nlargest(data, 400), copy[:400])
+ copy.sort()
+ vereq(nsmallest(data, 400), copy[:400])
+
# Make user happy
if verbose:
print "All OK"
diff --git a/Misc/NEWS b/Misc/NEWS
index a93ae18..dd75f61 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -416,7 +416,9 @@ Library
os.path.exists(), switched to using os.lstat() directly if possible.
- bisect.py and heapq.py now have underlying C implementations
- for better performance
+ for better performance.
+
+- heapq.py has two new functions, nsmallest() and nlargest().
- traceback.format_exc has been added (similar to print_exc but it returns
a string).