From f24eb35d185c0623315cfbd9977d37c509860dcf Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 12 Nov 2002 17:41:57 +0000 Subject: SF patch 629637: Add sample(population, k) method to the random module. Used for random sampling without replacement. --- Doc/lib/librandom.tex | 19 +++++++++++++++++ Lib/random.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++-- Misc/NEWS | 3 +++ 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/Doc/lib/librandom.tex b/Doc/lib/librandom.tex index 846b931..15e477b 100644 --- a/Doc/lib/librandom.tex +++ b/Doc/lib/librandom.tex @@ -179,6 +179,25 @@ Functions for sequences: long sequence can never be generated. \end{funcdesc} +\begin{funcdesc}{sample}{population, k} + Return a \var{k} length list of unique elements chosen from the + population sequence. Used for random sampling without replacement. + + Returns a new list containing elements from the population. The + list itself is in random order so that all sub-slices are also + random samples. The original sequence is left undisturbed. + + If the population has repeated elements, then each occurence is a + possible selection in the sample. + + If indices are needed for a large population, use \function{xrange} + as an argument: \code{sample(xrange(10000000), 60)}. + + Optional argument random is a 0-argument function returning a random + float in [0.0, 1.0); by default, the standard random.random. + \versionadded{2.3} +\end{funcdesc} + The following functions generate specific real-valued distributions. Function parameters are named after the corresponding variables in the diff --git a/Lib/random.py b/Lib/random.py index 4d29080..e2c675c 100644 --- a/Lib/random.py +++ b/Lib/random.py @@ -7,6 +7,7 @@ sequences --------- pick random element + pick random sample generate random permutation distributions on the real line: @@ -77,7 +78,7 @@ from math import log as _log, exp as _exp, pi as _pi, e as _e from math import sqrt as _sqrt, acos as _acos, cos as _cos, sin as _sin from math import floor as _floor -__all__ = ["Random","seed","random","uniform","randint","choice", +__all__ = ["Random","seed","random","uniform","randint","choice","sample", "randrange","shuffle","normalvariate","lognormvariate", "cunifvariate","expovariate","vonmisesvariate","gammavariate", "stdgamma","gauss","betavariate","paretovariate","weibullvariate", @@ -373,6 +374,43 @@ class Random: j = int(random() * (i+1)) x[i], x[j] = x[j], x[i] + def sample(self, population, k, random=None, int=int): + """Chooses k unique random elements from a population sequence. + + Returns a new list containing elements from the population. The + list itself is in random order so that all sub-slices are also + random samples. The original sequence is left undisturbed. + + If the population has repeated elements, then each occurence is + a possible selection in the sample. + + If indices are needed for a large population, use xrange as an + argument: sample(xrange(10000000), 60) + + Optional arg random is a 0-argument function returning a random + float in [0.0, 1.0); by default, the standard random.random. + """ + + n = len(population) + if not 0 <= k <= n: + raise ValueError, "sample larger than population" + if random is None: + random = self.random + if n < 6 * k: # if n len list takes less space than a k len dict + pool = list(population) + for i in xrange(n-1, n-k-1, -1): + j = int(random() * (i+1)) + pool[i], pool[j] = pool[j], pool[i] + return pool[-k:] + inorder = [None] * k + selections = {} + for i in xrange(k): + j = int(random() * n) + while j in selections: + j = int(random() * n) + selections[j] = inorder[i] = population[j] + return inorder # return selections in the order they were picked + ## -------------------- real-valued distributions ------------------- ## -------------------- uniform distribution ------------------- @@ -711,7 +749,19 @@ def _test_generator(n, funccall): print 'avg %g, stddev %g, min %g, max %g' % \ (avg, stddev, smallest, largest) -def _test(N=20000): +def _test_sample(n): + # For the entire allowable range of 0 <= k <= n, validate that + # the sample is of the correct length and contains only unique items + population = xrange(n) + for k in xrange(n+1): + s = sample(population, k) + assert len(dict([(elem,True) for elem in s])) == len(s) == k + +def _sample_generator(n, k): + # Return a fixed element from the sample. Validates random ordering. + return sample(xrange(n), k)[k//2] + +def _test(N=2000): print 'TWOPI =', TWOPI print 'LOG4 =', LOG4 print 'NV_MAGICCONST =', NV_MAGICCONST @@ -735,6 +785,9 @@ def _test(N=20000): _test_generator(N, 'betavariate(3.0, 3.0)') _test_generator(N, 'paretovariate(1.0)') _test_generator(N, 'weibullvariate(1.0, 1.0)') + _test_generator(N, '_sample_generator(50, 5)') # expected s.d.: 14.4 + _test_generator(N, '_sample_generator(50, 45)') # expected s.d.: 14.4 + _test_sample(1000) # Test jumpahead. s = getstate() @@ -760,6 +813,7 @@ uniform = _inst.uniform randint = _inst.randint choice = _inst.choice randrange = _inst.randrange +sample = _inst.sample shuffle = _inst.shuffle normalvariate = _inst.normalvariate lognormvariate = _inst.lognormvariate diff --git a/Misc/NEWS b/Misc/NEWS index 89f1e2f..addd5ae 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -427,6 +427,9 @@ Library - Added operator.pow(a,b) which is equivalent to a**b. +- Added random.sample(population,k) for random sampling without replacement. + Returns a k length list of unique elements chosen from the population. + - random.randrange(-sys.maxint-1, sys.maxint) no longer raises OverflowError. That is, it now accepts any combination of 'start' and 'stop' arguments so long as each is in the range of Python's -- cgit v0.12