summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2002-11-13 15:26:37 (GMT)
committerRaymond Hettinger <python@rcn.com>2002-11-13 15:26:37 (GMT)
commitc0b4034b8165ce958a23f2c865b51ae0f52040f5 (patch)
treebe7ce4addc8878e1d0f009202906b34d86b50f79 /Lib
parent674dae245a61b6519238cac8529c6ac83721f880 (diff)
downloadcpython-c0b4034b8165ce958a23f2c865b51ae0f52040f5.zip
cpython-c0b4034b8165ce958a23f2c865b51ae0f52040f5.tar.gz
cpython-c0b4034b8165ce958a23f2c865b51ae0f52040f5.tar.bz2
Improved clarity and thoroughness of docstring.
Added design notes in comments. Used better variable names. Eliminated the unsavory "pool[-k:]" which was an aspiring bug (for k==0). Used if/else to show the two algorithms in parallel style. Added one more test assertion.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/random.py61
1 files changed, 41 insertions, 20 deletions
diff --git a/Lib/random.py b/Lib/random.py
index 1e832e2..f57ddb7 100644
--- a/Lib/random.py
+++ b/Lib/random.py
@@ -377,39 +377,59 @@ class Random:
def sample(self, population, k, random=None, int=int):
"""Chooses k unique random elements from a population sequence.
- Returns a new list containing elements from the population. The
- list itself is in random order so that all sub-slices are also
- random samples. The original sequence is left undisturbed.
+ Returns a new list containing elements from the population while
+ leaving the original population unchanged. The resulting list is
+ in selection order so that all sub-slices will also be valid random
+ samples. This allows raffle winners (the sample) to be partitioned
+ into grand prize and second place winners (the subslices).
- If the population has repeated elements, then each occurrence is
- a possible selection in the sample.
+ Members of the population need not be hashable or unique. If the
+ population contains repeats, then each occurrence is a possible
+ selection in the sample.
- If indices are needed for a large population, use xrange as an
- argument: sample(xrange(10000000), 60)
+ To choose a sample in a range of integers, use xrange as an argument.
+ This is especially fast and space efficient for sampling from a
+ large population: sample(xrange(10000000), 60)
Optional arg random is a 0-argument function returning a random
float in [0.0, 1.0); by default, the standard random.random.
"""
+ # Sampling without replacement entails tracking either potential
+ # selections (the pool) or previous selections.
+
+ # Pools are stored in lists which provide __getitem__ for selection
+ # and provide a way to remove selections. But each list.remove()
+ # rebuilds the entire list, so it is better to rearrange the list,
+ # placing non-selected elements at the head of the list. Tracking
+ # the selection pool is only space efficient with small populations.
+
+ # Previous selections are stored in dictionaries which provide
+ # __contains__ for detecting repeat selections. Discarding repeats
+ # is efficient unless most of the population has already been chosen.
+ # So, tracking selections is useful when sample sizes are much
+ # smaller than the total population.
+
n = len(population)
if not 0 <= k <= n:
raise ValueError, "sample larger than population"
if random is None:
random = self.random
+ result = [None] * k
if n < 6 * k: # if n len list takes less space than a k len dict
- pool = list(population)
- for i in xrange(n-1, n-k-1, -1):
- j = int(random() * (i+1))
- pool[i], pool[j] = pool[j], pool[i]
- return pool[-k:]
- inorder = [None] * k
- selections = {}
- for i in xrange(k):
- j = int(random() * n)
- while j in selections:
+ pool = list(population) # track potential selections
+ for i in xrange(k):
+ j = int(random() * (n-i)) # non-selected at [0,n-i)
+ result[i] = pool[j] # save selected element
+ pool[j] = pool[n-i-1] # non-selected to head of list
+ else:
+ selected = {} # track previous selections
+ for i in xrange(k):
j = int(random() * n)
- selections[j] = inorder[i] = population[j]
- return inorder # return selections in the order they were picked
+ while j in selected: # discard and replace repeats
+ j = int(random() * n)
+ result[i] = selected[j] = population[j]
+ return result # return selections in the order they were picked
## -------------------- real-valued distributions -------------------
@@ -756,6 +776,7 @@ def _test_sample(n):
for k in xrange(n+1):
s = sample(population, k)
assert len(dict([(elem,True) for elem in s])) == len(s) == k
+ assert None not in s
def _sample_generator(n, k):
# Return a fixed element from the sample. Validates random ordering.
@@ -787,7 +808,7 @@ def _test(N=2000):
_test_generator(N, 'weibullvariate(1.0, 1.0)')
_test_generator(N, '_sample_generator(50, 5)') # expected s.d.: 14.4
_test_generator(N, '_sample_generator(50, 45)') # expected s.d.: 14.4
- _test_sample(1000)
+ _test_sample(500)
# Test jumpahead.
s = getstate()