summaryrefslogtreecommitdiffstats
path: root/Lib/test/test_tuple.py
diff options
context:
space:
mode:
authorjdemeyer <jdemeyer@cage.ugent.be>2018-10-28 00:06:38 (GMT)
committerRaymond Hettinger <rhettinger@users.noreply.github.com>2018-10-28 00:06:38 (GMT)
commitaeb1be5868623c9cd9cf6d7de3015a43fb005815 (patch)
tree8a4254e9e0077067e9d58525d3a8c8897a20aced /Lib/test/test_tuple.py
parent53125a53f483db0af76249b6af6efcdc200eb421 (diff)
downloadcpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.zip
cpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.tar.gz
cpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.tar.bz2
bpo-34751: improved hash function for tuples (GH-9471)
Diffstat (limited to 'Lib/test/test_tuple.py')
-rw-r--r--Lib/test/test_tuple.py111
1 files changed, 93 insertions, 18 deletions
diff --git a/Lib/test/test_tuple.py b/Lib/test/test_tuple.py
index 84c064f..929f853 100644
--- a/Lib/test/test_tuple.py
+++ b/Lib/test/test_tuple.py
@@ -62,29 +62,104 @@ class TupleTest(seq_tests.CommonTest):
yield i
self.assertEqual(list(tuple(f())), list(range(1000)))
- def test_hash(self):
- # See SF bug 942952: Weakness in tuple hash
- # The hash should:
- # be non-commutative
- # should spread-out closely spaced values
- # should not exhibit cancellation in tuples like (x,(x,y))
- # should be distinct from element hashes: hash(x)!=hash((x,))
- # This test exercises those cases.
- # For a pure random hash and N=50, the expected number of occupied
- # buckets when tossing 252,600 balls into 2**32 buckets
- # is 252,592.6, or about 7.4 expected collisions. The
- # standard deviation is 2.73. On a box with 64-bit hash
- # codes, no collisions are expected. Here we accept no
- # more than 15 collisions. Any worse and the hash function
- # is sorely suspect.
-
+ # Various tests for hashing of tuples to check that we get few collisions.
+ #
+ # Earlier versions of the tuple hash algorithm had collisions
+ # reported at:
+ # - https://bugs.python.org/issue942952
+ # - https://bugs.python.org/issue34751
+ #
+ # Notes:
+ # - The hash of tuples is deterministic: if the test passes once on a given
+ # system, it will always pass. So the probabilities mentioned in the
+ # test_hash functions below should be interpreted assuming that the
+ # hashes are random.
+ # - Due to the structure in the testsuite inputs, collisions are not
+ # independent. For example, if hash((a,b)) == hash((c,d)), then also
+ # hash((a,b,x)) == hash((c,d,x)). But the quoted probabilities assume
+ # independence anyway.
+ # - We limit the hash to 32 bits in the tests to have a good test on
+ # 64-bit systems too. Furthermore, this is also a sanity check that the
+ # lower 32 bits of a 64-bit hash are sufficiently random too.
+ def test_hash1(self):
+ # Check for hash collisions between small integers in range(50) and
+ # certain tuples and nested tuples of such integers.
N=50
base = list(range(N))
xp = [(i, j) for i in base for j in base]
inps = base + [(i, j) for i in base for j in xp] + \
[(i, j) for i in xp for j in base] + xp + list(zip(base))
- collisions = len(inps) - len(set(map(hash, inps)))
- self.assertTrue(collisions <= 15)
+ self.assertEqual(len(inps), 252600)
+ hashes = set(hash(x) % 2**32 for x in inps)
+ collisions = len(inps) - len(hashes)
+
+ # For a pure random 32-bit hash and N = 252,600 test items, the
+ # expected number of collisions equals
+ #
+ # 2**(-32) * N(N-1)/2 = 7.4
+ #
+ # We allow up to 15 collisions, which suffices to make the test
+ # pass with 99.5% confidence.
+ self.assertLessEqual(collisions, 15)
+
+ def test_hash2(self):
+ # Check for hash collisions between small integers (positive and
+ # negative), tuples and nested tuples of such integers.
+
+ # All numbers in the interval [-n, ..., n] except -1 because
+ # hash(-1) == hash(-2).
+ n = 5
+ A = [x for x in range(-n, n+1) if x != -1]
+
+ B = A + [(a,) for a in A]
+
+ L2 = [(a,b) for a in A for b in A]
+ L3 = L2 + [(a,b,c) for a in A for b in A for c in A]
+ L4 = L3 + [(a,b,c,d) for a in A for b in A for c in A for d in A]
+
+ # T = list of testcases. These consist of all (possibly nested
+ # at most 2 levels deep) tuples containing at most 4 items from
+ # the set A.
+ T = A
+ T += [(a,) for a in B + L4]
+ T += [(a,b) for a in L3 for b in B]
+ T += [(a,b) for a in L2 for b in L2]
+ T += [(a,b) for a in B for b in L3]
+ T += [(a,b,c) for a in B for b in B for c in L2]
+ T += [(a,b,c) for a in B for b in L2 for c in B]
+ T += [(a,b,c) for a in L2 for b in B for c in B]
+ T += [(a,b,c,d) for a in B for b in B for c in B for d in B]
+ self.assertEqual(len(T), 345130)
+ hashes = set(hash(x) % 2**32 for x in T)
+ collisions = len(T) - len(hashes)
+
+ # For a pure random 32-bit hash and N = 345,130 test items, the
+ # expected number of collisions equals
+ #
+ # 2**(-32) * N(N-1)/2 = 13.9
+ #
+ # We allow up to 20 collisions, which suffices to make the test
+ # pass with 95.5% confidence.
+ self.assertLessEqual(collisions, 20)
+
+ def test_hash3(self):
+ # Check for hash collisions between tuples containing 0.0 and 0.5.
+ # The hashes of 0.0 and 0.5 itself differ only in one high bit.
+ # So this implicitly tests propagation of high bits to low bits.
+ from itertools import product
+ T = list(product([0.0, 0.5], repeat=18))
+ self.assertEqual(len(T), 262144)
+ hashes = set(hash(x) % 2**32 for x in T)
+ collisions = len(T) - len(hashes)
+
+ # For a pure random 32-bit hash and N = 262,144 test items, the
+ # expected number of collisions equals
+ #
+ # 2**(-32) * N(N-1)/2 = 8.0
+ #
+ # We allow up to 15 collisions, which suffices to make the test
+ # pass with 99.1% confidence.
+ self.assertLessEqual(collisions, 15)
def test_repr(self):
l0 = tuple()