diff options
author | Tim Peters <tim.peters@gmail.com> | 2019-02-08 21:09:26 (GMT) |
---|---|---|
committer | Raymond Hettinger <rhettinger@users.noreply.github.com> | 2019-02-08 21:09:26 (GMT) |
commit | 7ab3d1573c123fdd582a239d01f8475651df38f2 (patch) | |
tree | 5304e087b566e0750c67285240188b9fb77eea4a /Lib/test/test_tuple.py | |
parent | 5741c45acf9b0ce22ff0dbf56322fe0ff16cfcfc (diff) | |
download | cpython-7ab3d1573c123fdd582a239d01f8475651df38f2.zip cpython-7ab3d1573c123fdd582a239d01f8475651df38f2.tar.gz cpython-7ab3d1573c123fdd582a239d01f8475651df38f2.tar.bz2 |
Rework tuple hash tests. (GH-10161)
Add tooling that will useful in future updates, paying particular attention to difficult cases where only the upper bits on the input vary.
Diffstat (limited to 'Lib/test/test_tuple.py')
-rw-r--r-- | Lib/test/test_tuple.py | 351 |
1 files changed, 267 insertions, 84 deletions
diff --git a/Lib/test/test_tuple.py b/Lib/test/test_tuple.py index 929f853..ca46d0b 100644 --- a/Lib/test/test_tuple.py +++ b/Lib/test/test_tuple.py @@ -4,6 +4,17 @@ import unittest import gc import pickle +# For tuple hashes, we normally only run a test to ensure that we get +# the same results across platforms in a handful of cases. If that's +# so, there's no real point to running more. Set RUN_ALL_HASH_TESTS to +# run more anyway. That's usually of real interest only when analyzing, +# or changing, the hash algorithm. In which case it's usually also +# most useful to set JUST_SHOW_HASH_RESULTS, to see all the results +# instead of wrestling with test "failures". See the bottom of the +# file for extensive notes on what we're testing here and why. +RUN_ALL_HASH_TESTS = False +JUST_SHOW_HASH_RESULTS = False # if RUN_ALL_HASH_TESTS, just display + class TupleTest(seq_tests.CommonTest): type2test = tuple @@ -62,104 +73,183 @@ class TupleTest(seq_tests.CommonTest): yield i self.assertEqual(list(tuple(f())), list(range(1000))) + # We expect tuples whose base components have deterministic hashes to + # have deterministic hashes too - and, indeed, the same hashes across + # platforms with hash codes of the same bit width. + def test_hash_exact(self): + def check_one_exact(t, e32, e64): + got = hash(t) + expected = e32 if support.NHASHBITS == 32 else e64 + if got != expected: + msg = f"FAIL hash({t!r}) == {got} != {expected}" + self.fail(msg) + + check_one_exact((), 750394483, 5740354900026072187) + check_one_exact((0,), 1214856301, -8753497827991233192) + check_one_exact((0, 0), -168982784, -8458139203682520985) + check_one_exact((0.5,), 2077348973, -408149959306781352) + check_one_exact((0.5, (), (-2, 3, (4, 6))), 714642271, + -1845940830829704396) + # Various tests for hashing of tuples to check that we get few collisions. + # Does something only if RUN_ALL_HASH_TESTS is true. # - # Earlier versions of the tuple hash algorithm had collisions + # Earlier versions of the tuple hash algorithm had massive collisions # reported at: # - https://bugs.python.org/issue942952 # - https://bugs.python.org/issue34751 - # - # Notes: - # - The hash of tuples is deterministic: if the test passes once on a given - # system, it will always pass. So the probabilities mentioned in the - # test_hash functions below should be interpreted assuming that the - # hashes are random. - # - Due to the structure in the testsuite inputs, collisions are not - # independent. For example, if hash((a,b)) == hash((c,d)), then also - # hash((a,b,x)) == hash((c,d,x)). But the quoted probabilities assume - # independence anyway. - # - We limit the hash to 32 bits in the tests to have a good test on - # 64-bit systems too. Furthermore, this is also a sanity check that the - # lower 32 bits of a 64-bit hash are sufficiently random too. - def test_hash1(self): - # Check for hash collisions between small integers in range(50) and - # certain tuples and nested tuples of such integers. - N=50 + def test_hash_optional(self): + from itertools import product + + if not RUN_ALL_HASH_TESTS: + return + + # If specified, `expected` is a 2-tuple of expected + # (number_of_collisions, pileup) values, and the test fails if + # those aren't the values we get. Also if specified, the test + # fails if z > `zlimit`. + def tryone_inner(tag, nbins, hashes, expected=None, zlimit=None): + from collections import Counter + + nballs = len(hashes) + mean, sdev = support.collision_stats(nbins, nballs) + c = Counter(hashes) + collisions = nballs - len(c) + z = (collisions - mean) / sdev + pileup = max(c.values()) - 1 + del c + got = (collisions, pileup) + failed = False + prefix = "" + if zlimit is not None and z > zlimit: + failed = True + prefix = f"FAIL z > {zlimit}; " + if expected is not None and got != expected: + failed = True + prefix += f"FAIL {got} != {expected}; " + if failed or JUST_SHOW_HASH_RESULTS: + msg = f"{prefix}{tag}; pileup {pileup:,} mean {mean:.1f} " + msg += f"coll {collisions:,} z {z:+.1f}" + if JUST_SHOW_HASH_RESULTS: + import sys + print(msg, file=sys.__stdout__) + else: + self.fail(msg) + + def tryone(tag, xs, + native32=None, native64=None, hi32=None, lo32=None, + zlimit=None): + NHASHBITS = support.NHASHBITS + hashes = list(map(hash, xs)) + tryone_inner(tag + f"; {NHASHBITS}-bit hash codes", + 1 << NHASHBITS, + hashes, + native32 if NHASHBITS == 32 else native64, + zlimit) + + if NHASHBITS > 32: + shift = NHASHBITS - 32 + tryone_inner(tag + "; 32-bit upper hash codes", + 1 << 32, + [h >> shift for h in hashes], + hi32, + zlimit) + + mask = (1 << 32) - 1 + tryone_inner(tag + "; 32-bit lower hash codes", + 1 << 32, + [h & mask for h in hashes], + lo32, + zlimit) + + # Tuples of smallish positive integers are common - nice if we + # get "better than random" for these. + tryone("range(100) by 3", list(product(range(100), repeat=3)), + (0, 0), (0, 0), (4, 1), (0, 0)) + + # A previous hash had systematic problems when mixing integers of + # similar magnitude but opposite sign, obscurely related to that + # j ^ -2 == -j when j is odd. + cands = list(range(-10, -1)) + list(range(9)) + + # Note: -1 is omitted because hash(-1) == hash(-2) == -2, and + # there's nothing the tuple hash can do to avoid collisions + # inherited from collisions in the tuple components' hashes. + tryone("-10 .. 8 by 4", list(product(cands, repeat=4)), + (0, 0), (0, 0), (0, 0), (0, 0)) + del cands + + # The hashes here are a weird mix of values where all the + # variation is in the lowest bits and across a single high-order + # bit - the middle bits are all zeroes. A decent hash has to + # both propagate low bits to the left and high bits to the + # right. This is also complicated a bit in that there are + # collisions among the hashes of the integers in L alone. + L = [n << 60 for n in range(100)] + tryone("0..99 << 60 by 3", list(product(L, repeat=3)), + (0, 0), (0, 0), (0, 0), (324, 1)) + del L + + # Used to suffer a massive number of collisions. + tryone("[-3, 3] by 18", list(product([-3, 3], repeat=18)), + (7, 1), (0, 0), (7, 1), (6, 1)) + + # And even worse. hash(0.5) has only a single bit set, at the + # high end. A decent hash needs to propagate high bits right. + tryone("[0, 0.5] by 18", list(product([0, 0.5], repeat=18)), + (5, 1), (0, 0), (9, 1), (12, 1)) + + # Hashes of ints and floats are the same across platforms. + # String hashes vary even on a single platform across runs, due + # to hash randomization for strings. So we can't say exactly + # what this should do. Instead we insist that the # of + # collisions is no more than 4 sdevs above the theoretically + # random mean. Even if the tuple hash can't achieve that on its + # own, the string hash is trying to be decently pseudo-random + # (in all bit positions) on _its_ own. We can at least test + # that the tuple hash doesn't systematically ruin that. + tryone("4-char tuples", + list(product("abcdefghijklmnopqrstuvwxyz", repeat=4)), + zlimit=4.0) + + # The "old tuple test". See https://bugs.python.org/issue942952. + # Ensures, for example, that the hash: + # is non-commutative + # spreads closely spaced values + # doesn't exhibit cancellation in tuples like (x,(x,y)) + N = 50 base = list(range(N)) - xp = [(i, j) for i in base for j in base] - inps = base + [(i, j) for i in base for j in xp] + \ - [(i, j) for i in xp for j in base] + xp + list(zip(base)) - self.assertEqual(len(inps), 252600) - hashes = set(hash(x) % 2**32 for x in inps) - collisions = len(inps) - len(hashes) - - # For a pure random 32-bit hash and N = 252,600 test items, the - # expected number of collisions equals - # - # 2**(-32) * N(N-1)/2 = 7.4 - # - # We allow up to 15 collisions, which suffices to make the test - # pass with 99.5% confidence. - self.assertLessEqual(collisions, 15) - - def test_hash2(self): - # Check for hash collisions between small integers (positive and - # negative), tuples and nested tuples of such integers. - - # All numbers in the interval [-n, ..., n] except -1 because - # hash(-1) == hash(-2). + xp = list(product(base, repeat=2)) + inps = base + list(product(base, xp)) + \ + list(product(xp, base)) + xp + list(zip(base)) + tryone("old tuple test", inps, + (2, 1), (0, 0), (52, 49), (7, 1)) + del base, xp, inps + + # The "new tuple test". See https://bugs.python.org/issue34751. + # Even more tortured nesting, and a mix of signed ints of very + # small magnitude. n = 5 A = [x for x in range(-n, n+1) if x != -1] - B = A + [(a,) for a in A] - - L2 = [(a,b) for a in A for b in A] - L3 = L2 + [(a,b,c) for a in A for b in A for c in A] - L4 = L3 + [(a,b,c,d) for a in A for b in A for c in A for d in A] - + L2 = list(product(A, repeat=2)) + L3 = L2 + list(product(A, repeat=3)) + L4 = L3 + list(product(A, repeat=4)) # T = list of testcases. These consist of all (possibly nested # at most 2 levels deep) tuples containing at most 4 items from # the set A. T = A T += [(a,) for a in B + L4] - T += [(a,b) for a in L3 for b in B] - T += [(a,b) for a in L2 for b in L2] - T += [(a,b) for a in B for b in L3] - T += [(a,b,c) for a in B for b in B for c in L2] - T += [(a,b,c) for a in B for b in L2 for c in B] - T += [(a,b,c) for a in L2 for b in B for c in B] - T += [(a,b,c,d) for a in B for b in B for c in B for d in B] - self.assertEqual(len(T), 345130) - hashes = set(hash(x) % 2**32 for x in T) - collisions = len(T) - len(hashes) - - # For a pure random 32-bit hash and N = 345,130 test items, the - # expected number of collisions equals - # - # 2**(-32) * N(N-1)/2 = 13.9 - # - # We allow up to 20 collisions, which suffices to make the test - # pass with 95.5% confidence. - self.assertLessEqual(collisions, 20) - - def test_hash3(self): - # Check for hash collisions between tuples containing 0.0 and 0.5. - # The hashes of 0.0 and 0.5 itself differ only in one high bit. - # So this implicitly tests propagation of high bits to low bits. - from itertools import product - T = list(product([0.0, 0.5], repeat=18)) - self.assertEqual(len(T), 262144) - hashes = set(hash(x) % 2**32 for x in T) - collisions = len(T) - len(hashes) - - # For a pure random 32-bit hash and N = 262,144 test items, the - # expected number of collisions equals - # - # 2**(-32) * N(N-1)/2 = 8.0 - # - # We allow up to 15 collisions, which suffices to make the test - # pass with 99.1% confidence. - self.assertLessEqual(collisions, 15) + T += product(L3, B) + T += product(L2, repeat=2) + T += product(B, L3) + T += product(B, B, L2) + T += product(B, L2, B) + T += product(L2, B, B) + T += product(B, repeat=4) + assert len(T) == 345130 + tryone("new tuple test", T, + (9, 1), (0, 0), (21, 5), (6, 1)) def test_repr(self): l0 = tuple() @@ -298,5 +388,98 @@ class TupleTest(seq_tests.CommonTest): self.assertLess(a, b) self.assertLess(b, c) +# Notes on testing hash codes. The primary thing is that Python doesn't +# care about "random" hash codes. To the contrary, we like them to be +# very regular when possible, so that the low-order bits are as evenly +# distributed as possible. For integers this is easy: hash(i) == i for +# all not-huge i except i==-1. +# +# For tuples of mixed type there's really no hope of that, so we want +# "randomish" here instead. But getting close to pseudo-random in all +# bit positions is more expensive than we've been willing to pay for. +# +# We can tolerate large deviations from random - what we don't want is +# catastrophic pileups on a relative handful of hash codes. The dict +# and set lookup routines remain effective provided that full-width hash +# codes for not-equal objects are distinct. +# +# So we compute various statistics here based on what a "truly random" +# hash would do, but don't automate "pass or fail" based on those +# results. Instead those are viewed as inputs to human judgment, and the +# automated tests merely ensure we get the _same_ results across +# platforms. In fact, we normally don't bother to run them at all - +# set RUN_ALL_HASH_TESTS to force it. +# +# When global JUST_SHOW_HASH_RESULTS is True, the tuple hash statistics +# are just displayed to stdout. A typical output line looks like: +# +# old tuple test; 32-bit upper hash codes; \ +# pileup 49 mean 7.4 coll 52 z +16.4 +# +# "old tuple test" is just a string name for the test being run. +# +# "32-bit upper hash codes" means this was run under a 64-bit build and +# we've shifted away the lower 32 bits of the hash codes. +# +# "pileup" is 0 if there were no collisions across those hash codes. +# It's 1 less than the maximum number of times any single hash code was +# seen. So in this case, there was (at least) one hash code that was +# seen 50 times: that hash code "piled up" 49 more times than ideal. +# +# "mean" is the number of collisions a perfectly random hash function +# would have yielded, on average. +# +# "coll" is the number of collisions actually seen. +# +# "z" is "coll - mean" divided by the standard deviation of the number +# of collisions a perfectly random hash function would suffer. A +# positive value is "worse than random", and negative value "better than +# random". Anything of magnitude greater than 3 would be highly suspect +# for a hash function that claimed to be random. It's essentially +# impossible that a truly random function would deliver a result 16.4 +# sdevs "worse than random". +# +# But we don't care here! That's why the test isn't coded to fail. +# Knowing something about how the high-order hash code bits behave +# provides insight, but is irrelevant to how the dict and set lookup +# code performs. The low-order bits are much more important to that, +# and on the same test those did "just like random": +# +# old tuple test; 32-bit lower hash codes; \ +# pileup 1 mean 7.4 coll 7 z -0.2 +# +# So there are always tradeoffs to consider. For another: +# +# 0..99 << 60 by 3; 32-bit hash codes; \ +# pileup 0 mean 116.4 coll 0 z -10.8 +# +# That was run under a 32-bit build, and is spectacularly "better than +# random". On a 64-bit build the wider hash codes are fine too: +# +# 0..99 << 60 by 3; 64-bit hash codes; \ +# pileup 0 mean 0.0 coll 0 z -0.0 +# +# but their lower 32 bits are poor: +# +# 0..99 << 60 by 3; 32-bit lower hash codes; \ +# pileup 1 mean 116.4 coll 324 z +19.2 +# +# In a statistical sense that's waaaaay too many collisions, but (a) 324 +# collisions out of a million hash codes isn't anywhere near being a +# real problem; and, (b) the worst pileup on a single hash code is a measly +# 1 extra. It's a relatively poor case for the tuple hash, but still +# fine for practical use. +# +# This isn't, which is what Python 3.7.1 produced for the hashes of +# itertools.product([0, 0.5], repeat=18). Even with a fat 64-bit +# hashcode, the highest pileup was over 16,000 - making a dict/set +# lookup on one of the colliding values thousands of times slower (on +# average) than we expect. +# +# [0, 0.5] by 18; 64-bit hash codes; \ +# pileup 16,383 mean 0.0 coll 262,128 z +6073641856.9 +# [0, 0.5] by 18; 32-bit lower hash codes; \ +# pileup 262,143 mean 8.0 coll 262,143 z +92683.6 + if __name__ == "__main__": unittest.main() |