diff options
author | jdemeyer <jdemeyer@cage.ugent.be> | 2018-10-28 00:06:38 (GMT) |
---|---|---|
committer | Raymond Hettinger <rhettinger@users.noreply.github.com> | 2018-10-28 00:06:38 (GMT) |
commit | aeb1be5868623c9cd9cf6d7de3015a43fb005815 (patch) | |
tree | 8a4254e9e0077067e9d58525d3a8c8897a20aced | |
parent | 53125a53f483db0af76249b6af6efcdc200eb421 (diff) | |
download | cpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.zip cpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.tar.gz cpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.tar.bz2 |
bpo-34751: improved hash function for tuples (GH-9471)
-rw-r--r-- | Lib/test/test_tuple.py | 111 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2018-09-20-15-41-58.bpo-34751.Yiv0pV.rst | 4 | ||||
-rw-r--r-- | Objects/tupleobject.c | 71 |
3 files changed, 143 insertions, 43 deletions
diff --git a/Lib/test/test_tuple.py b/Lib/test/test_tuple.py index 84c064f..929f853 100644 --- a/Lib/test/test_tuple.py +++ b/Lib/test/test_tuple.py @@ -62,29 +62,104 @@ class TupleTest(seq_tests.CommonTest): yield i self.assertEqual(list(tuple(f())), list(range(1000))) - def test_hash(self): - # See SF bug 942952: Weakness in tuple hash - # The hash should: - # be non-commutative - # should spread-out closely spaced values - # should not exhibit cancellation in tuples like (x,(x,y)) - # should be distinct from element hashes: hash(x)!=hash((x,)) - # This test exercises those cases. - # For a pure random hash and N=50, the expected number of occupied - # buckets when tossing 252,600 balls into 2**32 buckets - # is 252,592.6, or about 7.4 expected collisions. The - # standard deviation is 2.73. On a box with 64-bit hash - # codes, no collisions are expected. Here we accept no - # more than 15 collisions. Any worse and the hash function - # is sorely suspect. - + # Various tests for hashing of tuples to check that we get few collisions. + # + # Earlier versions of the tuple hash algorithm had collisions + # reported at: + # - https://bugs.python.org/issue942952 + # - https://bugs.python.org/issue34751 + # + # Notes: + # - The hash of tuples is deterministic: if the test passes once on a given + # system, it will always pass. So the probabilities mentioned in the + # test_hash functions below should be interpreted assuming that the + # hashes are random. + # - Due to the structure in the testsuite inputs, collisions are not + # independent. For example, if hash((a,b)) == hash((c,d)), then also + # hash((a,b,x)) == hash((c,d,x)). But the quoted probabilities assume + # independence anyway. + # - We limit the hash to 32 bits in the tests to have a good test on + # 64-bit systems too. Furthermore, this is also a sanity check that the + # lower 32 bits of a 64-bit hash are sufficiently random too. + def test_hash1(self): + # Check for hash collisions between small integers in range(50) and + # certain tuples and nested tuples of such integers. N=50 base = list(range(N)) xp = [(i, j) for i in base for j in base] inps = base + [(i, j) for i in base for j in xp] + \ [(i, j) for i in xp for j in base] + xp + list(zip(base)) - collisions = len(inps) - len(set(map(hash, inps))) - self.assertTrue(collisions <= 15) + self.assertEqual(len(inps), 252600) + hashes = set(hash(x) % 2**32 for x in inps) + collisions = len(inps) - len(hashes) + + # For a pure random 32-bit hash and N = 252,600 test items, the + # expected number of collisions equals + # + # 2**(-32) * N(N-1)/2 = 7.4 + # + # We allow up to 15 collisions, which suffices to make the test + # pass with 99.5% confidence. + self.assertLessEqual(collisions, 15) + + def test_hash2(self): + # Check for hash collisions between small integers (positive and + # negative), tuples and nested tuples of such integers. + + # All numbers in the interval [-n, ..., n] except -1 because + # hash(-1) == hash(-2). + n = 5 + A = [x for x in range(-n, n+1) if x != -1] + + B = A + [(a,) for a in A] + + L2 = [(a,b) for a in A for b in A] + L3 = L2 + [(a,b,c) for a in A for b in A for c in A] + L4 = L3 + [(a,b,c,d) for a in A for b in A for c in A for d in A] + + # T = list of testcases. These consist of all (possibly nested + # at most 2 levels deep) tuples containing at most 4 items from + # the set A. + T = A + T += [(a,) for a in B + L4] + T += [(a,b) for a in L3 for b in B] + T += [(a,b) for a in L2 for b in L2] + T += [(a,b) for a in B for b in L3] + T += [(a,b,c) for a in B for b in B for c in L2] + T += [(a,b,c) for a in B for b in L2 for c in B] + T += [(a,b,c) for a in L2 for b in B for c in B] + T += [(a,b,c,d) for a in B for b in B for c in B for d in B] + self.assertEqual(len(T), 345130) + hashes = set(hash(x) % 2**32 for x in T) + collisions = len(T) - len(hashes) + + # For a pure random 32-bit hash and N = 345,130 test items, the + # expected number of collisions equals + # + # 2**(-32) * N(N-1)/2 = 13.9 + # + # We allow up to 20 collisions, which suffices to make the test + # pass with 95.5% confidence. + self.assertLessEqual(collisions, 20) + + def test_hash3(self): + # Check for hash collisions between tuples containing 0.0 and 0.5. + # The hashes of 0.0 and 0.5 itself differ only in one high bit. + # So this implicitly tests propagation of high bits to low bits. + from itertools import product + T = list(product([0.0, 0.5], repeat=18)) + self.assertEqual(len(T), 262144) + hashes = set(hash(x) % 2**32 for x in T) + collisions = len(T) - len(hashes) + + # For a pure random 32-bit hash and N = 262,144 test items, the + # expected number of collisions equals + # + # 2**(-32) * N(N-1)/2 = 8.0 + # + # We allow up to 15 collisions, which suffices to make the test + # pass with 99.1% confidence. + self.assertLessEqual(collisions, 15) def test_repr(self): l0 = tuple() diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-09-20-15-41-58.bpo-34751.Yiv0pV.rst b/Misc/NEWS.d/next/Core and Builtins/2018-09-20-15-41-58.bpo-34751.Yiv0pV.rst new file mode 100644 index 0000000..b2ba514 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-09-20-15-41-58.bpo-34751.Yiv0pV.rst @@ -0,0 +1,4 @@ +The hash function for tuples is now based on xxHash +which gives better collision results on (formerly) pathological cases. +Additionally, on 64-bit systems it improves tuple hashes in general. +Patch by Jeroen Demeyer with substantial contributions by Tim Peters. diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c index eaf92d5..2e32406 100644 --- a/Objects/tupleobject.c +++ b/Objects/tupleobject.c @@ -333,39 +333,60 @@ error: return NULL; } -/* The addend 82520, was selected from the range(0, 1000000) for - generating the greatest number of prime multipliers for tuples - up to length eight: - 1082527, 1165049, 1082531, 1165057, 1247581, 1330103, 1082533, - 1330111, 1412633, 1165069, 1247599, 1495177, 1577699 - - Tests have shown that it's not worth to cache the hash value, see - issue #9685. +/* Hash for tuples. This is a slightly simplified version of the xxHash + non-cryptographic hash: + - we do not use any parallellism, there is only 1 accumulator. + - we drop the final mixing since this is just a permutation of the + output space: it does not help against collisions. + - at the end, we mangle the length with a single constant. + For the xxHash specification, see + https://github.com/Cyan4973/xxHash/blob/master/doc/xxhash_spec.md + + Below are the official constants from the xxHash specification. Optimizing + compilers should emit a single "rotate" instruction for the + _PyHASH_XXROTATE() expansion. If that doesn't happen for some important + platform, the macro could be changed to expand to a platform-specific rotate + spelling instead. */ +#if SIZEOF_PY_UHASH_T > 4 +#define _PyHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL) +#define _PyHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL) +#define _PyHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL) +#define _PyHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ +#else +#define _PyHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL) +#define _PyHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL) +#define _PyHASH_XXPRIME_5 ((Py_uhash_t)374761393UL) +#define _PyHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ +#endif +/* Tests have shown that it's not worth to cache the hash value, see + https://bugs.python.org/issue9685 */ static Py_hash_t tuplehash(PyTupleObject *v) { - Py_uhash_t x; /* Unsigned for defined overflow behavior. */ - Py_hash_t y; - Py_ssize_t len = Py_SIZE(v); - PyObject **p; - Py_uhash_t mult = _PyHASH_MULTIPLIER; - x = 0x345678UL; - p = v->ob_item; - while (--len >= 0) { - y = PyObject_Hash(*p++); - if (y == -1) + Py_ssize_t i, len = Py_SIZE(v); + PyObject **item = v->ob_item; + + Py_uhash_t acc = _PyHASH_XXPRIME_5; + for (i = 0; i < len; i++) { + Py_uhash_t lane = PyObject_Hash(item[i]); + if (lane == (Py_uhash_t)-1) { return -1; - x = (x ^ y) * mult; - /* the cast might truncate len; that doesn't change hash stability */ - mult += (Py_hash_t)(82520UL + len + len); + } + acc += lane * _PyHASH_XXPRIME_2; + acc = _PyHASH_XXROTATE(acc); + acc *= _PyHASH_XXPRIME_1; + } + + /* Add input length, mangled to keep the historical value of hash(()). */ + acc += len ^ (_PyHASH_XXPRIME_5 ^ 3527539UL); + + if (acc == (Py_uhash_t)-1) { + return 1546275796; } - x += 97531UL; - if (x == (Py_uhash_t)-1) - x = -2; - return x; + return acc; } static Py_ssize_t |