diff options
author | Raymond Hettinger <python@rcn.com> | 2015-08-01 16:53:00 (GMT) |
---|---|---|
committer | Raymond Hettinger <python@rcn.com> | 2015-08-01 16:53:00 (GMT) |
commit | fbffdef47dd711d2441ff415f00c72ae7102e80f (patch) | |
tree | 3338cac7486563319eb09225d22a5d90c2058e34 /Objects/setobject.c | |
parent | 99b80b507246af3c293a0aa8eeaac4788b703200 (diff) | |
download | cpython-fbffdef47dd711d2441ff415f00c72ae7102e80f.zip cpython-fbffdef47dd711d2441ff415f00c72ae7102e80f.tar.gz cpython-fbffdef47dd711d2441ff415f00c72ae7102e80f.tar.bz2 |
Issue #24762: Speed-up frozenset_hash() and greatly beef-up the comments.
Diffstat (limited to 'Objects/setobject.c')
-rw-r--r-- | Objects/setobject.c | 70 |
1 files changed, 43 insertions, 27 deletions
diff --git a/Objects/setobject.c b/Objects/setobject.c index c590bbf..2d4d8cd 100644 --- a/Objects/setobject.c +++ b/Objects/setobject.c @@ -739,41 +739,57 @@ set_traverse(PySetObject *so, visitproc visit, void *arg) return 0; } -static Py_hash_t -frozenset_hash(PyObject *self) +/* Work to increase the bit dispersion for closely spaced hash values. + This is important because some use cases have many combinations of a + small number of elements with nearby hashes so that many distinct + combinations collapse to only a handful of distinct hash values. */ + +static Py_uhash_t +_shuffle_bits(Py_uhash_t h) { - /* Most of the constants in this hash algorithm are randomly choosen - large primes with "interesting bit patterns" and that passed - tests for good collision statistics on a variety of problematic - datasets such as: + return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL; +} - ps = [] - for r in range(21): - ps += itertools.combinations(range(20), r) - num_distinct_hashes = len({hash(frozenset(s)) for s in ps}) +/* Most of the constants in this hash algorithm are randomly chosen + large primes with "interesting bit patterns" and that passed tests + for good collision statistics on a variety of problematic datasets + including powersets and graph structures (such as David Eppstein's + graph recipes in Lib/test/test_set.py) */ - */ +static Py_hash_t +frozenset_hash(PyObject *self) +{ PySetObject *so = (PySetObject *)self; - Py_uhash_t h, hash = 1927868237UL; + Py_uhash_t hash = 1927868237UL; setentry *entry; - Py_ssize_t pos = 0; - - if (so->hash != -1) - return so->hash; + /* Make hash(frozenset({0})) distinct from hash(frozenset()) */ hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1; - while (set_next(so, &pos, &entry)) { - /* Work to increase the bit dispersion for closely spaced hash - values. This is important because some use cases have many - combinations of a small number of elements with nearby - hashes so that many distinct combinations collapse to only - a handful of distinct hash values. */ - h = entry->hash; - hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL; - } - /* Make the final result spread-out in a different pattern - than the algorithm for tuples or other python objects. */ + + /* Xor-in shuffled bits from every entry's hash field because xor is + commutative and a frozenset hash should be independent of order. + + For speed, include null entries and dummy entries and then + subtract out their effect afterwards so that the final hash + depends only on active entries. This allows the code to be + vectorized by the compiler and it saves the unpredictable + branches that would arise when trying to exclude null and dummy + entries on every iteration. */ + + for (entry = so->table; entry <= &so->table[so->mask]; entry++) + hash ^= _shuffle_bits(entry->hash); + + /* Remove the effect of an odd number NULL entries */ + if ((so->mask + 1 - so->fill) & 1) + hash ^= _shuffle_bits(0); + + /* Remove the effect of an odd number of dummy entries */ + if ((so->fill - so->used) & 1) + hash ^= _shuffle_bits(-1); + + /* Disperse patterns arising in nested frozensets */ hash = hash * 69069U + 907133923UL; + if (hash == (Py_uhash_t)-1) hash = 590923713UL; so->hash = hash; |