summaryrefslogtreecommitdiffstats
path: root/Objects/setobject.c
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2015-08-01 16:53:00 (GMT)
committerRaymond Hettinger <python@rcn.com>2015-08-01 16:53:00 (GMT)
commitfbffdef47dd711d2441ff415f00c72ae7102e80f (patch)
tree3338cac7486563319eb09225d22a5d90c2058e34 /Objects/setobject.c
parent99b80b507246af3c293a0aa8eeaac4788b703200 (diff)
downloadcpython-fbffdef47dd711d2441ff415f00c72ae7102e80f.zip
cpython-fbffdef47dd711d2441ff415f00c72ae7102e80f.tar.gz
cpython-fbffdef47dd711d2441ff415f00c72ae7102e80f.tar.bz2
Issue #24762: Speed-up frozenset_hash() and greatly beef-up the comments.
Diffstat (limited to 'Objects/setobject.c')
-rw-r--r--Objects/setobject.c70
1 files changed, 43 insertions, 27 deletions
diff --git a/Objects/setobject.c b/Objects/setobject.c
index c590bbf..2d4d8cd 100644
--- a/Objects/setobject.c
+++ b/Objects/setobject.c
@@ -739,41 +739,57 @@ set_traverse(PySetObject *so, visitproc visit, void *arg)
return 0;
}
-static Py_hash_t
-frozenset_hash(PyObject *self)
+/* Work to increase the bit dispersion for closely spaced hash values.
+ This is important because some use cases have many combinations of a
+ small number of elements with nearby hashes so that many distinct
+ combinations collapse to only a handful of distinct hash values. */
+
+static Py_uhash_t
+_shuffle_bits(Py_uhash_t h)
{
- /* Most of the constants in this hash algorithm are randomly choosen
- large primes with "interesting bit patterns" and that passed
- tests for good collision statistics on a variety of problematic
- datasets such as:
+ return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
+}
- ps = []
- for r in range(21):
- ps += itertools.combinations(range(20), r)
- num_distinct_hashes = len({hash(frozenset(s)) for s in ps})
+/* Most of the constants in this hash algorithm are randomly chosen
+ large primes with "interesting bit patterns" and that passed tests
+ for good collision statistics on a variety of problematic datasets
+ including powersets and graph structures (such as David Eppstein's
+ graph recipes in Lib/test/test_set.py) */
- */
+static Py_hash_t
+frozenset_hash(PyObject *self)
+{
PySetObject *so = (PySetObject *)self;
- Py_uhash_t h, hash = 1927868237UL;
+ Py_uhash_t hash = 1927868237UL;
setentry *entry;
- Py_ssize_t pos = 0;
-
- if (so->hash != -1)
- return so->hash;
+ /* Make hash(frozenset({0})) distinct from hash(frozenset()) */
hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1;
- while (set_next(so, &pos, &entry)) {
- /* Work to increase the bit dispersion for closely spaced hash
- values. This is important because some use cases have many
- combinations of a small number of elements with nearby
- hashes so that many distinct combinations collapse to only
- a handful of distinct hash values. */
- h = entry->hash;
- hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
- }
- /* Make the final result spread-out in a different pattern
- than the algorithm for tuples or other python objects. */
+
+ /* Xor-in shuffled bits from every entry's hash field because xor is
+ commutative and a frozenset hash should be independent of order.
+
+ For speed, include null entries and dummy entries and then
+ subtract out their effect afterwards so that the final hash
+ depends only on active entries. This allows the code to be
+ vectorized by the compiler and it saves the unpredictable
+ branches that would arise when trying to exclude null and dummy
+ entries on every iteration. */
+
+ for (entry = so->table; entry <= &so->table[so->mask]; entry++)
+ hash ^= _shuffle_bits(entry->hash);
+
+ /* Remove the effect of an odd number NULL entries */
+ if ((so->mask + 1 - so->fill) & 1)
+ hash ^= _shuffle_bits(0);
+
+ /* Remove the effect of an odd number of dummy entries */
+ if ((so->fill - so->used) & 1)
+ hash ^= _shuffle_bits(-1);
+
+ /* Disperse patterns arising in nested frozensets */
hash = hash * 69069U + 907133923UL;
+
if (hash == (Py_uhash_t)-1)
hash = 590923713UL;
so->hash = hash;