diff options
| author | Raymond Hettinger <python@rcn.com> | 2015-08-01 16:53:00 (GMT) | 
|---|---|---|
| committer | Raymond Hettinger <python@rcn.com> | 2015-08-01 16:53:00 (GMT) | 
| commit | fbffdef47dd711d2441ff415f00c72ae7102e80f (patch) | |
| tree | 3338cac7486563319eb09225d22a5d90c2058e34 /Objects/setobject.c | |
| parent | 99b80b507246af3c293a0aa8eeaac4788b703200 (diff) | |
| download | cpython-fbffdef47dd711d2441ff415f00c72ae7102e80f.zip cpython-fbffdef47dd711d2441ff415f00c72ae7102e80f.tar.gz cpython-fbffdef47dd711d2441ff415f00c72ae7102e80f.tar.bz2 | |
Issue #24762:  Speed-up frozenset_hash() and greatly beef-up the comments.
Diffstat (limited to 'Objects/setobject.c')
| -rw-r--r-- | Objects/setobject.c | 70 | 
1 files changed, 43 insertions, 27 deletions
| diff --git a/Objects/setobject.c b/Objects/setobject.c index c590bbf..2d4d8cd 100644 --- a/Objects/setobject.c +++ b/Objects/setobject.c @@ -739,41 +739,57 @@ set_traverse(PySetObject *so, visitproc visit, void *arg)      return 0;  } -static Py_hash_t -frozenset_hash(PyObject *self) +/* Work to increase the bit dispersion for closely spaced hash values. +   This is important because some use cases have many combinations of a +   small number of elements with nearby hashes so that many distinct +   combinations collapse to only a handful of distinct hash values. */ + +static Py_uhash_t +_shuffle_bits(Py_uhash_t h)  { -    /* Most of the constants in this hash algorithm are randomly choosen -       large primes with "interesting bit patterns" and that passed -       tests for good collision statistics on a variety of problematic -       datasets such as: +    return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL; +} -          ps = [] -          for r in range(21): -              ps += itertools.combinations(range(20), r) -          num_distinct_hashes = len({hash(frozenset(s)) for s in ps}) +/* Most of the constants in this hash algorithm are randomly chosen +   large primes with "interesting bit patterns" and that passed tests +   for good collision statistics on a variety of problematic datasets +   including powersets and graph structures (such as David Eppstein's +   graph recipes in Lib/test/test_set.py) */ -    */ +static Py_hash_t +frozenset_hash(PyObject *self) +{      PySetObject *so = (PySetObject *)self; -    Py_uhash_t h, hash = 1927868237UL; +    Py_uhash_t hash = 1927868237UL;      setentry *entry; -    Py_ssize_t pos = 0; - -    if (so->hash != -1) -        return so->hash; +    /* Make hash(frozenset({0})) distinct from hash(frozenset()) */      hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1; -    while (set_next(so, &pos, &entry)) { -        /* Work to increase the bit dispersion for closely spaced hash -           values.  This is important because some use cases have many -           combinations of a small number of elements with nearby -           hashes so that many distinct combinations collapse to only -           a handful of distinct hash values. */ -        h = entry->hash; -        hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL; -    } -    /* Make the final result spread-out in a different pattern -       than the algorithm for tuples or other python objects. */ + +    /* Xor-in shuffled bits from every entry's hash field because xor is +       commutative and a frozenset hash should be independent of order. + +       For speed, include null entries and dummy entries and then +       subtract out their effect afterwards so that the final hash +       depends only on active entries.  This allows the code to be +       vectorized by the compiler and it saves the unpredictable +       branches that would arise when trying to exclude null and dummy +       entries on every iteration. */ + +    for (entry = so->table; entry <= &so->table[so->mask]; entry++) +        hash ^= _shuffle_bits(entry->hash); + +    /* Remove the effect of an odd number NULL entries */ +    if ((so->mask + 1 - so->fill) & 1) +        hash ^= _shuffle_bits(0); + +    /* Remove the effect of an odd number of dummy entries */ +    if ((so->fill - so->used) & 1) +        hash ^= _shuffle_bits(-1); + +    /* Disperse patterns arising in nested frozensets */      hash = hash * 69069U + 907133923UL; +      if (hash == (Py_uhash_t)-1)          hash = 590923713UL;      so->hash = hash; | 
