diff options
author | jdemeyer <jdemeyer@cage.ugent.be> | 2018-10-28 00:06:38 (GMT) |
---|---|---|
committer | Raymond Hettinger <rhettinger@users.noreply.github.com> | 2018-10-28 00:06:38 (GMT) |
commit | aeb1be5868623c9cd9cf6d7de3015a43fb005815 (patch) | |
tree | 8a4254e9e0077067e9d58525d3a8c8897a20aced /Objects/tupleobject.c | |
parent | 53125a53f483db0af76249b6af6efcdc200eb421 (diff) | |
download | cpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.zip cpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.tar.gz cpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.tar.bz2 |
bpo-34751: improved hash function for tuples (GH-9471)
Diffstat (limited to 'Objects/tupleobject.c')
-rw-r--r-- | Objects/tupleobject.c | 71 |
1 files changed, 46 insertions, 25 deletions
diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c index eaf92d5..2e32406 100644 --- a/Objects/tupleobject.c +++ b/Objects/tupleobject.c @@ -333,39 +333,60 @@ error: return NULL; } -/* The addend 82520, was selected from the range(0, 1000000) for - generating the greatest number of prime multipliers for tuples - up to length eight: - 1082527, 1165049, 1082531, 1165057, 1247581, 1330103, 1082533, - 1330111, 1412633, 1165069, 1247599, 1495177, 1577699 - - Tests have shown that it's not worth to cache the hash value, see - issue #9685. +/* Hash for tuples. This is a slightly simplified version of the xxHash + non-cryptographic hash: + - we do not use any parallellism, there is only 1 accumulator. + - we drop the final mixing since this is just a permutation of the + output space: it does not help against collisions. + - at the end, we mangle the length with a single constant. + For the xxHash specification, see + https://github.com/Cyan4973/xxHash/blob/master/doc/xxhash_spec.md + + Below are the official constants from the xxHash specification. Optimizing + compilers should emit a single "rotate" instruction for the + _PyHASH_XXROTATE() expansion. If that doesn't happen for some important + platform, the macro could be changed to expand to a platform-specific rotate + spelling instead. */ +#if SIZEOF_PY_UHASH_T > 4 +#define _PyHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL) +#define _PyHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL) +#define _PyHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL) +#define _PyHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ +#else +#define _PyHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL) +#define _PyHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL) +#define _PyHASH_XXPRIME_5 ((Py_uhash_t)374761393UL) +#define _PyHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ +#endif +/* Tests have shown that it's not worth to cache the hash value, see + https://bugs.python.org/issue9685 */ static Py_hash_t tuplehash(PyTupleObject *v) { - Py_uhash_t x; /* Unsigned for defined overflow behavior. */ - Py_hash_t y; - Py_ssize_t len = Py_SIZE(v); - PyObject **p; - Py_uhash_t mult = _PyHASH_MULTIPLIER; - x = 0x345678UL; - p = v->ob_item; - while (--len >= 0) { - y = PyObject_Hash(*p++); - if (y == -1) + Py_ssize_t i, len = Py_SIZE(v); + PyObject **item = v->ob_item; + + Py_uhash_t acc = _PyHASH_XXPRIME_5; + for (i = 0; i < len; i++) { + Py_uhash_t lane = PyObject_Hash(item[i]); + if (lane == (Py_uhash_t)-1) { return -1; - x = (x ^ y) * mult; - /* the cast might truncate len; that doesn't change hash stability */ - mult += (Py_hash_t)(82520UL + len + len); + } + acc += lane * _PyHASH_XXPRIME_2; + acc = _PyHASH_XXROTATE(acc); + acc *= _PyHASH_XXPRIME_1; + } + + /* Add input length, mangled to keep the historical value of hash(()). */ + acc += len ^ (_PyHASH_XXPRIME_5 ^ 3527539UL); + + if (acc == (Py_uhash_t)-1) { + return 1546275796; } - x += 97531UL; - if (x == (Py_uhash_t)-1) - x = -2; - return x; + return acc; } static Py_ssize_t |