summaryrefslogtreecommitdiffstats
path: root/Objects/tupleobject.c
diff options
context:
space:
mode:
authorjdemeyer <jdemeyer@cage.ugent.be>2018-10-28 00:06:38 (GMT)
committerRaymond Hettinger <rhettinger@users.noreply.github.com>2018-10-28 00:06:38 (GMT)
commitaeb1be5868623c9cd9cf6d7de3015a43fb005815 (patch)
tree8a4254e9e0077067e9d58525d3a8c8897a20aced /Objects/tupleobject.c
parent53125a53f483db0af76249b6af6efcdc200eb421 (diff)
downloadcpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.zip
cpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.tar.gz
cpython-aeb1be5868623c9cd9cf6d7de3015a43fb005815.tar.bz2
bpo-34751: improved hash function for tuples (GH-9471)
Diffstat (limited to 'Objects/tupleobject.c')
-rw-r--r--Objects/tupleobject.c71
1 files changed, 46 insertions, 25 deletions
diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c
index eaf92d5..2e32406 100644
--- a/Objects/tupleobject.c
+++ b/Objects/tupleobject.c
@@ -333,39 +333,60 @@ error:
return NULL;
}
-/* The addend 82520, was selected from the range(0, 1000000) for
- generating the greatest number of prime multipliers for tuples
- up to length eight:
- 1082527, 1165049, 1082531, 1165057, 1247581, 1330103, 1082533,
- 1330111, 1412633, 1165069, 1247599, 1495177, 1577699
-
- Tests have shown that it's not worth to cache the hash value, see
- issue #9685.
+/* Hash for tuples. This is a slightly simplified version of the xxHash
+ non-cryptographic hash:
+ - we do not use any parallellism, there is only 1 accumulator.
+ - we drop the final mixing since this is just a permutation of the
+ output space: it does not help against collisions.
+ - at the end, we mangle the length with a single constant.
+ For the xxHash specification, see
+ https://github.com/Cyan4973/xxHash/blob/master/doc/xxhash_spec.md
+
+ Below are the official constants from the xxHash specification. Optimizing
+ compilers should emit a single "rotate" instruction for the
+ _PyHASH_XXROTATE() expansion. If that doesn't happen for some important
+ platform, the macro could be changed to expand to a platform-specific rotate
+ spelling instead.
*/
+#if SIZEOF_PY_UHASH_T > 4
+#define _PyHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
+#define _PyHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
+#define _PyHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
+#define _PyHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
+#else
+#define _PyHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
+#define _PyHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
+#define _PyHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
+#define _PyHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
+#endif
+/* Tests have shown that it's not worth to cache the hash value, see
+ https://bugs.python.org/issue9685 */
static Py_hash_t
tuplehash(PyTupleObject *v)
{
- Py_uhash_t x; /* Unsigned for defined overflow behavior. */
- Py_hash_t y;
- Py_ssize_t len = Py_SIZE(v);
- PyObject **p;
- Py_uhash_t mult = _PyHASH_MULTIPLIER;
- x = 0x345678UL;
- p = v->ob_item;
- while (--len >= 0) {
- y = PyObject_Hash(*p++);
- if (y == -1)
+ Py_ssize_t i, len = Py_SIZE(v);
+ PyObject **item = v->ob_item;
+
+ Py_uhash_t acc = _PyHASH_XXPRIME_5;
+ for (i = 0; i < len; i++) {
+ Py_uhash_t lane = PyObject_Hash(item[i]);
+ if (lane == (Py_uhash_t)-1) {
return -1;
- x = (x ^ y) * mult;
- /* the cast might truncate len; that doesn't change hash stability */
- mult += (Py_hash_t)(82520UL + len + len);
+ }
+ acc += lane * _PyHASH_XXPRIME_2;
+ acc = _PyHASH_XXROTATE(acc);
+ acc *= _PyHASH_XXPRIME_1;
+ }
+
+ /* Add input length, mangled to keep the historical value of hash(()). */
+ acc += len ^ (_PyHASH_XXPRIME_5 ^ 3527539UL);
+
+ if (acc == (Py_uhash_t)-1) {
+ return 1546275796;
}
- x += 97531UL;
- if (x == (Py_uhash_t)-1)
- x = -2;
- return x;
+ return acc;
}
static Py_ssize_t