diff options
author | Christian Heimes <christian@cheimes.de> | 2013-11-20 10:46:18 (GMT) |
---|---|---|
committer | Christian Heimes <christian@cheimes.de> | 2013-11-20 10:46:18 (GMT) |
commit | 985ecdcfc29adfc36ce2339acf03f819ad414869 (patch) | |
tree | 06a11f82271e768dbe49469c8736b65b083f671c /Objects | |
parent | fe32aec25a8b36498d840bd69485e9bc94195b9c (diff) | |
download | cpython-985ecdcfc29adfc36ce2339acf03f819ad414869.zip cpython-985ecdcfc29adfc36ce2339acf03f819ad414869.tar.gz cpython-985ecdcfc29adfc36ce2339acf03f819ad414869.tar.bz2 |
ssue #19183: Implement PEP 456 'secure and interchangeable hash algorithm'.
Python now uses SipHash24 on all major platforms.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/bytesobject.c | 2 | ||||
-rw-r--r-- | Objects/memoryobject.c | 2 | ||||
-rw-r--r-- | Objects/object.c | 146 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 35 |
4 files changed, 4 insertions, 181 deletions
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index efa0192..8217b1e 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -897,7 +897,7 @@ bytes_hash(PyBytesObject *a) { if (a->ob_shash == -1) { /* Can't fail */ - a->ob_shash = _Py_HashBytes((unsigned char *) a->ob_sval, Py_SIZE(a)); + a->ob_shash = _Py_HashBytes(a->ob_sval, Py_SIZE(a)); } return a->ob_shash; } diff --git a/Objects/memoryobject.c b/Objects/memoryobject.c index 1d52d9d..cb644b8 100644 --- a/Objects/memoryobject.c +++ b/Objects/memoryobject.c @@ -2742,7 +2742,7 @@ memory_hash(PyMemoryViewObject *self) } /* Can't fail */ - self->hash = _Py_HashBytes((unsigned char *)mem, view->len); + self->hash = _Py_HashBytes(mem, view->len); if (mem != view->buf) PyMem_Free(mem); diff --git a/Objects/object.c b/Objects/object.c index acc34af..395e28d 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -731,150 +731,6 @@ PyObject_RichCompareBool(PyObject *v, PyObject *w, int op) return ok; } -/* Set of hash utility functions to help maintaining the invariant that - if a==b then hash(a)==hash(b) - - All the utility functions (_Py_Hash*()) return "-1" to signify an error. -*/ - -/* For numeric types, the hash of a number x is based on the reduction - of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that - hash(x) == hash(y) whenever x and y are numerically equal, even if - x and y have different types. - - A quick summary of the hashing strategy: - - (1) First define the 'reduction of x modulo P' for any rational - number x; this is a standard extension of the usual notion of - reduction modulo P for integers. If x == p/q (written in lowest - terms), the reduction is interpreted as the reduction of p times - the inverse of the reduction of q, all modulo P; if q is exactly - divisible by P then define the reduction to be infinity. So we've - got a well-defined map - - reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }. - - (2) Now for a rational number x, define hash(x) by: - - reduce(x) if x >= 0 - -reduce(-x) if x < 0 - - If the result of the reduction is infinity (this is impossible for - integers, floats and Decimals) then use the predefined hash value - _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead. - _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the - hashes of float and Decimal infinities and nans. - - A selling point for the above strategy is that it makes it possible - to compute hashes of decimal and binary floating-point numbers - efficiently, even if the exponent of the binary or decimal number - is large. The key point is that - - reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS) - - provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a - binary or decimal float is never infinity, since the denominator is a power - of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have, - for nonnegative x, - - reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS - - reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS - - and reduce(10**e) can be computed efficiently by the usual modular - exponentiation algorithm. For reduce(2**e) it's even better: since - P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication - by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits. - - */ - -Py_hash_t -_Py_HashDouble(double v) -{ - int e, sign; - double m; - Py_uhash_t x, y; - - if (!Py_IS_FINITE(v)) { - if (Py_IS_INFINITY(v)) - return v > 0 ? _PyHASH_INF : -_PyHASH_INF; - else - return _PyHASH_NAN; - } - - m = frexp(v, &e); - - sign = 1; - if (m < 0) { - sign = -1; - m = -m; - } - - /* process 28 bits at a time; this should work well both for binary - and hexadecimal floating point. */ - x = 0; - while (m) { - x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28); - m *= 268435456.0; /* 2**28 */ - e -= 28; - y = (Py_uhash_t)m; /* pull out integer part */ - m -= y; - x += y; - if (x >= _PyHASH_MODULUS) - x -= _PyHASH_MODULUS; - } - - /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */ - e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS); - x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e); - - x = x * sign; - if (x == (Py_uhash_t)-1) - x = (Py_uhash_t)-2; - return (Py_hash_t)x; -} - -Py_hash_t -_Py_HashPointer(void *p) -{ - Py_hash_t x; - size_t y = (size_t)p; - /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid - excessive hash collisions for dicts and sets */ - y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4)); - x = (Py_hash_t)y; - if (x == -1) - x = -2; - return x; -} - -Py_hash_t -_Py_HashBytes(unsigned char *p, Py_ssize_t len) -{ - Py_uhash_t x; - Py_ssize_t i; - - /* - We make the hash of the empty string be 0, rather than using - (prefix ^ suffix), since this slightly obfuscates the hash secret - */ -#ifdef Py_DEBUG - assert(_Py_HashSecret_Initialized); -#endif - if (len == 0) { - return 0; - } - x = (Py_uhash_t) _Py_HashSecret.prefix; - x ^= (Py_uhash_t) *p << 7; - for (i = 0; i < len; i++) - x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++; - x ^= (Py_uhash_t) len; - x ^= (Py_uhash_t) _Py_HashSecret.suffix; - if (x == -1) - x = -2; - return x; -} - Py_hash_t PyObject_HashNotImplemented(PyObject *v) { @@ -883,8 +739,6 @@ PyObject_HashNotImplemented(PyObject *v) return -1; } -_Py_HashSecret_t _Py_HashSecret; - Py_hash_t PyObject_Hash(PyObject *v) { diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 880889e..3644db3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11386,39 +11386,8 @@ unicode_hash(PyObject *self) _PyUnicode_HASH(self) = 0; return 0; } - - /* The hash function as a macro, gets expanded three times below. */ -#define HASH(P) \ - x ^= (Py_uhash_t) *P << 7; \ - while (--len >= 0) \ - x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \ - - x = (Py_uhash_t) _Py_HashSecret.prefix; - switch (PyUnicode_KIND(self)) { - case PyUnicode_1BYTE_KIND: { - const unsigned char *c = PyUnicode_1BYTE_DATA(self); - HASH(c); - break; - } - case PyUnicode_2BYTE_KIND: { - const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); - HASH(s); - break; - } - default: { - Py_UCS4 *l; - assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && - "Impossible switch case in unicode_hash"); - l = PyUnicode_4BYTE_DATA(self); - HASH(l); - break; - } - } - x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self); - x ^= (Py_uhash_t) _Py_HashSecret.suffix; - - if (x == -1) - x = -2; + x = _Py_HashBytes(PyUnicode_DATA(self), + PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); _PyUnicode_HASH(self) = x; return x; } |