diff options
author | Mark Dickinson <dickinsm@gmail.com> | 2010-05-23 13:33:13 (GMT) |
---|---|---|
committer | Mark Dickinson <dickinsm@gmail.com> | 2010-05-23 13:33:13 (GMT) |
commit | dc787d2055a7b562b64ca91b8f1af6d49fa39f1c (patch) | |
tree | f6a3868e8134c25c662868f19306bfea76b0ab45 /Objects | |
parent | 03721133a68814696e3eee75b1eb09f5016ff078 (diff) | |
download | cpython-dc787d2055a7b562b64ca91b8f1af6d49fa39f1c.zip cpython-dc787d2055a7b562b64ca91b8f1af6d49fa39f1c.tar.gz cpython-dc787d2055a7b562b64ca91b8f1af6d49fa39f1c.tar.bz2 |
Issue #8188: Introduce a new scheme for computing hashes of numbers
(instances of int, float, complex, decimal.Decimal and
fractions.Fraction) that makes it easy to maintain the invariant that
hash(x) == hash(y) whenever x and y have equal value.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/complexobject.c | 18 | ||||
-rw-r--r-- | Objects/longobject.c | 39 | ||||
-rw-r--r-- | Objects/object.c | 136 | ||||
-rw-r--r-- | Objects/typeobject.c | 26 |
4 files changed, 145 insertions, 74 deletions
diff --git a/Objects/complexobject.c b/Objects/complexobject.c index 9e1e217..7594c88 100644 --- a/Objects/complexobject.c +++ b/Objects/complexobject.c @@ -403,12 +403,12 @@ complex_str(PyComplexObject *v) static long complex_hash(PyComplexObject *v) { - long hashreal, hashimag, combined; - hashreal = _Py_HashDouble(v->cval.real); - if (hashreal == -1) + unsigned long hashreal, hashimag, combined; + hashreal = (unsigned long)_Py_HashDouble(v->cval.real); + if (hashreal == (unsigned long)-1) return -1; - hashimag = _Py_HashDouble(v->cval.imag); - if (hashimag == -1) + hashimag = (unsigned long)_Py_HashDouble(v->cval.imag); + if (hashimag == (unsigned long)-1) return -1; /* Note: if the imaginary part is 0, hashimag is 0 now, * so the following returns hashreal unchanged. This is @@ -416,10 +416,10 @@ complex_hash(PyComplexObject *v) * compare equal must have the same hash value, so that * hash(x + 0*j) must equal hash(x). */ - combined = hashreal + 1000003 * hashimag; - if (combined == -1) - combined = -2; - return combined; + combined = hashreal + _PyHASH_IMAG * hashimag; + if (combined == (unsigned long)-1) + combined = (unsigned long)-2; + return (long)combined; } /* This macro may return! */ diff --git a/Objects/longobject.c b/Objects/longobject.c index 850396b..564d1a0 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -2571,18 +2571,37 @@ long_hash(PyLongObject *v) sign = -1; i = -(i); } - /* The following loop produces a C unsigned long x such that x is - congruent to the absolute value of v modulo ULONG_MAX. The - resulting x is nonzero if and only if v is. */ while (--i >= 0) { - /* Force a native long #-bits (32 or 64) circular shift */ - x = (x >> (8*SIZEOF_LONG-PyLong_SHIFT)) | (x << PyLong_SHIFT); + /* Here x is a quantity in the range [0, _PyHASH_MODULUS); we + want to compute x * 2**PyLong_SHIFT + v->ob_digit[i] modulo + _PyHASH_MODULUS. + + The computation of x * 2**PyLong_SHIFT % _PyHASH_MODULUS + amounts to a rotation of the bits of x. To see this, write + + x * 2**PyLong_SHIFT = y * 2**_PyHASH_BITS + z + + where y = x >> (_PyHASH_BITS - PyLong_SHIFT) gives the top + PyLong_SHIFT bits of x (those that are shifted out of the + original _PyHASH_BITS bits, and z = (x << PyLong_SHIFT) & + _PyHASH_MODULUS gives the bottom _PyHASH_BITS - PyLong_SHIFT + bits of x, shifted up. Then since 2**_PyHASH_BITS is + congruent to 1 modulo _PyHASH_MODULUS, y*2**_PyHASH_BITS is + congruent to y modulo _PyHASH_MODULUS. So + + x * 2**PyLong_SHIFT = y + z (mod _PyHASH_MODULUS). + + The right-hand side is just the result of rotating the + _PyHASH_BITS bits of x left by PyLong_SHIFT places; since + not all _PyHASH_BITS bits of x are 1s, the same is true + after rotation, so 0 <= y+z < _PyHASH_MODULUS and y + z is + the reduction of x*2**PyLong_SHIFT modulo + _PyHASH_MODULUS. */ + x = ((x << PyLong_SHIFT) & _PyHASH_MODULUS) | + (x >> (_PyHASH_BITS - PyLong_SHIFT)); x += v->ob_digit[i]; - /* If the addition above overflowed we compensate by - incrementing. This preserves the value modulo - ULONG_MAX. */ - if (x < v->ob_digit[i]) - x++; + if (x >= _PyHASH_MODULUS) + x -= _PyHASH_MODULUS; } x = x * sign; if (x == (unsigned long)-1) diff --git a/Objects/object.c b/Objects/object.c index 0802348..76d018f 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -647,63 +647,101 @@ PyObject_RichCompareBool(PyObject *v, PyObject *w, int op) All the utility functions (_Py_Hash*()) return "-1" to signify an error. */ +/* For numeric types, the hash of a number x is based on the reduction + of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that + hash(x) == hash(y) whenever x and y are numerically equal, even if + x and y have different types. + + A quick summary of the hashing strategy: + + (1) First define the 'reduction of x modulo P' for any rational + number x; this is a standard extension of the usual notion of + reduction modulo P for integers. If x == p/q (written in lowest + terms), the reduction is interpreted as the reduction of p times + the inverse of the reduction of q, all modulo P; if q is exactly + divisible by P then define the reduction to be infinity. So we've + got a well-defined map + + reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }. + + (2) Now for a rational number x, define hash(x) by: + + reduce(x) if x >= 0 + -reduce(-x) if x < 0 + + If the result of the reduction is infinity (this is impossible for + integers, floats and Decimals) then use the predefined hash value + _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead. + _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the + hashes of float and Decimal infinities and nans. + + A selling point for the above strategy is that it makes it possible + to compute hashes of decimal and binary floating-point numbers + efficiently, even if the exponent of the binary or decimal number + is large. The key point is that + + reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS) + + provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a + binary or decimal float is never infinity, since the denominator is a power + of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have, + for nonnegative x, + + reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS + + reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS + + and reduce(10**e) can be computed efficiently by the usual modular + exponentiation algorithm. For reduce(2**e) it's even better: since + P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication + by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits. + + */ + long _Py_HashDouble(double v) { - double intpart, fractpart; - int expo; - long hipart; - long x; /* the final hash value */ - /* This is designed so that Python numbers of different types - * that compare equal hash to the same value; otherwise comparisons - * of mapping keys will turn out weird. - */ + int e, sign; + double m; + unsigned long x, y; if (!Py_IS_FINITE(v)) { if (Py_IS_INFINITY(v)) - return v < 0 ? -271828 : 314159; + return v > 0 ? _PyHASH_INF : -_PyHASH_INF; else - return 0; + return _PyHASH_NAN; } - fractpart = modf(v, &intpart); - if (fractpart == 0.0) { - /* This must return the same hash as an equal int or long. */ - if (intpart > LONG_MAX/2 || -intpart > LONG_MAX/2) { - /* Convert to long and use its hash. */ - PyObject *plong; /* converted to Python long */ - plong = PyLong_FromDouble(v); - if (plong == NULL) - return -1; - x = PyObject_Hash(plong); - Py_DECREF(plong); - return x; - } - /* Fits in a C long == a Python int, so is its own hash. */ - x = (long)intpart; - if (x == -1) - x = -2; - return x; - } - /* The fractional part is non-zero, so we don't have to worry about - * making this match the hash of some other type. - * Use frexp to get at the bits in the double. - * Since the VAX D double format has 56 mantissa bits, which is the - * most of any double format in use, each of these parts may have as - * many as (but no more than) 56 significant bits. - * So, assuming sizeof(long) >= 4, each part can be broken into two - * longs; frexp and multiplication are used to do that. - * Also, since the Cray double format has 15 exponent bits, which is - * the most of any double format in use, shifting the exponent field - * left by 15 won't overflow a long (again assuming sizeof(long) >= 4). - */ - v = frexp(v, &expo); - v *= 2147483648.0; /* 2**31 */ - hipart = (long)v; /* take the top 32 bits */ - v = (v - (double)hipart) * 2147483648.0; /* get the next 32 bits */ - x = hipart + (long)v + (expo << 15); - if (x == -1) - x = -2; - return x; + + m = frexp(v, &e); + + sign = 1; + if (m < 0) { + sign = -1; + m = -m; + } + + /* process 28 bits at a time; this should work well both for binary + and hexadecimal floating point. */ + x = 0; + while (m) { + x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28); + m *= 268435456.0; /* 2**28 */ + e -= 28; + y = (unsigned long)m; /* pull out integer part */ + m -= y; + x += y; + if (x >= _PyHASH_MODULUS) + x -= _PyHASH_MODULUS; + } + + /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */ + e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS); + x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e); + + x = x * sign; + if (x == (unsigned long)-1) + x = (unsigned long)-2; + return (long)x; } long diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 369bac6..adfb0ec 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -4921,6 +4921,7 @@ slot_tp_hash(PyObject *self) PyObject *func, *res; static PyObject *hash_str; long h; + int overflow; func = lookup_method(self, "__hash__", &hash_str); @@ -4937,14 +4938,27 @@ slot_tp_hash(PyObject *self) Py_DECREF(func); if (res == NULL) return -1; - if (PyLong_Check(res)) + + if (!PyLong_Check(res)) { + PyErr_SetString(PyExc_TypeError, + "__hash__ method should return an integer"); + return -1; + } + /* Transform the PyLong `res` to a C long `h`. For an existing + hashable Python object x, hash(x) will always lie within the range + of a C long. Therefore our transformation must preserve values + that already lie within this range, to ensure that if x.__hash__() + returns hash(y) then hash(x) == hash(y). */ + h = PyLong_AsLongAndOverflow(res, &overflow); + if (overflow) + /* res was not within the range of a C long, so we're free to + use any sufficiently bit-mixing transformation; + long.__hash__ will do nicely. */ h = PyLong_Type.tp_hash(res); - else - h = PyLong_AsLong(res); Py_DECREF(res); - if (h == -1 && !PyErr_Occurred()) - h = -2; - return h; + if (h == -1 && !PyErr_Occurred()) + h = -2; + return h; } static PyObject * |