summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorMark Dickinson <dickinsm@gmail.com>2010-05-23 13:33:13 (GMT)
committerMark Dickinson <dickinsm@gmail.com>2010-05-23 13:33:13 (GMT)
commitdc787d2055a7b562b64ca91b8f1af6d49fa39f1c (patch)
treef6a3868e8134c25c662868f19306bfea76b0ab45 /Objects
parent03721133a68814696e3eee75b1eb09f5016ff078 (diff)
downloadcpython-dc787d2055a7b562b64ca91b8f1af6d49fa39f1c.zip
cpython-dc787d2055a7b562b64ca91b8f1af6d49fa39f1c.tar.gz
cpython-dc787d2055a7b562b64ca91b8f1af6d49fa39f1c.tar.bz2
Issue #8188: Introduce a new scheme for computing hashes of numbers
(instances of int, float, complex, decimal.Decimal and fractions.Fraction) that makes it easy to maintain the invariant that hash(x) == hash(y) whenever x and y have equal value.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/complexobject.c18
-rw-r--r--Objects/longobject.c39
-rw-r--r--Objects/object.c136
-rw-r--r--Objects/typeobject.c26
4 files changed, 145 insertions, 74 deletions
diff --git a/Objects/complexobject.c b/Objects/complexobject.c
index 9e1e217..7594c88 100644
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@@ -403,12 +403,12 @@ complex_str(PyComplexObject *v)
static long
complex_hash(PyComplexObject *v)
{
- long hashreal, hashimag, combined;
- hashreal = _Py_HashDouble(v->cval.real);
- if (hashreal == -1)
+ unsigned long hashreal, hashimag, combined;
+ hashreal = (unsigned long)_Py_HashDouble(v->cval.real);
+ if (hashreal == (unsigned long)-1)
return -1;
- hashimag = _Py_HashDouble(v->cval.imag);
- if (hashimag == -1)
+ hashimag = (unsigned long)_Py_HashDouble(v->cval.imag);
+ if (hashimag == (unsigned long)-1)
return -1;
/* Note: if the imaginary part is 0, hashimag is 0 now,
* so the following returns hashreal unchanged. This is
@@ -416,10 +416,10 @@ complex_hash(PyComplexObject *v)
* compare equal must have the same hash value, so that
* hash(x + 0*j) must equal hash(x).
*/
- combined = hashreal + 1000003 * hashimag;
- if (combined == -1)
- combined = -2;
- return combined;
+ combined = hashreal + _PyHASH_IMAG * hashimag;
+ if (combined == (unsigned long)-1)
+ combined = (unsigned long)-2;
+ return (long)combined;
}
/* This macro may return! */
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 850396b..564d1a0 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -2571,18 +2571,37 @@ long_hash(PyLongObject *v)
sign = -1;
i = -(i);
}
- /* The following loop produces a C unsigned long x such that x is
- congruent to the absolute value of v modulo ULONG_MAX. The
- resulting x is nonzero if and only if v is. */
while (--i >= 0) {
- /* Force a native long #-bits (32 or 64) circular shift */
- x = (x >> (8*SIZEOF_LONG-PyLong_SHIFT)) | (x << PyLong_SHIFT);
+ /* Here x is a quantity in the range [0, _PyHASH_MODULUS); we
+ want to compute x * 2**PyLong_SHIFT + v->ob_digit[i] modulo
+ _PyHASH_MODULUS.
+
+ The computation of x * 2**PyLong_SHIFT % _PyHASH_MODULUS
+ amounts to a rotation of the bits of x. To see this, write
+
+ x * 2**PyLong_SHIFT = y * 2**_PyHASH_BITS + z
+
+ where y = x >> (_PyHASH_BITS - PyLong_SHIFT) gives the top
+ PyLong_SHIFT bits of x (those that are shifted out of the
+ original _PyHASH_BITS bits, and z = (x << PyLong_SHIFT) &
+ _PyHASH_MODULUS gives the bottom _PyHASH_BITS - PyLong_SHIFT
+ bits of x, shifted up. Then since 2**_PyHASH_BITS is
+ congruent to 1 modulo _PyHASH_MODULUS, y*2**_PyHASH_BITS is
+ congruent to y modulo _PyHASH_MODULUS. So
+
+ x * 2**PyLong_SHIFT = y + z (mod _PyHASH_MODULUS).
+
+ The right-hand side is just the result of rotating the
+ _PyHASH_BITS bits of x left by PyLong_SHIFT places; since
+ not all _PyHASH_BITS bits of x are 1s, the same is true
+ after rotation, so 0 <= y+z < _PyHASH_MODULUS and y + z is
+ the reduction of x*2**PyLong_SHIFT modulo
+ _PyHASH_MODULUS. */
+ x = ((x << PyLong_SHIFT) & _PyHASH_MODULUS) |
+ (x >> (_PyHASH_BITS - PyLong_SHIFT));
x += v->ob_digit[i];
- /* If the addition above overflowed we compensate by
- incrementing. This preserves the value modulo
- ULONG_MAX. */
- if (x < v->ob_digit[i])
- x++;
+ if (x >= _PyHASH_MODULUS)
+ x -= _PyHASH_MODULUS;
}
x = x * sign;
if (x == (unsigned long)-1)
diff --git a/Objects/object.c b/Objects/object.c
index 0802348..76d018f 100644
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -647,63 +647,101 @@ PyObject_RichCompareBool(PyObject *v, PyObject *w, int op)
All the utility functions (_Py_Hash*()) return "-1" to signify an error.
*/
+/* For numeric types, the hash of a number x is based on the reduction
+ of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
+ hash(x) == hash(y) whenever x and y are numerically equal, even if
+ x and y have different types.
+
+ A quick summary of the hashing strategy:
+
+ (1) First define the 'reduction of x modulo P' for any rational
+ number x; this is a standard extension of the usual notion of
+ reduction modulo P for integers. If x == p/q (written in lowest
+ terms), the reduction is interpreted as the reduction of p times
+ the inverse of the reduction of q, all modulo P; if q is exactly
+ divisible by P then define the reduction to be infinity. So we've
+ got a well-defined map
+
+ reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
+
+ (2) Now for a rational number x, define hash(x) by:
+
+ reduce(x) if x >= 0
+ -reduce(-x) if x < 0
+
+ If the result of the reduction is infinity (this is impossible for
+ integers, floats and Decimals) then use the predefined hash value
+ _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
+ _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
+ hashes of float and Decimal infinities and nans.
+
+ A selling point for the above strategy is that it makes it possible
+ to compute hashes of decimal and binary floating-point numbers
+ efficiently, even if the exponent of the binary or decimal number
+ is large. The key point is that
+
+ reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
+
+ provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
+ binary or decimal float is never infinity, since the denominator is a power
+ of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
+ for nonnegative x,
+
+ reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
+
+ reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
+
+ and reduce(10**e) can be computed efficiently by the usual modular
+ exponentiation algorithm. For reduce(2**e) it's even better: since
+ P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
+ by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
+
+ */
+
long
_Py_HashDouble(double v)
{
- double intpart, fractpart;
- int expo;
- long hipart;
- long x; /* the final hash value */
- /* This is designed so that Python numbers of different types
- * that compare equal hash to the same value; otherwise comparisons
- * of mapping keys will turn out weird.
- */
+ int e, sign;
+ double m;
+ unsigned long x, y;
if (!Py_IS_FINITE(v)) {
if (Py_IS_INFINITY(v))
- return v < 0 ? -271828 : 314159;
+ return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
else
- return 0;
+ return _PyHASH_NAN;
}
- fractpart = modf(v, &intpart);
- if (fractpart == 0.0) {
- /* This must return the same hash as an equal int or long. */
- if (intpart > LONG_MAX/2 || -intpart > LONG_MAX/2) {
- /* Convert to long and use its hash. */
- PyObject *plong; /* converted to Python long */
- plong = PyLong_FromDouble(v);
- if (plong == NULL)
- return -1;
- x = PyObject_Hash(plong);
- Py_DECREF(plong);
- return x;
- }
- /* Fits in a C long == a Python int, so is its own hash. */
- x = (long)intpart;
- if (x == -1)
- x = -2;
- return x;
- }
- /* The fractional part is non-zero, so we don't have to worry about
- * making this match the hash of some other type.
- * Use frexp to get at the bits in the double.
- * Since the VAX D double format has 56 mantissa bits, which is the
- * most of any double format in use, each of these parts may have as
- * many as (but no more than) 56 significant bits.
- * So, assuming sizeof(long) >= 4, each part can be broken into two
- * longs; frexp and multiplication are used to do that.
- * Also, since the Cray double format has 15 exponent bits, which is
- * the most of any double format in use, shifting the exponent field
- * left by 15 won't overflow a long (again assuming sizeof(long) >= 4).
- */
- v = frexp(v, &expo);
- v *= 2147483648.0; /* 2**31 */
- hipart = (long)v; /* take the top 32 bits */
- v = (v - (double)hipart) * 2147483648.0; /* get the next 32 bits */
- x = hipart + (long)v + (expo << 15);
- if (x == -1)
- x = -2;
- return x;
+
+ m = frexp(v, &e);
+
+ sign = 1;
+ if (m < 0) {
+ sign = -1;
+ m = -m;
+ }
+
+ /* process 28 bits at a time; this should work well both for binary
+ and hexadecimal floating point. */
+ x = 0;
+ while (m) {
+ x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
+ m *= 268435456.0; /* 2**28 */
+ e -= 28;
+ y = (unsigned long)m; /* pull out integer part */
+ m -= y;
+ x += y;
+ if (x >= _PyHASH_MODULUS)
+ x -= _PyHASH_MODULUS;
+ }
+
+ /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
+ e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
+ x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
+
+ x = x * sign;
+ if (x == (unsigned long)-1)
+ x = (unsigned long)-2;
+ return (long)x;
}
long
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 369bac6..adfb0ec 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -4921,6 +4921,7 @@ slot_tp_hash(PyObject *self)
PyObject *func, *res;
static PyObject *hash_str;
long h;
+ int overflow;
func = lookup_method(self, "__hash__", &hash_str);
@@ -4937,14 +4938,27 @@ slot_tp_hash(PyObject *self)
Py_DECREF(func);
if (res == NULL)
return -1;
- if (PyLong_Check(res))
+
+ if (!PyLong_Check(res)) {
+ PyErr_SetString(PyExc_TypeError,
+ "__hash__ method should return an integer");
+ return -1;
+ }
+ /* Transform the PyLong `res` to a C long `h`. For an existing
+ hashable Python object x, hash(x) will always lie within the range
+ of a C long. Therefore our transformation must preserve values
+ that already lie within this range, to ensure that if x.__hash__()
+ returns hash(y) then hash(x) == hash(y). */
+ h = PyLong_AsLongAndOverflow(res, &overflow);
+ if (overflow)
+ /* res was not within the range of a C long, so we're free to
+ use any sufficiently bit-mixing transformation;
+ long.__hash__ will do nicely. */
h = PyLong_Type.tp_hash(res);
- else
- h = PyLong_AsLong(res);
Py_DECREF(res);
- if (h == -1 && !PyErr_Occurred())
- h = -2;
- return h;
+ if (h == -1 && !PyErr_Occurred())
+ h = -2;
+ return h;
}
static PyObject *