summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Misc/NEWS14
-rw-r--r--Objects/dictobject.c287
2 files changed, 130 insertions, 171 deletions
diff --git a/Misc/NEWS b/Misc/NEWS
index be58d95..14fa35e 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -15,7 +15,7 @@ Core
To enhance the usability of the .encode() method, the special
casing of Unicode object return values was dropped (Unicode objects
were auto-magically converted to string using the default encoding).
-
+
Both methods will now return whatever the codec in charge of the
requested encoding returns as object, e.g. Unicode codecs will
return Unicode objects when decoding is requested ("äöü".decode("latin-1")
@@ -116,13 +116,11 @@ Core
to crash if the element comparison routines for the dict keys and/or
values mutated the dicts. Making the code bulletproof slowed it down.
-- Collisions in dicts now use polynomial division instead of multiplication
- to generate the probe sequence, following an idea of Christian Tismer's.
- This allows all bits of the hash code to come into play. It should have
- little or no effect on speed in ordinary cases, but can help dramatically
- in bad cases. For example, looking up every key in a dict d with
- d.keys() = [i << 16 for i in range(20000)] is approximately 500x faster
- now.
+- Collisions in dicts are resolved via a new approach, which can help
+ dramatically in bad cases. For example, looking up every key in a dict
+ d with d.keys() = [i << 16 for i in range(20000)] is approximately 500x
+ faster now. Thanks to Christian Tismer for pointing out the cause and
+ the nature of an effective cure (last December! better late than never).
Library
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 5944b6e..754c750 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -3,121 +3,116 @@
#include "Python.h"
-
-/*
- * MINSIZE is the minimum size of a dictionary. This many slots are
+/* MINSIZE is the minimum size of a dictionary. This many slots are
* allocated directly in the dict object (in the ma_smalltable member).
- * This must be a power of 2, and the first entry in the polys[] vector must
- * match.
+ * It must be a power of 2, and at least 4. 8 allows dicts with no more than
+ * 5 active entries to live in ma_smalltable (and so avoid an additional
+ * malloc); instrumentation suggested this suffices for the majority of
+ * dicts (consisting mostly of usually-small instance dicts and usually-small
+ * dicts created to pass keyword arguments).
*/
#define MINSIZE 8
-/* define this out if you don't want conversion statistics on exit */
+/* Define this out if you don't want conversion statistics on exit. */
#undef SHOW_CONVERSION_COUNTS
+/* See large comment block below. This must be >= 1. */
+#define PERTURB_SHIFT 5
+
/*
-Table of irreducible polynomials to efficiently cycle through
-GF(2^n)-{0}, 2<=n<=30. A table size is always a power of 2.
-For a table size of 2**i, the polys entry is 2**i + j for some j in 1 thru
-2**i-1 inclusive. The polys[] entries here happen to add in the smallest j
-values "that work". Work means this: given any integer k in 1 thru 2**i-1
-inclusive, a poly works if & only if repeating this code:
- print k
- k <<= 1
- if k >= 2**i:
- k ^= poly
-prints every integer in 1 thru 2**i-1 inclusive exactly once before printing
-k a second time. Theory can be used to find such polys efficiently, but the
-operational defn. of "works" is sufficient to find them in reasonable time
-via brute force program (hint: any poly that has an even number of 1 bits
-cannot work; ditto any poly with low bit 0; exploit those).
-
-Some major subtleties: Most hash schemes depend on having a "good" hash
-function, in the sense of simulating randomness. Python doesn't: some of
-its hash functions are trivial, such as hash(i) == i for ints i (excepting
-i == -1, because -1 is the "error occurred" return value from tp_hash).
-
-This isn't necessarily bad! To the contrary, that our hash tables are powers
-of 2 in size, and that we take the low-order bits as the initial table index,
-means that there are no collisions at all for dicts indexed by a contiguous
-range of ints. This is "better than random" behavior, and that's very
-desirable.
-
-On the other hand, when collisions occur, the tendency to fill contiguous
-slices of the hash table makes a good collision resolution strategy crucial;
-e.g., linear probing is right out.
-
-Reimer Behrends contributed the idea of using a polynomial-based approach,
-using repeated multiplication by x in GF(2**n) where a polynomial is chosen
-such that x is a primitive root. This visits every table location exactly
-once, and the sequence of locations probed is highly non-linear.
-
-The same is also largely true of quadratic probing for power-of-2 tables, of
-the specific
-
- (i + comb(1, 2)) mod size
- (i + comb(2, 2)) mod size
- (i + comb(3, 2)) mod size
- (i + comb(4, 2)) mod size
- ...
- (i + comb(j, 2)) mod size
-
-flavor. The polynomial approach "scrambles" the probe indices better, but
-more importantly allows to get *some* additional bits of the hash code into
-play via computing the initial increment, thus giving a weak form of double
-hashing. Quadratic probing cannot be extended that way (the first probe
-offset must be 1, the second 3, the third 6, etc).
-
-Christian Tismer later contributed the idea of using polynomial division
-instead of multiplication. The problem is that the multiplicative method
-can't get *all* the bits of the hash code into play without expensive
-computations that slow down the initial index and/or initial increment
-computation. For a set of keys like [i << 16 for i in range(20000)], under
-the multiplicative method the initial index and increment were the same for
-all keys, so every key followed exactly the same probe sequence, and so
-this degenerated into a (very slow) linear search. The division method uses
-all the bits of the hash code naturally in the increment, although it *may*
-visit locations more than once until such time as all the high bits of the
-increment have been shifted away. It's also impossible to tell in advance
-whether incr is congruent to 0 modulo poly, so each iteration of the loop has
-to guard against incr becoming 0. These are minor costs, as we usually don't
-get into the probe loop, and when we do we usually get out on its first
-iteration.
+Major subtleties ahead: Most hash schemes depend on having a "good" hash
+function, in the sense of simulating randomness. Python doesn't: its most
+important hash functions (for strings and ints) are very regular in common
+cases:
+
+>>> map(hash, (0, 1, 2, 3))
+[0, 1, 2, 3]
+>>> map(hash, ("namea", "nameb", "namec", "named"))
+[-1658398457, -1658398460, -1658398459, -1658398462]
+>>>
+
+This isn't necessarily bad! To the contrary, in a table of size 2**i, taking
+the low-order i bits as the initial table index is extremely fast, and there
+are no collisions at all for dicts indexed by a contiguous range of ints.
+The same is approximately true when keys are "consecutive" strings. So this
+gives better-than-random behavior in common cases, and that's very desirable.
+
+OTOH, when collisions occur, the tendency to fill contiguous slices of the
+hash table makes a good collision resolution strategy crucial. Taking only
+the last i bits of the hash code is also vulnerable: for example, consider
+[i << 16 for i in range(20000)] as a set of keys. Since ints are their own
+hash codes, and this fits in a dict of size 2**15, the last 15 bits of every
+hash code are all 0: they *all* map to the same table index.
+
+But catering to unusual cases should not slow the usual ones, so we just take
+the last i bits anyway. It's up to collision resolution to do the rest. If
+we *usually* find the key we're looking for on the first try (and, it turns
+out, we usually do -- the table load factor is kept under 2/3, so the odds
+are solidly in our favor), then it makes best sense to keep the initial index
+computation dirt cheap.
+
+The first half of collision resolution is to visit table indices via this
+recurrence:
+
+ j = ((5*j) + 1) mod 2**i
+
+For any initial j in range(2**i), repeating that 2**i times generates each
+int in range(2**i) exactly once (see any text on random-number generation for
+proof). By itself, this doesn't help much: like linear probing (setting
+j += 1, or j -= 1, on each loop trip), it scans the table entries in a fixed
+order. This would be bad, except that's not the only thing we do, and it's
+actually *good* in the common cases where hash keys are consecutive. In an
+example that's really too small to make this entirely clear, for a table of
+size 2**3 the order of indices is:
+
+ 0 -> 1 -> 6 -> 7 -> 4 -> 5 -> 2 -> 3 -> 0 [and here it's repeating]
+
+If two things come in at index 5, the first place we look after is index 2,
+not 6, so if another comes in at index 6 the collision at 5 didn't hurt it.
+Linear probing is deadly in this case because there the fixed probe order
+is the *same* as the order consecutive keys are likely to arrive. But it's
+extremely unlikely hash codes will follow a 5*j+1 recurrence by accident,
+and certain that consecutive hash codes do not.
+
+The other half of the strategy is to get the other bits of the hash code
+into play. This is done by initializing a (unsigned) vrbl "perturb" to the
+full hash code, and changing the recurrence to:
+
+ j = (5*j) + 1 + perturb;
+ perturb >>= PERTURB_SHIFT;
+ use j % 2**i as the next table index;
+
+Now the probe sequence depends (eventually) on every bit in the hash code,
+and the pseudo-scrambling property of recurring on 5*j+1 is more valuable,
+because it quickly magnifies small differences in the bits that didn't affect
+the initial index. Note that because perturb is unsigned, if the recurrence
+is executed often enough perturb eventually becomes and remains 0. At that
+point (very rarely reached) the recurrence is on (just) 5*j+1 again, and
+that's certain to find an empty slot eventually (since it generates every int
+in range(2**i), and we make sure there's always at least one empty slot).
+
+Selecting a good value for PERTURB_SHIFT is a balancing act. You want it
+small so that the high bits of the hash code continue to affect the probe
+sequence across iterations; but you want it large so that in really bad cases
+the high-order hash bits have an effect on early iterations. 5 was "the
+best" in minimizing total collisions across experiments Tim Peters ran (on
+both normal and pathological cases), but 4 and 6 weren't significantly worse.
+
+Historical: Reimer Behrends contributed the idea of using a polynomial-based
+approach, using repeated multiplication by x in GF(2**n) where an irreducible
+polynomial for each table size was chosen such that x was a primitive root.
+Christian Tismer later extended that to use division by x instead, as an
+efficient way to get the high bits of the hash code into play. This scheme
+also gave excellent collision statistics, but was more expensive: two
+if-tests were required inside the loop; computing "the next" index took about
+the same number of operations but without as much potential parallelism
+(e.g., computing 5*j can go on at the same time as computing 1+perturb in the
+above, and then shifting perturb can be done while the table index is being
+masked); and the dictobject struct required a member to hold the table's
+polynomial. In Tim's experiments the current scheme ran faster, produced
+equally good collision statistics, needed less code & used less memory.
*/
-static long polys[] = {
-/* 4 + 3, */ /* first active entry if MINSIZE == 4 */
- 8 + 3, /* first active entry if MINSIZE == 8 */
- 16 + 3,
- 32 + 5,
- 64 + 3,
- 128 + 3,
- 256 + 29,
- 512 + 17,
- 1024 + 9,
- 2048 + 5,
- 4096 + 83,
- 8192 + 27,
- 16384 + 43,
- 32768 + 3,
- 65536 + 45,
- 131072 + 9,
- 262144 + 39,
- 524288 + 39,
- 1048576 + 9,
- 2097152 + 5,
- 4194304 + 3,
- 8388608 + 33,
- 16777216 + 27,
- 33554432 + 9,
- 67108864 + 71,
- 134217728 + 39,
- 268435456 + 9,
- 536870912 + 5,
- 1073741824 + 83
- /* 2147483648 + 9 -- if we ever boost this to unsigned long */
-};
-
/* Object used as dummy key to fill deleted entries */
static PyObject *dummy; /* Initialized by first call to newdictobject() */
@@ -168,7 +163,6 @@ struct dictobject {
int ma_fill; /* # Active + # Dummy */
int ma_used; /* # Active */
int ma_size; /* total # slots in ma_table */
- int ma_poly; /* appopriate entry from polys vector */
/* ma_table points to ma_smalltable for small tables, else to
* additional malloc'ed memory. ma_table is never NULL! This rule
* saves repeated runtime null-tests in the workhorse getitem and
@@ -202,8 +196,6 @@ show_counts(void)
(mp)->ma_table = (mp)->ma_smalltable; \
(mp)->ma_size = MINSIZE; \
(mp)->ma_used = (mp)->ma_fill = 0; \
- (mp)->ma_poly = polys[0]; \
- assert(MINSIZE < (mp)->ma_poly && (mp)->ma_poly < MINSIZE*2); \
} while(0)
PyObject *
@@ -235,28 +227,26 @@ The basic lookup function used by all operations.
This is based on Algorithm D from Knuth Vol. 3, Sec. 6.4.
Open addressing is preferred over chaining since the link overhead for
chaining would be substantial (100% with typical malloc overhead).
-However, instead of going through the table at constant steps, we cycle
-through the values of GF(2^n). This avoids modulo computations, being
-much cheaper on RISC machines, without leading to clustering.
-The initial probe index is computed as hash mod the table size.
-Subsequent probe indices use the values of x^i in GF(2^n)-{0} as an offset,
-where x is a root. The initial offset is derived from hash, too.
+The initial probe index is computed as hash mod the table size. Subsequent
+probe indices are computed as explained earlier.
All arithmetic on hash should ignore overflow.
-(This version is due to Reimer Behrends, some ideas are also due to
-Jyrki Alakuijala and Vladimir Marangozov.)
+(The details in this version are due to Tim Peters, building on many past
+contributions by Reimer Behrends, Jyrki Alakuijala, Vladimir Marangozov and
+Christian Tismer).
This function must never return NULL; failures are indicated by returning
a dictentry* for which the me_value field is NULL. Exceptions are never
reported by this function, and outstanding exceptions are maintained.
*/
+
static dictentry *
lookdict(dictobject *mp, PyObject *key, register long hash)
{
register int i;
- register unsigned int incr;
+ register unsigned int perturb;
register dictentry *freeslot;
register unsigned int mask = mp->ma_size-1;
dictentry *ep0 = mp->ma_table;
@@ -265,9 +255,7 @@ lookdict(dictobject *mp, PyObject *key, register long hash)
register int checked_error = 0;
register int cmp;
PyObject *err_type, *err_value, *err_tb;
- /* We must come up with (i, incr) such that 0 <= i < ma_size
- and 0 < incr < ma_size and both are a function of hash.
- i is the initial table index and incr the initial probe offset. */
+
i = hash & mask;
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key)
@@ -294,16 +282,12 @@ lookdict(dictobject *mp, PyObject *key, register long hash)
}
freeslot = NULL;
}
- /* Derive incr from hash, just to make it more arbitrary. Note that
- incr must not be 0, or we will get into an infinite loop.*/
- incr = hash ^ ((unsigned long)hash >> 3);
/* In the loop, me_key == dummy is by far (factor of 100s) the
least likely outcome, so test for that last. */
- for (;;) {
- if (!incr)
- incr = 1; /* and incr will never be 0 again */
- ep = &ep0[(i + incr) & mask];
+ for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
+ i = (i << 2) + i + perturb + 1;
+ ep = &ep0[i & mask];
if (ep->me_key == NULL) {
if (restore_error)
PyErr_Restore(err_type, err_value, err_tb);
@@ -335,10 +319,6 @@ lookdict(dictobject *mp, PyObject *key, register long hash)
}
else if (ep->me_key == dummy && freeslot == NULL)
freeslot = ep;
- /* Cycle through GF(2**n). */
- if (incr & 1)
- incr ^= mp->ma_poly; /* clears the lowest bit */
- incr >>= 1;
}
}
@@ -356,7 +336,7 @@ static dictentry *
lookdict_string(dictobject *mp, PyObject *key, register long hash)
{
register int i;
- register unsigned int incr;
+ register unsigned int perturb;
register dictentry *freeslot;
register unsigned int mask = mp->ma_size-1;
dictentry *ep0 = mp->ma_table;
@@ -370,8 +350,6 @@ lookdict_string(dictobject *mp, PyObject *key, register long hash)
mp->ma_lookup = lookdict;
return lookdict(mp, key, hash);
}
- /* We must come up with (i, incr) such that 0 <= i < ma_size
- and 0 < incr < ma_size and both are a function of hash */
i = hash & mask;
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key)
@@ -385,16 +363,12 @@ lookdict_string(dictobject *mp, PyObject *key, register long hash)
}
freeslot = NULL;
}
- /* Derive incr from hash, just to make it more arbitrary. Note that
- incr must not be 0, or we will get into an infinite loop.*/
- incr = hash ^ ((unsigned long)hash >> 3);
/* In the loop, me_key == dummy is by far (factor of 100s) the
least likely outcome, so test for that last. */
- for (;;) {
- if (!incr)
- incr = 1; /* and incr will never be 0 again */
- ep = &ep0[(i + incr) & mask];
+ for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
+ i = (i << 2) + i + perturb + 1;
+ ep = &ep0[i & mask];
if (ep->me_key == NULL)
return freeslot == NULL ? ep : freeslot;
if (ep->me_key == key
@@ -404,10 +378,6 @@ lookdict_string(dictobject *mp, PyObject *key, register long hash)
return ep;
if (ep->me_key == dummy && freeslot == NULL)
freeslot = ep;
- /* Cycle through GF(2**n). */
- if (incr & 1)
- incr ^= mp->ma_poly; /* clears the lowest bit */
- incr >>= 1;
}
}
@@ -448,7 +418,7 @@ actually be smaller than the old one.
static int
dictresize(dictobject *mp, int minused)
{
- int newsize, newpoly;
+ int newsize;
dictentry *oldtable, *newtable, *ep;
int i;
int is_oldtable_malloced;
@@ -456,20 +426,12 @@ dictresize(dictobject *mp, int minused)
assert(minused >= 0);
- /* Find the smallest table size > minused, and its poly[] entry. */
- newpoly = 0;
- newsize = MINSIZE;
- for (i = 0; i < sizeof(polys)/sizeof(polys[0]); ++i) {
- if (newsize > minused) {
- newpoly = polys[i];
- break;
- }
- newsize <<= 1;
- if (newsize < 0) /* overflow */
- break;
- }
- if (newpoly == 0) {
- /* Ran out of polynomials or newsize overflowed. */
+ /* Find the smallest table size > minused. */
+ for (newsize = MINSIZE;
+ newsize <= minused && newsize > 0;
+ newsize <<= 1)
+ ;
+ if (newsize <= 0) {
PyErr_NoMemory();
return -1;
}
@@ -511,7 +473,6 @@ dictresize(dictobject *mp, int minused)
mp->ma_table = newtable;
mp->ma_size = newsize;
memset(newtable, 0, sizeof(dictentry) * newsize);
- mp->ma_poly = newpoly;
mp->ma_used = 0;
i = mp->ma_fill;
mp->ma_fill = 0;
@@ -1255,7 +1216,7 @@ dict_equal(dictobject *a, dictobject *b)
if (a->ma_used != b->ma_used)
/* can't be equal if # of entries differ */
return 0;
-
+
/* Same # of entries -- check all of 'em. Exit early on any diff. */
for (i = 0; i < a->ma_size; i++) {
PyObject *aval = a->ma_table[i].me_value;