diff options
author | dkf <donal.k.fellows@manchester.ac.uk> | 2010-02-07 09:10:32 (GMT) |
---|---|---|
committer | dkf <donal.k.fellows@manchester.ac.uk> | 2010-02-07 09:10:32 (GMT) |
commit | baafc87e3c7fe20ed29437a7e7b44f9f06f4e974 (patch) | |
tree | d772606db0a33eea164149ef48f66190338f8673 | |
parent | 307233f5f049dea251e0d858e301565c0ed88117 (diff) | |
download | tcl-baafc87e3c7fe20ed29437a7e7b44f9f06f4e974.zip tcl-baafc87e3c7fe20ed29437a7e7b44f9f06f4e974.tar.gz tcl-baafc87e3c7fe20ed29437a7e7b44f9f06f4e974.tar.bz2 |
Upgrade Tcl's hash function to use the FNV-32 algorithm. This is marginally
faster and gives a bit better distribution of keys (especially in large hash
tables) but does change hash iteration order.
-rw-r--r-- | ChangeLog | 17 | ||||
-rw-r--r-- | generic/tclHash.c | 33 | ||||
-rw-r--r-- | generic/tclObj.c | 35 | ||||
-rw-r--r-- | generic/tclVar.c | 38 |
4 files changed, 47 insertions, 76 deletions
@@ -1,3 +1,20 @@ +2010-02-06 Donal K. Fellows <dkf@users.sf.net> + + * generic/tclHash.c (HashStringKey): Replace Tcl's crusty old hash + * generic/tclObj.c (TclHashObjKey): function with the algorithm + due to Fowler, Noll and Vo. This is slightly faster (assuming the + presence of hardware multiply) and has somewhat better distribution + properties of the resulting hash values. Note that we only ever used + the 32-bit version of the FNV algorithm; Tcl's core hash engine + assumes that hash values are simple unsigned ints. + + ***POTENTIAL INCOMPATIBILITY*** + Code that depends on hash iteration order (especially tests) may well + be disrupted by this. Where a definite order is required, the fix is + usually to just sort the results after extracting them from the hash. + Where this is insufficient, the code that has ceased working was + always wrong and was only working by chance. + 2010-02-05 Donal K. Fellows <dkf@users.sf.net> * generic/tclCompCmds.c (TclCompileErrorCmd): Added compilation of the diff --git a/generic/tclHash.c b/generic/tclHash.c index 9ed941e..f3c4b3a 100644 --- a/generic/tclHash.c +++ b/generic/tclHash.c @@ -10,7 +10,7 @@ * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclHash.c,v 1.39 2009/07/16 21:24:39 dgp Exp $ + * RCS: @(#) $Id: tclHash.c,v 1.40 2010/02/07 09:10:33 dkf Exp $ */ #include "tclInt.h" @@ -866,36 +866,29 @@ CompareStringKeys( *---------------------------------------------------------------------- */ -static unsigned int +static unsigned HashStringKey( Tcl_HashTable *tablePtr, /* Hash table. */ void *keyPtr) /* Key from which to compute hash value. */ { register const char *string = (const char *) keyPtr; - register unsigned int result; + register unsigned result = 0; register int c; /* - * I tried a zillion different hash functions and asked many other people - * for advice. Many people had their own favorite functions, all - * different, but no-one had much idea why they were good ones. I chose - * the one below (multiply by 9 and add new character) because of the - * following reasons: + * This is the (32-bit) Fowler/Noll/Vo hash algorithm. This has the + * property of being a reasonably good non-cryptographic hash function for + * short string words, i.e., virtually all command and namespace names. It + * is also faster than Tcl's original algorithm on Intel x86, where there + * is a fast built-in multiply assembly instruction. * - * 1. Multiplying by 10 is perfect for keys that are decimal strings, and - * multiplying by 9 is just about as good. - * 2. Times-9 is (shift-left-3) plus (old). This means that each - * character's bits hang around in the low-order bits of the hash value - * for ever, plus they spread fairly rapidly up to the high-order bits - * to fill out the hash value. This seems works well both for decimal - * and non-decimal strings, but isn't strong against maliciously-chosen - * keys. + * Derived from Public Domain implementation by Landon Curt Noll at: + * http://www.isthe.com/chongo/src/fnv/hash_32.c */ - result = 0; - - for (c=*string++ ; c ; c=*string++) { - result += (result<<3) + c; +#define FNV_32_PRIME ((unsigned) 0x01000193) + while ((c=*string++)) { + result = (result * FNV_32_PRIME) ^ c; } return result; } diff --git a/generic/tclObj.c b/generic/tclObj.c index 28d7a8a..6287b24 100644 --- a/generic/tclObj.c +++ b/generic/tclObj.c @@ -13,7 +13,7 @@ * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclObj.c,v 1.166 2009/12/29 16:54:44 dkf Exp $ + * RCS: @(#) $Id: tclObj.c,v 1.167 2010/02/07 09:10:33 dkf Exp $ */ #include "tclInt.h" @@ -4037,30 +4037,25 @@ TclHashObjKey( Tcl_HashTable *tablePtr, /* Hash table. */ void *keyPtr) /* Key from which to compute hash value. */ { - Tcl_Obj *objPtr = (Tcl_Obj *) keyPtr; - const char *string = TclGetString(objPtr); - int length = objPtr->length; - unsigned int result = 0; - int i; + Tcl_Obj *objPtr = keyPtr; + register unsigned result = 0; + const unsigned char *string = (unsigned char *) TclGetString(objPtr); + const unsigned char *last = string + objPtr->length; /* - * I tried a zillion different hash functions and asked many other people - * for advice. Many people had their own favorite functions, all - * different, but no-one had much idea why they were good ones. I chose - * the one below (multiply by 9 and add new character) because of the - * following reasons: + * This is the (32-bit) Fowler/Noll/Vo hash algorithm. This has the + * property of being a reasonably good non-cryptographic hash function for + * short string words, i.e., virtually all names used in practice. It is + * also faster than Tcl's original algorithm on Intel x86, where there is + * a fast built-in multiply assembly instruction. * - * 1. Multiplying by 10 is perfect for keys that are decimal strings, and - * multiplying by 9 is just about as good. - * 2. Times-9 is (shift-left-3) plus (old). This means that each - * character's bits hang around in the low-order bits of the hash value - * for ever, plus they spread fairly rapidly up to the high-order bits - * to fill out the hash value. This seems works well both for decimal - * and *non-decimal strings. + * Derived from Public Domain implementation by Landon Curt Noll at: + * http://www.isthe.com/chongo/src/fnv/hash_32.c */ - for (i=0 ; i<length ; i++) { - result += (result << 3) + string[i]; +#define FNV_32_PRIME ((unsigned) 0x01000193) + while (string < last) { + result = (result * FNV_32_PRIME) ^ (*string++); } return result; } diff --git a/generic/tclVar.c b/generic/tclVar.c index 79409b4..3f6130e 100644 --- a/generic/tclVar.c +++ b/generic/tclVar.c @@ -16,7 +16,7 @@ * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclVar.c,v 1.195 2010/02/05 14:33:09 dkf Exp $ + * RCS: @(#) $Id: tclVar.c,v 1.196 2010/02/07 09:10:33 dkf Exp $ */ #include "tclInt.h" @@ -28,12 +28,11 @@ static Tcl_HashEntry * AllocVarEntry(Tcl_HashTable *tablePtr, void *keyPtr); static void FreeVarEntry(Tcl_HashEntry *hPtr); static int CompareVarKeys(void *keyPtr, Tcl_HashEntry *hPtr); -static unsigned HashVarKey(Tcl_HashTable *tablePtr, void *keyPtr); static const Tcl_HashKeyType tclVarHashKeyType = { TCL_HASH_KEY_TYPE_VERSION, /* version */ 0, /* flags */ - HashVarKey, /* hashKeyProc */ + TclHashObjKey, /* hashKeyProc */ CompareVarKeys, /* compareKeysProc */ AllocVarEntry, /* allocEntryProc */ FreeVarEntry /* freeEntryProc */ @@ -6453,39 +6452,6 @@ CompareVarKeys( return 0; } - -static unsigned -HashVarKey( - Tcl_HashTable *tablePtr, /* Hash table. */ - void *keyPtr) /* Key from which to compute hash value. */ -{ - Tcl_Obj *objPtr = keyPtr; - const char *string = TclGetString(objPtr); - int length = objPtr->length; - register unsigned result = 0; - int i; - - /* - * I tried a zillion different hash functions and asked many other people - * for advice. Many people had their own favorite functions, all - * different, but no-one had much idea why they were good ones. I chose - * the one below (multiply by 9 and add new character) because of the - * following reasons: - * - * 1. Multiplying by 10 is perfect for keys that are decimal strings, and - * multiplying by 9 is just about as good. - * 2. Times-9 is (shift-left-3) plus (old). This means that each - * character's bits hang around in the low-order bits of the hash value - * for ever, plus they spread fairly rapidly up to the high-order bits - * to fill out the hash value. This seems works well both for decimal - * and non-decimal strings. - */ - - for (i=0 ; i<length ; i++) { - result += (result << 3) + string[i]; - } - return result; -} /* * Local Variables: |