From baafc87e3c7fe20ed29437a7e7b44f9f06f4e974 Mon Sep 17 00:00:00 2001
From: dkf <donal.k.fellows@manchester.ac.uk>
Date: Sun, 7 Feb 2010 09:10:32 +0000
Subject: Upgrade Tcl's hash function to use the FNV-32 algorithm. This is
 marginally faster and gives a bit better distribution of keys (especially in
 large hash tables) but does change hash iteration order.

---
 ChangeLog         | 17 +++++++++++++++++
 generic/tclHash.c | 33 +++++++++++++--------------------
 generic/tclObj.c  | 35 +++++++++++++++--------------------
 generic/tclVar.c  | 38 ++------------------------------------
 4 files changed, 47 insertions(+), 76 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 045d394..ce3e2dc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2010-02-06  Donal K. Fellows  <dkf@users.sf.net>
+
+	* generic/tclHash.c (HashStringKey):	Replace Tcl's crusty old hash
+	* generic/tclObj.c (TclHashObjKey):	function with the algorithm
+	due to Fowler, Noll and Vo. This is slightly faster (assuming the
+	presence of hardware multiply) and has somewhat better distribution
+	properties of the resulting hash values. Note that we only ever used
+	the 32-bit version of the FNV algorithm; Tcl's core hash engine
+	assumes that hash values are simple unsigned ints.
+
+	***POTENTIAL INCOMPATIBILITY***
+	Code that depends on hash iteration order (especially tests) may well
+	be disrupted by this. Where a definite order is required, the fix is
+	usually to just sort the results after extracting them from the hash.
+	Where this is insufficient, the code that has ceased working was
+	always wrong and was only working by chance.
+
 2010-02-05  Donal K. Fellows  <dkf@users.sf.net>
 
 	* generic/tclCompCmds.c (TclCompileErrorCmd): Added compilation of the
diff --git a/generic/tclHash.c b/generic/tclHash.c
index 9ed941e..f3c4b3a 100644
--- a/generic/tclHash.c
+++ b/generic/tclHash.c
@@ -10,7 +10,7 @@
  * See the file "license.terms" for information on usage and redistribution of
  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclHash.c,v 1.39 2009/07/16 21:24:39 dgp Exp $
+ * RCS: @(#) $Id: tclHash.c,v 1.40 2010/02/07 09:10:33 dkf Exp $
  */
 
 #include "tclInt.h"
@@ -866,36 +866,29 @@ CompareStringKeys(
  *----------------------------------------------------------------------
  */
 
-static unsigned int
+static unsigned
 HashStringKey(
     Tcl_HashTable *tablePtr,	/* Hash table. */
     void *keyPtr)		/* Key from which to compute hash value. */
 {
     register const char *string = (const char *) keyPtr;
-    register unsigned int result;
+    register unsigned result = 0;
     register int c;
 
     /*
-     * I tried a zillion different hash functions and asked many other people
-     * for advice. Many people had their own favorite functions, all
-     * different, but no-one had much idea why they were good ones. I chose
-     * the one below (multiply by 9 and add new character) because of the
-     * following reasons:
+     * This is the (32-bit) Fowler/Noll/Vo hash algorithm. This has the
+     * property of being a reasonably good non-cryptographic hash function for
+     * short string words, i.e., virtually all command and namespace names. It
+     * is also faster than Tcl's original algorithm on Intel x86, where there
+     * is a fast built-in multiply assembly instruction.
      *
-     * 1. Multiplying by 10 is perfect for keys that are decimal strings, and
-     *	  multiplying by 9 is just about as good.
-     * 2. Times-9 is (shift-left-3) plus (old). This means that each
-     *	  character's bits hang around in the low-order bits of the hash value
-     *	  for ever, plus they spread fairly rapidly up to the high-order bits
-     *	  to fill out the hash value. This seems works well both for decimal
-     *	  and non-decimal strings, but isn't strong against maliciously-chosen
-     *	  keys.
+     * Derived from Public Domain implementation by Landon Curt Noll at:
+     * http://www.isthe.com/chongo/src/fnv/hash_32.c
      */
 
-    result = 0;
-
-    for (c=*string++ ; c ; c=*string++) {
-	result += (result<<3) + c;
+#define FNV_32_PRIME	((unsigned) 0x01000193)
+    while ((c=*string++)) {
+	result = (result * FNV_32_PRIME) ^ c;
     }
     return result;
 }
diff --git a/generic/tclObj.c b/generic/tclObj.c
index 28d7a8a..6287b24 100644
--- a/generic/tclObj.c
+++ b/generic/tclObj.c
@@ -13,7 +13,7 @@
  * See the file "license.terms" for information on usage and redistribution of
  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclObj.c,v 1.166 2009/12/29 16:54:44 dkf Exp $
+ * RCS: @(#) $Id: tclObj.c,v 1.167 2010/02/07 09:10:33 dkf Exp $
  */
 
 #include "tclInt.h"
@@ -4037,30 +4037,25 @@ TclHashObjKey(
     Tcl_HashTable *tablePtr,	/* Hash table. */
     void *keyPtr)		/* Key from which to compute hash value. */
 {
-    Tcl_Obj *objPtr = (Tcl_Obj *) keyPtr;
-    const char *string = TclGetString(objPtr);
-    int length = objPtr->length;
-    unsigned int result = 0;
-    int i;
+    Tcl_Obj *objPtr = keyPtr;
+    register unsigned result = 0;
+    const unsigned char *string = (unsigned char *) TclGetString(objPtr);
+    const unsigned char *last = string + objPtr->length;
 
     /*
-     * I tried a zillion different hash functions and asked many other people
-     * for advice. Many people had their own favorite functions, all
-     * different, but no-one had much idea why they were good ones. I chose
-     * the one below (multiply by 9 and add new character) because of the
-     * following reasons:
+     * This is the (32-bit) Fowler/Noll/Vo hash algorithm. This has the
+     * property of being a reasonably good non-cryptographic hash function for
+     * short string words, i.e., virtually all names used in practice. It is
+     * also faster than Tcl's original algorithm on Intel x86, where there is
+     * a fast built-in multiply assembly instruction.
      *
-     * 1. Multiplying by 10 is perfect for keys that are decimal strings, and
-     *	  multiplying by 9 is just about as good.
-     * 2. Times-9 is (shift-left-3) plus (old). This means that each
-     *	  character's bits hang around in the low-order bits of the hash value
-     *	  for ever, plus they spread fairly rapidly up to the high-order bits
-     *	  to fill out the hash value. This seems works well both for decimal
-     *	  and *non-decimal strings.
+     * Derived from Public Domain implementation by Landon Curt Noll at:
+     * http://www.isthe.com/chongo/src/fnv/hash_32.c
      */
 
-    for (i=0 ; i<length ; i++) {
-	result += (result << 3) + string[i];
+#define FNV_32_PRIME	((unsigned) 0x01000193)
+    while (string < last) {
+	result = (result * FNV_32_PRIME) ^ (*string++);
     }
     return result;
 }
diff --git a/generic/tclVar.c b/generic/tclVar.c
index 79409b4..3f6130e 100644
--- a/generic/tclVar.c
+++ b/generic/tclVar.c
@@ -16,7 +16,7 @@
  * See the file "license.terms" for information on usage and redistribution of
  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclVar.c,v 1.195 2010/02/05 14:33:09 dkf Exp $
+ * RCS: @(#) $Id: tclVar.c,v 1.196 2010/02/07 09:10:33 dkf Exp $
  */
 
 #include "tclInt.h"
@@ -28,12 +28,11 @@
 static Tcl_HashEntry *	AllocVarEntry(Tcl_HashTable *tablePtr, void *keyPtr);
 static void		FreeVarEntry(Tcl_HashEntry *hPtr);
 static int		CompareVarKeys(void *keyPtr, Tcl_HashEntry *hPtr);
-static unsigned		HashVarKey(Tcl_HashTable *tablePtr, void *keyPtr);
 
 static const Tcl_HashKeyType tclVarHashKeyType = {
     TCL_HASH_KEY_TYPE_VERSION,	/* version */
     0,				/* flags */
-    HashVarKey,			/* hashKeyProc */
+    TclHashObjKey,		/* hashKeyProc */
     CompareVarKeys,		/* compareKeysProc */
     AllocVarEntry,		/* allocEntryProc */
     FreeVarEntry		/* freeEntryProc */
@@ -6453,39 +6452,6 @@ CompareVarKeys(
 
     return 0;
 }
-
-static unsigned
-HashVarKey(
-    Tcl_HashTable *tablePtr,	/* Hash table. */
-    void *keyPtr)		/* Key from which to compute hash value. */
-{
-    Tcl_Obj *objPtr = keyPtr;
-    const char *string = TclGetString(objPtr);
-    int length = objPtr->length;
-    register unsigned result = 0;
-    int i;
-
-    /*
-     * I tried a zillion different hash functions and asked many other people
-     * for advice. Many people had their own favorite functions, all
-     * different, but no-one had much idea why they were good ones. I chose
-     * the one below (multiply by 9 and add new character) because of the
-     * following reasons:
-     *
-     * 1. Multiplying by 10 is perfect for keys that are decimal strings, and
-     *	  multiplying by 9 is just about as good.
-     * 2. Times-9 is (shift-left-3) plus (old). This means that each
-     *	  character's bits hang around in the low-order bits of the hash value
-     *	  for ever, plus they spread fairly rapidly up to the high-order bits
-     *	  to fill out the hash value. This seems works well both for decimal
-     *	  and non-decimal strings.
-     */
-
-    for (i=0 ; i<length ; i++) {
-	result += (result << 3) + string[i];
-    }
-    return result;
-}
 
 /*
  * Local Variables:
-- 
cgit v0.12