summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordkf <donal.k.fellows@manchester.ac.uk>2010-02-17 21:58:08 (GMT)
committerdkf <donal.k.fellows@manchester.ac.uk>2010-02-17 21:58:08 (GMT)
commit75bf2508e3fdda7b0fb86b63d1cb8e14954cd880 (patch)
treeab11411b42bc7dae1ca8db2a091543842cce9d0a
parent0198d4b07964264dc869e7b3ec88e8b7fd25d18f (diff)
downloadtcl-75bf2508e3fdda7b0fb86b63d1cb8e14954cd880.zip
tcl-75bf2508e3fdda7b0fb86b63d1cb8e14954cd880.tar.gz
tcl-75bf2508e3fdda7b0fb86b63d1cb8e14954cd880.tar.bz2
Return to using the classic hash function. Now with *extensive* notes in the
comments about why this function is preferred.
-rw-r--r--ChangeLog8
-rw-r--r--generic/tclHash.c46
-rw-r--r--generic/tclLiteral.c42
-rw-r--r--generic/tclObj.c48
4 files changed, 103 insertions, 41 deletions
diff --git a/ChangeLog b/ChangeLog
index 9a983fb..ea6906a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
2010-02-17 Donal K. Fellows <dkf@users.sf.net>
+ * generic/tclHash.c (HashStringKey): Restore these hash functions
+ * generic/tclLiteral.c (HashString): to use the classic algorithm.
+ * generic/tclObj.c (TclHashObjKey): Community felt normal case
+ speed to be more important than resistance to malicious cases. For
+ now, hashes that need to deal with the malicious case can use a custom
+ hash table and install their own hash function, though that is not
+ functionality exposed to the script level.
+
* generic/tclCompCmds.c (TclCompileDictUpdateCmd): Stack depth must be
correctly described when compiling a body to prevent crashes in some
debugging modes.
diff --git a/generic/tclHash.c b/generic/tclHash.c
index 47d8fba..99c4b67 100644
--- a/generic/tclHash.c
+++ b/generic/tclHash.c
@@ -10,7 +10,7 @@
* See the file "license.terms" for information on usage and redistribution of
* this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclHash.c,v 1.42 2010/02/10 16:29:49 dkf Exp $
+ * RCS: @(#) $Id: tclHash.c,v 1.43 2010/02/17 21:58:11 dkf Exp $
*/
#include "tclInt.h"
@@ -871,24 +871,42 @@ HashStringKey(
Tcl_HashTable *tablePtr, /* Hash table. */
void *keyPtr) /* Key from which to compute hash value. */
{
- const unsigned char *string = keyPtr;
- unsigned result = 0x811c9dc5;
- unsigned c;
+ register const char *string = (const char *) keyPtr;
+ register unsigned int result = 0;
+ register char c;
/*
- * This is the (32-bit) Fowler/Noll/Vo hash algorithm. This has the
- * property of being a reasonably good non-cryptographic hash function for
- * short string words, i.e., virtually all command and namespace names. It
- * is also faster than Tcl's original algorithm on Intel x86, where there
- * is a fast built-in multiply assembly instruction.
+ * I tried a zillion different hash functions and asked many other people
+ * for advice. Many people had their own favorite functions, all
+ * different, but no-one had much idea why they were good ones. I chose
+ * the one below (multiply by 9 and add new character) because of the
+ * following reasons:
*
- * Derived from Public Domain implementation by Landon Curt Noll at:
- * http://www.isthe.com/chongo/src/fnv/hash_32.c
+ * 1. Multiplying by 10 is perfect for keys that are decimal strings, and
+ * multiplying by 9 is just about as good.
+ * 2. Times-9 is (shift-left-3) plus (old). This means that each
+ * character's bits hang around in the low-order bits of the hash value
+ * for ever, plus they spread fairly rapidly up to the high-order bits
+ * to fill out the hash value. This seems works well both for decimal
+ * and non-decimal strings, but isn't strong against maliciously-chosen
+ * keys.
+ *
+ * Note that this function is very weak against malicious strings; it's
+ * very easy to generate multiple keys that have the same hashcode. On the
+ * other hand, that hardly ever actually occurs and this function *is*
+ * very cheap, even by comparison with industry-standard hashes like FNV.
+ * If real strength of hash is required though, use a custom hash based on
+ * Bob Jenkins's lookup3(), but be aware that it's significantly slower.
+ * Since Tcl command and namespace names are usually reasonably-named (the
+ * main use for string hashes in modern Tcl) speed is far more important
+ * than strength.
+ *
+ * See also HashString in tclLiteral.c.
+ * See also TclObjHashKey in tclObj.c.
*/
-#define FNV_32_PRIME ((unsigned) 0x01000193)
- while ((c=*string++)) {
- result = (result * FNV_32_PRIME) ^ c;
+ for (; (c=*string++) != 0 ;) {
+ result += (result<<3) + UCHAR(c);
}
return result;
}
diff --git a/generic/tclLiteral.c b/generic/tclLiteral.c
index 05e1dba..67d24a5 100644
--- a/generic/tclLiteral.c
+++ b/generic/tclLiteral.c
@@ -13,7 +13,7 @@
* See the file "license.terms" for information on usage and redistribution of
* this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclLiteral.c,v 1.39 2010/02/17 15:40:54 dgp Exp $
+ * RCS: @(#) $Id: tclLiteral.c,v 1.40 2010/02/17 21:58:11 dkf Exp $
*/
#include "tclInt.h"
@@ -894,23 +894,39 @@ HashString(
register const char *bytes, /* String for which to compute hash value. */
int length) /* Number of bytes in the string. */
{
- unsigned result = 0x811c9dc5;
- const char *last = bytes + length;
+ register unsigned int result = 0;
+ register int i;
/*
- * This is the (32-bit) Fowler/Noll/Vo hash algorithm. This has the
- * property of being a reasonably good non-cryptographic hash function for
- * short string words, i.e., virtually all names used in practice. It is
- * also faster than Tcl's original algorithm on Intel x86, where there is
- * a fast built-in multiply assembly instruction.
+ * I tried a zillion different hash functions and asked many other people
+ * for advice. Many people had their own favorite functions, all
+ * different, but no-one had much idea why they were good ones. I chose
+ * the one below (multiply by 9 and add new character) because of the
+ * following reasons:
+ *
+ * 1. Multiplying by 10 is perfect for keys that are decimal strings, and
+ * multiplying by 9 is just about as good.
+ * 2. Times-9 is (shift-left-3) plus (old). This means that each
+ * character's bits hang around in the low-order bits of the hash value
+ * for ever, plus they spread fairly rapidly up to the high-order bits
+ * to fill out the hash value. This seems works well both for decimal
+ * and non-decimal strings.
+ *
+ * Note that this function is very weak against malicious strings; it's
+ * very easy to generate multiple keys that have the same hashcode. On the
+ * other hand, that hardly ever actually occurs and this function *is*
+ * very cheap, even by comparison with industry-standard hashes like FNV.
+ * If real strength of hash is required though, use a custom hash based on
+ * Bob Jenkins's lookup3(), but be aware that it's significantly slower.
+ * Tcl scripts tend to not have a big issue in this area, and literals
+ * mostly aren't looked up by name anyway.
*
- * Derived from Public Domain implementation by Landon Curt Noll at:
- * http://www.isthe.com/chongo/src/fnv/hash_32.c
+ * See also HashStringKey in tclHash.c.
+ * See also TclObjHashKey in tclObj.c.
*/
-#define FNV_32_PRIME ((unsigned) 0x01000193)
- while (bytes < last) {
- result = (result * FNV_32_PRIME) ^ UCHAR(*bytes++);
+ for (i=0; i<length ; i++) {
+ result += (result<<3) + UCHAR(bytes[i]);
}
return result;
}
diff --git a/generic/tclObj.c b/generic/tclObj.c
index b45d1c8..721de46 100644
--- a/generic/tclObj.c
+++ b/generic/tclObj.c
@@ -13,7 +13,7 @@
* See the file "license.terms" for information on usage and redistribution of
* this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclObj.c,v 1.168 2010/02/10 16:29:49 dkf Exp $
+ * RCS: @(#) $Id: tclObj.c,v 1.169 2010/02/17 21:58:11 dkf Exp $
*/
#include "tclInt.h"
@@ -4038,24 +4038,44 @@ TclHashObjKey(
void *keyPtr) /* Key from which to compute hash value. */
{
Tcl_Obj *objPtr = keyPtr;
- register unsigned result = 0x811c9dc5;
- const unsigned char *string = (unsigned char *) TclGetString(objPtr);
- const unsigned char *last = string + objPtr->length;
+ const char *string = TclGetString(objPtr);
+ unsigned int result = 0;
+ const char *end = string + objPtr->length;
/*
- * This is the (32-bit) Fowler/Noll/Vo hash algorithm. This has the
- * property of being a reasonably good non-cryptographic hash function for
- * short string words, i.e., virtually all names used in practice. It is
- * also faster than Tcl's original algorithm on Intel x86, where there is
- * a fast built-in multiply assembly instruction.
+ * I tried a zillion different hash functions and asked many other people
+ * for advice. Many people had their own favorite functions, all
+ * different, but no-one had much idea why they were good ones. I chose
+ * the one below (multiply by 9 and add new character) because of the
+ * following reasons:
*
- * Derived from Public Domain implementation by Landon Curt Noll at:
- * http://www.isthe.com/chongo/src/fnv/hash_32.c
+ * 1. Multiplying by 10 is perfect for keys that are decimal strings, and
+ * multiplying by 9 is just about as good.
+ * 2. Times-9 is (shift-left-3) plus (old). This means that each
+ * character's bits hang around in the low-order bits of the hash value
+ * for ever, plus they spread fairly rapidly up to the high-order bits
+ * to fill out the hash value. This seems works well both for decimal
+ * and non-decimal strings.
+ *
+ * Note that this function is very weak against malicious strings; it's
+ * very easy to generate multiple keys that have the same hashcode. On the
+ * other hand, that hardly ever actually occurs and this function *is*
+ * very cheap, even by comparison with industry-standard hashes like FNV.
+ * If real strength of hash is required though, use a custom hash based on
+ * Bob Jenkins's lookup3(), but be aware that it's significantly slower.
+ * Tcl does not use that level of strength because it typically does not
+ * need it (and some of the aspects of that strength are genuinely
+ * unnecessary given the rest of Tcl's hash machinery, and the fact that
+ * we do not either transfer hashes to another machine, use them as a true
+ * substitute for equality, or attempt to minimize work in rebuilding the
+ * hash table).
+ *
+ * See also HashStringKey in tclHash.c.
+ * See also HashString in tclLiteral.c.
*/
-#define FNV_32_PRIME ((unsigned) 0x01000193)
- while (string < last) {
- result = (result * FNV_32_PRIME) ^ (*string++);
+ while (string < end) {
+ result += (result << 3) + UCHAR(*string++);
}
return result;
}