From 75bf2508e3fdda7b0fb86b63d1cb8e14954cd880 Mon Sep 17 00:00:00 2001 From: dkf Date: Wed, 17 Feb 2010 21:58:08 +0000 Subject: Return to using the classic hash function. Now with *extensive* notes in the comments about why this function is preferred. --- ChangeLog | 8 ++++++++ generic/tclHash.c | 46 ++++++++++++++++++++++++++++++++-------------- generic/tclLiteral.c | 42 +++++++++++++++++++++++++++++------------- generic/tclObj.c | 48 ++++++++++++++++++++++++++++++++++-------------- 4 files changed, 103 insertions(+), 41 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9a983fb..ea6906a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,13 @@ 2010-02-17 Donal K. Fellows + * generic/tclHash.c (HashStringKey): Restore these hash functions + * generic/tclLiteral.c (HashString): to use the classic algorithm. + * generic/tclObj.c (TclHashObjKey): Community felt normal case + speed to be more important than resistance to malicious cases. For + now, hashes that need to deal with the malicious case can use a custom + hash table and install their own hash function, though that is not + functionality exposed to the script level. + * generic/tclCompCmds.c (TclCompileDictUpdateCmd): Stack depth must be correctly described when compiling a body to prevent crashes in some debugging modes. diff --git a/generic/tclHash.c b/generic/tclHash.c index 47d8fba..99c4b67 100644 --- a/generic/tclHash.c +++ b/generic/tclHash.c @@ -10,7 +10,7 @@ * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclHash.c,v 1.42 2010/02/10 16:29:49 dkf Exp $ + * RCS: @(#) $Id: tclHash.c,v 1.43 2010/02/17 21:58:11 dkf Exp $ */ #include "tclInt.h" @@ -871,24 +871,42 @@ HashStringKey( Tcl_HashTable *tablePtr, /* Hash table. */ void *keyPtr) /* Key from which to compute hash value. */ { - const unsigned char *string = keyPtr; - unsigned result = 0x811c9dc5; - unsigned c; + register const char *string = (const char *) keyPtr; + register unsigned int result = 0; + register char c; /* - * This is the (32-bit) Fowler/Noll/Vo hash algorithm. This has the - * property of being a reasonably good non-cryptographic hash function for - * short string words, i.e., virtually all command and namespace names. It - * is also faster than Tcl's original algorithm on Intel x86, where there - * is a fast built-in multiply assembly instruction. + * I tried a zillion different hash functions and asked many other people + * for advice. Many people had their own favorite functions, all + * different, but no-one had much idea why they were good ones. I chose + * the one below (multiply by 9 and add new character) because of the + * following reasons: * - * Derived from Public Domain implementation by Landon Curt Noll at: - * http://www.isthe.com/chongo/src/fnv/hash_32.c + * 1. Multiplying by 10 is perfect for keys that are decimal strings, and + * multiplying by 9 is just about as good. + * 2. Times-9 is (shift-left-3) plus (old). This means that each + * character's bits hang around in the low-order bits of the hash value + * for ever, plus they spread fairly rapidly up to the high-order bits + * to fill out the hash value. This seems works well both for decimal + * and non-decimal strings, but isn't strong against maliciously-chosen + * keys. + * + * Note that this function is very weak against malicious strings; it's + * very easy to generate multiple keys that have the same hashcode. On the + * other hand, that hardly ever actually occurs and this function *is* + * very cheap, even by comparison with industry-standard hashes like FNV. + * If real strength of hash is required though, use a custom hash based on + * Bob Jenkins's lookup3(), but be aware that it's significantly slower. + * Since Tcl command and namespace names are usually reasonably-named (the + * main use for string hashes in modern Tcl) speed is far more important + * than strength. + * + * See also HashString in tclLiteral.c. + * See also TclObjHashKey in tclObj.c. */ -#define FNV_32_PRIME ((unsigned) 0x01000193) - while ((c=*string++)) { - result = (result * FNV_32_PRIME) ^ c; + for (; (c=*string++) != 0 ;) { + result += (result<<3) + UCHAR(c); } return result; } diff --git a/generic/tclLiteral.c b/generic/tclLiteral.c index 05e1dba..67d24a5 100644 --- a/generic/tclLiteral.c +++ b/generic/tclLiteral.c @@ -13,7 +13,7 @@ * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclLiteral.c,v 1.39 2010/02/17 15:40:54 dgp Exp $ + * RCS: @(#) $Id: tclLiteral.c,v 1.40 2010/02/17 21:58:11 dkf Exp $ */ #include "tclInt.h" @@ -894,23 +894,39 @@ HashString( register const char *bytes, /* String for which to compute hash value. */ int length) /* Number of bytes in the string. */ { - unsigned result = 0x811c9dc5; - const char *last = bytes + length; + register unsigned int result = 0; + register int i; /* - * This is the (32-bit) Fowler/Noll/Vo hash algorithm. This has the - * property of being a reasonably good non-cryptographic hash function for - * short string words, i.e., virtually all names used in practice. It is - * also faster than Tcl's original algorithm on Intel x86, where there is - * a fast built-in multiply assembly instruction. + * I tried a zillion different hash functions and asked many other people + * for advice. Many people had their own favorite functions, all + * different, but no-one had much idea why they were good ones. I chose + * the one below (multiply by 9 and add new character) because of the + * following reasons: + * + * 1. Multiplying by 10 is perfect for keys that are decimal strings, and + * multiplying by 9 is just about as good. + * 2. Times-9 is (shift-left-3) plus (old). This means that each + * character's bits hang around in the low-order bits of the hash value + * for ever, plus they spread fairly rapidly up to the high-order bits + * to fill out the hash value. This seems works well both for decimal + * and non-decimal strings. + * + * Note that this function is very weak against malicious strings; it's + * very easy to generate multiple keys that have the same hashcode. On the + * other hand, that hardly ever actually occurs and this function *is* + * very cheap, even by comparison with industry-standard hashes like FNV. + * If real strength of hash is required though, use a custom hash based on + * Bob Jenkins's lookup3(), but be aware that it's significantly slower. + * Tcl scripts tend to not have a big issue in this area, and literals + * mostly aren't looked up by name anyway. * - * Derived from Public Domain implementation by Landon Curt Noll at: - * http://www.isthe.com/chongo/src/fnv/hash_32.c + * See also HashStringKey in tclHash.c. + * See also TclObjHashKey in tclObj.c. */ -#define FNV_32_PRIME ((unsigned) 0x01000193) - while (bytes < last) { - result = (result * FNV_32_PRIME) ^ UCHAR(*bytes++); + for (i=0; ilength; + const char *string = TclGetString(objPtr); + unsigned int result = 0; + const char *end = string + objPtr->length; /* - * This is the (32-bit) Fowler/Noll/Vo hash algorithm. This has the - * property of being a reasonably good non-cryptographic hash function for - * short string words, i.e., virtually all names used in practice. It is - * also faster than Tcl's original algorithm on Intel x86, where there is - * a fast built-in multiply assembly instruction. + * I tried a zillion different hash functions and asked many other people + * for advice. Many people had their own favorite functions, all + * different, but no-one had much idea why they were good ones. I chose + * the one below (multiply by 9 and add new character) because of the + * following reasons: * - * Derived from Public Domain implementation by Landon Curt Noll at: - * http://www.isthe.com/chongo/src/fnv/hash_32.c + * 1. Multiplying by 10 is perfect for keys that are decimal strings, and + * multiplying by 9 is just about as good. + * 2. Times-9 is (shift-left-3) plus (old). This means that each + * character's bits hang around in the low-order bits of the hash value + * for ever, plus they spread fairly rapidly up to the high-order bits + * to fill out the hash value. This seems works well both for decimal + * and non-decimal strings. + * + * Note that this function is very weak against malicious strings; it's + * very easy to generate multiple keys that have the same hashcode. On the + * other hand, that hardly ever actually occurs and this function *is* + * very cheap, even by comparison with industry-standard hashes like FNV. + * If real strength of hash is required though, use a custom hash based on + * Bob Jenkins's lookup3(), but be aware that it's significantly slower. + * Tcl does not use that level of strength because it typically does not + * need it (and some of the aspects of that strength are genuinely + * unnecessary given the rest of Tcl's hash machinery, and the fact that + * we do not either transfer hashes to another machine, use them as a true + * substitute for equality, or attempt to minimize work in rebuilding the + * hash table). + * + * See also HashStringKey in tclHash.c. + * See also HashString in tclLiteral.c. */ -#define FNV_32_PRIME ((unsigned) 0x01000193) - while (string < last) { - result = (result * FNV_32_PRIME) ^ (*string++); + while (string < end) { + result += (result << 3) + UCHAR(*string++); } return result; } -- cgit v0.12