- changed hash calculation for unicode strings. the new

value is calculated from the character values, in a way that makes sure an 8-bit ASCII string and a unicode string with the same contents get the same hash value. (as a side effect, this also works for ISO Latin 1 strings). for more details, see the python-dev discussion.
author: Fredrik Lundh <fredrik@pythonware.com> 2000-07-10 18:27:47 (GMT)
committer: Fredrik Lundh <fredrik@pythonware.com> 2000-07-10 18:27:47 (GMT)
commit: dde61644024185891e1c41d6f5de29f8780b1cff (patch)
tree: 729c72530031a4394196f53a802132cc87954d1c
parent: 417c489defd33ceb4a3cb76efaf4975ee886221e (diff)
download: cpython-dde61644024185891e1c41d6f5de29f8780b1cff.zip
cpython-dde61644024185891e1c41d6f5de29f8780b1cff.tar.gz
cpython-dde61644024185891e1c41d6f5de29f8780b1cff.tar.bz2
1 files changed, 20 insertions, 18 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 70afd0d..7737057 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3471,26 +3471,28 @@ unicode_getitem(PyUnicodeObject *self, int index)
 static long
 unicode_hash(PyUnicodeObject *self)
 {
-    long hash;
-    PyObject *utf8;
-
-    /* Since Unicode objects compare equal to their UTF-8 string
-       counterparts, they should also use the UTF-8 strings as basis
-       for their hash value. This is needed to assure that strings and
-       Unicode objects behave in the same way as dictionary
-       keys. Unfortunately, this costs some performance and also some
-       memory if the cached UTF-8 representation is not used later
-       on. */
+    /* Since Unicode objects compare equal to their ASCII string
+       counterparts, they should use the individual character values
+       as basis for their hash value.  This is needed to assure that
+       strings and Unicode objects behave in the same way as
+       dictionary keys. */
+
+    register int len;
+    register Py_UNICODE *p;
+    register long x;
+
     if (self->hash != -1)
 	return self->hash;
-    utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
-    if (utf8 == NULL)
-	return -1;
-    hash = PyObject_Hash(utf8);
-    if (hash == -1)
-	return -1;
-    self->hash = hash;
-    return hash;
+    len = PyUnicode_GET_SIZE(self);
+    p = PyUnicode_AS_UNICODE(self);
+    x = *p << 7;
+    while (--len >= 0)
+	x = (1000003*x) ^ *p++;
+    x ^= PyUnicode_GET_SIZE(self);
+    if (x == -1)
+	x = -2;
+    self->hash = x;
+    return x;
 }
 
 static char index__doc__[] =
author	Fredrik Lundh <fredrik@pythonware.com>	2000-07-10 18:27:47 (GMT)
committer	Fredrik Lundh <fredrik@pythonware.com>	2000-07-10 18:27:47 (GMT)
commit	dde61644024185891e1c41d6f5de29f8780b1cff (patch)
tree	729c72530031a4394196f53a802132cc87954d1c
parent	417c489defd33ceb4a3cb76efaf4975ee886221e (diff)
download	cpython-dde61644024185891e1c41d6f5de29f8780b1cff.zip cpython-dde61644024185891e1c41d6f5de29f8780b1cff.tar.gz cpython-dde61644024185891e1c41d6f5de29f8780b1cff.tar.bz2