diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2000-07-10 18:27:47 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2000-07-10 18:27:47 (GMT) |
commit | dde61644024185891e1c41d6f5de29f8780b1cff (patch) | |
tree | 729c72530031a4394196f53a802132cc87954d1c /Objects | |
parent | 417c489defd33ceb4a3cb76efaf4975ee886221e (diff) | |
download | cpython-dde61644024185891e1c41d6f5de29f8780b1cff.zip cpython-dde61644024185891e1c41d6f5de29f8780b1cff.tar.gz cpython-dde61644024185891e1c41d6f5de29f8780b1cff.tar.bz2 |
- changed hash calculation for unicode strings. the new
value is calculated from the character values, in a way
that makes sure an 8-bit ASCII string and a unicode string
with the same contents get the same hash value.
(as a side effect, this also works for ISO Latin 1 strings).
for more details, see the python-dev discussion.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 38 |
1 files changed, 20 insertions, 18 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 70afd0d..7737057 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3471,26 +3471,28 @@ unicode_getitem(PyUnicodeObject *self, int index) static long unicode_hash(PyUnicodeObject *self) { - long hash; - PyObject *utf8; - - /* Since Unicode objects compare equal to their UTF-8 string - counterparts, they should also use the UTF-8 strings as basis - for their hash value. This is needed to assure that strings and - Unicode objects behave in the same way as dictionary - keys. Unfortunately, this costs some performance and also some - memory if the cached UTF-8 representation is not used later - on. */ + /* Since Unicode objects compare equal to their ASCII string + counterparts, they should use the individual character values + as basis for their hash value. This is needed to assure that + strings and Unicode objects behave in the same way as + dictionary keys. */ + + register int len; + register Py_UNICODE *p; + register long x; + if (self->hash != -1) return self->hash; - utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL); - if (utf8 == NULL) - return -1; - hash = PyObject_Hash(utf8); - if (hash == -1) - return -1; - self->hash = hash; - return hash; + len = PyUnicode_GET_SIZE(self); + p = PyUnicode_AS_UNICODE(self); + x = *p << 7; + while (--len >= 0) + x = (1000003*x) ^ *p++; + x ^= PyUnicode_GET_SIZE(self); + if (x == -1) + x = -2; + self->hash = x; + return x; } static char index__doc__[] = |