From 2f228e75e4d5ac8c3eb4a6334dbc43243bff1095 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 13 May 2001 00:19:31 +0000 Subject: Get rid of the superstitious "~" in dict hashing's "i = (~hash) & mask". The comment following used to say: /* We use ~hash instead of hash, as degenerate hash functions, such as for ints , can have lots of leading zeros. It's not really a performance risk, but better safe than sorry. 12-Dec-00 tim: so ~hash produces lots of leading ones instead -- what's the gain? */ That is, there was never a good reason for doing it. And to the contrary, as explained on Python-Dev last December, it tended to make the *sum* (i + incr) & mask (which is the first table index examined in case of collison) the same "too often" across distinct hashes. Changing to the simpler "i = hash & mask" reduced the number of string-dict collisions (== # number of times we go around the lookup for-loop) from about 6 million to 5 million during a full run of the test suite (these are approximate because the test suite does some random stuff from run to run). The number of collisions in non-string dicts also decreased, but not as dramatically. Note that this may, for a given dict, change the order (wrt previous releases) of entries exposed by .keys(), .values() and .items(). A number of std tests suffered bogus failures as a result. For dicts keyed by small ints, or (less so) by characters, the order is much more likely to be in increasing order of key now; e.g., >>> d = {} >>> for i in range(10): ... d[i] = i ... >>> d {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9} >>> Unfortunately. people may latch on to that in small examples and draw a bogus conclusion. test_support.py Moved test_extcall's sortdict() into test_support, made it stronger, and imported sortdict into other std tests that needed it. test_unicode.py Excluced cp875 from the "roundtrip over range(128)" test, because cp875 doesn't have a well-defined inverse for unicode("?", "cp875"). See Python-Dev for excruciating details. Cookie.py Chaged various output functions to sort dicts before building strings from them. test_extcall Fiddled the expected-result file. This remains sensitive to native dict ordering, because, e.g., if there are multiple errors in a keyword-arg dict (and test_extcall sets up many cases like that), the specific error Python complains about first depends on native dict ordering. --- Lib/Cookie.py | 20 ++++++++++++++------ Lib/test/output/test_cookie | 6 +++--- Lib/test/output/test_extcall | 20 ++++++++++---------- Lib/test/test_cookie.py | 4 +++- Lib/test/test_extcall.py | 15 ++++----------- Lib/test/test_pyexpat.py | 4 +++- Lib/test/test_regex.py | 4 ++-- Lib/test/test_support.py | 8 ++++++++ Lib/test/test_unicode.py | 7 +++++-- Misc/NEWS | 10 ++++++++++ Objects/dictobject.c | 12 ++---------- 11 files changed, 64 insertions(+), 46 deletions(-) diff --git a/Lib/Cookie.py b/Lib/Cookie.py index b7ee19c..dd116b7 100644 --- a/Lib/Cookie.py +++ b/Lib/Cookie.py @@ -70,8 +70,8 @@ a dictionary. >>> C["fig"] = "newton" >>> C["sugar"] = "wafer" >>> print C - Set-Cookie: sugar=wafer; Set-Cookie: fig=newton; + Set-Cookie: sugar=wafer; Notice that the printable representation of a Cookie is the appropriate format for a Set-Cookie: header. This is the @@ -93,8 +93,8 @@ HTTP_COOKIE environment variable. >>> C = Cookie.SmartCookie() >>> C.load("chips=ahoy; vienna=finger") >>> print C - Set-Cookie: vienna=finger; Set-Cookie: chips=ahoy; + Set-Cookie: vienna=finger; The load() method is darn-tootin smart about identifying cookies within a string. Escaped quotation marks, nested semicolons, and other @@ -493,7 +493,9 @@ class Morsel(UserDict): # Now add any defined attributes if attrs is None: attrs = self._reserved_keys - for K,V in self.items(): + items = self.items() + items.sort() + for K,V in items: if V == "": continue if K not in attrs: continue if K == "expires" and type(V) == type(1): @@ -586,7 +588,9 @@ class BaseCookie(UserDict): def output(self, attrs=None, header="Set-Cookie:", sep="\n"): """Return a string suitable for HTTP.""" result = [] - for K,V in self.items(): + items = self.items() + items.sort() + for K,V in items: result.append( V.output(attrs, header) ) return string.join(result, sep) # end output @@ -595,14 +599,18 @@ class BaseCookie(UserDict): def __repr__(self): L = [] - for K,V in self.items(): + items = self.items() + items.sort() + for K,V in items: L.append( '%s=%s' % (K,repr(V.value) ) ) return '<%s: %s>' % (self.__class__.__name__, string.join(L)) def js_output(self, attrs=None): """Return a string suitable for JavaScript.""" result = [] - for K,V in self.items(): + items = self.items() + items.sort() + for K,V in items: result.append( V.js_output(attrs) ) return string.join(result, "") # end js_output diff --git a/Lib/test/output/test_cookie b/Lib/test/output/test_cookie index 64b9d04..221c72a 100644 --- a/Lib/test/output/test_cookie +++ b/Lib/test/output/test_cookie @@ -1,11 +1,11 @@ test_cookie - -Set-Cookie: vienna=finger; + Set-Cookie: chips=ahoy; - vienna 'finger' 'finger' Set-Cookie: vienna=finger; chips 'ahoy' 'ahoy' Set-Cookie: chips=ahoy; + vienna 'finger' 'finger' +Set-Cookie: vienna=finger; Set-Cookie: keebler="E=mc2; L=\"Loves\"; fudge=\012;"; keebler 'E=mc2; L="Loves"; fudge=\n;' 'E=mc2; L="Loves"; fudge=\n;' diff --git a/Lib/test/output/test_extcall b/Lib/test/output/test_extcall index 58f52d4..24e8c8c 100644 --- a/Lib/test/output/test_extcall +++ b/Lib/test/output/test_extcall @@ -40,7 +40,7 @@ za () {} -> za() takes exactly 1 argument (0 given) za () {'a': 'aa'} -> ok za aa B D E V a za () {'d': 'dd'} -> za() got an unexpected keyword argument 'd' za () {'a': 'aa', 'd': 'dd'} -> za() got an unexpected keyword argument 'd' -za () {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> za() got an unexpected keyword argument 'd' +za () {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> za() got an unexpected keyword argument 'b' za (1, 2) {} -> za() takes exactly 1 argument (2 given) za (1, 2) {'a': 'aa'} -> za() takes exactly 1 non-keyword argument (2 given) za (1, 2) {'d': 'dd'} -> za() takes exactly 1 non-keyword argument (2 given) @@ -59,8 +59,8 @@ zade () {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zade() got an unexpected zade (1, 2) {} -> ok zade 1 B 2 e V e zade (1, 2) {'a': 'aa'} -> zade() got multiple values for keyword argument 'a' zade (1, 2) {'d': 'dd'} -> zade() got multiple values for keyword argument 'd' -zade (1, 2) {'a': 'aa', 'd': 'dd'} -> zade() got multiple values for keyword argument 'd' -zade (1, 2) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zade() got multiple values for keyword argument 'd' +zade (1, 2) {'a': 'aa', 'd': 'dd'} -> zade() got multiple values for keyword argument 'a' +zade (1, 2) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zade() got multiple values for keyword argument 'a' zade (1, 2, 3, 4, 5) {} -> zade() takes at most 3 arguments (5 given) zade (1, 2, 3, 4, 5) {'a': 'aa'} -> zade() takes at most 3 non-keyword arguments (5 given) zade (1, 2, 3, 4, 5) {'d': 'dd'} -> zade() takes at most 3 non-keyword arguments (5 given) @@ -75,7 +75,7 @@ zabk (1, 2) {} -> ok zabk 1 2 D E V {} zabk (1, 2) {'a': 'aa'} -> zabk() got multiple values for keyword argument 'a' zabk (1, 2) {'d': 'dd'} -> ok zabk 1 2 D E V {'d': 'dd'} zabk (1, 2) {'a': 'aa', 'd': 'dd'} -> zabk() got multiple values for keyword argument 'a' -zabk (1, 2) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zabk() got multiple values for keyword argument 'b' +zabk (1, 2) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zabk() got multiple values for keyword argument 'a' zabk (1, 2, 3, 4, 5) {} -> zabk() takes exactly 2 arguments (5 given) zabk (1, 2, 3, 4, 5) {'a': 'aa'} -> zabk() takes exactly 2 non-keyword arguments (5 given) zabk (1, 2, 3, 4, 5) {'d': 'dd'} -> zabk() takes exactly 2 non-keyword arguments (5 given) @@ -90,12 +90,12 @@ zabdv (1, 2) {} -> ok zabdv 1 2 d E () e zabdv (1, 2) {'a': 'aa'} -> zabdv() got multiple values for keyword argument 'a' zabdv (1, 2) {'d': 'dd'} -> ok zabdv 1 2 dd E () d zabdv (1, 2) {'a': 'aa', 'd': 'dd'} -> zabdv() got multiple values for keyword argument 'a' -zabdv (1, 2) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zabdv() got an unexpected keyword argument 'e' +zabdv (1, 2) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zabdv() got multiple values for keyword argument 'a' zabdv (1, 2, 3, 4, 5) {} -> ok zabdv 1 2 3 E (4, 5) e zabdv (1, 2, 3, 4, 5) {'a': 'aa'} -> zabdv() got multiple values for keyword argument 'a' zabdv (1, 2, 3, 4, 5) {'d': 'dd'} -> zabdv() got multiple values for keyword argument 'd' -zabdv (1, 2, 3, 4, 5) {'a': 'aa', 'd': 'dd'} -> zabdv() got multiple values for keyword argument 'd' -zabdv (1, 2, 3, 4, 5) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zabdv() got multiple values for keyword argument 'd' +zabdv (1, 2, 3, 4, 5) {'a': 'aa', 'd': 'dd'} -> zabdv() got multiple values for keyword argument 'a' +zabdv (1, 2, 3, 4, 5) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zabdv() got multiple values for keyword argument 'a' zabdevk () {} -> zabdevk() takes at least 2 arguments (0 given) zabdevk () {'a': 'aa'} -> zabdevk() takes at least 2 non-keyword arguments (1 given) zabdevk () {'d': 'dd'} -> zabdevk() takes at least 2 non-keyword arguments (0 given) @@ -105,9 +105,9 @@ zabdevk (1, 2) {} -> ok zabdevk 1 2 d e () {} zabdevk (1, 2) {'a': 'aa'} -> zabdevk() got multiple values for keyword argument 'a' zabdevk (1, 2) {'d': 'dd'} -> ok zabdevk 1 2 dd e () {} zabdevk (1, 2) {'a': 'aa', 'd': 'dd'} -> zabdevk() got multiple values for keyword argument 'a' -zabdevk (1, 2) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zabdevk() got multiple values for keyword argument 'b' +zabdevk (1, 2) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zabdevk() got multiple values for keyword argument 'a' zabdevk (1, 2, 3, 4, 5) {} -> ok zabdevk 1 2 3 4 (5,) {} zabdevk (1, 2, 3, 4, 5) {'a': 'aa'} -> zabdevk() got multiple values for keyword argument 'a' zabdevk (1, 2, 3, 4, 5) {'d': 'dd'} -> zabdevk() got multiple values for keyword argument 'd' -zabdevk (1, 2, 3, 4, 5) {'a': 'aa', 'd': 'dd'} -> zabdevk() got multiple values for keyword argument 'd' -zabdevk (1, 2, 3, 4, 5) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zabdevk() got multiple values for keyword argument 'd' +zabdevk (1, 2, 3, 4, 5) {'a': 'aa', 'd': 'dd'} -> zabdevk() got multiple values for keyword argument 'a' +zabdevk (1, 2, 3, 4, 5) {'a': 'aa', 'b': 'bb', 'd': 'dd', 'e': 'ee'} -> zabdevk() got multiple values for keyword argument 'a' diff --git a/Lib/test/test_cookie.py b/Lib/test/test_cookie.py index 2063dbf..40c881f 100644 --- a/Lib/test/test_cookie.py +++ b/Lib/test/test_cookie.py @@ -20,7 +20,9 @@ for data, dict in cases: C = Cookie.SimpleCookie() ; C.load(data) print repr(C) print str(C) - for k, v in dict.items(): + items = dict.items() + items.sort() + for k, v in items: print ' ', k, repr( C[k].value ), repr(v) verify(C[k].value == v) print C[k] diff --git a/Lib/test/test_extcall.py b/Lib/test/test_extcall.py index 274e943..9effac7 100644 --- a/Lib/test/test_extcall.py +++ b/Lib/test/test_extcall.py @@ -1,14 +1,6 @@ -from test_support import verify, verbose, TestFailed +from test_support import verify, verbose, TestFailed, sortdict from UserList import UserList -def sortdict(d): - keys = d.keys() - keys.sort() - lst = [] - for k in keys: - lst.append("%r: %r" % (k, d[k])) - return "{%s}" % ", ".join(lst) - def f(*a, **k): print a, sortdict(k) @@ -228,8 +220,9 @@ for args in ['', 'a', 'ab']: lambda x: '%s="%s"' % (x, x), defargs) if vararg: arglist.append('*' + vararg) if kwarg: arglist.append('**' + kwarg) - decl = 'def %s(%s): print "ok %s", a, b, d, e, v, k' % ( - name, ', '.join(arglist), name) + decl = (('def %s(%s): print "ok %s", a, b, d, e, v, ' + + 'type(k) is type ("") and k or sortdict(k)') + % (name, ', '.join(arglist), name)) exec(decl) func = eval(name) funcs.append(func) diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py index fecb27e..873dd69 100644 --- a/Lib/test/test_pyexpat.py +++ b/Lib/test/test_pyexpat.py @@ -5,9 +5,11 @@ from xml.parsers import expat +from test_support import sortdict + class Outputter: def StartElementHandler(self, name, attrs): - print 'Start element:\n\t', repr(name), attrs + print 'Start element:\n\t', repr(name), sortdict(attrs) def EndElementHandler(self, name): print 'End element:\n\t', repr(name) diff --git a/Lib/test/test_regex.py b/Lib/test/test_regex.py index a324821..b6260d2 100644 --- a/Lib/test/test_regex.py +++ b/Lib/test/test_regex.py @@ -1,4 +1,4 @@ -from test_support import verbose +from test_support import verbose, sortdict import warnings warnings.filterwarnings("ignore", "the regex module is deprecated", DeprecationWarning, __name__) @@ -40,7 +40,7 @@ print cre.group('one') print cre.group(1, 2) print cre.group('one', 'two') print 'realpat:', cre.realpat -print 'groupindex:', cre.groupindex +print 'groupindex:', sortdict(cre.groupindex) re = 'world' cre = regex.compile(re) diff --git a/Lib/test/test_support.py b/Lib/test/test_support.py index 943ba8a..330b9c8 100644 --- a/Lib/test/test_support.py +++ b/Lib/test/test_support.py @@ -90,6 +90,14 @@ def verify(condition, reason='test failed'): if not condition: raise TestFailed(reason) +def sortdict(dict): + "Like repr(dict), but in sorted order." + items = dict.items() + items.sort() + reprpairs = ["%r: %r" % pair for pair in items] + withcommas = ", ".join(reprpairs) + return "{%s}" % withcommas + def check_syntax(statement): try: compile(statement, '', 'exec') diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 0df8217..c82ac69 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -5,7 +5,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. """#" -from test_support import verify, verbose +from test_support import verify, verbose, TestFailed import sys def test(method, input, output, *args): @@ -493,11 +493,14 @@ for encoding in ( 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', - 'cp1006', 'cp875', 'iso8859_8', + 'cp1006', 'iso8859_8', ### These have undefined mappings: #'cp424', + ### These fail the round-trip: + #'cp875' + ): try: verify(unicode(s,encoding).encode(encoding) == s) diff --git a/Misc/NEWS b/Misc/NEWS index b485959..3491109 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -23,6 +23,16 @@ Core usually for the better, but may also cause numerically unstable algorithms to break. +- The implementation of dicts suffers fewer collisions, which has speed + benefits. However, the order in which dict entries appear in dict.keys(), + dict.values() and dict.items() may differ from previous releases for a + given dict. Nothing is defined about this order, so no program should + rely on it. Nevertheless, it's easy to write test cases that rely on the + order by accident, typically because of printing the str() or repr() of a + dict to an "expected results" file. See Lib/test/test_support.py's new + sortdict(dict) function for a simple way to display a dict in sorted + order. + - Dictionary objects now support the "in" operator: "x in dict" means the same as dict.has_key(x). diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 9b5b9f4..65c09d6 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -188,12 +188,7 @@ lookdict(dictobject *mp, PyObject *key, register long hash) /* We must come up with (i, incr) such that 0 <= i < ma_size and 0 < incr < ma_size and both are a function of hash. i is the initial table index and incr the initial probe offset. */ - i = (~hash) & mask; - /* We use ~hash instead of hash, as degenerate hash functions, such - as for ints , can have lots of leading zeros. It's not - really a performance risk, but better safe than sorry. - 12-Dec-00 tim: so ~hash produces lots of leading ones instead -- - what's the gain? */ + i = hash & mask; ep = &ep0[i]; if (ep->me_key == NULL || ep->me_key == key) return ep; @@ -301,10 +296,7 @@ lookdict_string(dictobject *mp, PyObject *key, register long hash) } /* We must come up with (i, incr) such that 0 <= i < ma_size and 0 < incr < ma_size and both are a function of hash */ - i = (~hash) & mask; - /* We use ~hash instead of hash, as degenerate hash functions, such - as for ints , can have lots of leading zeros. It's not - really a performance risk, but better safe than sorry. */ + i = hash & mask; ep = &ep0[i]; if (ep->me_key == NULL || ep->me_key == key) return ep; -- cgit v0.12