diff options
author | Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com> | 2022-12-23 18:15:47 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-12-23 18:15:47 (GMT) |
commit | a98d9ea56e7b473af54438ecc487a6bf1b4d6530 (patch) | |
tree | 792d718b70d08874bd011cbf800af000abdca7e9 | |
parent | 36d358348de8efad75ebcf55dad8ed4a4f6dcda9 (diff) | |
download | cpython-a98d9ea56e7b473af54438ecc487a6bf1b4d6530.zip cpython-a98d9ea56e7b473af54438ecc487a6bf1b4d6530.tar.gz cpython-a98d9ea56e7b473af54438ecc487a6bf1b4d6530.tar.bz2 |
gh-94155: Reduce hash collisions for code objects (#100183)
* Uses a better hashing algorithm to get better dispersion and remove commutativity.
* Incorporates `co_firstlineno`, `Py_SIZE(co)`, and bytecode instructions.
* This is now the entire set of criteria used in `code_richcompare`, except for `_PyCode_ConstantKey` (which would incorporate the types of `co_consts` rather than just their values).
-rw-r--r-- | Lib/test/test_code.py | 26 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst | 1 | ||||
-rw-r--r-- | Objects/codeobject.c | 53 |
3 files changed, 60 insertions, 20 deletions
diff --git a/Lib/test/test_code.py b/Lib/test/test_code.py index 02ab8fb..b13d577 100644 --- a/Lib/test/test_code.py +++ b/Lib/test/test_code.py @@ -465,6 +465,32 @@ class CodeTest(unittest.TestCase): self.assertNotEqual(code_b, code_d) self.assertNotEqual(code_c, code_d) + def test_code_hash_uses_firstlineno(self): + c1 = (lambda: 1).__code__ + c2 = (lambda: 1).__code__ + self.assertNotEqual(c1, c2) + self.assertNotEqual(hash(c1), hash(c2)) + c3 = c1.replace(co_firstlineno=17) + self.assertNotEqual(c1, c3) + self.assertNotEqual(hash(c1), hash(c3)) + + def test_code_hash_uses_order(self): + # Swapping posonlyargcount and kwonlyargcount should change the hash. + c = (lambda x, y, *, z=1, w=1: 1).__code__ + self.assertEqual(c.co_argcount, 2) + self.assertEqual(c.co_posonlyargcount, 0) + self.assertEqual(c.co_kwonlyargcount, 2) + swapped = c.replace(co_posonlyargcount=2, co_kwonlyargcount=0) + self.assertNotEqual(c, swapped) + self.assertNotEqual(hash(c), hash(swapped)) + + def test_code_hash_uses_bytecode(self): + c = (lambda x, y: x + y).__code__ + d = (lambda x, y: x * y).__code__ + c1 = c.replace(co_code=d.co_code) + self.assertNotEqual(c, c1) + self.assertNotEqual(hash(c), hash(c1)) + def isinterned(s): return s is sys.intern(('_' + s + '_')[1:-1]) diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst b/Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst new file mode 100644 index 0000000..e7c7ed2 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst @@ -0,0 +1 @@ +Improved the hashing algorithm for code objects, mitigating some hash collisions. diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 1e5a922..e174c6f 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -1842,28 +1842,41 @@ code_richcompare(PyObject *self, PyObject *other, int op) static Py_hash_t code_hash(PyCodeObject *co) { - Py_hash_t h, h0, h1, h2, h3; - h0 = PyObject_Hash(co->co_name); - if (h0 == -1) return -1; - h1 = PyObject_Hash(co->co_consts); - if (h1 == -1) return -1; - h2 = PyObject_Hash(co->co_names); - if (h2 == -1) return -1; - h3 = PyObject_Hash(co->co_localsplusnames); - if (h3 == -1) return -1; - Py_hash_t h4 = PyObject_Hash(co->co_linetable); - if (h4 == -1) { - return -1; + Py_uhash_t uhash = 20221211; + #define SCRAMBLE_IN(H) do { \ + uhash ^= (Py_uhash_t)(H); \ + uhash *= _PyHASH_MULTIPLIER; \ + } while (0) + #define SCRAMBLE_IN_HASH(EXPR) do { \ + Py_hash_t h = PyObject_Hash(EXPR); \ + if (h == -1) { \ + return -1; \ + } \ + SCRAMBLE_IN(h); \ + } while (0) + + SCRAMBLE_IN_HASH(co->co_name); + SCRAMBLE_IN_HASH(co->co_consts); + SCRAMBLE_IN_HASH(co->co_names); + SCRAMBLE_IN_HASH(co->co_localsplusnames); + SCRAMBLE_IN_HASH(co->co_linetable); + SCRAMBLE_IN_HASH(co->co_exceptiontable); + SCRAMBLE_IN(co->co_argcount); + SCRAMBLE_IN(co->co_posonlyargcount); + SCRAMBLE_IN(co->co_kwonlyargcount); + SCRAMBLE_IN(co->co_flags); + SCRAMBLE_IN(co->co_firstlineno); + SCRAMBLE_IN(Py_SIZE(co)); + for (int i = 0; i < Py_SIZE(co); i++) { + int deop = _PyOpcode_Deopt[_Py_OPCODE(_PyCode_CODE(co)[i])]; + SCRAMBLE_IN(deop); + SCRAMBLE_IN(_Py_OPARG(_PyCode_CODE(co)[i])); + i += _PyOpcode_Caches[deop]; } - Py_hash_t h5 = PyObject_Hash(co->co_exceptiontable); - if (h5 == -1) { - return -1; + if ((Py_hash_t)uhash == -1) { + return -2; } - h = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ - co->co_argcount ^ co->co_posonlyargcount ^ co->co_kwonlyargcount ^ - co->co_flags; - if (h == -1) h = -2; - return h; + return (Py_hash_t)uhash; } |