From b0a7a037b8fde56b62f886d5188bced7776777b4 Mon Sep 17 00:00:00 2001 From: Yury Selivanov Date: Mon, 22 Jan 2018 11:54:41 -0500 Subject: bpo-31179: Make dict.copy() up to 5.5 times faster. (#3067) --- Lib/test/test_dict.py | 50 ++++++++++++++- .../2017-08-10-17-32-48.bpo-31179.XcgLYI.rst | 1 + Objects/dictobject.c | 73 ++++++++++++++++++++++ 3 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2017-08-10-17-32-48.bpo-31179.XcgLYI.rst diff --git a/Lib/test/test_dict.py b/Lib/test/test_dict.py index 4386eda..aa149d3 100644 --- a/Lib/test/test_dict.py +++ b/Lib/test/test_dict.py @@ -271,11 +271,57 @@ class DictTest(unittest.TestCase): self.assertEqual(baddict3.fromkeys({"a", "b", "c"}), res) def test_copy(self): - d = {1:1, 2:2, 3:3} - self.assertEqual(d.copy(), {1:1, 2:2, 3:3}) + d = {1: 1, 2: 2, 3: 3} + self.assertIsNot(d.copy(), d) + self.assertEqual(d.copy(), d) + self.assertEqual(d.copy(), {1: 1, 2: 2, 3: 3}) + + copy = d.copy() + d[4] = 4 + self.assertNotEqual(copy, d) + self.assertEqual({}.copy(), {}) self.assertRaises(TypeError, d.copy, None) + def test_copy_fuzz(self): + for dict_size in [10, 100, 1000, 10000, 100000]: + dict_size = random.randrange( + dict_size // 2, dict_size + dict_size // 2) + with self.subTest(dict_size=dict_size): + d = {} + for i in range(dict_size): + d[i] = i + + d2 = d.copy() + self.assertIsNot(d2, d) + self.assertEqual(d, d2) + d2['key'] = 'value' + self.assertNotEqual(d, d2) + self.assertEqual(len(d2), len(d) + 1) + + def test_copy_maintains_tracking(self): + class A: + pass + + key = A() + + for d in ({}, {'a': 1}, {key: 'val'}): + d2 = d.copy() + self.assertEqual(gc.is_tracked(d), gc.is_tracked(d2)) + + def test_copy_noncompact(self): + # Dicts don't compact themselves on del/pop operations. + # Copy will use a slow merging strategy that produces + # a compacted copy when roughly 33% of dict is a non-used + # keys-space (to optimize memory footprint). + # In this test we want to hit the slow/compacting + # branch of dict.copy() and make sure it works OK. + d = {k: k for k in range(1000)} + for k in range(950): + del d[k] + d2 = d.copy() + self.assertEqual(d2, d) + def test_get(self): d = {} self.assertIs(d.get('c'), None) diff --git a/Misc/NEWS.d/next/Core and Builtins/2017-08-10-17-32-48.bpo-31179.XcgLYI.rst b/Misc/NEWS.d/next/Core and Builtins/2017-08-10-17-32-48.bpo-31179.XcgLYI.rst new file mode 100644 index 0000000..12ebc8b --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2017-08-10-17-32-48.bpo-31179.XcgLYI.rst @@ -0,0 +1 @@ +Make dict.copy() up to 5.5 times faster. diff --git a/Objects/dictobject.c b/Objects/dictobject.c index b20b85c..259fa25 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -615,6 +615,52 @@ new_dict_with_shared_keys(PyDictKeysObject *keys) return new_dict(keys, values); } + +static PyObject * +clone_combined_dict(PyDictObject *orig) +{ + assert(PyDict_CheckExact(orig)); + assert(orig->ma_values == NULL); + assert(orig->ma_keys->dk_refcnt == 1); + + Py_ssize_t keys_size = _PyDict_KeysSize(orig->ma_keys); + PyDictKeysObject *keys = PyObject_Malloc(keys_size); + if (keys == NULL) { + PyErr_NoMemory(); + return NULL; + } + + memcpy(keys, orig->ma_keys, keys_size); + + /* After copying key/value pairs, we need to incref all + keys and values and they are about to be co-owned by a + new dict object. */ + PyDictKeyEntry *ep0 = DK_ENTRIES(keys); + Py_ssize_t n = keys->dk_nentries; + for (Py_ssize_t i = 0; i < n; i++) { + PyDictKeyEntry *entry = &ep0[i]; + PyObject *value = entry->me_value; + if (value != NULL) { + Py_INCREF(value); + Py_INCREF(entry->me_key); + } + } + + PyDictObject *new = (PyDictObject *)new_dict(keys, NULL); + if (new == NULL) { + /* In case of an error, `new_dict()` takes care of + cleaning up `keys`. */ + return NULL; + } + new->ma_used = orig->ma_used; + assert(_PyDict_CheckConsistency(new)); + if (_PyObject_GC_IS_TRACKED(orig)) { + /* Maintain tracking. */ + _PyObject_GC_TRACK(new); + } + return (PyObject *)new; +} + PyObject * PyDict_New(void) { @@ -2484,7 +2530,13 @@ PyDict_Copy(PyObject *o) PyErr_BadInternalCall(); return NULL; } + mp = (PyDictObject *)o; + if (mp->ma_used == 0) { + /* The dict is empty; just return a new dict. */ + return PyDict_New(); + } + if (_PyDict_HasSplitTable(mp)) { PyDictObject *split_copy; Py_ssize_t size = USABLE_FRACTION(DK_SIZE(mp->ma_keys)); @@ -2510,6 +2562,27 @@ PyDict_Copy(PyObject *o) _PyObject_GC_TRACK(split_copy); return (PyObject *)split_copy; } + + if (PyDict_CheckExact(mp) && mp->ma_values == NULL && + (mp->ma_used >= (mp->ma_keys->dk_nentries * 2) / 3)) + { + /* Use fast-copy if: + + (1) 'mp' is an instance of a subclassed dict; and + + (2) 'mp' is not a split-dict; and + + (3) if 'mp' is non-compact ('del' operation does not resize dicts), + do fast-copy only if it has at most 1/3 non-used keys. + + The last condition (3) is important to guard against a pathalogical + case when a large dict is almost emptied with multiple del/pop + operations and copied after that. In cases like this, we defer to + PyDict_Merge, which produces a compacted copy. + */ + return clone_combined_dict(mp); + } + copy = PyDict_New(); if (copy == NULL) return NULL; -- cgit v0.12