diff options
-rw-r--r-- | Doc/library/difflib.rst | 10 | ||||
-rw-r--r-- | Lib/difflib.py | 10 | ||||
-rw-r--r-- | Lib/test/test_difflib.py | 17 |
3 files changed, 28 insertions, 9 deletions
diff --git a/Doc/library/difflib.rst b/Doc/library/difflib.rst index 1d623ce..a0afe81 100644 --- a/Doc/library/difflib.rst +++ b/Doc/library/difflib.rst @@ -359,11 +359,11 @@ The :class:`SequenceMatcher` class has this constructor: The *autojunk* parameter. SequenceMatcher objects get three data attributes: *bjunk* is the - set of elements of b for which *isjunk* is True; *bpopular* is the set of non- - junk elements considered popular by the heuristic (if it is not disabled); - *b2j* is a dict mapping the remaining elements of b to a list of positions where - they occur. All three are reset whenever *b* is reset with :meth:`set_seqs` - or :meth:`set_seq2`. + set of elements of *b* for which *isjunk* is True; *bpopular* is the set of + non-junk elements considered popular by the heuristic (if it is not + disabled); *b2j* is a dict mapping the remaining elements of *b* to a list + of positions where they occur. All three are reset whenever *b* is reset + with :meth:`set_seqs` or :meth:`set_seq2`. .. versionadded:: 3.2 The *bjunk* and *bpopular* attributes. diff --git a/Lib/difflib.py b/Lib/difflib.py index a1c5ec0..381721a 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -320,20 +320,22 @@ class SequenceMatcher: self.bjunk = junk = set() isjunk = self.isjunk if isjunk: - for elt in list(b2j.keys()): # using list() since b2j is modified + for elt in b2j.keys(): if isjunk(elt): junk.add(elt) - del b2j[elt] + for elt in junk: # separate loop avoids separate list of keys + del b2j[elt] # Purge popular elements that are not junk self.bpopular = popular = set() n = len(b) if self.autojunk and n >= 200: ntest = n // 100 + 1 - for elt, idxs in list(b2j.items()): + for elt, idxs in b2j.items(): if len(idxs) > ntest: popular.add(elt) - del b2j[elt] + for elt in popular: # ditto; as fast for 1% deletion + del b2j[elt] def isbjunk(self, item): "Deprecated; use 'item in SequenceMatcher().bjunk'." diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py index e72df26..a263ee6 100644 --- a/Lib/test/test_difflib.py +++ b/Lib/test/test_difflib.py @@ -12,12 +12,14 @@ class TestWithAscii(unittest.TestCase): self.assertEqual(list(sm.get_opcodes()), [ ('insert', 0, 0, 0, 1), ('equal', 0, 100, 1, 101)]) + self.assertEqual(sm.bpopular, set()) sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50) self.assertAlmostEqual(sm.ratio(), 0.995, places=3) self.assertEqual(list(sm.get_opcodes()), [ ('equal', 0, 50, 0, 50), ('insert', 50, 50, 50, 51), ('equal', 50, 100, 51, 101)]) + self.assertEqual(sm.bpopular, set()) def test_one_delete(self): sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40) @@ -27,6 +29,19 @@ class TestWithAscii(unittest.TestCase): ('delete', 40, 41, 40, 40), ('equal', 41, 81, 40, 80)]) + def test_bjunk(self): + sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ', + a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40) + self.assertEqual(sm.bjunk, set()) + + sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ', + a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20) + self.assertEqual(sm.bjunk, {' '}) + + sm = difflib.SequenceMatcher(isjunk=lambda x: x in [' ', 'b'], + a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20) + self.assertEqual(sm.bjunk, {' ', 'b'}) + class TestAutojunk(unittest.TestCase): """Tests for the autojunk parameter added in 2.7""" @@ -38,10 +53,12 @@ class TestAutojunk(unittest.TestCase): sm = difflib.SequenceMatcher(None, seq1, seq2) self.assertAlmostEqual(sm.ratio(), 0, places=3) + self.assertEqual(sm.bpopular, {'b'}) # Now turn the heuristic off sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False) self.assertAlmostEqual(sm.ratio(), 0.9975, places=3) + self.assertEqual(sm.bpopular, set()) class TestSFbugs(unittest.TestCase): |