summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/difflib.rst10
-rw-r--r--Lib/difflib.py10
-rw-r--r--Lib/test/test_difflib.py17
3 files changed, 28 insertions, 9 deletions
diff --git a/Doc/library/difflib.rst b/Doc/library/difflib.rst
index 1d623ce..a0afe81 100644
--- a/Doc/library/difflib.rst
+++ b/Doc/library/difflib.rst
@@ -359,11 +359,11 @@ The :class:`SequenceMatcher` class has this constructor:
The *autojunk* parameter.
SequenceMatcher objects get three data attributes: *bjunk* is the
- set of elements of b for which *isjunk* is True; *bpopular* is the set of non-
- junk elements considered popular by the heuristic (if it is not disabled);
- *b2j* is a dict mapping the remaining elements of b to a list of positions where
- they occur. All three are reset whenever *b* is reset with :meth:`set_seqs`
- or :meth:`set_seq2`.
+ set of elements of *b* for which *isjunk* is True; *bpopular* is the set of
+ non-junk elements considered popular by the heuristic (if it is not
+ disabled); *b2j* is a dict mapping the remaining elements of *b* to a list
+ of positions where they occur. All three are reset whenever *b* is reset
+ with :meth:`set_seqs` or :meth:`set_seq2`.
.. versionadded:: 3.2
The *bjunk* and *bpopular* attributes.
diff --git a/Lib/difflib.py b/Lib/difflib.py
index a1c5ec0..381721a 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -320,20 +320,22 @@ class SequenceMatcher:
self.bjunk = junk = set()
isjunk = self.isjunk
if isjunk:
- for elt in list(b2j.keys()): # using list() since b2j is modified
+ for elt in b2j.keys():
if isjunk(elt):
junk.add(elt)
- del b2j[elt]
+ for elt in junk: # separate loop avoids separate list of keys
+ del b2j[elt]
# Purge popular elements that are not junk
self.bpopular = popular = set()
n = len(b)
if self.autojunk and n >= 200:
ntest = n // 100 + 1
- for elt, idxs in list(b2j.items()):
+ for elt, idxs in b2j.items():
if len(idxs) > ntest:
popular.add(elt)
- del b2j[elt]
+ for elt in popular: # ditto; as fast for 1% deletion
+ del b2j[elt]
def isbjunk(self, item):
"Deprecated; use 'item in SequenceMatcher().bjunk'."
diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py
index e72df26..a263ee6 100644
--- a/Lib/test/test_difflib.py
+++ b/Lib/test/test_difflib.py
@@ -12,12 +12,14 @@ class TestWithAscii(unittest.TestCase):
self.assertEqual(list(sm.get_opcodes()),
[ ('insert', 0, 0, 0, 1),
('equal', 0, 100, 1, 101)])
+ self.assertEqual(sm.bpopular, set())
sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
self.assertEqual(list(sm.get_opcodes()),
[ ('equal', 0, 50, 0, 50),
('insert', 50, 50, 50, 51),
('equal', 50, 100, 51, 101)])
+ self.assertEqual(sm.bpopular, set())
def test_one_delete(self):
sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
@@ -27,6 +29,19 @@ class TestWithAscii(unittest.TestCase):
('delete', 40, 41, 40, 40),
('equal', 41, 81, 40, 80)])
+ def test_bjunk(self):
+ sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ',
+ a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40)
+ self.assertEqual(sm.bjunk, set())
+
+ sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ',
+ a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20)
+ self.assertEqual(sm.bjunk, {' '})
+
+ sm = difflib.SequenceMatcher(isjunk=lambda x: x in [' ', 'b'],
+ a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20)
+ self.assertEqual(sm.bjunk, {' ', 'b'})
+
class TestAutojunk(unittest.TestCase):
"""Tests for the autojunk parameter added in 2.7"""
@@ -38,10 +53,12 @@ class TestAutojunk(unittest.TestCase):
sm = difflib.SequenceMatcher(None, seq1, seq2)
self.assertAlmostEqual(sm.ratio(), 0, places=3)
+ self.assertEqual(sm.bpopular, {'b'})
# Now turn the heuristic off
sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False)
self.assertAlmostEqual(sm.ratio(), 0.9975, places=3)
+ self.assertEqual(sm.bpopular, set())
class TestSFbugs(unittest.TestCase):