#2986 Add autojunk parameter to SequenceMatcher to optionally disable 'popular == junk' heuristic.

author: Terry Reedy <tjreedy@udel.edu> 2010-11-11 23:22:19 (GMT)
committer: Terry Reedy <tjreedy@udel.edu> 2010-11-11 23:22:19 (GMT)
commit: d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef (patch)
tree: 3d0e6661b6a54c6257786c700b9ba36664642365
parent: 6c2e0224ffb739a3db3984aa8beb68db3338b3f1 (diff)
download: cpython-d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef.zip
cpython-d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef.tar.gz
cpython-d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef.tar.bz2
4 files changed, 97 insertions, 40 deletions
diff --git a/Doc/library/difflib.rst b/Doc/library/difflib.rst
index 8556e1d..4d19b40 100644
--- a/Doc/library/difflib.rst
+++ b/Doc/library/difflib.rst
@@ -37,6 +37,16 @@ diffs. For comparing directories and files, see also, the :mod:`filecmp` module.
    complicated way on how many elements the sequences have in common; best case
    time is linear.
 
+   **Automatic junk heuristic:** :class:`SequenceMatcher` supports a heuristic that
+   automatically treats certain sequence items as junk. The heuristic counts how many
+   times each individual item appears in the sequence. If an item's duplicates (after
+   the first one) account for more than 1% of the sequence and the sequence is at least
+   200 items long, this item is marked as "popular" and is treated as junk for
+   the purpose of sequence matching. This heuristic can be turned off by setting
+   the ``autojunk`` argument to ``False`` when creating the :class:`SequenceMatcher`.
+
+   .. versionadded:: 2.7
+      The *autojunk* parameter.
 
 .. class:: Differ
 
@@ -334,7 +344,7 @@ SequenceMatcher Objects
 The :class:`SequenceMatcher` class has this constructor:
 
 
-.. class:: SequenceMatcher([isjunk[, a[, b]]])
+.. class:: SequenceMatcher([isjunk[, a[, b[, autojunk=True]]]])
 
    Optional argument *isjunk* must be ``None`` (the default) or a one-argument
    function that takes a sequence element and returns true if and only if the
@@ -350,6 +360,9 @@ The :class:`SequenceMatcher` class has this constructor:
    The optional arguments *a* and *b* are sequences to be compared; both default to
    empty strings.  The elements of both sequences must be :term:`hashable`.
 
+   The optional argument *autojunk* can be used to disable the automatic junk
+   heuristic.
+
    :class:`SequenceMatcher` objects have the following methods:
 
 
diff --git a/Lib/difflib.py b/Lib/difflib.py
index 3cc5f6b..e5910bc 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -151,7 +151,7 @@ class SequenceMatcher:
         Return an upper bound on ratio() very quickly.
     """
 
-    def __init__(self, isjunk=None, a='', b=''):
+    def __init__(self, isjunk=None, a='', b='', autojunk=True):
         """Construct a SequenceMatcher.
 
         Optional arg isjunk is None (the default), or a one-argument
@@ -169,6 +169,10 @@ class SequenceMatcher:
         Optional arg b is the second of two sequences to be compared.  By
         default, an empty string.  The elements of b must be hashable. See
         also .set_seqs() and .set_seq2().
+
+        Optional arg autojunk should be set to False to disable the
+        "automatic junk heuristic" that treats popular elements as junk
+        (see module documentation for more information).
         """
 
         # Members:
@@ -207,11 +211,13 @@ class SequenceMatcher:
         #      DOES NOT WORK for x in a!
         # isbpopular
         #      for x in b, isbpopular(x) is true iff b is reasonably long
-        #      (at least 200 elements) and x accounts for more than 1% of
-        #      its elements.  DOES NOT WORK for x in a!
+        #      (at least 200 elements) and x accounts for more than 1 + 1% of
+        #      its elements (when autojunk is enabled).
+        #      DOES NOT WORK for x in a!
 
         self.isjunk = isjunk
         self.a = self.b = None
+        self.autojunk = autojunk
         self.set_seqs(a, b)
 
     def set_seqs(self, a, b):
@@ -288,7 +294,7 @@ class SequenceMatcher:
     # from starting any matching block at a junk element ...
     # also creates the fast isbjunk function ...
     # b2j also does not contain entries for "popular" elements, meaning
-    # elements that account for more than 1% of the total elements, and
+    # elements that account for more than 1 + 1% of the total elements, and
     # when the sequence is reasonably large (>= 200 elements); this can
     # be viewed as an adaptive notion of semi-junk, and yields an enormous
     # speedup when, e.g., comparing program files with hundreds of
@@ -309,44 +315,37 @@ class SequenceMatcher:
         # out the junk later is much cheaper than building b2j "right"
         # from the start.
         b = self.b
-        n = len(b)
         self.b2j = b2j = {}
-        populardict = {}
-        for i, elt in enumerate(b):
-            if elt in b2j:
-                indices = b2j[elt]
-                if n >= 200 and len(indices) * 100 > n:
-                    populardict[elt] = 1
-                    del indices[:]
-                else:
-                    indices.append(i)
-            else:
-                b2j[elt] = [i]
 
-        # Purge leftover indices for popular elements.
-        for elt in populardict:
-            del b2j[elt]
+        for i, elt in enumerate(b):
+            indices = b2j.setdefault(elt, [])
+            indices.append(i)
 
-        # Now b2j.keys() contains elements uniquely, and especially when
-        # the sequence is a string, that's usually a good deal smaller
-        # than len(string).  The difference is the number of isjunk calls
-        # saved.
+        # Purge junk elements
+        junk = set()
         isjunk = self.isjunk
-        junkdict = {}
         if isjunk:
-            for d in populardict, b2j:
-                for elt in d.keys():
-                    if isjunk(elt):
-                        junkdict[elt] = 1
-                        del d[elt]
-
-        # Now for x in b, isjunk(x) == x in junkdict, but the
-        # latter is much faster.  Note too that while there may be a
-        # lot of junk in the sequence, the number of *unique* junk
-        # elements is probably small.  So the memory burden of keeping
-        # this dict alive is likely trivial compared to the size of b2j.
-        self.isbjunk = junkdict.__contains__
-        self.isbpopular = populardict.__contains__
+            for elt in list(b2j.keys()):  # using list() since b2j is modified
+                if isjunk(elt):
+                    junk.add(elt)
+                    del b2j[elt]
+
+        # Purge popular elements that are not junk
+        popular = set()
+        n = len(b)
+        if self.autojunk and n >= 200:
+            ntest = n // 100 + 1
+            for elt, idxs in list(b2j.items()):
+                if len(idxs) > ntest:
+                    popular.add(elt)
+                    del b2j[elt]
+
+        # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
+        # Sicne the number of *unique* junk elements is probably small, the
+        # memory burden of keeping this set alive is likely trivial compared to
+        # the size of b2j.
+        self.isbjunk = junk.__contains__
+        self.isbpopular = popular.__contains__
 
     def find_longest_match(self, alo, ahi, blo, bhi):
         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py
index 1b828bb..3533451 100644
--- a/Lib/test/test_difflib.py
+++ b/Lib/test/test_difflib.py
@@ -4,8 +4,47 @@ import unittest
 import doctest
 import sys
 
-class TestSFbugs(unittest.TestCase):
 
+class TestWithAscii(unittest.TestCase):
+    def test_one_insert(self):
+        sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100)
+        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
+        self.assertEqual(list(sm.get_opcodes()),
+            [   ('insert', 0, 0, 0, 1),
+                ('equal', 0, 100, 1, 101)])
+        sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
+        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
+        self.assertEqual(list(sm.get_opcodes()),
+            [   ('equal', 0, 50, 0, 50),
+                ('insert', 50, 50, 50, 51),
+                ('equal', 50, 100, 51, 101)])
+
+    def test_one_delete(self):
+        sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
+        self.assertAlmostEqual(sm.ratio(), 0.994, places=3)
+        self.assertEqual(list(sm.get_opcodes()),
+            [   ('equal', 0, 40, 0, 40),
+                ('delete', 40, 41, 40, 40),
+                ('equal', 41, 81, 40, 80)])
+
+
+class TestAutojunk(unittest.TestCase):
+    """Tests for the autojunk parameter added in 2.7"""
+    def test_one_insert_homogenous_sequence(self):
+        # By default autojunk=True and the heuristic kicks in for a sequence
+        # of length 200+
+        seq1 = 'b' * 200
+        seq2 = 'a' + 'b' * 200
+
+        sm = difflib.SequenceMatcher(None, seq1, seq2)
+        self.assertAlmostEqual(sm.ratio(), 0, places=3)
+
+        # Now turn the heuristic off
+        sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False)
+        self.assertAlmostEqual(sm.ratio(), 0.9975, places=3)
+
+
+class TestSFbugs(unittest.TestCase):
     def test_ratio_for_null_seqn(self):
         # Check clearing of SF bug 763023
         s = difflib.SequenceMatcher(None, [], [])
@@ -184,7 +223,9 @@ class TestOutputFormat(unittest.TestCase):
 def test_main():
     difflib.HtmlDiff._default_prefix = 0
     Doctests = doctest.DocTestSuite(difflib)
-    run_unittest(TestSFpatches, TestSFbugs, TestOutputFormat, Doctests)
+    run_unittest(
+        TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs,
+        TestOutputFormat, Doctests)
 
 if __name__ == '__main__':
     test_main()
diff --git a/Misc/NEWS b/Misc/NEWS
index ab8e270..509959e 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -213,6 +213,10 @@ Library
 - Issue #808164: Fixed socket.close to avoid references to globals, to
   avoid issues when socket.close is called from a __del__ method.
 
+- Issue #2986: difflib.SequenceMatcher gets a new parameter, autojunk, which
+  can be set to False to turn off the previously undocumented 'popularity'
+  heuristic. Patch by Terry Reedy and Eli Bendersky
+
 - Issue #8797: urllib2 does a retry for Basic Authentication failure instead of
   falling into recursion.
author	Terry Reedy <tjreedy@udel.edu>	2010-11-11 23:22:19 (GMT)
committer	Terry Reedy <tjreedy@udel.edu>	2010-11-11 23:22:19 (GMT)
commit	d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef (patch)
tree	3d0e6661b6a54c6257786c700b9ba36664642365
parent	6c2e0224ffb739a3db3984aa8beb68db3338b3f1 (diff)
download	cpython-d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef.zip cpython-d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef.tar.gz cpython-d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef.tar.bz2