summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTerry Reedy <tjreedy@udel.edu>2010-11-11 23:22:19 (GMT)
committerTerry Reedy <tjreedy@udel.edu>2010-11-11 23:22:19 (GMT)
commitd2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef (patch)
tree3d0e6661b6a54c6257786c700b9ba36664642365
parent6c2e0224ffb739a3db3984aa8beb68db3338b3f1 (diff)
downloadcpython-d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef.zip
cpython-d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef.tar.gz
cpython-d2d2ae91c5ea2d226a6aae7afc8c8b05200c4eef.tar.bz2
#2986 Add autojunk parameter to SequenceMatcher to optionally disable 'popular == junk' heuristic.
-rw-r--r--Doc/library/difflib.rst15
-rw-r--r--Lib/difflib.py73
-rw-r--r--Lib/test/test_difflib.py45
-rw-r--r--Misc/NEWS4
4 files changed, 97 insertions, 40 deletions
diff --git a/Doc/library/difflib.rst b/Doc/library/difflib.rst
index 8556e1d..4d19b40 100644
--- a/Doc/library/difflib.rst
+++ b/Doc/library/difflib.rst
@@ -37,6 +37,16 @@ diffs. For comparing directories and files, see also, the :mod:`filecmp` module.
complicated way on how many elements the sequences have in common; best case
time is linear.
+ **Automatic junk heuristic:** :class:`SequenceMatcher` supports a heuristic that
+ automatically treats certain sequence items as junk. The heuristic counts how many
+ times each individual item appears in the sequence. If an item's duplicates (after
+ the first one) account for more than 1% of the sequence and the sequence is at least
+ 200 items long, this item is marked as "popular" and is treated as junk for
+ the purpose of sequence matching. This heuristic can be turned off by setting
+ the ``autojunk`` argument to ``False`` when creating the :class:`SequenceMatcher`.
+
+ .. versionadded:: 2.7
+ The *autojunk* parameter.
.. class:: Differ
@@ -334,7 +344,7 @@ SequenceMatcher Objects
The :class:`SequenceMatcher` class has this constructor:
-.. class:: SequenceMatcher([isjunk[, a[, b]]])
+.. class:: SequenceMatcher([isjunk[, a[, b[, autojunk=True]]]])
Optional argument *isjunk* must be ``None`` (the default) or a one-argument
function that takes a sequence element and returns true if and only if the
@@ -350,6 +360,9 @@ The :class:`SequenceMatcher` class has this constructor:
The optional arguments *a* and *b* are sequences to be compared; both default to
empty strings. The elements of both sequences must be :term:`hashable`.
+ The optional argument *autojunk* can be used to disable the automatic junk
+ heuristic.
+
:class:`SequenceMatcher` objects have the following methods:
diff --git a/Lib/difflib.py b/Lib/difflib.py
index 3cc5f6b..e5910bc 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -151,7 +151,7 @@ class SequenceMatcher:
Return an upper bound on ratio() very quickly.
"""
- def __init__(self, isjunk=None, a='', b=''):
+ def __init__(self, isjunk=None, a='', b='', autojunk=True):
"""Construct a SequenceMatcher.
Optional arg isjunk is None (the default), or a one-argument
@@ -169,6 +169,10 @@ class SequenceMatcher:
Optional arg b is the second of two sequences to be compared. By
default, an empty string. The elements of b must be hashable. See
also .set_seqs() and .set_seq2().
+
+ Optional arg autojunk should be set to False to disable the
+ "automatic junk heuristic" that treats popular elements as junk
+ (see module documentation for more information).
"""
# Members:
@@ -207,11 +211,13 @@ class SequenceMatcher:
# DOES NOT WORK for x in a!
# isbpopular
# for x in b, isbpopular(x) is true iff b is reasonably long
- # (at least 200 elements) and x accounts for more than 1% of
- # its elements. DOES NOT WORK for x in a!
+ # (at least 200 elements) and x accounts for more than 1 + 1% of
+ # its elements (when autojunk is enabled).
+ # DOES NOT WORK for x in a!
self.isjunk = isjunk
self.a = self.b = None
+ self.autojunk = autojunk
self.set_seqs(a, b)
def set_seqs(self, a, b):
@@ -288,7 +294,7 @@ class SequenceMatcher:
# from starting any matching block at a junk element ...
# also creates the fast isbjunk function ...
# b2j also does not contain entries for "popular" elements, meaning
- # elements that account for more than 1% of the total elements, and
+ # elements that account for more than 1 + 1% of the total elements, and
# when the sequence is reasonably large (>= 200 elements); this can
# be viewed as an adaptive notion of semi-junk, and yields an enormous
# speedup when, e.g., comparing program files with hundreds of
@@ -309,44 +315,37 @@ class SequenceMatcher:
# out the junk later is much cheaper than building b2j "right"
# from the start.
b = self.b
- n = len(b)
self.b2j = b2j = {}
- populardict = {}
- for i, elt in enumerate(b):
- if elt in b2j:
- indices = b2j[elt]
- if n >= 200 and len(indices) * 100 > n:
- populardict[elt] = 1
- del indices[:]
- else:
- indices.append(i)
- else:
- b2j[elt] = [i]
- # Purge leftover indices for popular elements.
- for elt in populardict:
- del b2j[elt]
+ for i, elt in enumerate(b):
+ indices = b2j.setdefault(elt, [])
+ indices.append(i)
- # Now b2j.keys() contains elements uniquely, and especially when
- # the sequence is a string, that's usually a good deal smaller
- # than len(string). The difference is the number of isjunk calls
- # saved.
+ # Purge junk elements
+ junk = set()
isjunk = self.isjunk
- junkdict = {}
if isjunk:
- for d in populardict, b2j:
- for elt in d.keys():
- if isjunk(elt):
- junkdict[elt] = 1
- del d[elt]
-
- # Now for x in b, isjunk(x) == x in junkdict, but the
- # latter is much faster. Note too that while there may be a
- # lot of junk in the sequence, the number of *unique* junk
- # elements is probably small. So the memory burden of keeping
- # this dict alive is likely trivial compared to the size of b2j.
- self.isbjunk = junkdict.__contains__
- self.isbpopular = populardict.__contains__
+ for elt in list(b2j.keys()): # using list() since b2j is modified
+ if isjunk(elt):
+ junk.add(elt)
+ del b2j[elt]
+
+ # Purge popular elements that are not junk
+ popular = set()
+ n = len(b)
+ if self.autojunk and n >= 200:
+ ntest = n // 100 + 1
+ for elt, idxs in list(b2j.items()):
+ if len(idxs) > ntest:
+ popular.add(elt)
+ del b2j[elt]
+
+ # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
+ # Sicne the number of *unique* junk elements is probably small, the
+ # memory burden of keeping this set alive is likely trivial compared to
+ # the size of b2j.
+ self.isbjunk = junk.__contains__
+ self.isbpopular = popular.__contains__
def find_longest_match(self, alo, ahi, blo, bhi):
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py
index 1b828bb..3533451 100644
--- a/Lib/test/test_difflib.py
+++ b/Lib/test/test_difflib.py
@@ -4,8 +4,47 @@ import unittest
import doctest
import sys
-class TestSFbugs(unittest.TestCase):
+class TestWithAscii(unittest.TestCase):
+ def test_one_insert(self):
+ sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100)
+ self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
+ self.assertEqual(list(sm.get_opcodes()),
+ [ ('insert', 0, 0, 0, 1),
+ ('equal', 0, 100, 1, 101)])
+ sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
+ self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
+ self.assertEqual(list(sm.get_opcodes()),
+ [ ('equal', 0, 50, 0, 50),
+ ('insert', 50, 50, 50, 51),
+ ('equal', 50, 100, 51, 101)])
+
+ def test_one_delete(self):
+ sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
+ self.assertAlmostEqual(sm.ratio(), 0.994, places=3)
+ self.assertEqual(list(sm.get_opcodes()),
+ [ ('equal', 0, 40, 0, 40),
+ ('delete', 40, 41, 40, 40),
+ ('equal', 41, 81, 40, 80)])
+
+
+class TestAutojunk(unittest.TestCase):
+ """Tests for the autojunk parameter added in 2.7"""
+ def test_one_insert_homogenous_sequence(self):
+ # By default autojunk=True and the heuristic kicks in for a sequence
+ # of length 200+
+ seq1 = 'b' * 200
+ seq2 = 'a' + 'b' * 200
+
+ sm = difflib.SequenceMatcher(None, seq1, seq2)
+ self.assertAlmostEqual(sm.ratio(), 0, places=3)
+
+ # Now turn the heuristic off
+ sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False)
+ self.assertAlmostEqual(sm.ratio(), 0.9975, places=3)
+
+
+class TestSFbugs(unittest.TestCase):
def test_ratio_for_null_seqn(self):
# Check clearing of SF bug 763023
s = difflib.SequenceMatcher(None, [], [])
@@ -184,7 +223,9 @@ class TestOutputFormat(unittest.TestCase):
def test_main():
difflib.HtmlDiff._default_prefix = 0
Doctests = doctest.DocTestSuite(difflib)
- run_unittest(TestSFpatches, TestSFbugs, TestOutputFormat, Doctests)
+ run_unittest(
+ TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs,
+ TestOutputFormat, Doctests)
if __name__ == '__main__':
test_main()
diff --git a/Misc/NEWS b/Misc/NEWS
index ab8e270..509959e 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -213,6 +213,10 @@ Library
- Issue #808164: Fixed socket.close to avoid references to globals, to
avoid issues when socket.close is called from a __del__ method.
+- Issue #2986: difflib.SequenceMatcher gets a new parameter, autojunk, which
+ can be set to False to turn off the previously undocumented 'popularity'
+ heuristic. Patch by Terry Reedy and Eli Bendersky
+
- Issue #8797: urllib2 does a retry for Basic Authentication failure instead of
falling into recursion.