summaryrefslogtreecommitdiffstats
path: root/Lib/difflib.py
diff options
context:
space:
mode:
authorTerry Reedy <tjreedy@udel.edu>2010-11-25 06:12:34 (GMT)
committerTerry Reedy <tjreedy@udel.edu>2010-11-25 06:12:34 (GMT)
commit99f9637de8893ecdb08ade606fe3a988e6a8c848 (patch)
tree0808c60efc0a58b46601438d012aa29ee73afced /Lib/difflib.py
parentbd86301070e38726532ae57e7d1bdc01143b298b (diff)
downloadcpython-99f9637de8893ecdb08ade606fe3a988e6a8c848.zip
cpython-99f9637de8893ecdb08ade606fe3a988e6a8c848.tar.gz
cpython-99f9637de8893ecdb08ade606fe3a988e6a8c848.tar.bz2
Issue 2986: Add autojunk paramater to SequenceMatcher to turn off heuristic. Patch by Terry Reedy, Eli Bendersky, and Simon Cross
Diffstat (limited to 'Lib/difflib.py')
-rw-r--r--Lib/difflib.py73
1 files changed, 36 insertions, 37 deletions
diff --git a/Lib/difflib.py b/Lib/difflib.py
index 92d58fa..fa8c287 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -150,7 +150,7 @@ class SequenceMatcher:
Return an upper bound on ratio() very quickly.
"""
- def __init__(self, isjunk=None, a='', b=''):
+ def __init__(self, isjunk=None, a='', b='', autojunk=True):
"""Construct a SequenceMatcher.
Optional arg isjunk is None (the default), or a one-argument
@@ -168,6 +168,10 @@ class SequenceMatcher:
Optional arg b is the second of two sequences to be compared. By
default, an empty string. The elements of b must be hashable. See
also .set_seqs() and .set_seq2().
+
+ Optional arg autojunk should be set to False to disable the
+ "automatic junk heuristic" that treats popular elements as junk
+ (see module documentation for more information).
"""
# Members:
@@ -206,11 +210,13 @@ class SequenceMatcher:
# DOES NOT WORK for x in a!
# isbpopular
# for x in b, isbpopular(x) is true iff b is reasonably long
- # (at least 200 elements) and x accounts for more than 1% of
- # its elements. DOES NOT WORK for x in a!
+ # (at least 200 elements) and x accounts for more than 1 + 1% of
+ # its elements (when autojunk is enabled).
+ # DOES NOT WORK for x in a!
self.isjunk = isjunk
self.a = self.b = None
+ self.autojunk = autojunk
self.set_seqs(a, b)
def set_seqs(self, a, b):
@@ -287,7 +293,7 @@ class SequenceMatcher:
# from starting any matching block at a junk element ...
# also creates the fast isbjunk function ...
# b2j also does not contain entries for "popular" elements, meaning
- # elements that account for more than 1% of the total elements, and
+ # elements that account for more than 1 + 1% of the total elements, and
# when the sequence is reasonably large (>= 200 elements); this can
# be viewed as an adaptive notion of semi-junk, and yields an enormous
# speedup when, e.g., comparing program files with hundreds of
@@ -308,44 +314,37 @@ class SequenceMatcher:
# out the junk later is much cheaper than building b2j "right"
# from the start.
b = self.b
- n = len(b)
self.b2j = b2j = {}
- populardict = {}
- for i, elt in enumerate(b):
- if elt in b2j:
- indices = b2j[elt]
- if n >= 200 and len(indices) * 100 > n:
- populardict[elt] = 1
- del indices[:]
- else:
- indices.append(i)
- else:
- b2j[elt] = [i]
- # Purge leftover indices for popular elements.
- for elt in populardict:
- del b2j[elt]
+ for i, elt in enumerate(b):
+ indices = b2j.setdefault(elt, [])
+ indices.append(i)
- # Now b2j.keys() contains elements uniquely, and especially when
- # the sequence is a string, that's usually a good deal smaller
- # than len(string). The difference is the number of isjunk calls
- # saved.
+ # Purge junk elements
+ junk = set()
isjunk = self.isjunk
- junkdict = {}
if isjunk:
- for d in populardict, b2j:
- for elt in list(d.keys()):
- if isjunk(elt):
- junkdict[elt] = 1
- del d[elt]
-
- # Now for x in b, isjunk(x) == x in junkdict, but the
- # latter is much faster. Note too that while there may be a
- # lot of junk in the sequence, the number of *unique* junk
- # elements is probably small. So the memory burden of keeping
- # this dict alive is likely trivial compared to the size of b2j.
- self.isbjunk = junkdict.__contains__
- self.isbpopular = populardict.__contains__
+ for elt in list(b2j.keys()): # using list() since b2j is modified
+ if isjunk(elt):
+ junk.add(elt)
+ del b2j[elt]
+
+ # Purge popular elements that are not junk
+ popular = set()
+ n = len(b)
+ if self.autojunk and n >= 200:
+ ntest = n // 100 + 1
+ for elt, idxs in list(b2j.items()):
+ if len(idxs) > ntest:
+ popular.add(elt)
+ del b2j[elt]
+
+ # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
+ # Since the number of *unique* junk elements is probably small, the
+ # memory burden of keeping this set alive is likely trivial compared to
+ # the size of b2j.
+ self.isbjunk = junk.__contains__
+ self.isbpopular = popular.__contains__
def find_longest_match(self, alo, ahi, blo, bhi):
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].