summaryrefslogtreecommitdiffstats
path: root/Lib/difflib.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/difflib.py')
-rw-r--r--Lib/difflib.py116
1 files changed, 45 insertions, 71 deletions
diff --git a/Lib/difflib.py b/Lib/difflib.py
index 79b446c..0443963 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -908,79 +908,52 @@ class Differ:
+ abcdefGhijkl
? ^ ^ ^
"""
- from operator import ge, gt
- # Don't synch up unless the lines have a similarity score of at
- # least cutoff; best_ratio tracks the best score seen so far.
- # Keep track of all index pairs achieving the best ratio and
- # deal with them here. Previously only the smallest pair was
- # handled here, and if there are many pairs with the best ratio,
- # recursion could grow very deep, and runtime cubic. See:
+ # Don't synch up unless the lines have a similarity score above
+ # cutoff. Previously only the smallest pair was handled here,
+ # and if there are many pairs with the best ratio, recursion
+ # could grow very deep, and runtime cubic. See:
# https://github.com/python/cpython/issues/119105
- best_ratio, cutoff = 0.74, 0.75
+ #
+ # Later, more pathological cases prompted removing recursion
+ # entirely.
+ cutoff = 0.74999
cruncher = SequenceMatcher(self.charjunk)
- eqi, eqj = None, None # 1st indices of equal lines (if any)
- # List of index pairs achieving best_ratio. Strictly increasing
- # in both index positions.
- max_pairs = []
- maxi = -1 # `i` index of last pair in max_pairs
-
- # search for the pair that matches best without being identical
- # (identical lines must be junk lines, & we don't want to synch
- # up on junk -- unless we have to)
crqr = cruncher.real_quick_ratio
cqr = cruncher.quick_ratio
cr = cruncher.ratio
+
+ WINDOW = 10
+ best_i = best_j = None
+ dump_i, dump_j = alo, blo # smallest indices not yet resolved
for j in range(blo, bhi):
- bj = b[j]
- cruncher.set_seq2(bj)
- # Find new best, if possible. Else search for the smallest i
- # (if any) > maxi that equals the best ratio
- search_equal = True
- for i in range(alo, ahi):
- ai = a[i]
- if ai == bj:
- if eqi is None:
- eqi, eqj = i, j
- continue
- cruncher.set_seq1(ai)
- # computing similarity is expensive, so use the quick
- # upper bounds first -- have seen this speed up messy
- # compares by a factor of 3.
- cmp = ge if search_equal and i > maxi else gt
- if (cmp(crqr(), best_ratio)
- and cmp(cqr(), best_ratio)
- and cmp((ratio := cr()), best_ratio)):
- if ratio > best_ratio:
- best_ratio = ratio
- max_pairs.clear()
- else:
- assert best_ratio == ratio and search_equal
- assert i > maxi
- max_pairs.append((i, j))
- maxi = i
- search_equal = False
- if best_ratio < cutoff:
- assert not max_pairs
- # no non-identical "pretty close" pair
- if eqi is None:
- # no identical pair either -- treat it as a straight replace
- yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
- return
- # no close pair, but an identical pair -- synch up on that
- max_pairs = [(eqi, eqj)]
- else:
- # there's a close pair, so forget the identical pair (if any)
- assert max_pairs
- eqi = None
-
- last_i, last_j = alo, blo
- for this_i, this_j in max_pairs:
- # pump out diffs from before the synch point
- yield from self._fancy_helper(a, last_i, this_i,
- b, last_j, this_j)
+ cruncher.set_seq2(b[j])
+ # Search the corresponding i's within WINDOW for rhe highest
+ # ratio greater than `cutoff`.
+ aequiv = alo + (j - blo)
+ arange = range(max(aequiv - WINDOW, dump_i),
+ min(aequiv + WINDOW + 1, ahi))
+ if not arange: # likely exit if `a` is shorter than `b`
+ break
+ best_ratio = cutoff
+ for i in arange:
+ cruncher.set_seq1(a[i])
+ # Ordering by cheapest to most expensive ratio is very
+ # valuable, most often getting out early.
+ if (crqr() > best_ratio
+ and cqr() > best_ratio
+ and cr() > best_ratio):
+ best_i, best_j, best_ratio = i, j, cr()
+
+ if best_i is None:
+ # found nothing to synch on yet - move to next j
+ continue
+
+ # pump out straight replace from before this synch pair
+ yield from self._fancy_helper(a, dump_i, best_i,
+ b, dump_j, best_j)
# do intraline marking on the synch pair
- aelt, belt = a[this_i], b[this_j]
- if eqi is None:
+ aelt, belt = a[best_i], b[best_j]
+ if aelt != belt:
# pump out a '-', '?', '+', '?' quad for the synched lines
atags = btags = ""
cruncher.set_seqs(aelt, belt)
@@ -1002,17 +975,18 @@ class Differ:
else:
# the synch pair is identical
yield ' ' + aelt
- last_i, last_j = this_i + 1, this_j + 1
+ dump_i, dump_j = best_i + 1, best_j + 1
+ best_i = best_j = None
- # pump out diffs from after the last synch point
- yield from self._fancy_helper(a, last_i, ahi,
- b, last_j, bhi)
+ # pump out straight replace from after the last synch pair
+ yield from self._fancy_helper(a, dump_i, ahi,
+ b, dump_j, bhi)
def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
g = []
if alo < ahi:
if blo < bhi:
- g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
+ g = self._plain_replace(a, alo, ahi, b, blo, bhi)
else:
g = self._dump('-', a, alo, ahi)
elif blo < bhi: