diff options
Diffstat (limited to 'Lib/difflib.py')
-rw-r--r-- | Lib/difflib.py | 116 |
1 files changed, 45 insertions, 71 deletions
diff --git a/Lib/difflib.py b/Lib/difflib.py index 79b446c..0443963 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -908,79 +908,52 @@ class Differ: + abcdefGhijkl ? ^ ^ ^ """ - from operator import ge, gt - # Don't synch up unless the lines have a similarity score of at - # least cutoff; best_ratio tracks the best score seen so far. - # Keep track of all index pairs achieving the best ratio and - # deal with them here. Previously only the smallest pair was - # handled here, and if there are many pairs with the best ratio, - # recursion could grow very deep, and runtime cubic. See: + # Don't synch up unless the lines have a similarity score above + # cutoff. Previously only the smallest pair was handled here, + # and if there are many pairs with the best ratio, recursion + # could grow very deep, and runtime cubic. See: # https://github.com/python/cpython/issues/119105 - best_ratio, cutoff = 0.74, 0.75 + # + # Later, more pathological cases prompted removing recursion + # entirely. + cutoff = 0.74999 cruncher = SequenceMatcher(self.charjunk) - eqi, eqj = None, None # 1st indices of equal lines (if any) - # List of index pairs achieving best_ratio. Strictly increasing - # in both index positions. - max_pairs = [] - maxi = -1 # `i` index of last pair in max_pairs - - # search for the pair that matches best without being identical - # (identical lines must be junk lines, & we don't want to synch - # up on junk -- unless we have to) crqr = cruncher.real_quick_ratio cqr = cruncher.quick_ratio cr = cruncher.ratio + + WINDOW = 10 + best_i = best_j = None + dump_i, dump_j = alo, blo # smallest indices not yet resolved for j in range(blo, bhi): - bj = b[j] - cruncher.set_seq2(bj) - # Find new best, if possible. Else search for the smallest i - # (if any) > maxi that equals the best ratio - search_equal = True - for i in range(alo, ahi): - ai = a[i] - if ai == bj: - if eqi is None: - eqi, eqj = i, j - continue - cruncher.set_seq1(ai) - # computing similarity is expensive, so use the quick - # upper bounds first -- have seen this speed up messy - # compares by a factor of 3. - cmp = ge if search_equal and i > maxi else gt - if (cmp(crqr(), best_ratio) - and cmp(cqr(), best_ratio) - and cmp((ratio := cr()), best_ratio)): - if ratio > best_ratio: - best_ratio = ratio - max_pairs.clear() - else: - assert best_ratio == ratio and search_equal - assert i > maxi - max_pairs.append((i, j)) - maxi = i - search_equal = False - if best_ratio < cutoff: - assert not max_pairs - # no non-identical "pretty close" pair - if eqi is None: - # no identical pair either -- treat it as a straight replace - yield from self._plain_replace(a, alo, ahi, b, blo, bhi) - return - # no close pair, but an identical pair -- synch up on that - max_pairs = [(eqi, eqj)] - else: - # there's a close pair, so forget the identical pair (if any) - assert max_pairs - eqi = None - - last_i, last_j = alo, blo - for this_i, this_j in max_pairs: - # pump out diffs from before the synch point - yield from self._fancy_helper(a, last_i, this_i, - b, last_j, this_j) + cruncher.set_seq2(b[j]) + # Search the corresponding i's within WINDOW for rhe highest + # ratio greater than `cutoff`. + aequiv = alo + (j - blo) + arange = range(max(aequiv - WINDOW, dump_i), + min(aequiv + WINDOW + 1, ahi)) + if not arange: # likely exit if `a` is shorter than `b` + break + best_ratio = cutoff + for i in arange: + cruncher.set_seq1(a[i]) + # Ordering by cheapest to most expensive ratio is very + # valuable, most often getting out early. + if (crqr() > best_ratio + and cqr() > best_ratio + and cr() > best_ratio): + best_i, best_j, best_ratio = i, j, cr() + + if best_i is None: + # found nothing to synch on yet - move to next j + continue + + # pump out straight replace from before this synch pair + yield from self._fancy_helper(a, dump_i, best_i, + b, dump_j, best_j) # do intraline marking on the synch pair - aelt, belt = a[this_i], b[this_j] - if eqi is None: + aelt, belt = a[best_i], b[best_j] + if aelt != belt: # pump out a '-', '?', '+', '?' quad for the synched lines atags = btags = "" cruncher.set_seqs(aelt, belt) @@ -1002,17 +975,18 @@ class Differ: else: # the synch pair is identical yield ' ' + aelt - last_i, last_j = this_i + 1, this_j + 1 + dump_i, dump_j = best_i + 1, best_j + 1 + best_i = best_j = None - # pump out diffs from after the last synch point - yield from self._fancy_helper(a, last_i, ahi, - b, last_j, bhi) + # pump out straight replace from after the last synch pair + yield from self._fancy_helper(a, dump_i, ahi, + b, dump_j, bhi) def _fancy_helper(self, a, alo, ahi, b, blo, bhi): g = [] if alo < ahi: if blo < bhi: - g = self._fancy_replace(a, alo, ahi, b, blo, bhi) + g = self._plain_replace(a, alo, ahi, b, blo, bhi) else: g = self._dump('-', a, alo, ahi) elif blo < bhi: |