summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTim Peters <tim.peters@gmail.com>2024-05-22 23:25:08 (GMT)
committerGitHub <noreply@github.com>2024-05-22 23:25:08 (GMT)
commit07df93de73b6bdd4f80cd9e29834d761a0882622 (patch)
tree81e895b32b7126edfae99ca869e02c0420b2ed63
parente3bf5381fd056d0bbdd775463e3724aab2012e45 (diff)
downloadcpython-07df93de73b6bdd4f80cd9e29834d761a0882622.zip
cpython-07df93de73b6bdd4f80cd9e29834d761a0882622.tar.gz
cpython-07df93de73b6bdd4f80cd9e29834d761a0882622.tar.bz2
gh-119105: difflib.py Differ.compare is too slow [for degenerate cases] (#119376)
Track all pairs achieving the best ratio in Differ(). This repairs the "very deep recursion and cubic time" bad cases in a way that preserves previous output.
-rw-r--r--Lib/difflib.py128
-rw-r--r--Lib/test/test_difflib.py20
2 files changed, 89 insertions, 59 deletions
diff --git a/Lib/difflib.py b/Lib/difflib.py
index 54ca33d..79b446c 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -908,29 +908,34 @@ class Differ:
+ abcdefGhijkl
? ^ ^ ^
"""
-
- # don't synch up unless the lines have a similarity score of at
- # least cutoff; best_ratio tracks the best score seen so far
- # best_ratio is a tuple storing the best .ratio() seen so far, and
- # a measure of how far the indices are from their index range
- # midpoints. The latter is used to resolve ratio ties. Favoring
- # indices near the midpoints tends to cut the ranges in half. Else,
- # if there are many pairs with the best ratio, recursion can grow
- # very deep, and runtime becomes cubic. See:
+ from operator import ge, gt
+ # Don't synch up unless the lines have a similarity score of at
+ # least cutoff; best_ratio tracks the best score seen so far.
+ # Keep track of all index pairs achieving the best ratio and
+ # deal with them here. Previously only the smallest pair was
+ # handled here, and if there are many pairs with the best ratio,
+ # recursion could grow very deep, and runtime cubic. See:
# https://github.com/python/cpython/issues/119105
- best_ratio, cutoff = (0.74, 0), 0.75
+ best_ratio, cutoff = 0.74, 0.75
cruncher = SequenceMatcher(self.charjunk)
eqi, eqj = None, None # 1st indices of equal lines (if any)
+ # List of index pairs achieving best_ratio. Strictly increasing
+ # in both index positions.
+ max_pairs = []
+ maxi = -1 # `i` index of last pair in max_pairs
# search for the pair that matches best without being identical
- # (identical lines must be junk lines, & we don't want to synch up
- # on junk -- unless we have to)
- amid = (alo + ahi - 1) / 2
- bmid = (blo + bhi - 1) / 2
+ # (identical lines must be junk lines, & we don't want to synch
+ # up on junk -- unless we have to)
+ crqr = cruncher.real_quick_ratio
+ cqr = cruncher.quick_ratio
+ cr = cruncher.ratio
for j in range(blo, bhi):
bj = b[j]
cruncher.set_seq2(bj)
- weight_j = - abs(j - bmid)
+ # Find new best, if possible. Else search for the smallest i
+ # (if any) > maxi that equals the best ratio
+ search_equal = True
for i in range(alo, ahi):
ai = a[i]
if ai == bj:
@@ -938,65 +943,70 @@ class Differ:
eqi, eqj = i, j
continue
cruncher.set_seq1(ai)
- # weight is used to balance the recursion by prioritizing
- # i and j in the middle of their ranges
- weight = weight_j - abs(i - amid)
# computing similarity is expensive, so use the quick
# upper bounds first -- have seen this speed up messy
# compares by a factor of 3.
- # note that ratio() is only expensive to compute the first
- # time it's called on a sequence pair; the expensive part
- # of the computation is cached by cruncher
- if (cruncher.real_quick_ratio(), weight) > best_ratio and \
- (cruncher.quick_ratio(), weight) > best_ratio and \
- (cruncher.ratio(), weight) > best_ratio:
- best_ratio, best_i, best_j = (cruncher.ratio(), weight), i, j
- best_ratio, _ = best_ratio
+ cmp = ge if search_equal and i > maxi else gt
+ if (cmp(crqr(), best_ratio)
+ and cmp(cqr(), best_ratio)
+ and cmp((ratio := cr()), best_ratio)):
+ if ratio > best_ratio:
+ best_ratio = ratio
+ max_pairs.clear()
+ else:
+ assert best_ratio == ratio and search_equal
+ assert i > maxi
+ max_pairs.append((i, j))
+ maxi = i
+ search_equal = False
if best_ratio < cutoff:
+ assert not max_pairs
# no non-identical "pretty close" pair
if eqi is None:
# no identical pair either -- treat it as a straight replace
yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
return
# no close pair, but an identical pair -- synch up on that
- best_i, best_j, best_ratio = eqi, eqj, 1.0
+ max_pairs = [(eqi, eqj)]
else:
# there's a close pair, so forget the identical pair (if any)
+ assert max_pairs
eqi = None
- # a[best_i] very similar to b[best_j]; eqi is None iff they're not
- # identical
-
- # pump out diffs from before the synch point
- yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
-
- # do intraline marking on the synch pair
- aelt, belt = a[best_i], b[best_j]
- if eqi is None:
- # pump out a '-', '?', '+', '?' quad for the synched lines
- atags = btags = ""
- cruncher.set_seqs(aelt, belt)
- for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
- la, lb = ai2 - ai1, bj2 - bj1
- if tag == 'replace':
- atags += '^' * la
- btags += '^' * lb
- elif tag == 'delete':
- atags += '-' * la
- elif tag == 'insert':
- btags += '+' * lb
- elif tag == 'equal':
- atags += ' ' * la
- btags += ' ' * lb
- else:
- raise ValueError('unknown tag %r' % (tag,))
- yield from self._qformat(aelt, belt, atags, btags)
- else:
- # the synch pair is identical
- yield ' ' + aelt
+ last_i, last_j = alo, blo
+ for this_i, this_j in max_pairs:
+ # pump out diffs from before the synch point
+ yield from self._fancy_helper(a, last_i, this_i,
+ b, last_j, this_j)
+ # do intraline marking on the synch pair
+ aelt, belt = a[this_i], b[this_j]
+ if eqi is None:
+ # pump out a '-', '?', '+', '?' quad for the synched lines
+ atags = btags = ""
+ cruncher.set_seqs(aelt, belt)
+ for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
+ la, lb = ai2 - ai1, bj2 - bj1
+ if tag == 'replace':
+ atags += '^' * la
+ btags += '^' * lb
+ elif tag == 'delete':
+ atags += '-' * la
+ elif tag == 'insert':
+ btags += '+' * lb
+ elif tag == 'equal':
+ atags += ' ' * la
+ btags += ' ' * lb
+ else:
+ raise ValueError('unknown tag %r' % (tag,))
+ yield from self._qformat(aelt, belt, atags, btags)
+ else:
+ # the synch pair is identical
+ yield ' ' + aelt
+ last_i, last_j = this_i + 1, this_j + 1
- # pump out diffs from after the synch point
- yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
+ # pump out diffs from after the last synch point
+ yield from self._fancy_helper(a, last_i, ahi,
+ b, last_j, bhi)
def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
g = []
diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py
index 6afd90a..bf6e5b1 100644
--- a/Lib/test/test_difflib.py
+++ b/Lib/test/test_difflib.py
@@ -272,6 +272,26 @@ class TestSFpatches(unittest.TestCase):
self.assertIn('content="text/html; charset=us-ascii"', output)
self.assertIn('&#305;mpl&#305;c&#305;t', output)
+class TestDiffer(unittest.TestCase):
+ def test_close_matches_aligned(self):
+ # Of the 4 closely matching pairs, we want 1 to match with 3,
+ # and 2 with 4, to align with a "top to bottom" mental model.
+ a = ["cat\n", "dog\n", "close match 1\n", "close match 2\n"]
+ b = ["close match 3\n", "close match 4\n", "kitten\n", "puppy\n"]
+ m = difflib.Differ().compare(a, b)
+ self.assertEqual(list(m),
+ ['- cat\n',
+ '- dog\n',
+ '- close match 1\n',
+ '? ^\n',
+ '+ close match 3\n',
+ '? ^\n',
+ '- close match 2\n',
+ '? ^\n',
+ '+ close match 4\n',
+ '? ^\n',
+ '+ kitten\n',
+ '+ puppy\n'])
class TestOutputFormat(unittest.TestCase):
def test_tab_delimiter(self):