summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpulkin <gpulkin@gmail.com>2024-05-19 21:46:37 (GMT)
committerGitHub <noreply@github.com>2024-05-19 21:46:37 (GMT)
commit0abf997e75bd3a8b76d920d33cc64d5e6c2d380f (patch)
tree3c53ca1a45a4706d853c0d5f56e59a208fe1408c
parent3c28510b984392b8dac87a17dfc5887366d5c4ab (diff)
downloadcpython-0abf997e75bd3a8b76d920d33cc64d5e6c2d380f.zip
cpython-0abf997e75bd3a8b76d920d33cc64d5e6c2d380f.tar.gz
cpython-0abf997e75bd3a8b76d920d33cc64d5e6c2d380f.tar.bz2
gh-119105: difflib: improve recursion for degenerate cases (#119131)
Code from https://github.com/pulkin, in PR https://github.com/python/cpython/pull/119131 Greatly speeds `Differ` when there are many identically scoring pairs, by splitting the recursion near the inputs' midpoints instead of degenerating (as now) into just peeling off the first two lines. Co-authored-by: Tim Peters <tim.peters@gmail.com>
-rw-r--r--Lib/difflib.py24
-rw-r--r--Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst1
2 files changed, 20 insertions, 5 deletions
diff --git a/Lib/difflib.py b/Lib/difflib.py
index ba0b256..54ca33d 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -911,16 +911,26 @@ class Differ:
# don't synch up unless the lines have a similarity score of at
# least cutoff; best_ratio tracks the best score seen so far
- best_ratio, cutoff = 0.74, 0.75
+ # best_ratio is a tuple storing the best .ratio() seen so far, and
+ # a measure of how far the indices are from their index range
+ # midpoints. The latter is used to resolve ratio ties. Favoring
+ # indices near the midpoints tends to cut the ranges in half. Else,
+ # if there are many pairs with the best ratio, recursion can grow
+ # very deep, and runtime becomes cubic. See:
+ # https://github.com/python/cpython/issues/119105
+ best_ratio, cutoff = (0.74, 0), 0.75
cruncher = SequenceMatcher(self.charjunk)
eqi, eqj = None, None # 1st indices of equal lines (if any)
# search for the pair that matches best without being identical
# (identical lines must be junk lines, & we don't want to synch up
# on junk -- unless we have to)
+ amid = (alo + ahi - 1) / 2
+ bmid = (blo + bhi - 1) / 2
for j in range(blo, bhi):
bj = b[j]
cruncher.set_seq2(bj)
+ weight_j = - abs(j - bmid)
for i in range(alo, ahi):
ai = a[i]
if ai == bj:
@@ -928,16 +938,20 @@ class Differ:
eqi, eqj = i, j
continue
cruncher.set_seq1(ai)
+ # weight is used to balance the recursion by prioritizing
+ # i and j in the middle of their ranges
+ weight = weight_j - abs(i - amid)
# computing similarity is expensive, so use the quick
# upper bounds first -- have seen this speed up messy
# compares by a factor of 3.
# note that ratio() is only expensive to compute the first
# time it's called on a sequence pair; the expensive part
# of the computation is cached by cruncher
- if cruncher.real_quick_ratio() > best_ratio and \
- cruncher.quick_ratio() > best_ratio and \
- cruncher.ratio() > best_ratio:
- best_ratio, best_i, best_j = cruncher.ratio(), i, j
+ if (cruncher.real_quick_ratio(), weight) > best_ratio and \
+ (cruncher.quick_ratio(), weight) > best_ratio and \
+ (cruncher.ratio(), weight) > best_ratio:
+ best_ratio, best_i, best_j = (cruncher.ratio(), weight), i, j
+ best_ratio, _ = best_ratio
if best_ratio < cutoff:
# no non-identical "pretty close" pair
if eqi is None:
diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst
new file mode 100644
index 0000000..30b5f97
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst
@@ -0,0 +1 @@
+``difflib.Differ`` is much faster for some cases of diffs where many pairs of lines are equally similar.