summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTim Peters <tim.peters@gmail.com>2020-05-06 02:28:24 (GMT)
committerGitHub <noreply@github.com>2020-05-06 02:28:24 (GMT)
commitb9c46a2c2d7fc68457bff641f78932d66f5e5f59 (patch)
tree7f7798c3f7ef5c476705bf2fdb349adddf14c354
parent96074de573f82fc66a2bd73c36905141a3f1d5c1 (diff)
downloadcpython-b9c46a2c2d7fc68457bff641f78932d66f5e5f59.zip
cpython-b9c46a2c2d7fc68457bff641f78932d66f5e5f59.tar.gz
cpython-b9c46a2c2d7fc68457bff641f78932d66f5e5f59.tar.bz2
bpo-40480 "fnmatch" exponential execution time (GH-19908)
bpo-40480: create different regexps in the presence of multiple `*` patterns to prevent fnmatch() from taking exponential time.
-rw-r--r--Lib/fnmatch.py60
-rw-r--r--Lib/test/test_fnmatch.py17
-rw-r--r--Misc/NEWS.d/next/Library/2020-05-04-21-21-43.bpo-40480.mjldWa.rst1
3 files changed, 71 insertions, 7 deletions
diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py
index b98e641..d7d915d 100644
--- a/Lib/fnmatch.py
+++ b/Lib/fnmatch.py
@@ -77,15 +77,19 @@ def translate(pat):
There is no way to quote meta-characters.
"""
+ STAR = object()
+ res = []
+ add = res.append
i, n = 0, len(pat)
- res = ''
while i < n:
c = pat[i]
i = i+1
if c == '*':
- res = res + '.*'
+ # compress consecutive `*` into one
+ if (not res) or res[-1] is not STAR:
+ add(STAR)
elif c == '?':
- res = res + '.'
+ add('.')
elif c == '[':
j = i
if j < n and pat[j] == '!':
@@ -95,7 +99,7 @@ def translate(pat):
while j < n and pat[j] != ']':
j = j+1
if j >= n:
- res = res + '\\['
+ add('\\[')
else:
stuff = pat[i:j]
if '--' not in stuff:
@@ -122,7 +126,49 @@ def translate(pat):
stuff = '^' + stuff[1:]
elif stuff[0] in ('^', '['):
stuff = '\\' + stuff
- res = '%s[%s]' % (res, stuff)
+ add(f'[{stuff}]')
else:
- res = res + re.escape(c)
- return r'(?s:%s)\Z' % res
+ add(re.escape(c))
+ assert i == n
+
+ # Deal with STARs.
+ inp = res
+ res = []
+ add = res.append
+ i, n = 0, len(inp)
+ # Fixed pieces at the start?
+ while i < n and inp[i] is not STAR:
+ add(inp[i])
+ i += 1
+ # Now deal with STAR fixed STAR fixed ...
+ # For an interior `STAR fixed` pairing, we want to do a minimal
+ # .*? match followed by `fixed`, with no possibility of backtracking.
+ # We can't spell that directly, but can trick it into working by matching
+ # .*?fixed
+ # in a lookahead assertion, save the matched part in a group, then
+ # consume that group via a backreference. If the overall match fails,
+ # the lookahead assertion won't try alternatives. So the translation is:
+ # (?=(P<name>.*?fixed))(?P=name)
+ # Group names are created as needed: g1, g2, g3, ...
+ groupnum = 0
+ while i < n:
+ assert inp[i] is STAR
+ i += 1
+ if i == n:
+ add(".*")
+ break
+ assert inp[i] is not STAR
+ fixed = []
+ while i < n and inp[i] is not STAR:
+ fixed.append(inp[i])
+ i += 1
+ fixed = "".join(fixed)
+ if i == n:
+ add(".*")
+ add(fixed)
+ else:
+ groupnum += 1
+ add(f"(?=(?P<g{groupnum}>.*?{fixed}))(?P=g{groupnum})")
+ assert i == n
+ res = "".join(res)
+ return fr'(?s:{res})\Z'
diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py
index 55f9f0d..4c17306 100644
--- a/Lib/test/test_fnmatch.py
+++ b/Lib/test/test_fnmatch.py
@@ -45,6 +45,13 @@ class FnmatchTestCase(unittest.TestCase):
check('\nfoo', 'foo*', False)
check('\n', '*')
+ def test_slow_fnmatch(self):
+ check = self.check_match
+ check('a' * 50, '*a*a*a*a*a*a*a*a*a*a')
+ # The next "takes forever" if the regexp translation is
+ # straightforward. See bpo-40480.
+ check('a' * 50 + 'b', '*a*a*a*a*a*a*a*a*a*a', False)
+
def test_mix_bytes_str(self):
self.assertRaises(TypeError, fnmatch, 'test', b'*')
self.assertRaises(TypeError, fnmatch, b'test', '*')
@@ -107,6 +114,16 @@ class TranslateTestCase(unittest.TestCase):
self.assertEqual(translate('[!x]'), r'(?s:[^x])\Z')
self.assertEqual(translate('[^x]'), r'(?s:[\^x])\Z')
self.assertEqual(translate('[x'), r'(?s:\[x)\Z')
+ # from the docs
+ self.assertEqual(translate('*.txt'), r'(?s:.*\.txt)\Z')
+ # squash consecutive stars
+ self.assertEqual(translate('*********'), r'(?s:.*)\Z')
+ self.assertEqual(translate('A*********'), r'(?s:A.*)\Z')
+ self.assertEqual(translate('*********A'), r'(?s:.*A)\Z')
+ self.assertEqual(translate('A*********?[?]?'), r'(?s:A.*.[?].)\Z')
+ # fancy translation to prevent exponential-time match failure
+ self.assertEqual(translate('**a*a****a'),
+ r'(?s:(?=(?P<g1>.*?a))(?P=g1)(?=(?P<g2>.*?a))(?P=g2).*a)\Z')
class FilterTestCase(unittest.TestCase):
diff --git a/Misc/NEWS.d/next/Library/2020-05-04-21-21-43.bpo-40480.mjldWa.rst b/Misc/NEWS.d/next/Library/2020-05-04-21-21-43.bpo-40480.mjldWa.rst
new file mode 100644
index 0000000..d046b14
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2020-05-04-21-21-43.bpo-40480.mjldWa.rst
@@ -0,0 +1 @@
+``fnmatch.fnmatch()`` could take exponential time in the presence of multiple ``*`` pattern characters. This was repaired by generating more elaborate regular expressions to avoid futile backtracking. \ No newline at end of file