summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorMa Lin <animalize@users.noreply.github.com>2022-03-29 14:31:01 (GMT)
committerGitHub <noreply@github.com>2022-03-29 14:31:01 (GMT)
commit356997cccc21a3391175d20e9ef03d434675b496 (patch)
tree16392c0b0212d7680d04f0ccb85fa6e13d812a9a /Lib
parent788154919c2d843a0a995994bf2aed2d074761ec (diff)
downloadcpython-356997cccc21a3391175d20e9ef03d434675b496.zip
cpython-356997cccc21a3391175d20e9ef03d434675b496.tar.gz
cpython-356997cccc21a3391175d20e9ef03d434675b496.tar.bz2
bpo-35859: Fix a few long-standing bugs in re engine (GH-12427)
In rare cases, capturing group could get wrong result. Regular expression engines in Perl and Java have similar bugs. The new behavior now matches the behavior of more modern RE engines: in the regex module and in PHP, Ruby and Node.js.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/test/test_re.py69
1 files changed, 69 insertions, 0 deletions
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index fd6db6a..85716fb 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -2033,6 +2033,75 @@ class ReTests(unittest.TestCase):
{'tag': 'foo', 'text': None},
{'tag': 'foo', 'text': None}])
+ def test_MARK_PUSH_macro_bug(self):
+ # issue35859, MARK_PUSH() macro didn't protect MARK-0 if it
+ # was the only available mark.
+ self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',))
+ self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',))
+ self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',))
+ self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',))
+
+ def test_MIN_UNTIL_mark_bug(self):
+ # Fixed in issue35859, reported in issue9134.
+ # JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
+ s = 'axxzbcz'
+ p = r'(?:(?:a|bc)*?(xx)??z)*'
+ self.assertEqual(re.match(p, s).groups(), ('xx',))
+
+ # test-case provided by issue9134
+ s = 'xtcxyzxc'
+ p = r'((x|yz)+?(t)??c)*'
+ m = re.match(p, s)
+ self.assertEqual(m.span(), (0, 8))
+ self.assertEqual(m.span(2), (6, 7))
+ self.assertEqual(m.groups(), ('xyzxc', 'x', 't'))
+
+ def test_REPEAT_ONE_mark_bug(self):
+ # issue35859
+ # JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat
+ s = 'aabaab'
+ p = r'(?:[^b]*a(?=(b)|(a))ab)*'
+ m = re.match(p, s)
+ self.assertEqual(m.span(), (0, 6))
+ self.assertEqual(m.span(2), (4, 5))
+ self.assertEqual(m.groups(), (None, 'a'))
+
+ # JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat
+ s = 'abab'
+ p = r'(?:[^b]*(?=(b)|(a))ab)*'
+ m = re.match(p, s)
+ self.assertEqual(m.span(), (0, 4))
+ self.assertEqual(m.span(2), (2, 3))
+ self.assertEqual(m.groups(), (None, 'a'))
+
+ self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',))
+
+ def test_MIN_REPEAT_ONE_mark_bug(self):
+ # issue35859
+ # JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
+ s = 'abab'
+ p = r'(?:.*?(?=(a)|(b))b)*'
+ m = re.match(p, s)
+ self.assertEqual(m.span(), (0, 4))
+ self.assertEqual(m.span(2), (3, 4))
+ self.assertEqual(m.groups(), (None, 'b'))
+
+ s = 'axxzaz'
+ p = r'(?:a*?(xx)??z)*'
+ self.assertEqual(re.match(p, s).groups(), ('xx',))
+
+ def test_ASSERT_NOT_mark_bug(self):
+ # Fixed in issue35859, reported in issue725149.
+ # JUMP_ASSERT_NOT should LASTMARK_SAVE()
+ self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,))
+
+ # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
+ m = re.match(r'((?!(ab)c)(.))*', 'abab')
+ self.assertEqual(m.span(), (0, 4))
+ self.assertEqual(m.span(1), (3, 4))
+ self.assertEqual(m.span(3), (3, 4))
+ self.assertEqual(m.groups(), ('b', None, 'b'))
+
def test_bug_40736(self):
with self.assertRaisesRegex(TypeError, "got 'int'"):
re.search("x*", 5)