summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2022-03-21 16:28:22 (GMT)
committerGitHub <noreply@github.com>2022-03-21 16:28:22 (GMT)
commit345b390ed69f36681dbc41187bc8f49cd9135b54 (patch)
tree31ce6451bed718405b29bdb32c7eb4ff96fe5697
parent2bde6827ea4f136297b2d882480b981ff26262b6 (diff)
downloadcpython-345b390ed69f36681dbc41187bc8f49cd9135b54.zip
cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.gz
cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.bz2
bpo-433030: Add support of atomic grouping in regular expressions (GH-31982)
* Atomic grouping: (?>...). * Possessive quantifiers: x++, x*+, x?+, x{m,n}+. Equivalent to (?>x+), (?>x*), (?>x?), (?>x{m,n}). Co-authored-by: Jeffrey C. Jacobs <timehorse@users.sourceforge.net>
-rw-r--r--Doc/library/re.rst54
-rw-r--r--Doc/whatsnew/3.11.rst6
-rw-r--r--Lib/sre_compile.py38
-rw-r--r--Lib/sre_constants.py5
-rw-r--r--Lib/sre_parse.py32
-rw-r--r--Lib/test/test_re.py294
-rw-r--r--Misc/ACKS1
-rw-r--r--Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst2
-rw-r--r--Modules/_sre.c24
-rw-r--r--Modules/sre_constants.h31
-rw-r--r--Modules/sre_lib.h198
11 files changed, 593 insertions, 92 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index 950a5b1..02c0a84 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -155,6 +155,30 @@ The special characters are:
only ``'<a>'``.
.. index::
+ single: *+; in regular expressions
+ single: ++; in regular expressions
+ single: ?+; in regular expressions
+
+``*+``, ``++``, ``?+``
+ Like the ``'*'``, ``'+'``, and ``'?'`` qualifiers, those where ``'+'`` is
+ appended also match as many times as possible.
+ However, unlike the true greedy qualifiers, these do not allow
+ back-tracking when the expression following it fails to match.
+ These are known as :dfn:`possessive` qualifiers.
+ For example, ``a*a`` will match ``'aaaa'`` because the ``a*`` will match
+ all 4 ``'a'``s, but, when the final ``'a'`` is encountered, the
+ expression is backtracked so that in the end the ``a*`` ends up matching
+ 3 ``'a'``s total, and the fourth ``'a'`` is matched by the final ``'a'``.
+ However, when ``a*+a`` is used to match ``'aaaa'``, the ``a*+`` will
+ match all 4 ``'a'``, but when the final ``'a'`` fails to find any more
+ characters to match, the expression cannot be backtracked and will thus
+ fail to match.
+ ``x*+``, ``x++`` and ``x?+`` are equivalent to ``(?>x*)``, ``(?>x+)``
+ and ``(?>x?)`` correspondigly.
+
+ .. versionadded:: 3.11
+
+.. index::
single: {} (curly brackets); in regular expressions
``{m}``
@@ -178,6 +202,21 @@ The special characters are:
6-character string ``'aaaaaa'``, ``a{3,5}`` will match 5 ``'a'`` characters,
while ``a{3,5}?`` will only match 3 characters.
+``{m,n}+``
+ Causes the resulting RE to match from *m* to *n* repetitions of the
+ preceding RE, attempting to match as many repetitions as possible
+ *without* establishing any backtracking points.
+ This is the possessive version of the qualifier above.
+ For example, on the 6-character string ``'aaaaaa'``, ``a{3,5}+aa``
+ attempt to match 5 ``'a'`` characters, then, requiring 2 more ``'a'``s,
+ will need more characters than available and thus fail, while
+ ``a{3,5}aa`` will match with ``a{3,5}`` capturing 5, then 4 ``'a'``s
+ by backtracking and then the final 2 ``'a'``s are matched by the final
+ ``aa`` in the pattern.
+ ``x{m,n}+`` is equivalent to ``(?>x{m,n})``.
+
+ .. versionadded:: 3.11
+
.. index:: single: \ (backslash); in regular expressions
``\``
@@ -336,6 +375,21 @@ The special characters are:
.. versionchanged:: 3.7
The letters ``'a'``, ``'L'`` and ``'u'`` also can be used in a group.
+``(?>...)``
+ Attempts to match ``...`` as if it was a separate regular expression, and
+ if successful, continues to match the rest of the pattern following it.
+ If the subsequent pattern fails to match, the stack can only be unwound
+ to a point *before* the ``(?>...)`` because once exited, the expression,
+ known as an :dfn:`atomic group`, has thrown away all stack points within
+ itself.
+ Thus, ``(?>.*).`` would never match anything because first the ``.*``
+ would match all characters possible, then, having nothing left to match,
+ the final ``.`` would fail to match.
+ Since there are no stack points saved in the Atomic Group, and there is
+ no stack point before it, the entire expression would thus fail to match.
+
+ .. versionadded:: 3.11
+
.. index:: single: (?P<; in regular expressions
``(?P<name>...)``
diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index b7e9dc6..ca7699d 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -295,6 +295,12 @@ os
instead of ``CryptGenRandom()`` which is deprecated.
(Contributed by Dong-hee Na in :issue:`44611`.)
+re
+--
+
+* Atomic grouping (``(?>...)``) and possessive qualifiers (``*+``, ``++``,
+ ``?+``, ``{m,n}+``) are now supported in regular expressions.
+ (Contributed by Jeffrey C. Jacobs and Serhiy Storchaka in :issue:`433030`.)
shutil
------
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index c6398bf..0867200 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -17,11 +17,16 @@ from sre_constants import *
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
-_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
+_REPEATING_CODES = {
+ MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
+ MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
+ POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
+}
+
# Sets of lowercase characters which have the same uppercase.
_equivalences = (
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
@@ -138,10 +143,7 @@ def _compile(code, pattern, flags):
if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator %r" % (op,))
if _simple(av[2]):
- if op is MAX_REPEAT:
- emit(REPEAT_ONE)
- else:
- emit(MIN_REPEAT_ONE)
+ emit(REPEATING_CODES[op][2])
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
@@ -149,16 +151,13 @@ def _compile(code, pattern, flags):
emit(SUCCESS)
code[skip] = _len(code) - skip
else:
- emit(REPEAT)
+ emit(REPEATING_CODES[op][0])
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
code[skip] = _len(code) - skip
- if op is MAX_REPEAT:
- emit(MAX_UNTIL)
- else:
- emit(MIN_UNTIL)
+ emit(REPEATING_CODES[op][1])
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
if group:
@@ -169,6 +168,17 @@ def _compile(code, pattern, flags):
if group:
emit(MARK)
emit((group-1)*2+1)
+ elif op is ATOMIC_GROUP:
+ # Atomic Groups are handled by starting with an Atomic
+ # Group op code, then putting in the atomic group pattern
+ # and finally a success op code to tell any repeat
+ # operations within the Atomic Group to stop eating and
+ # pop their stack if they reach it
+ emit(ATOMIC_GROUP)
+ skip = _len(code); emit(0)
+ _compile(code, av, flags)
+ emit(SUCCESS)
+ code[skip] = _len(code) - skip
elif op in SUCCESS_CODES:
emit(op)
elif op in ASSERT_CODES:
@@ -709,7 +719,8 @@ def dis(code):
else:
print_(FAILURE)
i += 1
- elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
+ elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
+ POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
skip, min, max = code[i: i+3]
if max == MAXREPEAT:
max = 'MAXREPEAT'
@@ -725,6 +736,11 @@ def dis(code):
print_(op, skip, arg, to=i+skip)
dis_(i+2, i+skip)
i += skip
+ elif op is ATOMIC_GROUP:
+ skip = code[i]
+ print_(op, skip, to=i+skip)
+ dis_(i+1, i+skip)
+ i += skip
elif op is INFO:
skip, flags, min, max = code[i: i+4]
if max == MAXREPEAT:
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index 8e613cb..a00b017 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
# update when constants are added or removed
-MAGIC = 20171005
+MAGIC = 20220318
from _sre import MAXREPEAT, MAXGROUPS
@@ -97,6 +97,9 @@ OPCODES = _makecodes("""
REPEAT_ONE
SUBPATTERN
MIN_REPEAT_ONE
+ ATOMIC_GROUP
+ POSSESSIVE_REPEAT
+ POSSESSIVE_REPEAT_ONE
GROUPREF_IGNORE
IN_IGNORE
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index bb95107..b91082e 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -25,7 +25,7 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
WHITESPACE = frozenset(" \t\n\r\v\f")
-_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
+_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
ESCAPES = {
@@ -190,6 +190,10 @@ class SubPattern:
i, j = av.getwidth()
lo = lo + i
hi = hi + j
+ elif op is ATOMIC_GROUP:
+ i, j = av.getwidth()
+ lo = lo + i
+ hi = hi + j
elif op is SUBPATTERN:
i, j = av[-1].getwidth()
lo = lo + i
@@ -675,8 +679,13 @@ def _parse(source, state, verbose, nested, first=False):
if group is None and not add_flags and not del_flags:
item = p
if sourcematch("?"):
+ # Non-Greedy Match
subpattern[-1] = (MIN_REPEAT, (min, max, item))
+ elif sourcematch("+"):
+ # Possessive Match (Always Greedy)
+ subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item))
else:
+ # Greedy Match
subpattern[-1] = (MAX_REPEAT, (min, max, item))
elif this == ".":
@@ -684,7 +693,8 @@ def _parse(source, state, verbose, nested, first=False):
elif this == "(":
start = source.tell() - 1
- group = True
+ capture = True
+ atomic = False
name = None
add_flags = 0
del_flags = 0
@@ -726,7 +736,7 @@ def _parse(source, state, verbose, nested, first=False):
len(char) + 2)
elif char == ":":
# non-capturing group
- group = None
+ capture = False
elif char == "#":
# comment
while True:
@@ -800,6 +810,10 @@ def _parse(source, state, verbose, nested, first=False):
subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
continue
+ elif char == ">":
+ # non-capturing, atomic group
+ capture = False
+ atomic = True
elif char in FLAGS or char == "-":
# flags
flags = _parse_flags(source, state, char)
@@ -813,17 +827,19 @@ def _parse(source, state, verbose, nested, first=False):
continue
add_flags, del_flags = flags
- group = None
+ capture = False
else:
raise source.error("unknown extension ?" + char,
len(char) + 1)
# parse group contents
- if group is not None:
+ if capture:
try:
group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1) from None
+ else:
+ group = None
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
not (del_flags & SRE_FLAG_VERBOSE))
p = _parse_sub(source, state, sub_verbose, nested + 1)
@@ -832,7 +848,11 @@ def _parse(source, state, verbose, nested, first=False):
source.tell() - start)
if group is not None:
state.closegroup(group, p)
- subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
+ if atomic:
+ assert group is None
+ subpatternappend((ATOMIC_GROUP, p))
+ else:
+ subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
elif this == "^":
subpatternappend((AT, AT_BEGINNING))
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index f8bbe51..bde7509 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -83,6 +83,23 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
self.assertIsNone(re.match('a+', 'xxx'))
+ def test_branching(self):
+ """Test Branching
+ Test expressions using the OR ('|') operator."""
+ self.assertEqual(re.match('(ab|ba)', 'ab').span(), (0, 2))
+ self.assertEqual(re.match('(ab|ba)', 'ba').span(), (0, 2))
+ self.assertEqual(re.match('(abc|bac|ca|cb)', 'abc').span(),
+ (0, 3))
+ self.assertEqual(re.match('(abc|bac|ca|cb)', 'bac').span(),
+ (0, 3))
+ self.assertEqual(re.match('(abc|bac|ca|cb)', 'ca').span(),
+ (0, 2))
+ self.assertEqual(re.match('(abc|bac|ca|cb)', 'cb').span(),
+ (0, 2))
+ self.assertEqual(re.match('((a)|(b)|(c))', 'a').span(), (0, 1))
+ self.assertEqual(re.match('((a)|(b)|(c))', 'b').span(), (0, 1))
+ self.assertEqual(re.match('((a)|(b)|(c))', 'c').span(), (0, 1))
+
def bump_num(self, matchobj):
int_value = int(matchobj.group(0))
return str(int_value + 1)
@@ -1239,11 +1256,13 @@ class ReTests(unittest.TestCase):
'nothing to repeat', 3)
def test_multiple_repeat(self):
- for outer_reps in '*', '+', '{1,2}':
- for outer_mod in '', '?':
+ for outer_reps in '*', '+', '?', '{1,2}':
+ for outer_mod in '', '?', '+':
outer_op = outer_reps + outer_mod
for inner_reps in '*', '+', '?', '{1,2}':
- for inner_mod in '', '?':
+ for inner_mod in '', '?', '+':
+ if inner_mod + outer_reps in ('?', '+'):
+ continue
inner_op = inner_reps + inner_mod
self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
'multiple repeat', 1 + len(inner_op))
@@ -1458,7 +1477,8 @@ class ReTests(unittest.TestCase):
def test_dollar_matches_twice(self):
- "$ matches the end of string, and just before the terminating \n"
+ r"""Test that $ does not include \n
+ $ matches the end of string, and just before the terminating \n"""
pattern = re.compile('$')
self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
@@ -1774,60 +1794,6 @@ class ReTests(unittest.TestCase):
self.assertEqual(m.group(1), "")
self.assertEqual(m.group(2), "y")
- @cpython_only
- def test_debug_flag(self):
- pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
- with captured_stdout() as out:
- re.compile(pat, re.DEBUG)
- self.maxDiff = None
- dump = '''\
-SUBPATTERN 1 0 0
- LITERAL 46
-BRANCH
- IN
- LITERAL 99
- LITERAL 104
-OR
- LITERAL 112
- LITERAL 121
-GROUPREF_EXISTS 1
- AT AT_END
-ELSE
- LITERAL 58
- LITERAL 32
-
- 0. INFO 8 0b1 2 5 (to 9)
- prefix_skip 0
- prefix [0x2e] ('.')
- overlap [0]
- 9: MARK 0
-11. LITERAL 0x2e ('.')
-13. MARK 1
-15. BRANCH 10 (to 26)
-17. IN 6 (to 24)
-19. LITERAL 0x63 ('c')
-21. LITERAL 0x68 ('h')
-23. FAILURE
-24: JUMP 9 (to 34)
-26: branch 7 (to 33)
-27. LITERAL 0x70 ('p')
-29. LITERAL 0x79 ('y')
-31. JUMP 2 (to 34)
-33: FAILURE
-34: GROUPREF_EXISTS 0 6 (to 41)
-37. AT END
-39. JUMP 5 (to 45)
-41: LITERAL 0x3a (':')
-43. LITERAL 0x20 (' ')
-45: SUCCESS
-'''
- self.assertEqual(out.getvalue(), dump)
- # Debug output is output again even a second time (bypassing
- # the cache -- issue #20426).
- with captured_stdout() as out:
- re.compile(pat, re.DEBUG)
- self.assertEqual(out.getvalue(), dump)
-
def test_keyword_parameters(self):
# Issue #20283: Accepting the string keyword parameter.
pat = re.compile(r'(ab)')
@@ -2072,6 +2038,218 @@ ELSE
with self.assertRaisesRegex(TypeError, "got 'type'"):
re.search("x*", type)
+ def test_possessive_qualifiers(self):
+ """Test Possessive Qualifiers
+ Test qualifiers of the form @+ for some repetition operator @,
+ e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed
+ without creating a stack frame for rolling the stack back and
+ trying 1 or more fewer matches."""
+ self.assertIsNone(re.match('e*+e', 'eeee'))
+ self.assertEqual(re.match('e++a', 'eeea').group(0), 'eeea')
+ self.assertEqual(re.match('e?+a', 'ea').group(0), 'ea')
+ self.assertEqual(re.match('e{2,4}+a', 'eeea').group(0), 'eeea')
+ self.assertIsNone(re.match('(.)++.', 'ee'))
+ self.assertEqual(re.match('(ae)*+a', 'aea').groups(), ('ae',))
+ self.assertEqual(re.match('([ae][ae])?+a', 'aea').groups(),
+ ('ae',))
+ self.assertEqual(re.match('(e?){2,4}+a', 'eeea').groups(),
+ ('',))
+ self.assertEqual(re.match('()*+a', 'a').groups(), ('',))
+ self.assertEqual(re.search('x*+', 'axx').span(), (0, 0))
+ self.assertEqual(re.search('x++', 'axx').span(), (1, 3))
+ self.assertEqual(re.match('a*+', 'xxx').span(), (0, 0))
+ self.assertEqual(re.match('x*+', 'xxxa').span(), (0, 3))
+ self.assertIsNone(re.match('a++', 'xxx'))
+ self.assertIsNone(re.match(r"^(\w){1}+$", "abc"))
+ self.assertIsNone(re.match(r"^(\w){1,2}+$", "abc"))
+
+ self.assertEqual(re.match(r"^(\w){3}+$", "abc").group(1), "c")
+ self.assertEqual(re.match(r"^(\w){1,3}+$", "abc").group(1), "c")
+ self.assertEqual(re.match(r"^(\w){1,4}+$", "abc").group(1), "c")
+
+ self.assertIsNone(re.match("^x{1}+$", "xxx"))
+ self.assertIsNone(re.match("^x{1,2}+$", "xxx"))
+
+ self.assertTrue(re.match("^x{3}+$", "xxx"))
+ self.assertTrue(re.match("^x{1,3}+$", "xxx"))
+ self.assertTrue(re.match("^x{1,4}+$", "xxx"))
+
+ self.assertIsNone(re.match("^x{}+$", "xxx"))
+ self.assertTrue(re.match("^x{}+$", "x{}"))
+
+ def test_fullmatch_possessive_qualifiers(self):
+ self.assertTrue(re.fullmatch(r'a++', 'a'))
+ self.assertTrue(re.fullmatch(r'a*+', 'a'))
+ self.assertTrue(re.fullmatch(r'a?+', 'a'))
+ self.assertTrue(re.fullmatch(r'a{1,3}+', 'a'))
+ self.assertIsNone(re.fullmatch(r'a++', 'ab'))
+ self.assertIsNone(re.fullmatch(r'a*+', 'ab'))
+ self.assertIsNone(re.fullmatch(r'a?+', 'ab'))
+ self.assertIsNone(re.fullmatch(r'a{1,3}+', 'ab'))
+
+ self.assertTrue(re.fullmatch(r'(?:ab)++', 'ab'))
+ self.assertTrue(re.fullmatch(r'(?:ab)*+', 'ab'))
+ self.assertTrue(re.fullmatch(r'(?:ab)?+', 'ab'))
+ self.assertTrue(re.fullmatch(r'(?:ab){1,3}+', 'ab'))
+ self.assertIsNone(re.fullmatch(r'(?:ab)++', 'abc'))
+ self.assertIsNone(re.fullmatch(r'(?:ab)*+', 'abc'))
+ self.assertIsNone(re.fullmatch(r'(?:ab)?+', 'abc'))
+ self.assertIsNone(re.fullmatch(r'(?:ab){1,3}+', 'abc'))
+
+ def test_findall_possessive_qualifiers(self):
+ self.assertEqual(re.findall(r'a++', 'aab'), ['aa'])
+ self.assertEqual(re.findall(r'a*+', 'aab'), ['aa', '', ''])
+ self.assertEqual(re.findall(r'a?+', 'aab'), ['a', 'a', '', ''])
+ self.assertEqual(re.findall(r'a{1,3}+', 'aab'), ['aa'])
+
+ self.assertEqual(re.findall(r'(?:ab)++', 'ababc'), ['abab'])
+ self.assertEqual(re.findall(r'(?:ab)*+', 'ababc'), ['abab', '', ''])
+ self.assertEqual(re.findall(r'(?:ab)?+', 'ababc'), ['ab', 'ab', '', ''])
+ self.assertEqual(re.findall(r'(?:ab){1,3}+', 'ababc'), ['abab'])
+
+ def test_atomic_grouping(self):
+ """Test Atomic Grouping
+ Test non-capturing groups of the form (?>...), which does
+ not maintain any stack point created within the group once the
+ group is finished being evaluated."""
+ pattern1 = re.compile(r'a(?>bc|b)c')
+ self.assertIsNone(pattern1.match('abc'))
+ self.assertTrue(pattern1.match('abcc'))
+ self.assertIsNone(re.match(r'(?>.*).', 'abc'))
+ self.assertTrue(re.match(r'(?>x)++', 'xxx'))
+ self.assertTrue(re.match(r'(?>x++)', 'xxx'))
+ self.assertIsNone(re.match(r'(?>x)++x', 'xxx'))
+ self.assertIsNone(re.match(r'(?>x++)x', 'xxx'))
+
+ def test_fullmatch_atomic_grouping(self):
+ self.assertTrue(re.fullmatch(r'(?>a+)', 'a'))
+ self.assertTrue(re.fullmatch(r'(?>a*)', 'a'))
+ self.assertTrue(re.fullmatch(r'(?>a?)', 'a'))
+ self.assertTrue(re.fullmatch(r'(?>a{1,3})', 'a'))
+ self.assertIsNone(re.fullmatch(r'(?>a+)', 'ab'))
+ self.assertIsNone(re.fullmatch(r'(?>a*)', 'ab'))
+ self.assertIsNone(re.fullmatch(r'(?>a?)', 'ab'))
+ self.assertIsNone(re.fullmatch(r'(?>a{1,3})', 'ab'))
+
+ self.assertTrue(re.fullmatch(r'(?>(?:ab)+)', 'ab'))
+ self.assertTrue(re.fullmatch(r'(?>(?:ab)*)', 'ab'))
+ self.assertTrue(re.fullmatch(r'(?>(?:ab)?)', 'ab'))
+ self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})', 'ab'))
+ self.assertIsNone(re.fullmatch(r'(?>(?:ab)+)', 'abc'))
+ self.assertIsNone(re.fullmatch(r'(?>(?:ab)*)', 'abc'))
+ self.assertIsNone(re.fullmatch(r'(?>(?:ab)?)', 'abc'))
+ self.assertIsNone(re.fullmatch(r'(?>(?:ab){1,3})', 'abc'))
+
+ def test_findall_atomic_grouping(self):
+ self.assertEqual(re.findall(r'(?>a+)', 'aab'), ['aa'])
+ self.assertEqual(re.findall(r'(?>a*)', 'aab'), ['aa', '', ''])
+ self.assertEqual(re.findall(r'(?>a?)', 'aab'), ['a', 'a', '', ''])
+ self.assertEqual(re.findall(r'(?>a{1,3})', 'aab'), ['aa'])
+
+ self.assertEqual(re.findall(r'(?>(?:ab)+)', 'ababc'), ['abab'])
+ self.assertEqual(re.findall(r'(?>(?:ab)*)', 'ababc'), ['abab', '', ''])
+ self.assertEqual(re.findall(r'(?>(?:ab)?)', 'ababc'), ['ab', 'ab', '', ''])
+ self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab'])
+
+
+def get_debug_out(pat):
+ with captured_stdout() as out:
+ re.compile(pat, re.DEBUG)
+ return out.getvalue()
+
+
+@cpython_only
+class DebugTests(unittest.TestCase):
+ maxDiff = None
+
+ def test_debug_flag(self):
+ pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
+ dump = '''\
+SUBPATTERN 1 0 0
+ LITERAL 46
+BRANCH
+ IN
+ LITERAL 99
+ LITERAL 104
+OR
+ LITERAL 112
+ LITERAL 121
+GROUPREF_EXISTS 1
+ AT AT_END
+ELSE
+ LITERAL 58
+ LITERAL 32
+
+ 0. INFO 8 0b1 2 5 (to 9)
+ prefix_skip 0
+ prefix [0x2e] ('.')
+ overlap [0]
+ 9: MARK 0
+11. LITERAL 0x2e ('.')
+13. MARK 1
+15. BRANCH 10 (to 26)
+17. IN 6 (to 24)
+19. LITERAL 0x63 ('c')
+21. LITERAL 0x68 ('h')
+23. FAILURE
+24: JUMP 9 (to 34)
+26: branch 7 (to 33)
+27. LITERAL 0x70 ('p')
+29. LITERAL 0x79 ('y')
+31. JUMP 2 (to 34)
+33: FAILURE
+34: GROUPREF_EXISTS 0 6 (to 41)
+37. AT END
+39. JUMP 5 (to 45)
+41: LITERAL 0x3a (':')
+43. LITERAL 0x20 (' ')
+45: SUCCESS
+'''
+ self.assertEqual(get_debug_out(pat), dump)
+ # Debug output is output again even a second time (bypassing
+ # the cache -- issue #20426).
+ self.assertEqual(get_debug_out(pat), dump)
+
+ def test_atomic_group(self):
+ self.assertEqual(get_debug_out(r'(?>ab?)'), '''\
+ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))]
+
+ 0. INFO 4 0b0 1 2 (to 5)
+ 5: ATOMIC_GROUP 11 (to 17)
+ 7. LITERAL 0x61 ('a')
+ 9. REPEAT_ONE 6 0 1 (to 16)
+13. LITERAL 0x62 ('b')
+15. SUCCESS
+16: SUCCESS
+17: SUCCESS
+''')
+
+ def test_possesive_repeat_one(self):
+ self.assertEqual(get_debug_out(r'a?+'), '''\
+POSSESSIVE_REPEAT 0 1
+ LITERAL 97
+
+ 0. INFO 4 0b0 0 1 (to 5)
+ 5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12)
+ 9. LITERAL 0x61 ('a')
+11. SUCCESS
+12: SUCCESS
+''')
+
+ def test_possesive_repeat(self):
+ self.assertEqual(get_debug_out(r'(?:ab)?+'), '''\
+POSSESSIVE_REPEAT 0 1
+ LITERAL 97
+ LITERAL 98
+
+ 0. INFO 4 0b0 0 2 (to 5)
+ 5: POSSESSIVE_REPEAT 7 0 1 (to 13)
+ 9. LITERAL 0x61 ('a')
+11. LITERAL 0x62 ('b')
+13: SUCCESS
+14. SUCCESS
+''')
+
class PatternReprTests(unittest.TestCase):
def check(self, pattern, expected):
diff --git a/Misc/ACKS b/Misc/ACKS
index 895813e..cfc15be 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -824,6 +824,7 @@ Ben Jackson
Paul Jackson
Manuel Jacob
David Jacobs
+Jeffrey C. Jacobs
Kevin Jacobs
Kjetil Jacobsen
Shantanu Jain
diff --git a/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst b/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst
new file mode 100644
index 0000000..1afed73
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst
@@ -0,0 +1,2 @@
+Add support of atomic grouping (``(?>...)``) and possessive qualifiers
+(``*+``, ``++``, ``?+``, ``{m,n}+``) in :mod:`regular expressions <re>`.
diff --git a/Modules/_sre.c b/Modules/_sre.c
index ab321ea..35bdb4f 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -1807,6 +1807,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_REPEAT_ONE:
case SRE_OP_MIN_REPEAT_ONE:
+ case SRE_OP_POSSESSIVE_REPEAT_ONE:
{
SRE_CODE min, max;
GET_SKIP;
@@ -1826,8 +1827,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
break;
case SRE_OP_REPEAT:
+ case SRE_OP_POSSESSIVE_REPEAT:
{
- SRE_CODE min, max;
+ SRE_CODE op1 = op, min, max;
GET_SKIP;
GET_ARG; min = arg;
GET_ARG; max = arg;
@@ -1839,7 +1841,25 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
FAIL;
code += skip-3;
GET_OP;
- if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
+ if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
+ if (op != SRE_OP_SUCCESS)
+ FAIL;
+ }
+ else {
+ if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
+ FAIL;
+ }
+ }
+ break;
+
+ case SRE_OP_ATOMIC_GROUP:
+ {
+ GET_SKIP;
+ if (!_validate_inner(code, code+skip-2, groups))
+ FAIL;
+ code += skip-2;
+ GET_OP;
+ if (op != SRE_OP_SUCCESS)
FAIL;
}
break;
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index c8ccb32..8b9125b 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
-#define SRE_MAGIC 20171005
+#define SRE_MAGIC 20220318
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@@ -40,19 +40,22 @@
#define SRE_OP_REPEAT_ONE 25
#define SRE_OP_SUBPATTERN 26
#define SRE_OP_MIN_REPEAT_ONE 27
-#define SRE_OP_GROUPREF_IGNORE 28
-#define SRE_OP_IN_IGNORE 29
-#define SRE_OP_LITERAL_IGNORE 30
-#define SRE_OP_NOT_LITERAL_IGNORE 31
-#define SRE_OP_GROUPREF_LOC_IGNORE 32
-#define SRE_OP_IN_LOC_IGNORE 33
-#define SRE_OP_LITERAL_LOC_IGNORE 34
-#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35
-#define SRE_OP_GROUPREF_UNI_IGNORE 36
-#define SRE_OP_IN_UNI_IGNORE 37
-#define SRE_OP_LITERAL_UNI_IGNORE 38
-#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39
-#define SRE_OP_RANGE_UNI_IGNORE 40
+#define SRE_OP_ATOMIC_GROUP 28
+#define SRE_OP_POSSESSIVE_REPEAT 29
+#define SRE_OP_POSSESSIVE_REPEAT_ONE 30
+#define SRE_OP_GROUPREF_IGNORE 31
+#define SRE_OP_IN_IGNORE 32
+#define SRE_OP_LITERAL_IGNORE 33
+#define SRE_OP_NOT_LITERAL_IGNORE 34
+#define SRE_OP_GROUPREF_LOC_IGNORE 35
+#define SRE_OP_IN_LOC_IGNORE 36
+#define SRE_OP_LITERAL_LOC_IGNORE 37
+#define SRE_OP_NOT_LITERAL_LOC_IGNORE 38
+#define SRE_OP_GROUPREF_UNI_IGNORE 39
+#define SRE_OP_IN_UNI_IGNORE 40
+#define SRE_OP_LITERAL_UNI_IGNORE 41
+#define SRE_OP_NOT_LITERAL_UNI_IGNORE 42
+#define SRE_OP_RANGE_UNI_IGNORE 43
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index 32469cd..956fd3f 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -480,6 +480,9 @@ do { \
#define JUMP_BRANCH 11
#define JUMP_ASSERT 12
#define JUMP_ASSERT_NOT 13
+#define JUMP_POSS_REPEAT_1 14
+#define JUMP_POSS_REPEAT_2 15
+#define JUMP_ATOMIC_GROUP 16
#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \
DATA_ALLOC(SRE(match_context), nextctx); \
@@ -950,6 +953,60 @@ entrance:
}
RETURN_FAILURE;
+ case SRE_OP_POSSESSIVE_REPEAT_ONE:
+ /* match repeated sequence (maximizing regexp) without
+ backtracking */
+
+ /* this operator only works if the repeated item is
+ exactly one character wide, and we're not already
+ collecting backtracking points. for other cases,
+ use the MAX_REPEAT operator */
+
+ /* <POSSESSIVE_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS>
+ tail */
+
+ TRACE(("|%p|%p|POSSESSIVE_REPEAT_ONE %d %d\n", ctx->pattern,
+ ctx->ptr, ctx->pattern[1], ctx->pattern[2]));
+
+ if (ctx->ptr + ctx->pattern[1] > end) {
+ RETURN_FAILURE; /* cannot match */
+ }
+
+ state->ptr = ctx->ptr;
+
+ ret = SRE(count)(state, ctx->pattern + 3, ctx->pattern[2]);
+ RETURN_ON_ERROR(ret);
+ DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
+ ctx->count = ret;
+ ctx->ptr += ctx->count;
+
+ /* when we arrive here, count contains the number of
+ matches, and ctx->ptr points to the tail of the target
+ string. check if the rest of the pattern matches,
+ and fail if not. */
+
+ /* Test for not enough repetitions in match */
+ if (ctx->count < (Py_ssize_t) ctx->pattern[1]) {
+ RETURN_FAILURE;
+ }
+
+ /* Update the pattern to point to the next op code */
+ ctx->pattern += ctx->pattern[0];
+
+ /* Let the tail be evaluated separately and consider this
+ match successful. */
+ if (*ctx->pattern == SRE_OP_SUCCESS &&
+ ctx->ptr == state->end &&
+ !(ctx->toplevel && state->must_advance && ctx->ptr == state->start))
+ {
+ /* tail is empty. we're finished */
+ state->ptr = ctx->ptr;
+ RETURN_SUCCESS;
+ }
+
+ /* Attempt to match the rest of the string */
+ break;
+
case SRE_OP_REPEAT:
/* create repeat context. all the hard work is done
by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
@@ -1110,6 +1167,138 @@ entrance:
state->ptr = ctx->ptr;
RETURN_FAILURE;
+ case SRE_OP_POSSESSIVE_REPEAT:
+ /* create possessive repeat contexts. */
+ /* <POSSESSIVE_REPEAT> <skip> <1=min> <2=max> pattern
+ <SUCCESS> tail */
+ TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern,
+ ctx->ptr, ctx->pattern[1], ctx->pattern[2]));
+
+ /* Set the global Input pointer to this context's Input
+ pointer */
+ state->ptr = ctx->ptr;
+
+ /* Initialize Count to 0 */
+ ctx->count = 0;
+
+ /* Check for minimum required matches. */
+ while (ctx->count < (Py_ssize_t)ctx->pattern[1]) {
+ /* not enough matches */
+ DO_JUMP(JUMP_POSS_REPEAT_1, jump_poss_repeat_1,
+ &ctx->pattern[3]);
+ if (ret) {
+ RETURN_ON_ERROR(ret);
+ ctx->count++;
+ }
+ else {
+ state->ptr = ctx->ptr;
+ RETURN_FAILURE;
+ }
+ }
+
+ /* Clear the context's Input stream pointer so that it
+ doesn't match the global state so that the while loop can
+ be entered. */
+ ctx->ptr = NULL;
+
+ /* Keep trying to parse the <pattern> sub-pattern until the
+ end is reached, creating a new context each time. */
+ while ((ctx->count < (Py_ssize_t)ctx->pattern[2] ||
+ (Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT) &&
+ state->ptr != ctx->ptr) {
+ /* Save the Capture Group Marker state into the current
+ Context and back up the current highest number
+ Capture Group marker. */
+ LASTMARK_SAVE();
+ MARK_PUSH(ctx->lastmark);
+
+ /* zero-width match protection */
+ /* Set the context's Input Stream pointer to be the
+ current Input Stream pointer from the global
+ state. When the loop reaches the next iteration,
+ the context will then store the last known good
+ position with the global state holding the Input
+ Input Stream position that has been updated with
+ the most recent match. Thus, if state's Input
+ stream remains the same as the one stored in the
+ current Context, we know we have successfully
+ matched an empty string and that all subsequent
+ matches will also be the empty string until the
+ maximum number of matches are counted, and because
+ of this, we could immediately stop at that point and
+ consider this match successful. */
+ ctx->ptr = state->ptr;
+
+ /* We have not reached the maximin matches, so try to
+ match once more. */
+ DO_JUMP(JUMP_POSS_REPEAT_2, jump_poss_repeat_2,
+ &ctx->pattern[3]);
+
+ /* Check to see if the last attempted match
+ succeeded. */
+ if (ret) {
+ /* Drop the saved highest number Capture Group
+ marker saved above and use the newly updated
+ value. */
+ MARK_POP_DISCARD(ctx->lastmark);
+ RETURN_ON_ERROR(ret);
+
+ /* Success, increment the count. */
+ ctx->count++;
+ }
+ /* Last attempted match failed. */
+ else {
+ /* Restore the previously saved highest number
+ Capture Group marker since the last iteration
+ did not match, then restore that to the global
+ state. */
+ MARK_POP(ctx->lastmark);
+ LASTMARK_RESTORE();
+
+ /* We have sufficient matches, so exit loop. */
+ break;
+ }
+ }
+
+ /* Evaluate Tail */
+ /* Jump to end of pattern indicated by skip, and then skip
+ the SUCCESS op code that follows it. */
+ ctx->pattern += ctx->pattern[0] + 1;
+ ctx->ptr = state->ptr;
+ break;
+
+ case SRE_OP_ATOMIC_GROUP:
+ /* Atomic Group Sub Pattern */
+ /* <ATOMIC_GROUP> <skip> pattern <SUCCESS> tail */
+ TRACE(("|%p|%p|ATOMIC_GROUP\n", ctx->pattern, ctx->ptr));
+
+ /* Set the global Input pointer to this context's Input
+ pointer */
+ state->ptr = ctx->ptr;
+
+ /* Evaluate the Atomic Group in a new context, terminating
+ when the end of the group, represented by a SUCCESS op
+ code, is reached. */
+ /* Group Pattern begins at an offset of 1 code. */
+ DO_JUMP(JUMP_ATOMIC_GROUP, jump_atomic_group,
+ &ctx->pattern[1]);
+
+ /* Test Exit Condition */
+ RETURN_ON_ERROR(ret);
+
+ if (ret == 0) {
+ /* Atomic Group failed to Match. */
+ state->ptr = ctx->ptr;
+ RETURN_FAILURE;
+ }
+
+ /* Evaluate Tail */
+ /* Jump to end of pattern indicated by skip, and then skip
+ the SUCCESS op code that follows it. */
+ ctx->pattern += ctx->pattern[0];
+ ctx->ptr = state->ptr;
+ break;
+
case SRE_OP_GROUPREF:
/* match backreference */
TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
@@ -1306,6 +1495,12 @@ exit:
case JUMP_MIN_UNTIL_1:
TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
goto jump_min_until_1;
+ case JUMP_POSS_REPEAT_1:
+ TRACE(("|%p|%p|JUMP_POSS_REPEAT_1\n", ctx->pattern, ctx->ptr));
+ goto jump_poss_repeat_1;
+ case JUMP_POSS_REPEAT_2:
+ TRACE(("|%p|%p|JUMP_POSS_REPEAT_2\n", ctx->pattern, ctx->ptr));
+ goto jump_poss_repeat_2;
case JUMP_REPEAT:
TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
goto jump_repeat;
@@ -1318,6 +1513,9 @@ exit:
case JUMP_MIN_REPEAT_ONE:
TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
goto jump_min_repeat_one;
+ case JUMP_ATOMIC_GROUP:
+ TRACE(("|%p|%p|JUMP_ATOMIC_GROUP\n", ctx->pattern, ctx->ptr));
+ goto jump_atomic_group;
case JUMP_ASSERT:
TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
goto jump_assert;