bpo-433030: Add support of atomic grouping in regular expressions (GH-31982)

* Atomic grouping: (?>...). * Possessive quantifiers: x++, x*+, x?+, x{m,n}+. Equivalent to (?>x+), (?>x*), (?>x?), (?>x{m,n}). Co-authored-by: Jeffrey C. Jacobs <timehorse@users.sourceforge.net>
author: Serhiy Storchaka <storchaka@gmail.com> 2022-03-21 16:28:22 (GMT)
committer: GitHub <noreply@github.com> 2022-03-21 16:28:22 (GMT)
commit: 345b390ed69f36681dbc41187bc8f49cd9135b54 (patch)
tree: 31ce6451bed718405b29bdb32c7eb4ff96fe5697
parent: 2bde6827ea4f136297b2d882480b981ff26262b6 (diff)
download: cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.zip
cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.gz
cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.bz2
11 files changed, 593 insertions, 92 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index 950a5b1..02c0a84 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -155,6 +155,30 @@ The special characters are:
    only ``'<a>'``.
 
 .. index::
+   single: *+; in regular expressions
+   single: ++; in regular expressions
+   single: ?+; in regular expressions
+
+``*+``, ``++``, ``?+``
+  Like the ``'*'``, ``'+'``, and ``'?'`` qualifiers, those where ``'+'`` is
+  appended also match as many times as possible.
+  However, unlike the true greedy qualifiers, these do not allow
+  back-tracking when the expression following it fails to match.
+  These are known as :dfn:`possessive` qualifiers.
+  For example, ``a*a`` will match ``'aaaa'`` because the ``a*`` will match
+  all 4 ``'a'``s, but, when the final ``'a'`` is encountered, the
+  expression is backtracked so that in the end the ``a*`` ends up matching
+  3 ``'a'``s total, and the fourth ``'a'`` is matched by the final ``'a'``.
+  However, when ``a*+a`` is used to match ``'aaaa'``, the ``a*+`` will
+  match all 4 ``'a'``, but when the final ``'a'`` fails to find any more
+  characters to match, the expression cannot be backtracked and will thus
+  fail to match.
+  ``x*+``, ``x++`` and ``x?+`` are equivalent to ``(?>x*)``, ``(?>x+)``
+  and ``(?>x?)`` correspondigly.
+
+   .. versionadded:: 3.11
+
+.. index::
    single: {} (curly brackets); in regular expressions
 
 ``{m}``
@@ -178,6 +202,21 @@ The special characters are:
    6-character string ``'aaaaaa'``, ``a{3,5}`` will match 5 ``'a'`` characters,
    while ``a{3,5}?`` will only match 3 characters.
 
+``{m,n}+``
+   Causes the resulting RE to match from *m* to *n* repetitions of the
+   preceding RE, attempting to match as many repetitions as possible
+   *without* establishing any backtracking points.
+   This is the possessive version of the qualifier above.
+   For example, on the 6-character string ``'aaaaaa'``, ``a{3,5}+aa``
+   attempt to match 5 ``'a'`` characters, then, requiring 2 more ``'a'``s,
+   will need more characters than available and thus fail, while
+   ``a{3,5}aa`` will match with ``a{3,5}`` capturing 5, then 4 ``'a'``s
+   by backtracking and then the final 2 ``'a'``s are matched by the final
+   ``aa`` in the pattern.
+   ``x{m,n}+`` is equivalent to ``(?>x{m,n})``.
+
+   .. versionadded:: 3.11
+
 .. index:: single: \ (backslash); in regular expressions
 
 ``\``
@@ -336,6 +375,21 @@ The special characters are:
    .. versionchanged:: 3.7
       The letters ``'a'``, ``'L'`` and ``'u'`` also can be used in a group.
 
+``(?>...)``
+   Attempts to match ``...`` as if it was a separate regular expression, and
+   if successful, continues to match the rest of the pattern following it.
+   If the subsequent pattern fails to match, the stack can only be unwound
+   to a point *before* the ``(?>...)`` because once exited, the expression,
+   known as an :dfn:`atomic group`, has thrown away all stack points within
+   itself.
+   Thus, ``(?>.*).`` would never match anything because first the ``.*``
+   would match all characters possible, then, having nothing left to match,
+   the final ``.`` would fail to match.
+   Since there are no stack points saved in the Atomic Group, and there is
+   no stack point before it, the entire expression would thus fail to match.
+
+   .. versionadded:: 3.11
+
 .. index:: single: (?P<; in regular expressions
 
 ``(?P<name>...)``
diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index b7e9dc6..ca7699d 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -295,6 +295,12 @@ os
   instead of ``CryptGenRandom()`` which is deprecated.
   (Contributed by Dong-hee Na in :issue:`44611`.)
 
+re
+--
+
+* Atomic grouping (``(?>...)``) and possessive qualifiers (``*+``, ``++``,
+  ``?+``, ``{m,n}+``) are now supported in regular expressions.
+  (Contributed by Jeffrey C. Jacobs and Serhiy Storchaka in :issue:`433030`.)
 
 shutil
 ------
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index c6398bf..0867200 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -17,11 +17,16 @@ from sre_constants import *
 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
 
 _LITERAL_CODES = {LITERAL, NOT_LITERAL}
-_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
 _SUCCESS_CODES = {SUCCESS, FAILURE}
 _ASSERT_CODES = {ASSERT, ASSERT_NOT}
 _UNIT_CODES = _LITERAL_CODES | {ANY, IN}
 
+_REPEATING_CODES = {
+    MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
+    MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
+    POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
+}
+
 # Sets of lowercase characters which have the same uppercase.
 _equivalences = (
     # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
@@ -138,10 +143,7 @@ def _compile(code, pattern, flags):
             if flags & SRE_FLAG_TEMPLATE:
                 raise error("internal: unsupported template operator %r" % (op,))
             if _simple(av[2]):
-                if op is MAX_REPEAT:
-                    emit(REPEAT_ONE)
-                else:
-                    emit(MIN_REPEAT_ONE)
+                emit(REPEATING_CODES[op][2])
                 skip = _len(code); emit(0)
                 emit(av[0])
                 emit(av[1])
@@ -149,16 +151,13 @@ def _compile(code, pattern, flags):
                 emit(SUCCESS)
                 code[skip] = _len(code) - skip
             else:
-                emit(REPEAT)
+                emit(REPEATING_CODES[op][0])
                 skip = _len(code); emit(0)
                 emit(av[0])
                 emit(av[1])
                 _compile(code, av[2], flags)
                 code[skip] = _len(code) - skip
-                if op is MAX_REPEAT:
-                    emit(MAX_UNTIL)
-                else:
-                    emit(MIN_UNTIL)
+                emit(REPEATING_CODES[op][1])
         elif op is SUBPATTERN:
             group, add_flags, del_flags, p = av
             if group:
@@ -169,6 +168,17 @@ def _compile(code, pattern, flags):
             if group:
                 emit(MARK)
                 emit((group-1)*2+1)
+        elif op is ATOMIC_GROUP:
+            # Atomic Groups are handled by starting with an Atomic
+            # Group op code, then putting in the atomic group pattern
+            # and finally a success op code to tell any repeat
+            # operations within the Atomic Group to stop eating and
+            # pop their stack if they reach it
+            emit(ATOMIC_GROUP)
+            skip = _len(code); emit(0)
+            _compile(code, av, flags)
+            emit(SUCCESS)
+            code[skip] = _len(code) - skip
         elif op in SUCCESS_CODES:
             emit(op)
         elif op in ASSERT_CODES:
@@ -709,7 +719,8 @@ def dis(code):
                     else:
                         print_(FAILURE)
                 i += 1
-            elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
+            elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
+                        POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
                 skip, min, max = code[i: i+3]
                 if max == MAXREPEAT:
                     max = 'MAXREPEAT'
@@ -725,6 +736,11 @@ def dis(code):
                 print_(op, skip, arg, to=i+skip)
                 dis_(i+2, i+skip)
                 i += skip
+            elif op is ATOMIC_GROUP:
+                skip = code[i]
+                print_(op, skip, to=i+skip)
+                dis_(i+1, i+skip)
+                i += skip
             elif op is INFO:
                 skip, flags, min, max = code[i: i+4]
                 if max == MAXREPEAT:
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index 8e613cb..a00b017 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
 
 # update when constants are added or removed
 
-MAGIC = 20171005
+MAGIC = 20220318
 
 from _sre import MAXREPEAT, MAXGROUPS
 
@@ -97,6 +97,9 @@ OPCODES = _makecodes("""
     REPEAT_ONE
     SUBPATTERN
     MIN_REPEAT_ONE
+    ATOMIC_GROUP
+    POSSESSIVE_REPEAT
+    POSSESSIVE_REPEAT_ONE
 
     GROUPREF_IGNORE
     IN_IGNORE
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index bb95107..b91082e 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -25,7 +25,7 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 
 WHITESPACE = frozenset(" \t\n\r\v\f")
 
-_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
+_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
 
 ESCAPES = {
@@ -190,6 +190,10 @@ class SubPattern:
                 i, j = av.getwidth()
                 lo = lo + i
                 hi = hi + j
+            elif op is ATOMIC_GROUP:
+                i, j = av.getwidth()
+                lo = lo + i
+                hi = hi + j
             elif op is SUBPATTERN:
                 i, j = av[-1].getwidth()
                 lo = lo + i
@@ -675,8 +679,13 @@ def _parse(source, state, verbose, nested, first=False):
                 if group is None and not add_flags and not del_flags:
                     item = p
             if sourcematch("?"):
+                # Non-Greedy Match
                 subpattern[-1] = (MIN_REPEAT, (min, max, item))
+            elif sourcematch("+"):
+                # Possessive Match (Always Greedy)
+                subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item))
             else:
+                # Greedy Match
                 subpattern[-1] = (MAX_REPEAT, (min, max, item))
 
         elif this == ".":
@@ -684,7 +693,8 @@ def _parse(source, state, verbose, nested, first=False):
 
         elif this == "(":
             start = source.tell() - 1
-            group = True
+            capture = True
+            atomic = False
             name = None
             add_flags = 0
             del_flags = 0
@@ -726,7 +736,7 @@ def _parse(source, state, verbose, nested, first=False):
                                            len(char) + 2)
                 elif char == ":":
                     # non-capturing group
-                    group = None
+                    capture = False
                 elif char == "#":
                     # comment
                     while True:
@@ -800,6 +810,10 @@ def _parse(source, state, verbose, nested, first=False):
                     subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
                     continue
 
+                elif char == ">":
+                    # non-capturing, atomic group
+                    capture = False
+                    atomic = True
                 elif char in FLAGS or char == "-":
                     # flags
                     flags = _parse_flags(source, state, char)
@@ -813,17 +827,19 @@ def _parse(source, state, verbose, nested, first=False):
                         continue
 
                     add_flags, del_flags = flags
-                    group = None
+                    capture = False
                 else:
                     raise source.error("unknown extension ?" + char,
                                        len(char) + 1)
 
             # parse group contents
-            if group is not None:
+            if capture:
                 try:
                     group = state.opengroup(name)
                 except error as err:
                     raise source.error(err.msg, len(name) + 1) from None
+            else:
+                group = None
             sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
                            not (del_flags & SRE_FLAG_VERBOSE))
             p = _parse_sub(source, state, sub_verbose, nested + 1)
@@ -832,7 +848,11 @@ def _parse(source, state, verbose, nested, first=False):
                                    source.tell() - start)
             if group is not None:
                 state.closegroup(group, p)
-            subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
+            if atomic:
+                assert group is None
+                subpatternappend((ATOMIC_GROUP, p))
+            else:
+                subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
 
         elif this == "^":
             subpatternappend((AT, AT_BEGINNING))
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index f8bbe51..bde7509 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -83,6 +83,23 @@ class ReTests(unittest.TestCase):
         self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
         self.assertIsNone(re.match('a+', 'xxx'))
 
+    def test_branching(self):
+        """Test Branching
+        Test expressions using the OR ('|') operator."""
+        self.assertEqual(re.match('(ab|ba)', 'ab').span(), (0, 2))
+        self.assertEqual(re.match('(ab|ba)', 'ba').span(), (0, 2))
+        self.assertEqual(re.match('(abc|bac|ca|cb)', 'abc').span(),
+                         (0, 3))
+        self.assertEqual(re.match('(abc|bac|ca|cb)', 'bac').span(),
+                         (0, 3))
+        self.assertEqual(re.match('(abc|bac|ca|cb)', 'ca').span(),
+                         (0, 2))
+        self.assertEqual(re.match('(abc|bac|ca|cb)', 'cb').span(),
+                         (0, 2))
+        self.assertEqual(re.match('((a)|(b)|(c))', 'a').span(), (0, 1))
+        self.assertEqual(re.match('((a)|(b)|(c))', 'b').span(), (0, 1))
+        self.assertEqual(re.match('((a)|(b)|(c))', 'c').span(), (0, 1))
+
     def bump_num(self, matchobj):
         int_value = int(matchobj.group(0))
         return str(int_value + 1)
@@ -1239,11 +1256,13 @@ class ReTests(unittest.TestCase):
                                        'nothing to repeat', 3)
 
     def test_multiple_repeat(self):
-        for outer_reps in '*', '+', '{1,2}':
-            for outer_mod in '', '?':
+        for outer_reps in '*', '+', '?', '{1,2}':
+            for outer_mod in '', '?', '+':
                 outer_op = outer_reps + outer_mod
                 for inner_reps in '*', '+', '?', '{1,2}':
-                    for inner_mod in '', '?':
+                    for inner_mod in '', '?', '+':
+                        if inner_mod + outer_reps in ('?', '+'):
+                            continue
                         inner_op = inner_reps + inner_mod
                         self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
                                 'multiple repeat', 1 + len(inner_op))
@@ -1458,7 +1477,8 @@ class ReTests(unittest.TestCase):
 
 
     def test_dollar_matches_twice(self):
-        "$ matches the end of string, and just before the terminating \n"
+        r"""Test that $ does not include \n
+        $ matches the end of string, and just before the terminating \n"""
         pattern = re.compile('$')
         self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
@@ -1774,60 +1794,6 @@ class ReTests(unittest.TestCase):
                 self.assertEqual(m.group(1), "")
                 self.assertEqual(m.group(2), "y")
 
-    @cpython_only
-    def test_debug_flag(self):
-        pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
-        with captured_stdout() as out:
-            re.compile(pat, re.DEBUG)
-        self.maxDiff = None
-        dump = '''\
-SUBPATTERN 1 0 0
-  LITERAL 46
-BRANCH
-  IN
-    LITERAL 99
-    LITERAL 104
-OR
-  LITERAL 112
-  LITERAL 121
-GROUPREF_EXISTS 1
-  AT AT_END
-ELSE
-  LITERAL 58
-  LITERAL 32
-
- 0. INFO 8 0b1 2 5 (to 9)
-      prefix_skip 0
-      prefix [0x2e] ('.')
-      overlap [0]
- 9: MARK 0
-11. LITERAL 0x2e ('.')
-13. MARK 1
-15. BRANCH 10 (to 26)
-17.   IN 6 (to 24)
-19.     LITERAL 0x63 ('c')
-21.     LITERAL 0x68 ('h')
-23.     FAILURE
-24:   JUMP 9 (to 34)
-26: branch 7 (to 33)
-27.   LITERAL 0x70 ('p')
-29.   LITERAL 0x79 ('y')
-31.   JUMP 2 (to 34)
-33: FAILURE
-34: GROUPREF_EXISTS 0 6 (to 41)
-37. AT END
-39. JUMP 5 (to 45)
-41: LITERAL 0x3a (':')
-43. LITERAL 0x20 (' ')
-45: SUCCESS
-'''
-        self.assertEqual(out.getvalue(), dump)
-        # Debug output is output again even a second time (bypassing
-        # the cache -- issue #20426).
-        with captured_stdout() as out:
-            re.compile(pat, re.DEBUG)
-        self.assertEqual(out.getvalue(), dump)
-
     def test_keyword_parameters(self):
         # Issue #20283: Accepting the string keyword parameter.
         pat = re.compile(r'(ab)')
@@ -2072,6 +2038,218 @@ ELSE
         with self.assertRaisesRegex(TypeError, "got 'type'"):
             re.search("x*", type)
 
+    def test_possessive_qualifiers(self):
+        """Test Possessive Qualifiers
+        Test qualifiers of the form @+ for some repetition operator @,
+        e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed
+        without creating a stack frame for rolling the stack back and
+        trying 1 or more fewer matches."""
+        self.assertIsNone(re.match('e*+e', 'eeee'))
+        self.assertEqual(re.match('e++a', 'eeea').group(0), 'eeea')
+        self.assertEqual(re.match('e?+a', 'ea').group(0), 'ea')
+        self.assertEqual(re.match('e{2,4}+a', 'eeea').group(0), 'eeea')
+        self.assertIsNone(re.match('(.)++.', 'ee'))
+        self.assertEqual(re.match('(ae)*+a', 'aea').groups(), ('ae',))
+        self.assertEqual(re.match('([ae][ae])?+a', 'aea').groups(),
+                         ('ae',))
+        self.assertEqual(re.match('(e?){2,4}+a', 'eeea').groups(),
+                         ('',))
+        self.assertEqual(re.match('()*+a', 'a').groups(), ('',))
+        self.assertEqual(re.search('x*+', 'axx').span(), (0, 0))
+        self.assertEqual(re.search('x++', 'axx').span(), (1, 3))
+        self.assertEqual(re.match('a*+', 'xxx').span(), (0, 0))
+        self.assertEqual(re.match('x*+', 'xxxa').span(), (0, 3))
+        self.assertIsNone(re.match('a++', 'xxx'))
+        self.assertIsNone(re.match(r"^(\w){1}+$", "abc"))
+        self.assertIsNone(re.match(r"^(\w){1,2}+$", "abc"))
+
+        self.assertEqual(re.match(r"^(\w){3}+$", "abc").group(1), "c")
+        self.assertEqual(re.match(r"^(\w){1,3}+$", "abc").group(1), "c")
+        self.assertEqual(re.match(r"^(\w){1,4}+$", "abc").group(1), "c")
+
+        self.assertIsNone(re.match("^x{1}+$", "xxx"))
+        self.assertIsNone(re.match("^x{1,2}+$", "xxx"))
+
+        self.assertTrue(re.match("^x{3}+$", "xxx"))
+        self.assertTrue(re.match("^x{1,3}+$", "xxx"))
+        self.assertTrue(re.match("^x{1,4}+$", "xxx"))
+
+        self.assertIsNone(re.match("^x{}+$", "xxx"))
+        self.assertTrue(re.match("^x{}+$", "x{}"))
+
+    def test_fullmatch_possessive_qualifiers(self):
+        self.assertTrue(re.fullmatch(r'a++', 'a'))
+        self.assertTrue(re.fullmatch(r'a*+', 'a'))
+        self.assertTrue(re.fullmatch(r'a?+', 'a'))
+        self.assertTrue(re.fullmatch(r'a{1,3}+', 'a'))
+        self.assertIsNone(re.fullmatch(r'a++', 'ab'))
+        self.assertIsNone(re.fullmatch(r'a*+', 'ab'))
+        self.assertIsNone(re.fullmatch(r'a?+', 'ab'))
+        self.assertIsNone(re.fullmatch(r'a{1,3}+', 'ab'))
+
+        self.assertTrue(re.fullmatch(r'(?:ab)++', 'ab'))
+        self.assertTrue(re.fullmatch(r'(?:ab)*+', 'ab'))
+        self.assertTrue(re.fullmatch(r'(?:ab)?+', 'ab'))
+        self.assertTrue(re.fullmatch(r'(?:ab){1,3}+', 'ab'))
+        self.assertIsNone(re.fullmatch(r'(?:ab)++', 'abc'))
+        self.assertIsNone(re.fullmatch(r'(?:ab)*+', 'abc'))
+        self.assertIsNone(re.fullmatch(r'(?:ab)?+', 'abc'))
+        self.assertIsNone(re.fullmatch(r'(?:ab){1,3}+', 'abc'))
+
+    def test_findall_possessive_qualifiers(self):
+        self.assertEqual(re.findall(r'a++', 'aab'), ['aa'])
+        self.assertEqual(re.findall(r'a*+', 'aab'), ['aa', '', ''])
+        self.assertEqual(re.findall(r'a?+', 'aab'), ['a', 'a', '', ''])
+        self.assertEqual(re.findall(r'a{1,3}+', 'aab'), ['aa'])
+
+        self.assertEqual(re.findall(r'(?:ab)++', 'ababc'), ['abab'])
+        self.assertEqual(re.findall(r'(?:ab)*+', 'ababc'), ['abab', '', ''])
+        self.assertEqual(re.findall(r'(?:ab)?+', 'ababc'), ['ab', 'ab', '', ''])
+        self.assertEqual(re.findall(r'(?:ab){1,3}+', 'ababc'), ['abab'])
+
+    def test_atomic_grouping(self):
+        """Test Atomic Grouping
+        Test non-capturing groups of the form (?>...), which does
+        not maintain any stack point created within the group once the
+        group is finished being evaluated."""
+        pattern1 = re.compile(r'a(?>bc|b)c')
+        self.assertIsNone(pattern1.match('abc'))
+        self.assertTrue(pattern1.match('abcc'))
+        self.assertIsNone(re.match(r'(?>.*).', 'abc'))
+        self.assertTrue(re.match(r'(?>x)++', 'xxx'))
+        self.assertTrue(re.match(r'(?>x++)', 'xxx'))
+        self.assertIsNone(re.match(r'(?>x)++x', 'xxx'))
+        self.assertIsNone(re.match(r'(?>x++)x', 'xxx'))
+
+    def test_fullmatch_atomic_grouping(self):
+        self.assertTrue(re.fullmatch(r'(?>a+)', 'a'))
+        self.assertTrue(re.fullmatch(r'(?>a*)', 'a'))
+        self.assertTrue(re.fullmatch(r'(?>a?)', 'a'))
+        self.assertTrue(re.fullmatch(r'(?>a{1,3})', 'a'))
+        self.assertIsNone(re.fullmatch(r'(?>a+)', 'ab'))
+        self.assertIsNone(re.fullmatch(r'(?>a*)', 'ab'))
+        self.assertIsNone(re.fullmatch(r'(?>a?)', 'ab'))
+        self.assertIsNone(re.fullmatch(r'(?>a{1,3})', 'ab'))
+
+        self.assertTrue(re.fullmatch(r'(?>(?:ab)+)', 'ab'))
+        self.assertTrue(re.fullmatch(r'(?>(?:ab)*)', 'ab'))
+        self.assertTrue(re.fullmatch(r'(?>(?:ab)?)', 'ab'))
+        self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})', 'ab'))
+        self.assertIsNone(re.fullmatch(r'(?>(?:ab)+)', 'abc'))
+        self.assertIsNone(re.fullmatch(r'(?>(?:ab)*)', 'abc'))
+        self.assertIsNone(re.fullmatch(r'(?>(?:ab)?)', 'abc'))
+        self.assertIsNone(re.fullmatch(r'(?>(?:ab){1,3})', 'abc'))
+
+    def test_findall_atomic_grouping(self):
+        self.assertEqual(re.findall(r'(?>a+)', 'aab'), ['aa'])
+        self.assertEqual(re.findall(r'(?>a*)', 'aab'), ['aa', '', ''])
+        self.assertEqual(re.findall(r'(?>a?)', 'aab'), ['a', 'a', '', ''])
+        self.assertEqual(re.findall(r'(?>a{1,3})', 'aab'), ['aa'])
+
+        self.assertEqual(re.findall(r'(?>(?:ab)+)', 'ababc'), ['abab'])
+        self.assertEqual(re.findall(r'(?>(?:ab)*)', 'ababc'), ['abab', '', ''])
+        self.assertEqual(re.findall(r'(?>(?:ab)?)', 'ababc'), ['ab', 'ab', '', ''])
+        self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab'])
+
+
+def get_debug_out(pat):
+    with captured_stdout() as out:
+        re.compile(pat, re.DEBUG)
+    return out.getvalue()
+
+
+@cpython_only
+class DebugTests(unittest.TestCase):
+    maxDiff = None
+
+    def test_debug_flag(self):
+        pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
+        dump = '''\
+SUBPATTERN 1 0 0
+  LITERAL 46
+BRANCH
+  IN
+    LITERAL 99
+    LITERAL 104
+OR
+  LITERAL 112
+  LITERAL 121
+GROUPREF_EXISTS 1
+  AT AT_END
+ELSE
+  LITERAL 58
+  LITERAL 32
+
+ 0. INFO 8 0b1 2 5 (to 9)
+      prefix_skip 0
+      prefix [0x2e] ('.')
+      overlap [0]
+ 9: MARK 0
+11. LITERAL 0x2e ('.')
+13. MARK 1
+15. BRANCH 10 (to 26)
+17.   IN 6 (to 24)
+19.     LITERAL 0x63 ('c')
+21.     LITERAL 0x68 ('h')
+23.     FAILURE
+24:   JUMP 9 (to 34)
+26: branch 7 (to 33)
+27.   LITERAL 0x70 ('p')
+29.   LITERAL 0x79 ('y')
+31.   JUMP 2 (to 34)
+33: FAILURE
+34: GROUPREF_EXISTS 0 6 (to 41)
+37. AT END
+39. JUMP 5 (to 45)
+41: LITERAL 0x3a (':')
+43. LITERAL 0x20 (' ')
+45: SUCCESS
+'''
+        self.assertEqual(get_debug_out(pat), dump)
+        # Debug output is output again even a second time (bypassing
+        # the cache -- issue #20426).
+        self.assertEqual(get_debug_out(pat), dump)
+
+    def test_atomic_group(self):
+        self.assertEqual(get_debug_out(r'(?>ab?)'), '''\
+ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))]
+
+ 0. INFO 4 0b0 1 2 (to 5)
+ 5: ATOMIC_GROUP 11 (to 17)
+ 7.   LITERAL 0x61 ('a')
+ 9.   REPEAT_ONE 6 0 1 (to 16)
+13.     LITERAL 0x62 ('b')
+15.     SUCCESS
+16:   SUCCESS
+17: SUCCESS
+''')
+
+    def test_possesive_repeat_one(self):
+        self.assertEqual(get_debug_out(r'a?+'), '''\
+POSSESSIVE_REPEAT 0 1
+  LITERAL 97
+
+ 0. INFO 4 0b0 0 1 (to 5)
+ 5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12)
+ 9.   LITERAL 0x61 ('a')
+11.   SUCCESS
+12: SUCCESS
+''')
+
+    def test_possesive_repeat(self):
+        self.assertEqual(get_debug_out(r'(?:ab)?+'), '''\
+POSSESSIVE_REPEAT 0 1
+  LITERAL 97
+  LITERAL 98
+
+ 0. INFO 4 0b0 0 2 (to 5)
+ 5: POSSESSIVE_REPEAT 7 0 1 (to 13)
+ 9.   LITERAL 0x61 ('a')
+11.   LITERAL 0x62 ('b')
+13: SUCCESS
+14. SUCCESS
+''')
+
 
 class PatternReprTests(unittest.TestCase):
     def check(self, pattern, expected):
diff --git a/Misc/ACKS b/Misc/ACKS
index 895813e..cfc15be 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -824,6 +824,7 @@ Ben Jackson
 Paul Jackson
 Manuel Jacob
 David Jacobs
+Jeffrey C. Jacobs
 Kevin Jacobs
 Kjetil Jacobsen
 Shantanu Jain
diff --git a/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst b/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst
new file mode 100644
index 0000000..1afed73
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst
@@ -0,0 +1,2 @@
+Add support of atomic grouping (``(?>...)``) and possessive qualifiers
+(``*+``, ``++``, ``?+``, ``{m,n}+``) in :mod:`regular expressions <re>`.
diff --git a/Modules/_sre.c b/Modules/_sre.c
index ab321ea..35bdb4f 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -1807,6 +1807,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
 
         case SRE_OP_REPEAT_ONE:
         case SRE_OP_MIN_REPEAT_ONE:
+        case SRE_OP_POSSESSIVE_REPEAT_ONE:
             {
                 SRE_CODE min, max;
                 GET_SKIP;
@@ -1826,8 +1827,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
             break;
 
         case SRE_OP_REPEAT:
+        case SRE_OP_POSSESSIVE_REPEAT:
             {
-                SRE_CODE min, max;
+                SRE_CODE op1 = op, min, max;
                 GET_SKIP;
                 GET_ARG; min = arg;
                 GET_ARG; max = arg;
@@ -1839,7 +1841,25 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
                     FAIL;
                 code += skip-3;
                 GET_OP;
-                if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
+                if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
+                    if (op != SRE_OP_SUCCESS)
+                        FAIL;
+                }
+                else {
+                    if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
+                        FAIL;
+                }
+            }
+            break;
+
+        case SRE_OP_ATOMIC_GROUP:
+            {
+                GET_SKIP;
+                if (!_validate_inner(code, code+skip-2, groups))
+                    FAIL;
+                code += skip-2;
+                GET_OP;
+                if (op != SRE_OP_SUCCESS)
                     FAIL;
             }
             break;
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index c8ccb32..8b9125b 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
  * See the _sre.c file for information on usage and redistribution.
  */
 
-#define SRE_MAGIC 20171005
+#define SRE_MAGIC 20220318
 #define SRE_OP_FAILURE 0
 #define SRE_OP_SUCCESS 1
 #define SRE_OP_ANY 2
@@ -40,19 +40,22 @@
 #define SRE_OP_REPEAT_ONE 25
 #define SRE_OP_SUBPATTERN 26
 #define SRE_OP_MIN_REPEAT_ONE 27
-#define SRE_OP_GROUPREF_IGNORE 28
-#define SRE_OP_IN_IGNORE 29
-#define SRE_OP_LITERAL_IGNORE 30
-#define SRE_OP_NOT_LITERAL_IGNORE 31
-#define SRE_OP_GROUPREF_LOC_IGNORE 32
-#define SRE_OP_IN_LOC_IGNORE 33
-#define SRE_OP_LITERAL_LOC_IGNORE 34
-#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35
-#define SRE_OP_GROUPREF_UNI_IGNORE 36
-#define SRE_OP_IN_UNI_IGNORE 37
-#define SRE_OP_LITERAL_UNI_IGNORE 38
-#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39
-#define SRE_OP_RANGE_UNI_IGNORE 40
+#define SRE_OP_ATOMIC_GROUP 28
+#define SRE_OP_POSSESSIVE_REPEAT 29
+#define SRE_OP_POSSESSIVE_REPEAT_ONE 30
+#define SRE_OP_GROUPREF_IGNORE 31
+#define SRE_OP_IN_IGNORE 32
+#define SRE_OP_LITERAL_IGNORE 33
+#define SRE_OP_NOT_LITERAL_IGNORE 34
+#define SRE_OP_GROUPREF_LOC_IGNORE 35
+#define SRE_OP_IN_LOC_IGNORE 36
+#define SRE_OP_LITERAL_LOC_IGNORE 37
+#define SRE_OP_NOT_LITERAL_LOC_IGNORE 38
+#define SRE_OP_GROUPREF_UNI_IGNORE 39
+#define SRE_OP_IN_UNI_IGNORE 40
+#define SRE_OP_LITERAL_UNI_IGNORE 41
+#define SRE_OP_NOT_LITERAL_UNI_IGNORE 42
+#define SRE_OP_RANGE_UNI_IGNORE 43
 #define SRE_AT_BEGINNING 0
 #define SRE_AT_BEGINNING_LINE 1
 #define SRE_AT_BEGINNING_STRING 2
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index 32469cd..956fd3f 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -480,6 +480,9 @@ do { \
 #define JUMP_BRANCH          11
 #define JUMP_ASSERT          12
 #define JUMP_ASSERT_NOT      13
+#define JUMP_POSS_REPEAT_1   14
+#define JUMP_POSS_REPEAT_2   15
+#define JUMP_ATOMIC_GROUP    16
 
 #define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \
     DATA_ALLOC(SRE(match_context), nextctx); \
@@ -950,6 +953,60 @@ entrance:
             }
             RETURN_FAILURE;
 
+        case SRE_OP_POSSESSIVE_REPEAT_ONE:
+            /* match repeated sequence (maximizing regexp) without
+               backtracking */
+
+            /* this operator only works if the repeated item is
+               exactly one character wide, and we're not already
+               collecting backtracking points.  for other cases,
+               use the MAX_REPEAT operator */
+
+            /* <POSSESSIVE_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS>
+               tail */
+
+            TRACE(("|%p|%p|POSSESSIVE_REPEAT_ONE %d %d\n", ctx->pattern,
+                   ctx->ptr, ctx->pattern[1], ctx->pattern[2]));
+
+            if (ctx->ptr + ctx->pattern[1] > end) {
+                RETURN_FAILURE; /* cannot match */
+            }
+
+            state->ptr = ctx->ptr;
+
+            ret = SRE(count)(state, ctx->pattern + 3, ctx->pattern[2]);
+            RETURN_ON_ERROR(ret);
+            DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
+            ctx->count = ret;
+            ctx->ptr += ctx->count;
+
+            /* when we arrive here, count contains the number of
+               matches, and ctx->ptr points to the tail of the target
+               string.  check if the rest of the pattern matches,
+               and fail if not. */
+
+            /* Test for not enough repetitions in match */
+            if (ctx->count < (Py_ssize_t) ctx->pattern[1]) {
+                RETURN_FAILURE;
+            }
+
+            /* Update the pattern to point to the next op code */
+            ctx->pattern += ctx->pattern[0];
+
+            /* Let the tail be evaluated separately and consider this
+               match successful. */
+            if (*ctx->pattern == SRE_OP_SUCCESS &&
+                ctx->ptr == state->end &&
+                !(ctx->toplevel && state->must_advance && ctx->ptr == state->start))
+            {
+                /* tail is empty.  we're finished */
+                state->ptr = ctx->ptr;
+                RETURN_SUCCESS;
+            }
+
+            /* Attempt to match the rest of the string */
+            break;
+
         case SRE_OP_REPEAT:
             /* create repeat context.  all the hard work is done
                by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
@@ -1110,6 +1167,138 @@ entrance:
             state->ptr = ctx->ptr;
             RETURN_FAILURE;
 
+        case SRE_OP_POSSESSIVE_REPEAT:
+            /* create possessive repeat contexts. */
+            /* <POSSESSIVE_REPEAT> <skip> <1=min> <2=max> pattern
+               <SUCCESS> tail */
+            TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern,
+                   ctx->ptr, ctx->pattern[1], ctx->pattern[2]));
+
+            /* Set the global Input pointer to this context's Input
+               pointer */
+            state->ptr = ctx->ptr;
+
+            /* Initialize Count to 0 */
+            ctx->count = 0;
+
+            /* Check for minimum required matches. */
+            while (ctx->count < (Py_ssize_t)ctx->pattern[1]) {
+                /* not enough matches */
+                DO_JUMP(JUMP_POSS_REPEAT_1, jump_poss_repeat_1,
+                        &ctx->pattern[3]);
+                if (ret) {
+                    RETURN_ON_ERROR(ret);
+                    ctx->count++;
+                }
+                else {
+                    state->ptr = ctx->ptr;
+                    RETURN_FAILURE;
+                }
+            }
+
+            /* Clear the context's Input stream pointer so that it
+               doesn't match the global state so that the while loop can
+               be entered. */
+            ctx->ptr = NULL;
+
+            /* Keep trying to parse the <pattern> sub-pattern until the
+               end is reached, creating a new context each time. */
+            while ((ctx->count < (Py_ssize_t)ctx->pattern[2] ||
+                    (Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT) &&
+                   state->ptr != ctx->ptr) {
+                /* Save the Capture Group Marker state into the current
+                   Context and back up the current highest number
+                   Capture Group marker. */
+                LASTMARK_SAVE();
+                MARK_PUSH(ctx->lastmark);
+
+                /* zero-width match protection */
+                /* Set the context's Input Stream pointer to be the
+                   current Input Stream pointer from the global
+                   state.  When the loop reaches the next iteration,
+                   the context will then store the last known good
+                   position with the global state holding the Input
+                   Input Stream position that has been updated with
+                   the most recent match.  Thus, if state's Input
+                   stream remains the same as the one stored in the
+                   current Context, we know we have successfully
+                   matched an empty string and that all subsequent
+                   matches will also be the empty string until the
+                   maximum number of matches are counted, and because
+                   of this, we could immediately stop at that point and
+                   consider this match successful. */
+                ctx->ptr = state->ptr;
+
+                /* We have not reached the maximin matches, so try to
+                   match once more. */
+                DO_JUMP(JUMP_POSS_REPEAT_2, jump_poss_repeat_2,
+                        &ctx->pattern[3]);
+
+                /* Check to see if the last attempted match
+                   succeeded. */
+                if (ret) {
+                    /* Drop the saved highest number Capture Group
+                       marker saved above and use the newly updated
+                       value. */
+                    MARK_POP_DISCARD(ctx->lastmark);
+                    RETURN_ON_ERROR(ret);
+
+                    /* Success, increment the count. */
+                    ctx->count++;
+                }
+                /* Last attempted match failed. */
+                else {
+                    /* Restore the previously saved highest number
+                       Capture Group marker since the last iteration
+                       did not match, then restore that to the global
+                       state. */
+                    MARK_POP(ctx->lastmark);
+                    LASTMARK_RESTORE();
+
+                    /* We have sufficient matches, so exit loop. */
+                    break;
+                }
+            }
+
+            /* Evaluate Tail */
+            /* Jump to end of pattern indicated by skip, and then skip
+               the SUCCESS op code that follows it. */
+            ctx->pattern += ctx->pattern[0] + 1;
+            ctx->ptr = state->ptr;
+            break;
+
+        case SRE_OP_ATOMIC_GROUP:
+            /* Atomic Group Sub Pattern */
+            /* <ATOMIC_GROUP> <skip> pattern <SUCCESS> tail */
+            TRACE(("|%p|%p|ATOMIC_GROUP\n", ctx->pattern, ctx->ptr));
+
+            /* Set the global Input pointer to this context's Input
+            pointer */
+            state->ptr = ctx->ptr;
+
+            /* Evaluate the Atomic Group in a new context, terminating
+               when the end of the group, represented by a SUCCESS op
+               code, is reached. */
+            /* Group Pattern begins at an offset of 1 code. */
+            DO_JUMP(JUMP_ATOMIC_GROUP, jump_atomic_group,
+                    &ctx->pattern[1]);
+
+            /* Test Exit Condition */
+            RETURN_ON_ERROR(ret);
+
+            if (ret == 0) {
+                /* Atomic Group failed to Match. */
+                state->ptr = ctx->ptr;
+                RETURN_FAILURE;
+            }
+
+            /* Evaluate Tail */
+            /* Jump to end of pattern indicated by skip, and then skip
+               the SUCCESS op code that follows it. */
+            ctx->pattern += ctx->pattern[0];
+            ctx->ptr = state->ptr;
+            break;
+
         case SRE_OP_GROUPREF:
             /* match backreference */
             TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
@@ -1306,6 +1495,12 @@ exit:
         case JUMP_MIN_UNTIL_1:
             TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
             goto jump_min_until_1;
+        case JUMP_POSS_REPEAT_1:
+            TRACE(("|%p|%p|JUMP_POSS_REPEAT_1\n", ctx->pattern, ctx->ptr));
+            goto jump_poss_repeat_1;
+        case JUMP_POSS_REPEAT_2:
+            TRACE(("|%p|%p|JUMP_POSS_REPEAT_2\n", ctx->pattern, ctx->ptr));
+            goto jump_poss_repeat_2;
         case JUMP_REPEAT:
             TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
             goto jump_repeat;
@@ -1318,6 +1513,9 @@ exit:
         case JUMP_MIN_REPEAT_ONE:
             TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
             goto jump_min_repeat_one;
+        case JUMP_ATOMIC_GROUP:
+            TRACE(("|%p|%p|JUMP_ATOMIC_GROUP\n", ctx->pattern, ctx->ptr));
+            goto jump_atomic_group;
         case JUMP_ASSERT:
             TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
             goto jump_assert;
author	Serhiy Storchaka <storchaka@gmail.com>	2022-03-21 16:28:22 (GMT)
committer	GitHub <noreply@github.com>	2022-03-21 16:28:22 (GMT)
commit	345b390ed69f36681dbc41187bc8f49cd9135b54 (patch)
tree	31ce6451bed718405b29bdb32c7eb4ff96fe5697
parent	2bde6827ea4f136297b2d882480b981ff26262b6 (diff)
download	cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.zip cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.gz cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.bz2