summaryrefslogtreecommitdiffstats
path: root/Lib/sre_parse.py
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2022-03-21 16:28:22 (GMT)
committerGitHub <noreply@github.com>2022-03-21 16:28:22 (GMT)
commit345b390ed69f36681dbc41187bc8f49cd9135b54 (patch)
tree31ce6451bed718405b29bdb32c7eb4ff96fe5697 /Lib/sre_parse.py
parent2bde6827ea4f136297b2d882480b981ff26262b6 (diff)
downloadcpython-345b390ed69f36681dbc41187bc8f49cd9135b54.zip
cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.gz
cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.bz2
bpo-433030: Add support of atomic grouping in regular expressions (GH-31982)
* Atomic grouping: (?>...). * Possessive quantifiers: x++, x*+, x?+, x{m,n}+. Equivalent to (?>x+), (?>x*), (?>x?), (?>x{m,n}). Co-authored-by: Jeffrey C. Jacobs <timehorse@users.sourceforge.net>
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r--Lib/sre_parse.py32
1 files changed, 26 insertions, 6 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index bb95107..b91082e 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -25,7 +25,7 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
WHITESPACE = frozenset(" \t\n\r\v\f")
-_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
+_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
ESCAPES = {
@@ -190,6 +190,10 @@ class SubPattern:
i, j = av.getwidth()
lo = lo + i
hi = hi + j
+ elif op is ATOMIC_GROUP:
+ i, j = av.getwidth()
+ lo = lo + i
+ hi = hi + j
elif op is SUBPATTERN:
i, j = av[-1].getwidth()
lo = lo + i
@@ -675,8 +679,13 @@ def _parse(source, state, verbose, nested, first=False):
if group is None and not add_flags and not del_flags:
item = p
if sourcematch("?"):
+ # Non-Greedy Match
subpattern[-1] = (MIN_REPEAT, (min, max, item))
+ elif sourcematch("+"):
+ # Possessive Match (Always Greedy)
+ subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item))
else:
+ # Greedy Match
subpattern[-1] = (MAX_REPEAT, (min, max, item))
elif this == ".":
@@ -684,7 +693,8 @@ def _parse(source, state, verbose, nested, first=False):
elif this == "(":
start = source.tell() - 1
- group = True
+ capture = True
+ atomic = False
name = None
add_flags = 0
del_flags = 0
@@ -726,7 +736,7 @@ def _parse(source, state, verbose, nested, first=False):
len(char) + 2)
elif char == ":":
# non-capturing group
- group = None
+ capture = False
elif char == "#":
# comment
while True:
@@ -800,6 +810,10 @@ def _parse(source, state, verbose, nested, first=False):
subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
continue
+ elif char == ">":
+ # non-capturing, atomic group
+ capture = False
+ atomic = True
elif char in FLAGS or char == "-":
# flags
flags = _parse_flags(source, state, char)
@@ -813,17 +827,19 @@ def _parse(source, state, verbose, nested, first=False):
continue
add_flags, del_flags = flags
- group = None
+ capture = False
else:
raise source.error("unknown extension ?" + char,
len(char) + 1)
# parse group contents
- if group is not None:
+ if capture:
try:
group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1) from None
+ else:
+ group = None
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
not (del_flags & SRE_FLAG_VERBOSE))
p = _parse_sub(source, state, sub_verbose, nested + 1)
@@ -832,7 +848,11 @@ def _parse(source, state, verbose, nested, first=False):
source.tell() - start)
if group is not None:
state.closegroup(group, p)
- subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
+ if atomic:
+ assert group is None
+ subpatternappend((ATOMIC_GROUP, p))
+ else:
+ subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
elif this == "^":
subpatternappend((AT, AT_BEGINNING))