summaryrefslogtreecommitdiffstats
path: root/Lib/re
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2022-05-08 16:19:29 (GMT)
committerGitHub <noreply@github.com>2022-05-08 16:19:29 (GMT)
commita84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7 (patch)
tree5a129f41f7e8c49aa7ffa3f3d874ff9cd41751a8 /Lib/re
parent7b024e3a3f77027f747da7580ed0a3ed2dec276a (diff)
downloadcpython-a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7.zip
cpython-a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7.tar.gz
cpython-a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7.tar.bz2
gh-91760: More strict rules for numerical group references and group names in RE (GH-91792)
Only sequence of ASCII digits is now accepted as a numerical reference. The group name in bytes patterns and replacement strings can now only contain ASCII letters and digits and underscore.
Diffstat (limited to 'Lib/re')
-rw-r--r--Lib/re/_parser.py40
1 files changed, 12 insertions, 28 deletions
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
index a393c50..33b7097 100644
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -291,17 +291,13 @@ class Tokenizer:
msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
return error(msg, self.string, self.tell() - offset)
- def checkgroupname(self, name, offset, nested):
+ def checkgroupname(self, name, offset):
+ if not (self.istext or name.isascii()):
+ msg = "bad character in group name %a" % name
+ raise self.error(msg, len(name) + offset)
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise self.error(msg, len(name) + offset)
- if not (self.istext or name.isascii()):
- import warnings
- warnings.warn(
- "bad character in group name %a at position %d" %
- (name, self.tell() - len(name) - offset),
- DeprecationWarning, stacklevel=nested + 7
- )
def _class_escape(source, escape):
# handle escape code inside character class
@@ -717,11 +713,11 @@ def _parse(source, state, verbose, nested, first=False):
if sourcematch("<"):
# named group: skip forward to end of name
name = source.getuntil(">", "group name")
- source.checkgroupname(name, 1, nested)
+ source.checkgroupname(name, 1)
elif sourcematch("="):
# named backreference
name = source.getuntil(")", "group name")
- source.checkgroupname(name, 1, nested)
+ source.checkgroupname(name, 1)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name %r" % name
@@ -782,20 +778,14 @@ def _parse(source, state, verbose, nested, first=False):
elif char == "(":
# conditional backreference group
condname = source.getuntil(")", "group name")
- if condname.isidentifier():
- source.checkgroupname(condname, 1, nested)
+ if not (condname.isdecimal() and condname.isascii()):
+ source.checkgroupname(condname, 1)
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name %r" % condname
raise source.error(msg, len(condname) + 1)
else:
- try:
- condgroup = int(condname)
- if condgroup < 0:
- raise ValueError
- except ValueError:
- msg = "bad character in group name %r" % condname
- raise source.error(msg, len(condname) + 1) from None
+ condgroup = int(condname)
if not condgroup:
raise source.error("bad group number",
len(condname) + 1)
@@ -1022,20 +1012,14 @@ def parse_template(source, state):
if not s.match("<"):
raise s.error("missing <")
name = s.getuntil(">", "group name")
- if name.isidentifier():
- s.checkgroupname(name, 1, -1)
+ if not (name.isdecimal() and name.isascii()):
+ s.checkgroupname(name, 1)
try:
index = groupindex[name]
except KeyError:
raise IndexError("unknown group name %r" % name) from None
else:
- try:
- index = int(name)
- if index < 0:
- raise ValueError
- except ValueError:
- raise s.error("bad character in group name %r" % name,
- len(name) + 1) from None
+ index = int(name)
if index >= MAXGROUPS:
raise s.error("invalid group reference %d" % index,
len(name) + 1)