diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-05-08 16:19:29 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-08 16:19:29 (GMT) |
commit | a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7 (patch) | |
tree | 5a129f41f7e8c49aa7ffa3f3d874ff9cd41751a8 /Lib/re | |
parent | 7b024e3a3f77027f747da7580ed0a3ed2dec276a (diff) | |
download | cpython-a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7.zip cpython-a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7.tar.gz cpython-a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7.tar.bz2 |
gh-91760: More strict rules for numerical group references and group names in RE (GH-91792)
Only sequence of ASCII digits is now accepted as a numerical reference.
The group name in bytes patterns and replacement strings can now only
contain ASCII letters and digits and underscore.
Diffstat (limited to 'Lib/re')
-rw-r--r-- | Lib/re/_parser.py | 40 |
1 files changed, 12 insertions, 28 deletions
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index a393c50..33b7097 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -291,17 +291,13 @@ class Tokenizer: msg = msg.encode('ascii', 'backslashreplace').decode('ascii') return error(msg, self.string, self.tell() - offset) - def checkgroupname(self, name, offset, nested): + def checkgroupname(self, name, offset): + if not (self.istext or name.isascii()): + msg = "bad character in group name %a" % name + raise self.error(msg, len(name) + offset) if not name.isidentifier(): msg = "bad character in group name %r" % name raise self.error(msg, len(name) + offset) - if not (self.istext or name.isascii()): - import warnings - warnings.warn( - "bad character in group name %a at position %d" % - (name, self.tell() - len(name) - offset), - DeprecationWarning, stacklevel=nested + 7 - ) def _class_escape(source, escape): # handle escape code inside character class @@ -717,11 +713,11 @@ def _parse(source, state, verbose, nested, first=False): if sourcematch("<"): # named group: skip forward to end of name name = source.getuntil(">", "group name") - source.checkgroupname(name, 1, nested) + source.checkgroupname(name, 1) elif sourcematch("="): # named backreference name = source.getuntil(")", "group name") - source.checkgroupname(name, 1, nested) + source.checkgroupname(name, 1) gid = state.groupdict.get(name) if gid is None: msg = "unknown group name %r" % name @@ -782,20 +778,14 @@ def _parse(source, state, verbose, nested, first=False): elif char == "(": # conditional backreference group condname = source.getuntil(")", "group name") - if condname.isidentifier(): - source.checkgroupname(condname, 1, nested) + if not (condname.isdecimal() and condname.isascii()): + source.checkgroupname(condname, 1) condgroup = state.groupdict.get(condname) if condgroup is None: msg = "unknown group name %r" % condname raise source.error(msg, len(condname) + 1) else: - try: - condgroup = int(condname) - if condgroup < 0: - raise ValueError - except ValueError: - msg = "bad character in group name %r" % condname - raise source.error(msg, len(condname) + 1) from None + condgroup = int(condname) if not condgroup: raise source.error("bad group number", len(condname) + 1) @@ -1022,20 +1012,14 @@ def parse_template(source, state): if not s.match("<"): raise s.error("missing <") name = s.getuntil(">", "group name") - if name.isidentifier(): - s.checkgroupname(name, 1, -1) + if not (name.isdecimal() and name.isascii()): + s.checkgroupname(name, 1) try: index = groupindex[name] except KeyError: raise IndexError("unknown group name %r" % name) from None else: - try: - index = int(name) - if index < 0: - raise ValueError - except ValueError: - raise s.error("bad character in group name %r" % name, - len(name) + 1) from None + index = int(name) if index >= MAXGROUPS: raise s.error("invalid group reference %d" % index, len(name) + 1) |