diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-04-30 10:13:46 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-30 10:13:46 (GMT) |
commit | 19dca041212f9f58ee11833bff3f8c157d4fd3e8 (patch) | |
tree | db3feb981ca27aabbf833b2a168e65ab3816b1b8 /Lib/re/_parser.py | |
parent | 6d0d547033e295f91f05030322acfbb0e280fc1f (diff) | |
download | cpython-19dca041212f9f58ee11833bff3f8c157d4fd3e8.zip cpython-19dca041212f9f58ee11833bff3f8c157d4fd3e8.tar.gz cpython-19dca041212f9f58ee11833bff3f8c157d4fd3e8.tar.bz2 |
gh-91760: Deprecate group names and numbers which will be invalid in future (GH-91794)
Only sequence of ASCII digits will be accepted as a numerical reference.
The group name in bytes patterns and replacement strings could only
contain ASCII letters and digits and underscore.
Diffstat (limited to 'Lib/re/_parser.py')
-rw-r--r-- | Lib/re/_parser.py | 41 |
1 files changed, 34 insertions, 7 deletions
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index 933d515..a393c50 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -287,8 +287,22 @@ class Tokenizer: self.__next() def error(self, msg, offset=0): + if not self.istext: + msg = msg.encode('ascii', 'backslashreplace').decode('ascii') return error(msg, self.string, self.tell() - offset) + def checkgroupname(self, name, offset, nested): + if not name.isidentifier(): + msg = "bad character in group name %r" % name + raise self.error(msg, len(name) + offset) + if not (self.istext or name.isascii()): + import warnings + warnings.warn( + "bad character in group name %a at position %d" % + (name, self.tell() - len(name) - offset), + DeprecationWarning, stacklevel=nested + 7 + ) + def _class_escape(source, escape): # handle escape code inside character class code = ESCAPES.get(escape) @@ -703,15 +717,11 @@ def _parse(source, state, verbose, nested, first=False): if sourcematch("<"): # named group: skip forward to end of name name = source.getuntil(">", "group name") - if not name.isidentifier(): - msg = "bad character in group name %r" % name - raise source.error(msg, len(name) + 1) + source.checkgroupname(name, 1, nested) elif sourcematch("="): # named backreference name = source.getuntil(")", "group name") - if not name.isidentifier(): - msg = "bad character in group name %r" % name - raise source.error(msg, len(name) + 1) + source.checkgroupname(name, 1, nested) gid = state.groupdict.get(name) if gid is None: msg = "unknown group name %r" % name @@ -773,6 +783,7 @@ def _parse(source, state, verbose, nested, first=False): # conditional backreference group condname = source.getuntil(")", "group name") if condname.isidentifier(): + source.checkgroupname(condname, 1, nested) condgroup = state.groupdict.get(condname) if condgroup is None: msg = "unknown group name %r" % condname @@ -795,6 +806,14 @@ def _parse(source, state, verbose, nested, first=False): state.grouprefpos[condgroup] = ( source.tell() - len(condname) - 1 ) + if not (condname.isdecimal() and condname.isascii()): + import warnings + warnings.warn( + "bad character in group name %s at position %d" % + (repr(condname) if source.istext else ascii(condname), + source.tell() - len(condname) - 1), + DeprecationWarning, stacklevel=nested + 6 + ) state.checklookbehindgroup(condgroup, source) item_yes = _parse(source, state, verbose, nested + 1) if source.match("|"): @@ -1000,11 +1019,11 @@ def parse_template(source, state): # group c = this[1] if c == "g": - name = "" if not s.match("<"): raise s.error("missing <") name = s.getuntil(">", "group name") if name.isidentifier(): + s.checkgroupname(name, 1, -1) try: index = groupindex[name] except KeyError: @@ -1020,6 +1039,14 @@ def parse_template(source, state): if index >= MAXGROUPS: raise s.error("invalid group reference %d" % index, len(name) + 1) + if not (name.isdecimal() and name.isascii()): + import warnings + warnings.warn( + "bad character in group name %s at position %d" % + (repr(name) if s.istext else ascii(name), + s.tell() - len(name) - 1), + DeprecationWarning, stacklevel=5 + ) addgroup(index, len(name) + 1) elif c == "0": if s.next in OCTDIGITS: |