gh-91760: Deprecate group names and numbers which will be invalid in future (GH-91794)

Only sequence of ASCII digits will be accepted as a numerical reference. The group name in bytes patterns and replacement strings could only contain ASCII letters and digits and underscore.
author: Serhiy Storchaka <storchaka@gmail.com> 2022-04-30 10:13:46 (GMT)
committer: GitHub <noreply@github.com> 2022-04-30 10:13:46 (GMT)
commit: 19dca041212f9f58ee11833bff3f8c157d4fd3e8 (patch)
tree: db3feb981ca27aabbf833b2a168e65ab3816b1b8 /Lib/re/_parser.py
parent: 6d0d547033e295f91f05030322acfbb0e280fc1f (diff)
download: cpython-19dca041212f9f58ee11833bff3f8c157d4fd3e8.zip
cpython-19dca041212f9f58ee11833bff3f8c157d4fd3e8.tar.gz
cpython-19dca041212f9f58ee11833bff3f8c157d4fd3e8.tar.bz2
1 files changed, 34 insertions, 7 deletions
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
index 933d515..a393c50 100644
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -287,8 +287,22 @@ class Tokenizer:
         self.__next()
 
     def error(self, msg, offset=0):
+        if not self.istext:
+            msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
         return error(msg, self.string, self.tell() - offset)
 
+    def checkgroupname(self, name, offset, nested):
+        if not name.isidentifier():
+            msg = "bad character in group name %r" % name
+            raise self.error(msg, len(name) + offset)
+        if not (self.istext or name.isascii()):
+            import warnings
+            warnings.warn(
+                "bad character in group name %a at position %d" %
+                (name, self.tell() - len(name) - offset),
+                DeprecationWarning, stacklevel=nested + 7
+            )
+
 def _class_escape(source, escape):
     # handle escape code inside character class
     code = ESCAPES.get(escape)
@@ -703,15 +717,11 @@ def _parse(source, state, verbose, nested, first=False):
                     if sourcematch("<"):
                         # named group: skip forward to end of name
                         name = source.getuntil(">", "group name")
-                        if not name.isidentifier():
-                            msg = "bad character in group name %r" % name
-                            raise source.error(msg, len(name) + 1)
+                        source.checkgroupname(name, 1, nested)
                     elif sourcematch("="):
                         # named backreference
                         name = source.getuntil(")", "group name")
-                        if not name.isidentifier():
-                            msg = "bad character in group name %r" % name
-                            raise source.error(msg, len(name) + 1)
+                        source.checkgroupname(name, 1, nested)
                         gid = state.groupdict.get(name)
                         if gid is None:
                             msg = "unknown group name %r" % name
@@ -773,6 +783,7 @@ def _parse(source, state, verbose, nested, first=False):
                     # conditional backreference group
                     condname = source.getuntil(")", "group name")
                     if condname.isidentifier():
+                        source.checkgroupname(condname, 1, nested)
                         condgroup = state.groupdict.get(condname)
                         if condgroup is None:
                             msg = "unknown group name %r" % condname
@@ -795,6 +806,14 @@ def _parse(source, state, verbose, nested, first=False):
                             state.grouprefpos[condgroup] = (
                                 source.tell() - len(condname) - 1
                             )
+                        if not (condname.isdecimal() and condname.isascii()):
+                            import warnings
+                            warnings.warn(
+                                "bad character in group name %s at position %d" %
+                                (repr(condname) if source.istext else ascii(condname),
+                                 source.tell() - len(condname) - 1),
+                                DeprecationWarning, stacklevel=nested + 6
+                            )
                     state.checklookbehindgroup(condgroup, source)
                     item_yes = _parse(source, state, verbose, nested + 1)
                     if source.match("|"):
@@ -1000,11 +1019,11 @@ def parse_template(source, state):
             # group
             c = this[1]
             if c == "g":
-                name = ""
                 if not s.match("<"):
                     raise s.error("missing <")
                 name = s.getuntil(">", "group name")
                 if name.isidentifier():
+                    s.checkgroupname(name, 1, -1)
                     try:
                         index = groupindex[name]
                     except KeyError:
@@ -1020,6 +1039,14 @@ def parse_template(source, state):
                     if index >= MAXGROUPS:
                         raise s.error("invalid group reference %d" % index,
                                       len(name) + 1)
+                    if not (name.isdecimal() and name.isascii()):
+                        import warnings
+                        warnings.warn(
+                            "bad character in group name %s at position %d" %
+                            (repr(name) if s.istext else ascii(name),
+                             s.tell() - len(name) - 1),
+                            DeprecationWarning, stacklevel=5
+                        )
                 addgroup(index, len(name) + 1)
             elif c == "0":
                 if s.next in OCTDIGITS:
author	Serhiy Storchaka <storchaka@gmail.com>	2022-04-30 10:13:46 (GMT)
committer	GitHub <noreply@github.com>	2022-04-30 10:13:46 (GMT)
commit	19dca041212f9f58ee11833bff3f8c157d4fd3e8 (patch)
tree	db3feb981ca27aabbf833b2a168e65ab3816b1b8 /Lib/re/_parser.py
parent	6d0d547033e295f91f05030322acfbb0e280fc1f (diff)
download	cpython-19dca041212f9f58ee11833bff3f8c157d4fd3e8.zip cpython-19dca041212f9f58ee11833bff3f8c157d4fd3e8.tar.gz cpython-19dca041212f9f58ee11833bff3f8c157d4fd3e8.tar.bz2