diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-05-08 16:19:29 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-08 16:19:29 (GMT) |
commit | a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7 (patch) | |
tree | 5a129f41f7e8c49aa7ffa3f3d874ff9cd41751a8 | |
parent | 7b024e3a3f77027f747da7580ed0a3ed2dec276a (diff) | |
download | cpython-a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7.zip cpython-a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7.tar.gz cpython-a84a56d80fa3d9a5909d074bbcd2efff7ef8f1b7.tar.bz2 |
gh-91760: More strict rules for numerical group references and group names in RE (GH-91792)
Only sequence of ASCII digits is now accepted as a numerical reference.
The group name in bytes patterns and replacement strings can now only
contain ASCII letters and digits and underscore.
-rw-r--r-- | Doc/library/re.rst | 19 | ||||
-rw-r--r-- | Doc/whatsnew/3.12.rst | 10 | ||||
-rw-r--r-- | Lib/re/_parser.py | 40 | ||||
-rw-r--r-- | Lib/test/test_re.py | 79 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst | 5 |
5 files changed, 62 insertions, 91 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 3cd9f25..39e7d23 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -395,7 +395,8 @@ The special characters are: ``(?P<name>...)`` Similar to regular parentheses, but the substring matched by the group is accessible via the symbolic group name *name*. Group names must be valid - Python identifiers, and each group name must be defined only once within a + Python identifiers, and in bytes patterns they must contain only characters + in the ASCII range. Each group name must be defined only once within a regular expression. A symbolic group is also a numbered group, just as if the group were not named. @@ -417,8 +418,9 @@ The special characters are: | | * ``\1`` | +---------------------------------------+----------------------------------+ - .. deprecated:: 3.11 - Group names containing non-ASCII characters in bytes patterns. + .. versionchanged:: 3.12 + In bytes patterns group names must contain only characters in + the ASCII range. .. index:: single: (?P=; in regular expressions @@ -489,8 +491,8 @@ The special characters are: will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but not with ``'<user@host.com'`` nor ``'user@host.com>'``. - .. deprecated:: 3.11 - Group *id* containing anything except ASCII digits. + .. versionchanged:: 3.12 + Group *id* can only contain ASCII digits. The special sequences consist of ``'\'`` and a character from the list below. @@ -1001,9 +1003,10 @@ form. Empty matches for the pattern are replaced when adjacent to a previous non-empty match. - .. deprecated:: 3.11 - Group *id* containing anything except ASCII digits. - Group names containing non-ASCII characters in bytes replacement strings. + .. versionchanged:: 3.12 + Group *id* can only contain ASCII digits. + In bytes replacement strings group names must contain only characters + in the ASCII range. .. function:: subn(pattern, repl, string, count=0, flags=0) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index dacf041..b73c3db 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -114,3 +114,13 @@ Porting to Python 3.12 This section lists previously described changes and other bugfixes that may require changes to your code. + +Changes in the Python API +------------------------- + +* More strict rules are now applied for numerical group references and + group names in regular expressions. + Only sequence of ASCII digits is now accepted as a numerical reference. + The group name in bytes patterns and replacement strings can now only + contain ASCII letters and digits and underscore. + (Contributed by Serhiy Storchaka in :gh:`91760`.) diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index a393c50..33b7097 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -291,17 +291,13 @@ class Tokenizer: msg = msg.encode('ascii', 'backslashreplace').decode('ascii') return error(msg, self.string, self.tell() - offset) - def checkgroupname(self, name, offset, nested): + def checkgroupname(self, name, offset): + if not (self.istext or name.isascii()): + msg = "bad character in group name %a" % name + raise self.error(msg, len(name) + offset) if not name.isidentifier(): msg = "bad character in group name %r" % name raise self.error(msg, len(name) + offset) - if not (self.istext or name.isascii()): - import warnings - warnings.warn( - "bad character in group name %a at position %d" % - (name, self.tell() - len(name) - offset), - DeprecationWarning, stacklevel=nested + 7 - ) def _class_escape(source, escape): # handle escape code inside character class @@ -717,11 +713,11 @@ def _parse(source, state, verbose, nested, first=False): if sourcematch("<"): # named group: skip forward to end of name name = source.getuntil(">", "group name") - source.checkgroupname(name, 1, nested) + source.checkgroupname(name, 1) elif sourcematch("="): # named backreference name = source.getuntil(")", "group name") - source.checkgroupname(name, 1, nested) + source.checkgroupname(name, 1) gid = state.groupdict.get(name) if gid is None: msg = "unknown group name %r" % name @@ -782,20 +778,14 @@ def _parse(source, state, verbose, nested, first=False): elif char == "(": # conditional backreference group condname = source.getuntil(")", "group name") - if condname.isidentifier(): - source.checkgroupname(condname, 1, nested) + if not (condname.isdecimal() and condname.isascii()): + source.checkgroupname(condname, 1) condgroup = state.groupdict.get(condname) if condgroup is None: msg = "unknown group name %r" % condname raise source.error(msg, len(condname) + 1) else: - try: - condgroup = int(condname) - if condgroup < 0: - raise ValueError - except ValueError: - msg = "bad character in group name %r" % condname - raise source.error(msg, len(condname) + 1) from None + condgroup = int(condname) if not condgroup: raise source.error("bad group number", len(condname) + 1) @@ -1022,20 +1012,14 @@ def parse_template(source, state): if not s.match("<"): raise s.error("missing <") name = s.getuntil(">", "group name") - if name.isidentifier(): - s.checkgroupname(name, 1, -1) + if not (name.isdecimal() and name.isascii()): + s.checkgroupname(name, 1) try: index = groupindex[name] except KeyError: raise IndexError("unknown group name %r" % name) from None else: - try: - index = int(name) - if index < 0: - raise ValueError - except ValueError: - raise s.error("bad character in group name %r" % name, - len(name) + 1) from None + index = int(name) if index >= MAXGROUPS: raise s.error("invalid group reference %d" % index, len(name) + 1) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index c101475..ba70de4 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -275,21 +275,12 @@ class ReTests(unittest.TestCase): self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4) self.checkPatternError('(?P=©)', "bad character in group name '©'", 4) self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '\\xc2\\xb5' " - r"at position 4") as w: - re.compile(b'(?P<\xc2\xb5>x)') - self.assertEqual(w.filename, __file__) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '\\xc2\\xb5' " - r"at position 4"): - self.checkPatternError(b'(?P=\xc2\xb5)', - r"unknown group name '\xc2\xb5'", 4) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '\\xc2\\xb5' " - r"at position 3"): - self.checkPatternError(b'(?(\xc2\xb5)y)', - r"unknown group name '\xc2\xb5'", 3) + self.checkPatternError(b'(?P<\xc2\xb5>x)', + r"bad character in group name '\xc2\xb5'", 4) + self.checkPatternError(b'(?P=\xc2\xb5)', + r"bad character in group name '\xc2\xb5'", 4) + self.checkPatternError(b'(?(\xc2\xb5)y)', + r"bad character in group name '\xc2\xb5'", 3) def test_symbolic_refs(self): self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') @@ -322,35 +313,22 @@ class ReTests(unittest.TestCase): re.sub('(?P<a>x)', r'\g<ab>', 'xx') self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx', "bad character in group name '-1'", 3) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '\+1' " - r"at position 3") as w: - re.sub('(?P<a>x)', r'\g<+1>', 'xx') - self.assertEqual(w.filename, __file__) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '1_0' " - r"at position 3"): - re.sub('()'*10, r'\g<1_0>', 'xx') - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name ' 1 ' " - r"at position 3"): - re.sub('(?P<a>x)', r'\g< 1 >', 'xx') + self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx', + "bad character in group name '+1'", 3) + self.checkTemplateError('()'*10, r'\g<1_0>', 'xx', + "bad character in group name '1_0'", 3) + self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx', + "bad character in group name ' 1 '", 3) self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx', "bad character in group name '©'", 3) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '\\xc2\\xb5' " - r"at position 3") as w: - with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"): - re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx') - self.assertEqual(w.filename, __file__) + self.checkTemplateError(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx', + r"bad character in group name '\xc2\xb5'", 3) self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx', "bad character in group name '㊀'", 3) self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx', "bad character in group name '¹'", 3) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '१' " - r"at position 3"): - re.sub('(?P<a>x)', r'\g<१>', 'xx') + self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx', + "bad character in group name '१'", 3) def test_re_subn(self): self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) @@ -616,27 +594,18 @@ class ReTests(unittest.TestCase): self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10) self.checkPatternError(r'()(?(-1)a|b)', "bad character in group name '-1'", 5) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '\+1' " - r"at position 5") as w: - re.compile(r'()(?(+1)a|b)') - self.assertEqual(w.filename, __file__) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '1_0' " - r"at position 23"): - re.compile(r'()'*10 + r'(?(1_0)a|b)') - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name ' 1 ' " - r"at position 5"): - re.compile(r'()(?( 1 )a|b)') + self.checkPatternError(r'()(?(+1)a|b)', + "bad character in group name '+1'", 5) + self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)', + "bad character in group name '1_0'", 23) + self.checkPatternError(r'()(?( 1 )a|b)', + "bad character in group name ' 1 '", 5) self.checkPatternError(r'()(?(㊀)a|b)', "bad character in group name '㊀'", 5) self.checkPatternError(r'()(?(¹)a|b)', "bad character in group name '¹'", 5) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '१' " - r"at position 5"): - re.compile(r'()(?(१)a|b)') + self.checkPatternError(r'()(?(१)a|b)', + "bad character in group name '१'", 5) self.checkPatternError(r'()(?(1', "missing ), unterminated name", 5) self.checkPatternError(r'()(?(1)a', diff --git a/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst new file mode 100644 index 0000000..ac3e7cd --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst @@ -0,0 +1,5 @@ +Apply more strict rules for numerical group references and group names in +regular expressions. Only sequence of ASCII digits is now accepted as +a numerical reference. The group name in +bytes patterns and replacement strings can now only contain ASCII letters +and digits and underscore. |