From fbb490fd2f38bd817d99c20c05121ad0168a38ee Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 4 Jan 2018 11:06:13 +0200 Subject: bpo-32308: Replace empty matches adjacent to a previous non-empty match in re.sub(). (#4846) --- Doc/howto/regex.rst | 4 ++-- Doc/library/re.rst | 14 +++++++++---- Doc/whatsnew/3.7.rst | 13 +++++++++--- Lib/test/test_re.py | 23 +++++++++------------- .../2017-12-13-20-31-30.bpo-32308.CUbsb2.rst | 2 ++ Modules/_sre.c | 4 ++-- 6 files changed, 35 insertions(+), 25 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2017-12-13-20-31-30.bpo-32308.CUbsb2.rst diff --git a/Doc/howto/regex.rst b/Doc/howto/regex.rst index fa8c693..87a6b1a 100644 --- a/Doc/howto/regex.rst +++ b/Doc/howto/regex.rst @@ -1140,12 +1140,12 @@ new string value and the number of replacements that were performed:: >>> p.subn('colour', 'no colours at all') ('no colours at all', 0) -Empty matches are replaced only when they're not adjacent to a previous match. +Empty matches are replaced only when they're not adjacent to a previous empty match. :: >>> p = re.compile('x*') >>> p.sub('-', 'abxd') - '-a-b-d-' + '-a-b--d-' If *replacement* is a string, any backslash escapes in it are processed. That is, ``\n`` is converted to a single newline character, ``\r`` is converted to a diff --git a/Doc/library/re.rst b/Doc/library/re.rst index dae1d7e..9b175f4 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -708,12 +708,15 @@ form. That way, separator components are always found at the same relative indices within the result list. - The pattern can match empty strings. :: + Empty matches for the pattern split the string only when not adjacent + to a previous empty match. >>> re.split(r'\b', 'Words, words, words.') ['', 'Words', ', ', 'words', ', ', 'words', '.'] + >>> re.split(r'\W*', '...words...') + ['', '', 'w', 'o', 'r', 'd', 's', '', ''] >>> re.split(r'(\W*)', '...words...') - ['', '...', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', ''] + ['', '...', '', '', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', '', '', ''] .. versionchanged:: 3.1 Added the optional flags argument. @@ -778,8 +781,8 @@ form. The optional argument *count* is the maximum number of pattern occurrences to be replaced; *count* must be a non-negative integer. If omitted or zero, all occurrences will be replaced. Empty matches for the pattern are replaced only - when not adjacent to a previous match, so ``sub('x*', '-', 'abc')`` returns - ``'-a-b-c-'``. + when not adjacent to a previous empty match, so ``sub('x*', '-', 'abxd')`` returns + ``'-a-b--d-'``. In string-type *repl* arguments, in addition to the character escapes and backreferences described above, @@ -805,6 +808,9 @@ form. Unknown escapes in *repl* consisting of ``'\'`` and an ASCII letter now are errors. + Empty matches for the pattern are replaced when adjacent to a previous + non-empty match. + .. function:: subn(pattern, repl, string, count=0, flags=0) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 1924881..1311e9e 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -881,8 +881,9 @@ Changes in the Python API * The result of splitting a string on a :mod:`regular expression ` that could match an empty string has been changed. For example splitting on ``r'\s*'`` will now split not only on whitespaces as it - did previously, but also between any pair of non-whitespace - characters. The previous behavior can be restored by changing the pattern + did previously, but also on empty strings before all non-whitespace + characters and just before the end of the string. + The previous behavior can be restored by changing the pattern to ``r'\s+'``. A :exc:`FutureWarning` was emitted for such patterns since Python 3.5. @@ -893,7 +894,13 @@ Changes in the Python API positions 2--3. To match only blank lines, the pattern should be rewritten as ``r'(?m)^[^\S\n]*$'``. - (Contributed by Serhiy Storchaka in :issue:`25054`.) + :func:`re.sub()` now replaces empty matches adjacent to a previous + non-empty match. For example ``re.sub('x*', '-', 'abxd')`` returns now + ``'-a-b--d-'`` instead of ``'-a-b--d-'`` (the first minus between 'b' and + 'd' replaces 'x', and the second minus replaces an empty string between + 'x' and 'd'). + + (Contributed by Serhiy Storchaka in :issue:`25054` and :issue:`32308`.) * :class:`tracemalloc.Traceback` frames are now sorted from oldest to most recent to be more consistent with :mod:`traceback`. diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index aaed3d8..9fed4be 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -213,11 +213,6 @@ class ReTests(unittest.TestCase): self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 'hello there') - def test_bug_462270(self): - # Test for empty sub() behaviour, see SF bug #462270 - self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-') - self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d') - def test_symbolic_groups(self): re.compile(r'(?Px)(?P=a)(?(a)y)') re.compile(r'(?Px)(?P=a1)(?(a1)y)') @@ -331,10 +326,10 @@ class ReTests(unittest.TestCase): ['', 'a', '', '', 'c']) for sep, expected in [ - (':*', ['', 'a', 'b', 'c', '']), - ('(?::*)', ['', 'a', 'b', 'c', '']), - ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c', '', '']), - ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c', None, '']), + (':*', ['', '', 'a', '', 'b', '', 'c', '']), + ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']), + ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']), + ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']), ]: with self.subTest(sep=sep): self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) @@ -357,7 +352,7 @@ class ReTests(unittest.TestCase): self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c']) self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), - ['', ':', 'a', ':', 'b::c']) + ['', ':', '', '', 'a:b::c']) def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) @@ -1753,13 +1748,13 @@ class ReTests(unittest.TestCase): def test_zerowidth(self): # Issues 852532, 1647489, 3262, 25054. self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', '']) - self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', 'bc', '']) - self.assertEqual(re.split(r"(?