summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2015-02-03 09:04:19 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2015-02-03 09:04:19 (GMT)
commit83e802796c80f46be616b48020356f7f51be533d (patch)
treee896b143abc3523f96e20d88ebcc22512af16aa7
parent32ca3dcb97a75c05dc2b90c88bbf82a541c57c61 (diff)
downloadcpython-83e802796c80f46be616b48020356f7f51be533d.zip
cpython-83e802796c80f46be616b48020356f7f51be533d.tar.gz
cpython-83e802796c80f46be616b48020356f7f51be533d.tar.bz2
Issue #22818: Splitting on a pattern that could match an empty string now
raises a warning. Patterns that can only match empty strings are now rejected.
-rw-r--r--Doc/library/re.rst32
-rw-r--r--Doc/whatsnew/3.5.rst7
-rw-r--r--Lib/sre_compile.py10
-rw-r--r--Lib/test/test_re.py39
-rw-r--r--Misc/NEWS4
-rw-r--r--Modules/_sre.c13
6 files changed, 85 insertions, 20 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index 60ded8b..8e20496 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -626,17 +626,37 @@ form.
That way, separator components are always found at the same relative
indices within the result list.
- Note that *split* will never split a string on an empty pattern match.
- For example:
+ .. note::
+
+ :func:`split` doesn't currently split a string on an empty pattern match.
+ For example:
+
+ >>> re.split('x*', 'axbc')
+ ['a', 'bc']
- >>> re.split('x*', 'foo')
- ['foo']
- >>> re.split("(?m)^$", "foo\n\nbar\n")
- ['foo\n\nbar\n']
+ Even though ``'x*'`` also matches 0 'x' before 'a', between 'b' and 'c',
+ and after 'c', currently these matches are ignored. The correct behavior
+ (i.e. splitting on empty matches too and returning ``['', 'a', 'b', 'c',
+ '']``) will be implemented in future versions of Python, but since this
+ is a backward incompatible change, a :exc:`FutureWarning` will be raised
+ in the meanwhile.
+
+ Patterns that can only match empty strings currently never split the
+ string. Since this doesn't match the expected behavior, a
+ :exc:`ValueError` will be raised starting from Python 3.5::
+
+ >>> re.split("^$", "foo\n\nbar\n", flags=re.M)
+ Traceback (most recent call last):
+ File "<stdin>", line 1, in <module>
+ ...
+ ValueError: split() requires a non-empty pattern match.
.. versionchanged:: 3.1
Added the optional flags argument.
+ .. versionchanged:: 3.5
+ Splitting on a pattern that could match an empty string now raises
+ a warning. Patterns that can only match empty strings are now rejected.
.. function:: findall(pattern, string, flags=0)
diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst
index c309aa8..f7b9a83 100644
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -482,6 +482,13 @@ Changes in the Python API
simply define :meth:`~importlib.machinery.Loader.create_module` to return
``None`` (:issue:`23014`).
+* :func:`re.split` always ignored empty pattern matches, so the ``'x*'``
+ pattern worked the same as ``'x+'``, and the ``'\b'`` pattern never worked.
+ Now :func:`re.split` raises a warning if the pattern could match
+ an empty string. For compatibility use patterns that never match an empty
+ string (e.g. ``'x+'`` instead of ``'x*'``). Patterns that could only match
+ an empty string (such as ``'\b'``) now raise an error.
+
Changes in the C API
--------------------
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 1241a01..30a5fae 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -414,8 +414,11 @@ def _compile_info(code, pattern, flags):
# this contains min/max pattern width, and an optional literal
# prefix or a character map
lo, hi = pattern.getwidth()
+ if hi > MAXCODE:
+ hi = MAXCODE
if lo == 0:
- return # not worth it
+ code.extend([INFO, 4, 0, lo, hi])
+ return
# look for a literal prefix
prefix = []
prefixappend = prefix.append
@@ -495,10 +498,7 @@ def _compile_info(code, pattern, flags):
else:
emit(MAXCODE)
prefix = prefix[:MAXCODE]
- if hi < MAXCODE:
- emit(hi)
- else:
- emit(0)
+ emit(min(hi, MAXCODE))
# add literal prefix
if prefix:
emit(len(prefix)) # length
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 6e90b2f..2fb4764 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -251,28 +251,28 @@ class ReTests(unittest.TestCase):
for string in ":a:b::c", S(":a:b::c"):
self.assertTypedEqual(re.split(":", string),
['', 'a', 'b', '', 'c'])
- self.assertTypedEqual(re.split(":*", string),
+ self.assertTypedEqual(re.split(":+", string),
['', 'a', 'b', 'c'])
- self.assertTypedEqual(re.split("(:*)", string),
+ self.assertTypedEqual(re.split("(:+)", string),
['', ':', 'a', ':', 'b', '::', 'c'])
for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
memoryview(b":a:b::c")):
self.assertTypedEqual(re.split(b":", string),
[b'', b'a', b'b', b'', b'c'])
- self.assertTypedEqual(re.split(b":*", string),
+ self.assertTypedEqual(re.split(b":+", string),
[b'', b'a', b'b', b'c'])
- self.assertTypedEqual(re.split(b"(:*)", string),
+ self.assertTypedEqual(re.split(b"(:+)", string),
[b'', b':', b'a', b':', b'b', b'::', b'c'])
for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
"\U0001d49c\U0001d49e\U0001d4b5"):
string = ":%s:%s::%s" % (a, b, c)
self.assertEqual(re.split(":", string), ['', a, b, '', c])
- self.assertEqual(re.split(":*", string), ['', a, b, c])
- self.assertEqual(re.split("(:*)", string),
+ self.assertEqual(re.split(":+", string), ['', a, b, c])
+ self.assertEqual(re.split("(:+)", string),
['', ':', a, ':', b, '::', c])
- self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
- self.assertEqual(re.split("(:)*", ":a:b::c"),
+ self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
+ self.assertEqual(re.split("(:)+", ":a:b::c"),
['', ':', 'a', ':', 'b', ':', 'c'])
self.assertEqual(re.split("([b:]+)", ":a:b::c"),
['', ':', 'a', ':b::', 'c'])
@@ -282,13 +282,34 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
['', 'a', '', '', 'c'])
+ for sep, expected in [
+ (':*', ['', 'a', 'b', 'c']),
+ ('(?::*)', ['', 'a', 'b', 'c']),
+ ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
+ ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
+ ]:
+ with self.subTest(sep=sep), self.assertWarns(FutureWarning):
+ self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
+
+ for sep, expected in [
+ ('', [':a:b::c']),
+ (r'\b', [':a:b::c']),
+ (r'(?=:)', [':a:b::c']),
+ (r'(?<=:)', [':a:b::c']),
+ ]:
+ with self.subTest(sep=sep), self.assertRaises(ValueError):
+ self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
+
def test_qualified_re_split(self):
self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
['', ':', 'a', ':', 'b::c'])
- self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
+ self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
['', ':', 'a', ':', 'b::c'])
+ with self.assertWarns(FutureWarning):
+ self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
+ ['', ':', 'a', ':', 'b::c'])
def test_re_findall(self):
self.assertEqual(re.findall(":+", "abc"), [])
diff --git a/Misc/NEWS b/Misc/NEWS
index 7a3a5ae..a6cc4b5 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -232,6 +232,10 @@ Core and Builtins
Library
-------
+- Issue #22818: Splitting on a pattern that could match an empty string now
+ raises a warning. Patterns that can only match empty strings are now
+ rejected.
+
- Issue #23099: Closing io.BytesIO with exported buffer is rejected now to
prevent corrupting exported buffer.
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 63778f4..9550d97 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -863,6 +863,19 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
if (!string)
return NULL;
+ assert(self->codesize != 0);
+ if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) {
+ if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
+ PyErr_SetString(PyExc_ValueError,
+ "split() requires a non-empty pattern match.");
+ return NULL;
+ }
+ if (PyErr_WarnEx(PyExc_FutureWarning,
+ "split() requires a non-empty pattern match.",
+ 1) < 0)
+ return NULL;
+ }
+
string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
if (!string)
return NULL;