Issue #22818: Splitting on a pattern that could match an empty string now

raises a warning. Patterns that can only match empty strings are now rejected.
author: Serhiy Storchaka <storchaka@gmail.com> 2015-02-03 09:04:19 (GMT)
committer: Serhiy Storchaka <storchaka@gmail.com> 2015-02-03 09:04:19 (GMT)
commit: 83e802796c80f46be616b48020356f7f51be533d (patch)
tree: e896b143abc3523f96e20d88ebcc22512af16aa7 /Lib
parent: 32ca3dcb97a75c05dc2b90c88bbf82a541c57c61 (diff)
download: cpython-83e802796c80f46be616b48020356f7f51be533d.zip
cpython-83e802796c80f46be616b48020356f7f51be533d.tar.gz
cpython-83e802796c80f46be616b48020356f7f51be533d.tar.bz2
2 files changed, 35 insertions, 14 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 1241a01..30a5fae 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -414,8 +414,11 @@ def _compile_info(code, pattern, flags):
     # this contains min/max pattern width, and an optional literal
     # prefix or a character map
     lo, hi = pattern.getwidth()
+    if hi > MAXCODE:
+        hi = MAXCODE
     if lo == 0:
-        return # not worth it
+        code.extend([INFO, 4, 0, lo, hi])
+        return
     # look for a literal prefix
     prefix = []
     prefixappend = prefix.append
@@ -495,10 +498,7 @@ def _compile_info(code, pattern, flags):
     else:
         emit(MAXCODE)
         prefix = prefix[:MAXCODE]
-    if hi < MAXCODE:
-        emit(hi)
-    else:
-        emit(0)
+    emit(min(hi, MAXCODE))
     # add literal prefix
     if prefix:
         emit(len(prefix)) # length
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 6e90b2f..2fb4764 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -251,28 +251,28 @@ class ReTests(unittest.TestCase):
         for string in ":a:b::c", S(":a:b::c"):
             self.assertTypedEqual(re.split(":", string),
                                   ['', 'a', 'b', '', 'c'])
-            self.assertTypedEqual(re.split(":*", string),
+            self.assertTypedEqual(re.split(":+", string),
                                   ['', 'a', 'b', 'c'])
-            self.assertTypedEqual(re.split("(:*)", string),
+            self.assertTypedEqual(re.split("(:+)", string),
                                   ['', ':', 'a', ':', 'b', '::', 'c'])
         for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
                        memoryview(b":a:b::c")):
             self.assertTypedEqual(re.split(b":", string),
                                   [b'', b'a', b'b', b'', b'c'])
-            self.assertTypedEqual(re.split(b":*", string),
+            self.assertTypedEqual(re.split(b":+", string),
                                   [b'', b'a', b'b', b'c'])
-            self.assertTypedEqual(re.split(b"(:*)", string),
+            self.assertTypedEqual(re.split(b"(:+)", string),
                                   [b'', b':', b'a', b':', b'b', b'::', b'c'])
         for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
                         "\U0001d49c\U0001d49e\U0001d4b5"):
             string = ":%s:%s::%s" % (a, b, c)
             self.assertEqual(re.split(":", string), ['', a, b, '', c])
-            self.assertEqual(re.split(":*", string), ['', a, b, c])
-            self.assertEqual(re.split("(:*)", string),
+            self.assertEqual(re.split(":+", string), ['', a, b, c])
+            self.assertEqual(re.split("(:+)", string),
                              ['', ':', a, ':', b, '::', c])
 
-        self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
-        self.assertEqual(re.split("(:)*", ":a:b::c"),
+        self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
+        self.assertEqual(re.split("(:)+", ":a:b::c"),
                          ['', ':', 'a', ':', 'b', ':', 'c'])
         self.assertEqual(re.split("([b:]+)", ":a:b::c"),
                          ['', ':', 'a', ':b::', 'c'])
@@ -282,13 +282,34 @@ class ReTests(unittest.TestCase):
         self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
                          ['', 'a', '', '', 'c'])
 
+        for sep, expected in [
+            (':*', ['', 'a', 'b', 'c']),
+            ('(?::*)', ['', 'a', 'b', 'c']),
+            ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
+            ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
+        ]:
+            with self.subTest(sep=sep), self.assertWarns(FutureWarning):
+                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
+
+        for sep, expected in [
+            ('', [':a:b::c']),
+            (r'\b', [':a:b::c']),
+            (r'(?=:)', [':a:b::c']),
+            (r'(?<=:)', [':a:b::c']),
+        ]:
+            with self.subTest(sep=sep), self.assertRaises(ValueError):
+                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
+
     def test_qualified_re_split(self):
         self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
         self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
         self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
                          ['', ':', 'a', ':', 'b::c'])
-        self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
+        self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
                          ['', ':', 'a', ':', 'b::c'])
+        with self.assertWarns(FutureWarning):
+            self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
+                             ['', ':', 'a', ':', 'b::c'])
 
     def test_re_findall(self):
         self.assertEqual(re.findall(":+", "abc"), [])
author	Serhiy Storchaka <storchaka@gmail.com>	2015-02-03 09:04:19 (GMT)
committer	Serhiy Storchaka <storchaka@gmail.com>	2015-02-03 09:04:19 (GMT)
commit	83e802796c80f46be616b48020356f7f51be533d (patch)
tree	e896b143abc3523f96e20d88ebcc22512af16aa7 /Lib
parent	32ca3dcb97a75c05dc2b90c88bbf82a541c57c61 (diff)
download	cpython-83e802796c80f46be616b48020356f7f51be533d.zip cpython-83e802796c80f46be616b48020356f7f51be533d.tar.gz cpython-83e802796c80f46be616b48020356f7f51be533d.tar.bz2