#2834: Change re module semantics, so that str and bytes mixing is forbidden,

and str (unicode) patterns get full unicode matching by default. The re.ASCII flag is also introduced to ask for ASCII matching instead.
author: Antoine Pitrou <solipsis@pitrou.net> 2008-08-19 17:56:33 (GMT)
committer: Antoine Pitrou <solipsis@pitrou.net> 2008-08-19 17:56:33 (GMT)
commit: fd036451bf0e0ade8783e21df801abf7be96d020 (patch)
tree: e70ff65a9e641d8e790bc091f0dc2507baf344ca /Lib/test
parent: 3ad7ba10a20827b24d4b1aa9dd49474db8affbdd (diff)
download: cpython-fd036451bf0e0ade8783e21df801abf7be96d020.zip
cpython-fd036451bf0e0ade8783e21df801abf7be96d020.tar.gz
cpython-fd036451bf0e0ade8783e21df801abf7be96d020.tar.bz2
4 files changed, 48 insertions, 23 deletions
diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py
index 220301a..d314e20 100755
--- a/Lib/test/re_tests.py
+++ b/Lib/test/re_tests.py
@@ -667,4 +667,4 @@ tests.extend([
     (r'\b.\b', 'a', SUCCEED, 'found', 'a'),
     (r'(?u)\b.\b', u, SUCCEED, 'found', u),
     (r'(?u)\w', u, SUCCEED, 'found', u),
-    ])
+])
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index c64ac19..630f862 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -506,7 +506,7 @@ class ByteArrayTest(BaseBytesTest):
         def by(s):
             return bytearray(map(ord, s))
         b = by("Hello, world")
-        self.assertEqual(re.findall(r"\w+", b), [by("Hello"), by("world")])
+        self.assertEqual(re.findall(br"\w+", b), [by("Hello"), by("world")])
 
     def test_setitem(self):
         b = bytearray([1, 2, 3])
diff --git a/Lib/test/test_mmap.py b/Lib/test/test_mmap.py
index 0b5202a..9fe044f 100644
--- a/Lib/test/test_mmap.py
+++ b/Lib/test/test_mmap.py
@@ -54,7 +54,7 @@ class MmapTests(unittest.TestCase):
         m.flush()
 
         # Test doing a regular expression match in an mmap'ed file
-        match = re.search('[A-Za-z]+', m)
+        match = re.search(b'[A-Za-z]+', m)
         if match is None:
             self.fail('regex match on mmap failed!')
         else:
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 60b816e..755cb00 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -83,23 +83,6 @@ class ReTests(unittest.TestCase):
         self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
                          'abc\ndef\n')
 
-    def test_bug_1140(self):
-        # re.sub(x, y, b'') should return b'', not '', and
-        # re.sub(x, y, '') should return '', not b''.
-        # Also:
-        # re.sub(x, y, str(x)) should return str(y), and
-        # re.sub(x, y, bytes(x)) should return
-        #     str(y) if isinstance(y, str) else unicode(y).
-        for x in 'x',  b'x':
-            for y in 'y', b'y':
-                z = re.sub(x, y, b'')
-                self.assertEqual(z, b'')
-                self.assertEqual(type(z), bytes)
-                #
-                z = re.sub(x, y, '')
-                self.assertEqual(z, '')
-                self.assertEqual(type(z), str)
-
     def test_bug_1661(self):
         # Verify that flags do not get silently ignored with compiled patterns
         pattern = re.compile('.')
@@ -327,7 +310,7 @@ class ReTests(unittest.TestCase):
 
     def test_getattr(self):
         self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
-        self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I)
+        self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
         self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
         self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
         self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
@@ -614,8 +597,8 @@ class ReTests(unittest.TestCase):
         import array
         for typecode in 'bBuhHiIlLfd':
             a = array.array(typecode)
-            self.assertEqual(re.compile("bla").match(a), None)
-            self.assertEqual(re.compile("").match(a).groups(), ())
+            self.assertEqual(re.compile(b"bla").match(a), None)
+            self.assertEqual(re.compile(b"").match(a).groups(), ())
 
     def test_inline_flags(self):
         # Bug #1700
@@ -658,6 +641,48 @@ class ReTests(unittest.TestCase):
         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
 
+    def test_bytes_str_mixing(self):
+        # Mixing str and bytes is disallowed
+        pat = re.compile('.')
+        bpat = re.compile(b'.')
+        self.assertRaises(TypeError, pat.match, b'b')
+        self.assertRaises(TypeError, bpat.match, 'b')
+        self.assertRaises(TypeError, pat.sub, b'b', 'c')
+        self.assertRaises(TypeError, pat.sub, 'b', b'c')
+        self.assertRaises(TypeError, pat.sub, b'b', b'c')
+        self.assertRaises(TypeError, bpat.sub, b'b', 'c')
+        self.assertRaises(TypeError, bpat.sub, 'b', b'c')
+        self.assertRaises(TypeError, bpat.sub, 'b', 'c')
+
+    def test_ascii_and_unicode_flag(self):
+        # String patterns
+        for flags in (0, re.UNICODE):
+            pat = re.compile('\xc0', flags | re.IGNORECASE)
+            self.assertNotEqual(pat.match('\xe0'), None)
+            pat = re.compile('\w', flags)
+            self.assertNotEqual(pat.match('\xe0'), None)
+        pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
+        self.assertEqual(pat.match('\xe0'), None)
+        pat = re.compile('(?a)\xc0', re.IGNORECASE)
+        self.assertEqual(pat.match('\xe0'), None)
+        pat = re.compile('\w', re.ASCII)
+        self.assertEqual(pat.match('\xe0'), None)
+        pat = re.compile('(?a)\w')
+        self.assertEqual(pat.match('\xe0'), None)
+        # Bytes patterns
+        for flags in (0, re.ASCII):
+            pat = re.compile(b'\xc0', re.IGNORECASE)
+            self.assertEqual(pat.match(b'\xe0'), None)
+            pat = re.compile(b'\w')
+            self.assertEqual(pat.match(b'\xe0'), None)
+        # Incompatibilities
+        self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
+        self.assertRaises(ValueError, re.compile, b'(?u)\w')
+        self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
+        self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
+        self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
+        self.assertRaises(ValueError, re.compile, '(?au)\w')
+
 
 def run_re_tests():
     from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR
author	Antoine Pitrou <solipsis@pitrou.net>	2008-08-19 17:56:33 (GMT)
committer	Antoine Pitrou <solipsis@pitrou.net>	2008-08-19 17:56:33 (GMT)
commit	fd036451bf0e0ade8783e21df801abf7be96d020 (patch)
tree	e70ff65a9e641d8e790bc091f0dc2507baf344ca /Lib/test
parent	3ad7ba10a20827b24d4b1aa9dd49474db8affbdd (diff)
download	cpython-fd036451bf0e0ade8783e21df801abf7be96d020.zip cpython-fd036451bf0e0ade8783e21df801abf7be96d020.tar.gz cpython-fd036451bf0e0ade8783e21df801abf7be96d020.tar.bz2