From 5908300e4b0891fc5ab8bd24fba8fac72012eaa7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 13 Apr 2017 21:06:43 +0300 Subject: bpo-29995: re.escape() now escapes only special characters. (#1007) --- Doc/library/re.rst | 10 +++++++--- Doc/tools/susp-ignored.csv | 2 +- Lib/idlelib/idle_test/test_replace.py | 4 ++-- Lib/re.py | 36 +++++++++------------------------- Lib/test/test_re.py | 37 ++++++++++++++++++----------------- Misc/NEWS | 2 ++ 6 files changed, 40 insertions(+), 51 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 3213daf..ce90ec7 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -786,7 +786,7 @@ form. .. function:: escape(pattern) - Escape all the characters in *pattern* except ASCII letters, numbers and ``'_'``. + Escape special characters in *pattern*. This is useful if you want to match an arbitrary literal string that may have regular expression metacharacters in it. For example:: @@ -795,15 +795,19 @@ form. >>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:" >>> print('[%s]+' % re.escape(legal_chars)) - [abcdefghijklmnopqrstuvwxyz0123456789\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\:]+ + [abcdefghijklmnopqrstuvwxyz0123456789!\#\$%&'\*\+\-\.\^_`\|~:]+ >>> operators = ['+', '-', '*', '/', '**'] >>> print('|'.join(map(re.escape, sorted(operators, reverse=True)))) - \/|\-|\+|\*\*|\* + /|\-|\+|\*\*|\* .. versionchanged:: 3.3 The ``'_'`` character is no longer escaped. + .. versionchanged:: 3.7 + Only characters that can have special meaning in a regular expression + are escaped. + .. function:: purge() diff --git a/Doc/tools/susp-ignored.csv b/Doc/tools/susp-ignored.csv index df67f75..01b1d98 100644 --- a/Doc/tools/susp-ignored.csv +++ b/Doc/tools/susp-ignored.csv @@ -303,7 +303,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a whatsnew/3.2,,:location,zope9-location = ${zope9:location} whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf library/re,,`,!#$%&'*+-.^_`|~: -library/re,,`,\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\: +library/re,,`,!\#\$%&'\*\+\-\.\^_`\|~: library/tarfile,,:xz,'x:xz' library/xml.etree.elementtree,,:sometag,prefix:sometag library/xml.etree.elementtree,,:fictional,"@_`~' + def test_re_escape(self): - alnum_chars = string.ascii_letters + string.digits + '_' p = ''.join(chr(i) for i in range(256)) for c in p: - if c in alnum_chars: - self.assertEqual(re.escape(c), c) - elif c == '\x00': - self.assertEqual(re.escape(c), '\\000') - else: - self.assertEqual(re.escape(c), '\\' + c) self.assertMatch(re.escape(c), c) + self.assertMatch('[' + re.escape(c) + ']', c) + self.assertMatch('(?x)' + re.escape(c), c) self.assertMatch(re.escape(p), p) + for c in '-.]{}': + self.assertEqual(re.escape(c)[:1], '\\') + literal_chars = self.LITERAL_CHARS + self.assertEqual(re.escape(literal_chars), literal_chars) - def test_re_escape_byte(self): - alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii') + def test_re_escape_bytes(self): p = bytes(range(256)) for i in p: b = bytes([i]) - if b in alnum_chars: - self.assertEqual(re.escape(b), b) - elif i == 0: - self.assertEqual(re.escape(b), b'\\000') - else: - self.assertEqual(re.escape(b), b'\\' + b) self.assertMatch(re.escape(b), b) + self.assertMatch(b'[' + re.escape(b) + b']', b) + self.assertMatch(b'(?x)' + re.escape(b), b) self.assertMatch(re.escape(p), p) + for i in b'-.]{}': + b = bytes([i]) + self.assertEqual(re.escape(b)[:1], b'\\') + literal_chars = self.LITERAL_CHARS.encode('ascii') + self.assertEqual(re.escape(literal_chars), literal_chars) def test_re_escape_non_ascii(self): s = 'xxx\u2620\u2620\u2620xxx' s_escaped = re.escape(s) - self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx') + self.assertEqual(s_escaped, s) self.assertMatch(s_escaped, s) self.assertMatch('.%s+.' % re.escape('\u2620'), s, 'x\u2620\u2620\u2620x', (2, 7), re.search) @@ -955,7 +956,7 @@ class ReTests(unittest.TestCase): def test_re_escape_non_ascii_bytes(self): b = 'y\u2620y\u2620y'.encode('utf-8') b_escaped = re.escape(b) - self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y') + self.assertEqual(b_escaped, b) self.assertMatch(b_escaped, b) res = re.findall(re.escape('\u2620'.encode('utf-8')), b) self.assertEqual(len(res), 2) diff --git a/Misc/NEWS b/Misc/NEWS index 440f0b2..ec85455 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -320,6 +320,8 @@ Library - bpo-29998: Pickling and copying ImportError now preserves name and path attributes. +- bpo-29995: re.escape() now escapes only regex special characters. + - bpo-29962: Add math.remainder operation, implementing remainder as specified in IEEE 754. -- cgit v0.12