diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2017-05-05 05:53:40 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-05-05 05:53:40 (GMT) |
commit | 898ff03e1e7925ecde3da66327d3cdc7e07625ba (patch) | |
tree | 977fc4b98c0e85816348cebd3b12026407c368b6 /Lib | |
parent | 647c3d381e67490e82cdbbe6c96e46d5e1628ce2 (diff) | |
download | cpython-898ff03e1e7925ecde3da66327d3cdc7e07625ba.zip cpython-898ff03e1e7925ecde3da66327d3cdc7e07625ba.tar.gz cpython-898ff03e1e7925ecde3da66327d3cdc7e07625ba.tar.bz2 |
bpo-30215: Make re.compile() locale agnostic. (#1361)
Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time. Only the locale at matching
time affects the result of matching.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/re.py | 12 | ||||
-rw-r--r-- | Lib/sre_compile.py | 24 | ||||
-rw-r--r-- | Lib/sre_constants.py | 10 | ||||
-rw-r--r-- | Lib/test/test_re.py | 32 |
4 files changed, 58 insertions, 20 deletions
@@ -268,9 +268,7 @@ _MAXCACHE = 512 def _compile(pattern, flags): # internal: compile pattern try: - p, loc = _cache[type(pattern), pattern, flags] - if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE): - return p + return _cache[type(pattern), pattern, flags] except KeyError: pass if isinstance(pattern, _pattern_type): @@ -284,13 +282,7 @@ def _compile(pattern, flags): if not (flags & DEBUG): if len(_cache) >= _MAXCACHE: _cache.clear() - if p.flags & LOCALE: - if not _locale: - return p - loc = _locale.setlocale(_locale.LC_CTYPE) - else: - loc = None - _cache[type(pattern), pattern, flags] = p, loc + _cache[type(pattern), pattern, flags] = p return p @functools.lru_cache(_MAXCACHE) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 2cc3900..d7ee4e8 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -78,7 +78,13 @@ def _compile(code, pattern, flags): fixes = None for op, av in pattern: if op in LITERAL_CODES: - if flags & SRE_FLAG_IGNORECASE: + if not flags & SRE_FLAG_IGNORECASE: + emit(op) + emit(av) + elif flags & SRE_FLAG_LOCALE: + emit(OP_LOC_IGNORE[op]) + emit(av) + else: lo = _sre.getlower(av, flags) if fixes and lo in fixes: emit(IN_IGNORE) @@ -93,17 +99,17 @@ def _compile(code, pattern, flags): else: emit(OP_IGNORE[op]) emit(lo) - else: - emit(op) - emit(av) elif op is IN: - if flags & SRE_FLAG_IGNORECASE: - emit(OP_IGNORE[op]) - def fixup(literal, flags=flags): - return _sre.getlower(literal, flags) - else: + if not flags & SRE_FLAG_IGNORECASE: emit(op) fixup = None + elif flags & SRE_FLAG_LOCALE: + emit(IN_LOC_IGNORE) + fixup = None + else: + emit(IN_IGNORE) + def fixup(literal, flags=flags): + return _sre.getlower(literal, flags) skip = _len(code); emit(0) _compile_charset(av, flags, code, fixup, fixes) code[skip] = _len(code) - skip diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index fc684ae..b016431 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20140917 +MAGIC = 20170530 from _sre import MAXREPEAT, MAXGROUPS @@ -87,6 +87,9 @@ OPCODES = _makecodes(""" SUBPATTERN MIN_REPEAT_ONE RANGE_IGNORE + LITERAL_LOC_IGNORE + NOT_LITERAL_LOC_IGNORE + IN_LOC_IGNORE MIN_REPEAT MAX_REPEAT """) @@ -124,6 +127,11 @@ OP_IGNORE = { RANGE: RANGE_IGNORE, } +OP_LOC_IGNORE = { + LITERAL: LITERAL_LOC_IGNORE, + NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, +} + AT_MULTILINE = { AT_BEGINNING: AT_BEGINNING_LINE, AT_END: AT_END_LINE diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index da5c953..7601dc8 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1730,6 +1730,38 @@ SUBPATTERN None 0 0 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) + def test_locale_compiled(self): + oldlocale = locale.setlocale(locale.LC_CTYPE) + self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) + for loc in 'en_US.iso88591', 'en_US.utf8': + try: + locale.setlocale(locale.LC_CTYPE, loc) + except locale.Error: + # Unsupported locale on this system + self.skipTest('test needs %s locale' % loc) + + locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') + p1 = re.compile(b'\xc5\xe5', re.L|re.I) + p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I) + p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I) + p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I) + for p in p1, p2, p3: + self.assertTrue(p.match(b'\xc5\xe5')) + self.assertTrue(p.match(b'\xe5\xe5')) + self.assertTrue(p.match(b'\xc5\xc5')) + self.assertIsNone(p4.match(b'\xe5\xc5')) + self.assertIsNone(p4.match(b'\xe5\xe5')) + self.assertIsNone(p4.match(b'\xc5\xc5')) + + locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') + for p in p1, p2, p3: + self.assertTrue(p.match(b'\xc5\xe5')) + self.assertIsNone(p.match(b'\xe5\xe5')) + self.assertIsNone(p.match(b'\xc5\xc5')) + self.assertTrue(p4.match(b'\xe5\xc5')) + self.assertIsNone(p4.match(b'\xe5\xe5')) + self.assertIsNone(p4.match(b'\xc5\xc5')) + def test_error(self): with self.assertRaises(re.error) as cm: re.compile('(\u20ac))') |