summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2017-05-05 05:53:40 (GMT)
committerGitHub <noreply@github.com>2017-05-05 05:53:40 (GMT)
commit898ff03e1e7925ecde3da66327d3cdc7e07625ba (patch)
tree977fc4b98c0e85816348cebd3b12026407c368b6 /Lib
parent647c3d381e67490e82cdbbe6c96e46d5e1628ce2 (diff)
downloadcpython-898ff03e1e7925ecde3da66327d3cdc7e07625ba.zip
cpython-898ff03e1e7925ecde3da66327d3cdc7e07625ba.tar.gz
cpython-898ff03e1e7925ecde3da66327d3cdc7e07625ba.tar.bz2
bpo-30215: Make re.compile() locale agnostic. (#1361)
Compiled regular expression objects with the re.LOCALE flag no longer depend on the locale at compile time. Only the locale at matching time affects the result of matching.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/re.py12
-rw-r--r--Lib/sre_compile.py24
-rw-r--r--Lib/sre_constants.py10
-rw-r--r--Lib/test/test_re.py32
4 files changed, 58 insertions, 20 deletions
diff --git a/Lib/re.py b/Lib/re.py
index 7053edd..d0ee5db 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -268,9 +268,7 @@ _MAXCACHE = 512
def _compile(pattern, flags):
# internal: compile pattern
try:
- p, loc = _cache[type(pattern), pattern, flags]
- if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
- return p
+ return _cache[type(pattern), pattern, flags]
except KeyError:
pass
if isinstance(pattern, _pattern_type):
@@ -284,13 +282,7 @@ def _compile(pattern, flags):
if not (flags & DEBUG):
if len(_cache) >= _MAXCACHE:
_cache.clear()
- if p.flags & LOCALE:
- if not _locale:
- return p
- loc = _locale.setlocale(_locale.LC_CTYPE)
- else:
- loc = None
- _cache[type(pattern), pattern, flags] = p, loc
+ _cache[type(pattern), pattern, flags] = p
return p
@functools.lru_cache(_MAXCACHE)
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 2cc3900..d7ee4e8 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -78,7 +78,13 @@ def _compile(code, pattern, flags):
fixes = None
for op, av in pattern:
if op in LITERAL_CODES:
- if flags & SRE_FLAG_IGNORECASE:
+ if not flags & SRE_FLAG_IGNORECASE:
+ emit(op)
+ emit(av)
+ elif flags & SRE_FLAG_LOCALE:
+ emit(OP_LOC_IGNORE[op])
+ emit(av)
+ else:
lo = _sre.getlower(av, flags)
if fixes and lo in fixes:
emit(IN_IGNORE)
@@ -93,17 +99,17 @@ def _compile(code, pattern, flags):
else:
emit(OP_IGNORE[op])
emit(lo)
- else:
- emit(op)
- emit(av)
elif op is IN:
- if flags & SRE_FLAG_IGNORECASE:
- emit(OP_IGNORE[op])
- def fixup(literal, flags=flags):
- return _sre.getlower(literal, flags)
- else:
+ if not flags & SRE_FLAG_IGNORECASE:
emit(op)
fixup = None
+ elif flags & SRE_FLAG_LOCALE:
+ emit(IN_LOC_IGNORE)
+ fixup = None
+ else:
+ emit(IN_IGNORE)
+ def fixup(literal, flags=flags):
+ return _sre.getlower(literal, flags)
skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup, fixes)
code[skip] = _len(code) - skip
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index fc684ae..b016431 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
# update when constants are added or removed
-MAGIC = 20140917
+MAGIC = 20170530
from _sre import MAXREPEAT, MAXGROUPS
@@ -87,6 +87,9 @@ OPCODES = _makecodes("""
SUBPATTERN
MIN_REPEAT_ONE
RANGE_IGNORE
+ LITERAL_LOC_IGNORE
+ NOT_LITERAL_LOC_IGNORE
+ IN_LOC_IGNORE
MIN_REPEAT MAX_REPEAT
""")
@@ -124,6 +127,11 @@ OP_IGNORE = {
RANGE: RANGE_IGNORE,
}
+OP_LOC_IGNORE = {
+ LITERAL: LITERAL_LOC_IGNORE,
+ NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
+}
+
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index da5c953..7601dc8 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1730,6 +1730,38 @@ SUBPATTERN None 0 0
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
+ def test_locale_compiled(self):
+ oldlocale = locale.setlocale(locale.LC_CTYPE)
+ self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
+ for loc in 'en_US.iso88591', 'en_US.utf8':
+ try:
+ locale.setlocale(locale.LC_CTYPE, loc)
+ except locale.Error:
+ # Unsupported locale on this system
+ self.skipTest('test needs %s locale' % loc)
+
+ locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
+ p1 = re.compile(b'\xc5\xe5', re.L|re.I)
+ p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
+ p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
+ p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
+ for p in p1, p2, p3:
+ self.assertTrue(p.match(b'\xc5\xe5'))
+ self.assertTrue(p.match(b'\xe5\xe5'))
+ self.assertTrue(p.match(b'\xc5\xc5'))
+ self.assertIsNone(p4.match(b'\xe5\xc5'))
+ self.assertIsNone(p4.match(b'\xe5\xe5'))
+ self.assertIsNone(p4.match(b'\xc5\xc5'))
+
+ locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
+ for p in p1, p2, p3:
+ self.assertTrue(p.match(b'\xc5\xe5'))
+ self.assertIsNone(p.match(b'\xe5\xe5'))
+ self.assertIsNone(p.match(b'\xc5\xc5'))
+ self.assertTrue(p4.match(b'\xe5\xc5'))
+ self.assertIsNone(p4.match(b'\xe5\xe5'))
+ self.assertIsNone(p4.match(b'\xc5\xc5'))
+
def test_error(self):
with self.assertRaises(re.error) as cm:
re.compile('(\u20ac))')