From 898ff03e1e7925ecde3da66327d3cdc7e07625ba Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 5 May 2017 08:53:40 +0300 Subject: bpo-30215: Make re.compile() locale agnostic. (#1361) Compiled regular expression objects with the re.LOCALE flag no longer depend on the locale at compile time. Only the locale at matching time affects the result of matching. --- Doc/library/re.rst | 5 ++++ Lib/re.py | 12 ++------- Lib/sre_compile.py | 24 ++++++++++------- Lib/sre_constants.py | 10 ++++++- Lib/test/test_re.py | 32 +++++++++++++++++++++++ Misc/NEWS | 4 +++ Modules/_sre.c | 3 +++ Modules/sre_constants.h | 5 +++- Modules/sre_lib.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++-- 9 files changed, 141 insertions(+), 23 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 0fa7196..131f372 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -559,6 +559,11 @@ form. :const:`re.LOCALE` can be used only with bytes patterns and is not compatible with :const:`re.ASCII`. + .. versionchanged:: 3.7 + Compiled regular expression objects with the :const:`re.LOCALE` flag no + longer depend on the locale at compile time. Only the locale at + matching time affects the result of matching. + .. data:: M MULTILINE diff --git a/Lib/re.py b/Lib/re.py index 7053edd..d0ee5db 100644 --- a/Lib/re.py +++ b/Lib/re.py @@ -268,9 +268,7 @@ _MAXCACHE = 512 def _compile(pattern, flags): # internal: compile pattern try: - p, loc = _cache[type(pattern), pattern, flags] - if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE): - return p + return _cache[type(pattern), pattern, flags] except KeyError: pass if isinstance(pattern, _pattern_type): @@ -284,13 +282,7 @@ def _compile(pattern, flags): if not (flags & DEBUG): if len(_cache) >= _MAXCACHE: _cache.clear() - if p.flags & LOCALE: - if not _locale: - return p - loc = _locale.setlocale(_locale.LC_CTYPE) - else: - loc = None - _cache[type(pattern), pattern, flags] = p, loc + _cache[type(pattern), pattern, flags] = p return p @functools.lru_cache(_MAXCACHE) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 2cc3900..d7ee4e8 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -78,7 +78,13 @@ def _compile(code, pattern, flags): fixes = None for op, av in pattern: if op in LITERAL_CODES: - if flags & SRE_FLAG_IGNORECASE: + if not flags & SRE_FLAG_IGNORECASE: + emit(op) + emit(av) + elif flags & SRE_FLAG_LOCALE: + emit(OP_LOC_IGNORE[op]) + emit(av) + else: lo = _sre.getlower(av, flags) if fixes and lo in fixes: emit(IN_IGNORE) @@ -93,17 +99,17 @@ def _compile(code, pattern, flags): else: emit(OP_IGNORE[op]) emit(lo) - else: - emit(op) - emit(av) elif op is IN: - if flags & SRE_FLAG_IGNORECASE: - emit(OP_IGNORE[op]) - def fixup(literal, flags=flags): - return _sre.getlower(literal, flags) - else: + if not flags & SRE_FLAG_IGNORECASE: emit(op) fixup = None + elif flags & SRE_FLAG_LOCALE: + emit(IN_LOC_IGNORE) + fixup = None + else: + emit(IN_IGNORE) + def fixup(literal, flags=flags): + return _sre.getlower(literal, flags) skip = _len(code); emit(0) _compile_charset(av, flags, code, fixup, fixes) code[skip] = _len(code) - skip diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index fc684ae..b016431 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20140917 +MAGIC = 20170530 from _sre import MAXREPEAT, MAXGROUPS @@ -87,6 +87,9 @@ OPCODES = _makecodes(""" SUBPATTERN MIN_REPEAT_ONE RANGE_IGNORE + LITERAL_LOC_IGNORE + NOT_LITERAL_LOC_IGNORE + IN_LOC_IGNORE MIN_REPEAT MAX_REPEAT """) @@ -124,6 +127,11 @@ OP_IGNORE = { RANGE: RANGE_IGNORE, } +OP_LOC_IGNORE = { + LITERAL: LITERAL_LOC_IGNORE, + NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, +} + AT_MULTILINE = { AT_BEGINNING: AT_BEGINNING_LINE, AT_END: AT_END_LINE diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index da5c953..7601dc8 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1730,6 +1730,38 @@ SUBPATTERN None 0 0 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) + def test_locale_compiled(self): + oldlocale = locale.setlocale(locale.LC_CTYPE) + self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) + for loc in 'en_US.iso88591', 'en_US.utf8': + try: + locale.setlocale(locale.LC_CTYPE, loc) + except locale.Error: + # Unsupported locale on this system + self.skipTest('test needs %s locale' % loc) + + locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') + p1 = re.compile(b'\xc5\xe5', re.L|re.I) + p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I) + p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I) + p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I) + for p in p1, p2, p3: + self.assertTrue(p.match(b'\xc5\xe5')) + self.assertTrue(p.match(b'\xe5\xe5')) + self.assertTrue(p.match(b'\xc5\xc5')) + self.assertIsNone(p4.match(b'\xe5\xc5')) + self.assertIsNone(p4.match(b'\xe5\xe5')) + self.assertIsNone(p4.match(b'\xc5\xc5')) + + locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') + for p in p1, p2, p3: + self.assertTrue(p.match(b'\xc5\xe5')) + self.assertIsNone(p.match(b'\xe5\xe5')) + self.assertIsNone(p.match(b'\xc5\xc5')) + self.assertTrue(p4.match(b'\xe5\xc5')) + self.assertIsNone(p4.match(b'\xe5\xe5')) + self.assertIsNone(p4.match(b'\xc5\xc5')) + def test_error(self): with self.assertRaises(re.error) as cm: re.compile('(\u20ac))') diff --git a/Misc/NEWS b/Misc/NEWS index d76c76b..f2c1994 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -317,6 +317,10 @@ Extension Modules Library ------- +- bpo-30215: Compiled regular expression objects with the re.LOCALE flag no + longer depend on the locale at compile time. Only the locale at matching + time affects the result of matching. + - bpo-30185: Avoid KeyboardInterrupt tracebacks in forkserver helper process when Ctrl-C is received. diff --git a/Modules/_sre.c b/Modules/_sre.c index 03a138e..afb2bce 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1588,6 +1588,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_NOT_LITERAL: case SRE_OP_LITERAL_IGNORE: case SRE_OP_NOT_LITERAL_IGNORE: + case SRE_OP_LITERAL_LOC_IGNORE: + case SRE_OP_NOT_LITERAL_LOC_IGNORE: GET_ARG; /* The arg is just a character, nothing to check */ break; @@ -1625,6 +1627,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_IN: case SRE_OP_IN_IGNORE: + case SRE_OP_IN_LOC_IGNORE: GET_SKIP; /* Stop 1 before the end; we check the FAILURE below */ if (!_validate_charset(code, code+skip-2)) diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 6632442..6d6d21e 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20140917 +#define SRE_MAGIC 20170530 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -45,6 +45,9 @@ #define SRE_OP_SUBPATTERN 30 #define SRE_OP_MIN_REPEAT_ONE 31 #define SRE_OP_RANGE_IGNORE 32 +#define SRE_OP_LITERAL_LOC_IGNORE 33 +#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34 +#define SRE_OP_IN_LOC_IGNORE 35 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 0865fc6..b540d21 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -101,6 +101,14 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) } LOCAL(int) +SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch) +{ + return ch == pattern + || (SRE_CODE) state->lower(ch) == pattern + || (SRE_CODE) state->upper(ch) == pattern; +} + +LOCAL(int) SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) { /* check if character is a member of the given set */ @@ -187,6 +195,18 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) } } +LOCAL(int) +SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) +{ + SRE_CODE lo, up; + lo = state->lower(ch); + if (SRE(charset)(state, set, lo)) + return 1; + + up = state->upper(ch); + return up != lo && SRE(charset)(state, set, up); +} + LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all); LOCAL(Py_ssize_t) @@ -247,6 +267,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) ptr++; break; + case SRE_OP_LITERAL_LOC_IGNORE: + /* repeated literal */ + chr = pattern[1]; + TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr)); + while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr)) + ptr++; + break; + case SRE_OP_NOT_LITERAL: /* repeated non-literal */ chr = pattern[1]; @@ -269,6 +297,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) ptr++; break; + case SRE_OP_NOT_LITERAL_LOC_IGNORE: + /* repeated non-literal */ + chr = pattern[1]; + TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr)); + while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr)) + ptr++; + break; + default: /* repeated single character pattern */ TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr)); @@ -651,7 +687,17 @@ entrance: TRACE(("|%p|%p|LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, ctx->pattern[0])); if (ctx->ptr >= end || - state->lower(*ctx->ptr) != state->lower(*ctx->pattern)) + state->lower(*ctx->ptr) != *ctx->pattern) + RETURN_FAILURE; + ctx->pattern++; + ctx->ptr++; + break; + + case SRE_OP_LITERAL_LOC_IGNORE: + TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n", + ctx->pattern, ctx->ptr, ctx->pattern[0])); + if (ctx->ptr >= end + || !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr)) RETURN_FAILURE; ctx->pattern++; ctx->ptr++; @@ -661,7 +707,17 @@ entrance: TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); if (ctx->ptr >= end || - state->lower(*ctx->ptr) == state->lower(*ctx->pattern)) + state->lower(*ctx->ptr) == *ctx->pattern) + RETURN_FAILURE; + ctx->pattern++; + ctx->ptr++; + break; + + case SRE_OP_NOT_LITERAL_LOC_IGNORE: + TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n", + ctx->pattern, ctx->ptr, *ctx->pattern)); + if (ctx->ptr >= end + || SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr)) RETURN_FAILURE; ctx->pattern++; ctx->ptr++; @@ -677,6 +733,15 @@ entrance: ctx->ptr++; break; + case SRE_OP_IN_LOC_IGNORE: + TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr)); + if (ctx->ptr >= end + || !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr)) + RETURN_FAILURE; + ctx->pattern += ctx->pattern[0]; + ctx->ptr++; + break; + case SRE_OP_JUMP: case SRE_OP_INFO: /* jump forward */ -- cgit v0.12