From 4b8f8949b43715f1b0f0ef77e15e19c180ccc195 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 31 Oct 2014 12:36:56 +0200 Subject: Issue #17381: Fixed handling of case-insensitive ranges in regular expressions. Added new opcode RANGE_IGNORE. --- Lib/sre_compile.py | 35 +++++++++++++++++++++-------------- Lib/sre_constants.py | 9 ++++++--- Lib/test/test_re.py | 19 +++++++++++++++++++ Misc/NEWS | 3 +++ Modules/_sre.c | 28 +++++++++++++++++++++++++--- Modules/sre.h | 2 +- Modules/sre_constants.h | 3 ++- Modules/sre_lib.h | 28 ++++++++++++++++++++++------ 8 files changed, 99 insertions(+), 28 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index d4d129b..1b3e9f8 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -22,9 +22,6 @@ if _sre.CODESIZE == 2: else: MAXCODE = 0xFFFFFFFF -def _identityfunction(x): - return x - _LITERAL_CODES = set([LITERAL, NOT_LITERAL]) _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) _SUCCESS_CODES = set([SUCCESS, FAILURE]) @@ -53,7 +50,7 @@ def _compile(code, pattern, flags): return _sre.getlower(literal, flags) else: emit(OPCODES[op]) - fixup = _identityfunction + fixup = None skip = _len(code); emit(0) _compile_charset(av, flags, code, fixup) code[skip] = _len(code) - skip @@ -172,17 +169,15 @@ def _compile(code, pattern, flags): def _compile_charset(charset, flags, code, fixup=None): # compile charset subprogram emit = code.append - if fixup is None: - fixup = _identityfunction for op, av in _optimize_charset(charset, fixup): emit(OPCODES[op]) if op is NEGATE: pass elif op is LITERAL: - emit(fixup(av)) - elif op is RANGE: - emit(fixup(av[0])) - emit(fixup(av[1])) + emit(av) + elif op is RANGE or op is RANGE_IGNORE: + emit(av[0]) + emit(av[1]) elif op is CHARSET: code.extend(av) elif op is BIGCHARSET: @@ -207,9 +202,14 @@ def _optimize_charset(charset, fixup): while True: try: if op is LITERAL: - charmap[fixup(av)] = 1 + if fixup: + av = fixup(av) + charmap[av] = 1 elif op is RANGE: - for i in range(fixup(av[0]), fixup(av[1])+1): + r = range(av[0], av[1]+1) + if fixup: + r = map(fixup, r) + for i in r: charmap[i] = 1 elif op is NEGATE: out.append((op, av)) @@ -220,7 +220,12 @@ def _optimize_charset(charset, fixup): # character set contains non-UCS1 character codes charmap += b'\0' * 0xff00 continue - # character set contains non-BMP character codes + # Character set contains non-BMP character codes. + # There are only two ranges of cased non-BMP characters: + # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), + # and for both ranges RANGE_IGNORE works. + if fixup and op is RANGE: + op = RANGE_IGNORE tail.append((op, av)) break @@ -247,8 +252,10 @@ def _optimize_charset(charset, fixup): else: out.append((RANGE, (p, q - 1))) out += tail - if len(out) < len(charset): + # if the case was changed or new representation is more compact + if fixup or len(out) < len(charset): return out + # else original character set is good enough return charset # use bitmap diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 8815d1d..8296ecd 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20031017 +MAGIC = 20140917 from _sre import MAXREPEAT, MAXGROUPS @@ -56,6 +56,7 @@ NEGATE = "negate" NOT_LITERAL = "not_literal" NOT_LITERAL_IGNORE = "not_literal_ignore" RANGE = "range" +RANGE_IGNORE = "range_ignore" REPEAT = "repeat" REPEAT_ONE = "repeat_one" SUBPATTERN = "subpattern" @@ -121,7 +122,8 @@ OPCODES = [ REPEAT, REPEAT_ONE, SUBPATTERN, - MIN_REPEAT_ONE + MIN_REPEAT_ONE, + RANGE_IGNORE, ] @@ -159,7 +161,8 @@ OP_IGNORE = { GROUPREF: GROUPREF_IGNORE, IN: IN_IGNORE, LITERAL: LITERAL_IGNORE, - NOT_LITERAL: NOT_LITERAL_IGNORE + NOT_LITERAL: NOT_LITERAL_IGNORE, + RANGE: RANGE_IGNORE, } AT_MULTILINE = { diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 4029561..e09aa2b 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -601,6 +601,25 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") + def test_ignore_case_range(self): + # Issues #3511, #17381. + self.assertTrue(re.match(r'[9-a]', '_', re.I)) + self.assertIsNone(re.match(r'[9-A]', '_', re.I)) + self.assertTrue(re.match(br'[9-a]', b'_', re.I)) + self.assertIsNone(re.match(br'[9-A]', b'_', re.I)) + self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I)) + self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I)) + self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I)) + self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I)) + self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I)) + self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I)) + self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I)) + self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I)) + self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I)) + self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I)) + self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I)) + self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I)) + def test_category(self): self.assertEqual(re.match(r"(\s)", " ").group(1), " ") diff --git a/Misc/NEWS b/Misc/NEWS index dd2aeb6..cef3b92 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -180,6 +180,9 @@ Core and Builtins Library ------- +- Issue #17381: Fixed handling of case-insensitive ranges in regular + expressions. + - Issue #22410: Module level functions in the re module now cache compiled locale-dependent regular expressions taking into account the locale. diff --git a/Modules/_sre.c b/Modules/_sre.c index 0dc5212..63778f4 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -113,6 +113,11 @@ static unsigned int sre_lower(unsigned int ch) return ((ch) < 128 ? Py_TOLOWER(ch) : ch); } +static unsigned int sre_upper(unsigned int ch) +{ + return ((ch) < 128 ? Py_TOUPPER(ch) : ch); +} + /* locale-specific character predicates */ /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids * warnings when c's type supports only numbers < N+1 */ @@ -124,6 +129,11 @@ static unsigned int sre_lower_locale(unsigned int ch) return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch); } +static unsigned int sre_upper_locale(unsigned int ch) +{ + return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch); +} + /* unicode-specific character predicates */ #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch) @@ -137,6 +147,11 @@ static unsigned int sre_lower_unicode(unsigned int ch) return (unsigned int) Py_UNICODE_TOLOWER(ch); } +static unsigned int sre_upper_unicode(unsigned int ch) +{ + return (unsigned int) Py_UNICODE_TOUPPER(ch); +} + LOCAL(int) sre_category(SRE_CODE category, unsigned int ch) { @@ -377,12 +392,18 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->pos = start; state->endpos = end; - if (pattern->flags & SRE_FLAG_LOCALE) + if (pattern->flags & SRE_FLAG_LOCALE) { state->lower = sre_lower_locale; - else if (pattern->flags & SRE_FLAG_UNICODE) + state->upper = sre_upper_locale; + } + else if (pattern->flags & SRE_FLAG_UNICODE) { state->lower = sre_lower_unicode; - else + state->upper = sre_upper_unicode; + } + else { state->lower = sre_lower; + state->upper = sre_upper; + } return string; err: @@ -1567,6 +1588,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end) break; case SRE_OP_RANGE: + case SRE_OP_RANGE_IGNORE: GET_ARG; GET_ARG; break; diff --git a/Modules/sre.h b/Modules/sre.h index 35d198f..b632165 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -84,7 +84,7 @@ typedef struct { /* current repeat context */ SRE_REPEAT *repeat; /* hooks */ - SRE_TOLOWER_HOOK lower; + SRE_TOLOWER_HOOK lower, upper; } SRE_STATE; typedef struct { diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 5940d5a..6632442 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20031017 +#define SRE_MAGIC 20140917 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -44,6 +44,7 @@ #define SRE_OP_REPEAT_ONE 29 #define SRE_OP_SUBPATTERN 30 #define SRE_OP_MIN_REPEAT_ONE 31 +#define SRE_OP_RANGE_IGNORE 32 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 5c6c5a5..463a908 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -101,7 +101,7 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) } LOCAL(int) -SRE(charset)(SRE_CODE* set, SRE_CODE ch) +SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) { /* check if character is a member of the given set */ @@ -142,6 +142,20 @@ SRE(charset)(SRE_CODE* set, SRE_CODE ch) set += 2; break; + case SRE_OP_RANGE_IGNORE: + /* */ + { + SRE_CODE uch; + /* ch is already lower cased */ + if (set[0] <= ch && ch <= set[1]) + return ok; + uch = state->upper(ch); + if (set[0] <= uch && uch <= set[1]) + return ok; + set += 2; + break; + } + case SRE_OP_NEGATE: ok = !ok; break; @@ -193,7 +207,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) case SRE_OP_IN: /* repeated set */ TRACE(("|%p|%p|COUNT IN\n", pattern, ptr)); - while (ptr < end && SRE(charset)(pattern + 2, *ptr)) + while (ptr < end && SRE(charset)(state, pattern + 2, *ptr)) ptr++; break; @@ -628,7 +642,8 @@ entrance: /* match set member (or non_member) */ /* */ TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end || !SRE(charset)(ctx->pattern + 1, *ctx->ptr)) + if (ctx->ptr >= end || + !SRE(charset)(state, ctx->pattern + 1, *ctx->ptr)) RETURN_FAILURE; ctx->pattern += ctx->pattern[0]; ctx->ptr++; @@ -657,7 +672,7 @@ entrance: case SRE_OP_IN_IGNORE: TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); if (ctx->ptr >= end - || !SRE(charset)(ctx->pattern+1, + || !SRE(charset)(state, ctx->pattern+1, (SRE_CODE)state->lower(*ctx->ptr))) RETURN_FAILURE; ctx->pattern += ctx->pattern[0]; @@ -688,7 +703,8 @@ entrance: continue; if (ctx->pattern[1] == SRE_OP_IN && (ctx->ptr >= end || - !SRE(charset)(ctx->pattern + 3, (SRE_CODE) *ctx->ptr))) + !SRE(charset)(state, ctx->pattern + 3, + (SRE_CODE) *ctx->ptr))) continue; state->ptr = ctx->ptr; DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); @@ -1310,7 +1326,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) /* pattern starts with a character from a known set */ end = (SRE_CHAR *)state->end; for (;;) { - while (ptr < end && !SRE(charset)(charset, *ptr)) + while (ptr < end && !SRE(charset)(state, charset, *ptr)) ptr++; if (ptr >= end) return 0; -- cgit v0.12