From e9e54ae2228e195e046e6325de018374b3247d2d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 31 Oct 2014 13:53:21 +0200 Subject: Issue #17381: Fixed ranges handling in case-insensitive regular expressions. --- Lib/sre_compile.py | 64 ++++++++++++++++++++++++++++++++++++++++------------- Lib/test/test_re.py | 37 +++++++++++++++++++++++++++++++ Misc/NEWS | 2 ++ 3 files changed, 88 insertions(+), 15 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index e121845..febbde1 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -21,9 +21,6 @@ if _sre.CODESIZE == 2: else: MAXCODE = 0xFFFFFFFFL -def _identityfunction(x): - return x - _LITERAL_CODES = set([LITERAL, NOT_LITERAL]) _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) _SUCCESS_CODES = set([SUCCESS, FAILURE]) @@ -52,7 +49,7 @@ def _compile(code, pattern, flags): return _sre.getlower(literal, flags) else: emit(OPCODES[op]) - fixup = _identityfunction + fixup = None skip = _len(code); emit(0) _compile_charset(av, flags, code, fixup) code[skip] = _len(code) - skip @@ -178,17 +175,15 @@ def _compile(code, pattern, flags): def _compile_charset(charset, flags, code, fixup=None): # compile charset subprogram emit = code.append - if fixup is None: - fixup = _identityfunction - for op, av in _optimize_charset(charset, fixup): + for op, av in _optimize_charset(charset, fixup, flags & SRE_FLAG_UNICODE): emit(OPCODES[op]) if op is NEGATE: pass elif op is LITERAL: - emit(fixup(av)) + emit(av) elif op is RANGE: - emit(fixup(av[0])) - emit(fixup(av[1])) + emit(av[0]) + emit(av[1]) elif op is CHARSET: code.extend(av) elif op is BIGCHARSET: @@ -204,7 +199,7 @@ def _compile_charset(charset, flags, code, fixup=None): raise error, "internal: unsupported set operator" emit(OPCODES[FAILURE]) -def _optimize_charset(charset, fixup): +def _optimize_charset(charset, fixup, isunicode): # internal: optimize character set out = [] tail = [] @@ -213,9 +208,15 @@ def _optimize_charset(charset, fixup): while True: try: if op is LITERAL: - charmap[fixup(av)] = 1 + i = av + if fixup: + i = fixup(i) + charmap[i] = 1 elif op is RANGE: - for i in range(fixup(av[0]), fixup(av[1])+1): + r = range(av[0], av[1]+1) + if fixup: + r = map(fixup, r) + for i in r: charmap[i] = 1 elif op is NEGATE: out.append((op, av)) @@ -227,7 +228,20 @@ def _optimize_charset(charset, fixup): charmap += b'\0' * 0xff00 continue # character set contains non-BMP character codes - tail.append((op, av)) + if fixup and isunicode and op is RANGE: + lo, hi = av + ranges = [av] + # There are only two ranges of cased astral characters: + # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi). + _fixup_range(max(0x10000, lo), min(0x11fff, hi), + ranges, fixup) + for lo, hi in ranges: + if lo == hi: + tail.append((LITERAL, hi)) + else: + tail.append((RANGE, (lo, hi))) + else: + tail.append((op, av)) break # compress character map @@ -253,8 +267,10 @@ def _optimize_charset(charset, fixup): else: out.append((RANGE, (p, q - 1))) out += tail - if len(out) < len(charset): + # if the case was changed or new representation is more compact + if fixup or len(out) < len(charset): return out + # else original character set is good enough return charset # use bitmap @@ -307,6 +323,24 @@ def _optimize_charset(charset, fixup): out += tail return out +def _fixup_range(lo, hi, ranges, fixup): + for i in map(fixup, range(lo, hi+1)): + for k, (lo, hi) in enumerate(ranges): + if i < lo: + if l == lo - 1: + ranges[k] = (i, hi) + else: + ranges.insert(k, (i, i)) + break + elif i > hi: + if i == hi + 1: + ranges[k] = (lo, i) + break + else: + break + else: + ranges.append((i, i)) + _CODEBITS = _sre.CODESIZE * 8 _BITS_TRANS = b'0' + b'1' * 255 def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 7921c4a..7a25dbe 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -474,6 +474,43 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") + def test_ignore_case_range(self): + # Issues #3511, #17381. + self.assertTrue(re.match(r'[9-a]', '_', re.I)) + self.assertIsNone(re.match(r'[9-A]', '_', re.I)) + self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I)) + self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I)) + self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I)) + self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I)) + if have_unicode: + self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I)) + self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I)) + self.assertTrue(re.match(u(r'[\xc0-\xde]'), + u(r'\xd7'), re.U | re.I)) + self.assertIsNone(re.match(u(r'[\xc0-\xde]'), + u(r'\xf7'), re.U | re.I)) + self.assertTrue(re.match(u(r'[\xe0-\xfe]'), + u(r'\xf7'), re.U | re.I)) + self.assertIsNone(re.match(u(r'[\xe0-\xfe]'), + u(r'\xd7'), re.U | re.I)) + self.assertTrue(re.match(u(r'[\u0430-\u045f]'), + u(r'\u0450'), re.U | re.I)) + self.assertTrue(re.match(u(r'[\u0430-\u045f]'), + u(r'\u0400'), re.U | re.I)) + self.assertTrue(re.match(u(r'[\u0400-\u042f]'), + u(r'\u0450'), re.U | re.I)) + self.assertTrue(re.match(u(r'[\u0400-\u042f]'), + u(r'\u0400'), re.U | re.I)) + if sys.maxunicode > 0xffff: + self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'), + u(r'\U00010428'), re.U | re.I)) + self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'), + u(r'\U00010400'), re.U | re.I)) + self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'), + u(r'\U00010428'), re.U | re.I)) + self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'), + u(r'\U00010400'), re.U | re.I)) + def test_category(self): self.assertEqual(re.match(r"(\s)", " ").group(1), " ") diff --git a/Misc/NEWS b/Misc/NEWS index ce0ed51..5c03887 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -37,6 +37,8 @@ Core and Builtins Library ------- +- Issue #17381: Fixed ranges handling in case-insensitive regular expressions. + - Issue #19329: Optimized compiling charsets in regular expressions. - Issue #22410: Module level functions in the re module now cache compiled -- cgit v0.12