diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2014-10-31 10:36:56 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2014-10-31 10:36:56 (GMT) |
commit | 4b8f8949b43715f1b0f0ef77e15e19c180ccc195 (patch) | |
tree | bebc1eda94d11692278f03c41c683b5b8ca815dd /Lib/sre_compile.py | |
parent | 455de40a6e99ad7548e6061733f9c5dae2327e83 (diff) | |
download | cpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.zip cpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.tar.gz cpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.tar.bz2 |
Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r-- | Lib/sre_compile.py | 35 |
1 files changed, 21 insertions, 14 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index d4d129b..1b3e9f8 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -22,9 +22,6 @@ if _sre.CODESIZE == 2: else: MAXCODE = 0xFFFFFFFF -def _identityfunction(x): - return x - _LITERAL_CODES = set([LITERAL, NOT_LITERAL]) _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) _SUCCESS_CODES = set([SUCCESS, FAILURE]) @@ -53,7 +50,7 @@ def _compile(code, pattern, flags): return _sre.getlower(literal, flags) else: emit(OPCODES[op]) - fixup = _identityfunction + fixup = None skip = _len(code); emit(0) _compile_charset(av, flags, code, fixup) code[skip] = _len(code) - skip @@ -172,17 +169,15 @@ def _compile(code, pattern, flags): def _compile_charset(charset, flags, code, fixup=None): # compile charset subprogram emit = code.append - if fixup is None: - fixup = _identityfunction for op, av in _optimize_charset(charset, fixup): emit(OPCODES[op]) if op is NEGATE: pass elif op is LITERAL: - emit(fixup(av)) - elif op is RANGE: - emit(fixup(av[0])) - emit(fixup(av[1])) + emit(av) + elif op is RANGE or op is RANGE_IGNORE: + emit(av[0]) + emit(av[1]) elif op is CHARSET: code.extend(av) elif op is BIGCHARSET: @@ -207,9 +202,14 @@ def _optimize_charset(charset, fixup): while True: try: if op is LITERAL: - charmap[fixup(av)] = 1 + if fixup: + av = fixup(av) + charmap[av] = 1 elif op is RANGE: - for i in range(fixup(av[0]), fixup(av[1])+1): + r = range(av[0], av[1]+1) + if fixup: + r = map(fixup, r) + for i in r: charmap[i] = 1 elif op is NEGATE: out.append((op, av)) @@ -220,7 +220,12 @@ def _optimize_charset(charset, fixup): # character set contains non-UCS1 character codes charmap += b'\0' * 0xff00 continue - # character set contains non-BMP character codes + # Character set contains non-BMP character codes. + # There are only two ranges of cased non-BMP characters: + # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), + # and for both ranges RANGE_IGNORE works. + if fixup and op is RANGE: + op = RANGE_IGNORE tail.append((op, av)) break @@ -247,8 +252,10 @@ def _optimize_charset(charset, fixup): else: out.append((RANGE, (p, q - 1))) out += tail - if len(out) < len(charset): + # if the case was changed or new representation is more compact + if fixup or len(out) < len(charset): return out + # else original character set is good enough return charset # use bitmap |