diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2017-05-09 20:37:14 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-05-09 20:37:14 (GMT) |
commit | 6d336a027913327fc042b0d758a16724fea27b9c (patch) | |
tree | ca511a6c75e340ef3493674b791f05a692e0c9e2 /Lib/sre_compile.py | |
parent | f93234bb8a87855f295d441524e519481ce6ab13 (diff) | |
download | cpython-6d336a027913327fc042b0d758a16724fea27b9c.zip cpython-6d336a027913327fc042b0d758a16724fea27b9c.tar.gz cpython-6d336a027913327fc042b0d758a16724fea27b9c.tar.bz2 |
bpo-30285: Optimize case-insensitive matching and searching (#1482)
of regular expressions.
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r-- | Lib/sre_compile.py | 171 |
1 files changed, 102 insertions, 69 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index db8b8a2..cebecb9 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -69,13 +69,16 @@ def _compile(code, pattern, flags): REPEATING_CODES = _REPEATING_CODES SUCCESS_CODES = _SUCCESS_CODES ASSERT_CODES = _ASSERT_CODES + iscased = None tolower = None fixes = None if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: + iscased = _sre.unicode_iscased tolower = _sre.unicode_tolower fixes = _ignorecase_fixes else: + iscased = _sre.ascii_iscased tolower = _sre.ascii_tolower for op, av in pattern: if op in LITERAL_CODES: @@ -85,6 +88,9 @@ def _compile(code, pattern, flags): elif flags & SRE_FLAG_LOCALE: emit(OP_LOC_IGNORE[op]) emit(av) + elif not iscased(av): + emit(op) + emit(av) else: lo = tolower(av) if fixes and lo in fixes: @@ -101,14 +107,15 @@ def _compile(code, pattern, flags): emit(OP_IGNORE[op]) emit(lo) elif op is IN: - if not flags & SRE_FLAG_IGNORECASE: - emit(op) - elif flags & SRE_FLAG_LOCALE: + charset, hascased = _optimize_charset(av, iscased, tolower, fixes) + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: emit(IN_LOC_IGNORE) - else: + elif hascased: emit(IN_IGNORE) + else: + emit(IN) skip = _len(code); emit(0) - _compile_charset(av, flags, code, tolower, fixes) + _compile_charset(charset, flags, code) code[skip] = _len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: @@ -223,10 +230,10 @@ def _compile(code, pattern, flags): else: raise error("internal: unsupported operand type %r" % (op,)) -def _compile_charset(charset, flags, code, fixup=None, fixes=None): +def _compile_charset(charset, flags, code): # compile charset subprogram emit = code.append - for op, av in _optimize_charset(charset, fixup, fixes): + for op, av in charset: emit(op) if op is NEGATE: pass @@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None): raise error("internal: unsupported set operator %r" % (op,)) emit(FAILURE) -def _optimize_charset(charset, fixup, fixes): +def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): # internal: optimize character set out = [] tail = [] charmap = bytearray(256) + hascased = False for op, av in charset: while True: try: @@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes): if fixes and lo in fixes: for k in fixes[lo]: charmap[k] = 1 + if not hascased and iscased(av): + hascased = True else: charmap[av] = 1 elif op is RANGE: r = range(av[0], av[1]+1) if fixup: - r = map(fixup, r) - if fixup and fixes: - for i in r: - charmap[i] = 1 - if i in fixes: - for k in fixes[i]: - charmap[k] = 1 + if fixes: + for i in map(fixup, r): + charmap[i] = 1 + if i in fixes: + for k in fixes[i]: + charmap[k] = 1 + else: + for i in map(fixup, r): + charmap[i] = 1 + if not hascased: + hascased = any(map(iscased, r)) else: for i in r: charmap[i] = 1 @@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes): charmap += b'\0' * 0xff00 continue # Character set contains non-BMP character codes. - # There are only two ranges of cased non-BMP characters: - # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), - # and for both ranges RANGE_IGNORE works. - if fixup and op is RANGE: - op = RANGE_IGNORE + if fixup: + hascased = True + # There are only two ranges of cased non-BMP characters: + # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), + # and for both ranges RANGE_IGNORE works. + if op is RANGE: + op = RANGE_IGNORE tail.append((op, av)) break @@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes): out.append((RANGE, (p, q - 1))) out += tail # if the case was changed or new representation is more compact - if fixup or len(out) < len(charset): - return out + if hascased or len(out) < len(charset): + return out, hascased # else original character set is good enough - return charset + return charset, hascased # use bitmap if len(charmap) == 256: data = _mk_bitmap(charmap) out.append((CHARSET, data)) out += tail - return out + return out, hascased # To represent a big charset, first a bitmap of all characters in the # set is constructed. Then, this bitmap is sliced into chunks of 256 @@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes): data[0:0] = [block] + _bytes_to_codes(mapping) out.append((BIGCHARSET, data)) out += tail - return out + return out, hascased _CODEBITS = _sre.CODESIZE * 8 MAXCODE = (1 << _CODEBITS) - 1 @@ -414,19 +430,31 @@ def _generate_overlap_table(prefix): table[i] = idx + 1 return table -def _get_literal_prefix(pattern): +def _get_iscased(flags): + if not flags & SRE_FLAG_IGNORECASE: + return None + elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: + return _sre.unicode_iscased + else: + return _sre.ascii_iscased + +def _get_literal_prefix(pattern, flags): # look for literal prefix prefix = [] prefixappend = prefix.append prefix_skip = None + iscased = _get_iscased(flags) for op, av in pattern.data: if op is LITERAL: + if iscased and iscased(av): + break prefixappend(av) elif op is SUBPATTERN: group, add_flags, del_flags, p = av - if add_flags & SRE_FLAG_IGNORECASE: + flags1 = (flags | add_flags) & ~del_flags + if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: break - prefix1, prefix_skip1, got_all = _get_literal_prefix(p) + prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) if prefix_skip is None: if group is not None: prefix_skip = len(prefix) @@ -441,46 +469,49 @@ def _get_literal_prefix(pattern): return prefix, prefix_skip, True return prefix, prefix_skip, False -def _get_charset_prefix(pattern): - charset = [] # not used - charsetappend = charset.append - if pattern.data: +def _get_charset_prefix(pattern, flags): + while True: + if not pattern.data: + return None op, av = pattern.data[0] - if op is SUBPATTERN: - group, add_flags, del_flags, p = av - if p and not (add_flags & SRE_FLAG_IGNORECASE): - op, av = p[0] - if op is LITERAL: - charsetappend((op, av)) - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break - else: - charset = c - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break + if op is not SUBPATTERN: + break + group, add_flags, del_flags, pattern = av + flags = (flags | add_flags) & ~del_flags + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + return None + + iscased = _get_iscased(flags) + if op is LITERAL: + if iscased and iscased(av): + return None + return [(op, av)] + elif op is BRANCH: + charset = [] + charsetappend = charset.append + for p in av[1]: + if not p: + return None + op, av = p[0] + if op is LITERAL and not (iscased and iscased(av)): + charsetappend((op, av)) else: - charset = c - elif op is IN: - charset = av - return charset + return None + return charset + elif op is IN: + charset = av + if iscased: + for op, av in charset: + if op is LITERAL: + if iscased(av): + return None + elif op is RANGE: + if av[1] > 0xffff: + return None + if any(map(iscased, range(av[0], av[1]+1))): + return None + return charset + return None def _compile_info(code, pattern, flags): # internal: compile an info block. in the current version, @@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags): prefix = [] prefix_skip = 0 charset = [] # not used - if not (flags & SRE_FLAG_IGNORECASE): + if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): # look for literal prefix - prefix, prefix_skip, got_all = _get_literal_prefix(pattern) + prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) # if no prefix, look for charset prefix if not prefix: - charset = _get_charset_prefix(pattern) + charset = _get_charset_prefix(pattern, flags) ## if prefix: ## print("*** PREFIX", prefix, prefix_skip) ## if charset: @@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags): # generate overlap table code.extend(_generate_overlap_table(prefix)) elif charset: + charset, hascased = _optimize_charset(charset) + assert not hascased _compile_charset(charset, flags, code) code[skip] = len(code) - skip |