diff options
-rw-r--r-- | Lib/sre_compile.py | 117 | ||||
-rw-r--r-- | Misc/NEWS | 4 | ||||
-rw-r--r-- | Modules/_sre.c | 3 | ||||
-rw-r--r-- | Modules/sre_lib.h | 53 |
4 files changed, 99 insertions, 78 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 502b061..4edb03f 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -409,57 +409,39 @@ def _generate_overlap_table(prefix): table[i] = idx + 1 return table -def _compile_info(code, pattern, flags): - # internal: compile an info block. in the current version, - # this contains min/max pattern width, and an optional literal - # prefix or a character map - lo, hi = pattern.getwidth() - if hi > MAXCODE: - hi = MAXCODE - if lo == 0: - code.extend([INFO, 4, 0, lo, hi]) - return - # look for a literal prefix +def _get_literal_prefix(pattern): + # look for literal prefix prefix = [] prefixappend = prefix.append - prefix_skip = 0 + prefix_skip = None + got_all = True + for op, av in pattern.data: + if op is LITERAL: + prefixappend(av) + elif op is SUBPATTERN: + prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1]) + if prefix_skip is None: + if av[0] is not None: + prefix_skip = len(prefix) + elif prefix_skip1 is not None: + prefix_skip = len(prefix) + prefix_skip1 + prefix.extend(prefix1) + if not got_all: + break + else: + got_all = False + break + return prefix, prefix_skip, got_all + +def _get_charset_prefix(pattern): charset = [] # not used charsetappend = charset.append - if not (flags & SRE_FLAG_IGNORECASE): - # look for literal prefix - for op, av in pattern.data: + if pattern.data: + op, av = pattern.data[0] + if op is SUBPATTERN and av[1]: + op, av = av[1][0] if op is LITERAL: - if len(prefix) == prefix_skip: - prefix_skip = prefix_skip + 1 - prefixappend(av) - elif op is SUBPATTERN and len(av[1]) == 1: - op, av = av[1][0] - if op is LITERAL: - prefixappend(av) - else: - break - else: - break - # if no prefix, look for charset prefix - if not prefix and pattern.data: - op, av = pattern.data[0] - if op is SUBPATTERN and av[1]: - op, av = av[1][0] - if op is LITERAL: - charsetappend((op, av)) - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break - else: - charset = c + charsetappend((op, av)) elif op is BRANCH: c = [] cappend = c.append @@ -473,8 +455,43 @@ def _compile_info(code, pattern, flags): break else: charset = c - elif op is IN: - charset = av + elif op is BRANCH: + c = [] + cappend = c.append + for p in av[1]: + if not p: + break + op, av = p[0] + if op is LITERAL: + cappend((op, av)) + else: + break + else: + charset = c + elif op is IN: + charset = av + return charset + +def _compile_info(code, pattern, flags): + # internal: compile an info block. in the current version, + # this contains min/max pattern width, and an optional literal + # prefix or a character map + lo, hi = pattern.getwidth() + if hi > MAXCODE: + hi = MAXCODE + if lo == 0: + code.extend([INFO, 4, 0, lo, hi]) + return + # look for a literal prefix + prefix = [] + prefix_skip = 0 + charset = [] # not used + if not (flags & SRE_FLAG_IGNORECASE): + # look for literal prefix + prefix, prefix_skip, got_all = _get_literal_prefix(pattern) + # if no prefix, look for charset prefix + if not prefix: + charset = _get_charset_prefix(pattern) ## if prefix: ## print("*** PREFIX", prefix, prefix_skip) ## if charset: @@ -487,7 +504,7 @@ def _compile_info(code, pattern, flags): mask = 0 if prefix: mask = SRE_INFO_PREFIX - if len(prefix) == prefix_skip == len(pattern.data): + if prefix_skip is None and got_all: mask = mask | SRE_INFO_LITERAL elif charset: mask = mask | SRE_INFO_CHARSET @@ -502,6 +519,8 @@ def _compile_info(code, pattern, flags): # add literal prefix if prefix: emit(len(prefix)) # length + if prefix_skip is None: + prefix_skip = len(prefix) emit(prefix_skip) # skip code.extend(prefix) # generate overlap table @@ -13,6 +13,10 @@ Core and Builtins Library ------- +- Issue #24426: Fast searching optimization in regular expressions now works + for patterns that starts with capturing groups. Fast searching optimization + now can't be disabled at compile time. + Documentation ------------- diff --git a/Modules/_sre.c b/Modules/_sre.c index 4016a45..1d90281 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -62,9 +62,6 @@ static char copyright[] = /* -------------------------------------------------------------------- */ /* optional features */ -/* enables fast searching */ -#define USE_FAST_SEARCH - /* enables copy/deepcopy handling (work in progress) */ #undef USE_BUILTIN_COPY diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 463a908..422f168 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1248,7 +1248,32 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) prefix, prefix_len, prefix_skip)); TRACE(("charset = %p\n", charset)); -#if defined(USE_FAST_SEARCH) + if (prefix_len == 1) { + /* pattern starts with a literal character */ + SRE_CHAR c = (SRE_CHAR) prefix[0]; +#if SIZEOF_SRE_CHAR < 4 + if ((SRE_CODE) c != prefix[0]) + return 0; /* literal can't match: doesn't fit in char width */ +#endif + end = (SRE_CHAR *)state->end; + while (ptr < end) { + while (*ptr != c) { + if (++ptr >= end) + return 0; + } + TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr)); + state->start = ptr; + state->ptr = ptr + prefix_skip; + if (flags & SRE_INFO_LITERAL) + return 1; /* we got all of it */ + status = SRE(match)(state, pattern + 2*prefix_skip, 0); + if (status != 0) + return status; + ++ptr; + } + return 0; + } + if (prefix_len > 1) { /* pattern starts with a known prefix. use the overlap table to skip forward as fast as we possibly can */ @@ -1297,32 +1322,8 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) } return 0; } -#endif - if (pattern[0] == SRE_OP_LITERAL) { - /* pattern starts with a literal character. this is used - for short prefixes, and if fast search is disabled */ - SRE_CHAR c = (SRE_CHAR) pattern[1]; -#if SIZEOF_SRE_CHAR < 4 - if ((SRE_CODE) c != pattern[1]) - return 0; /* literal can't match: doesn't fit in char width */ -#endif - end = (SRE_CHAR *)state->end; - while (ptr < end) { - while (*ptr != c) { - if (++ptr >= end) - return 0; - } - TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr)); - state->start = ptr; - state->ptr = ++ptr; - if (flags & SRE_INFO_LITERAL) - return 1; /* we got all of it */ - status = SRE(match)(state, pattern + 2, 0); - if (status != 0) - break; - } - } else if (charset) { + if (charset) { /* pattern starts with a character from a known set */ end = (SRE_CHAR *)state->end; for (;;) { |