From 29c08beab08ae3e8b9686a119f5cf0afe99ed918 Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Thu, 29 Jun 2000 23:33:12 +0000 Subject: still trying to figure out how to fix the remaining group reset problem. in the meantime, I added some optimizations: - added "inline" directive to LOCAL (this assumes that AC_C_INLINE does what it's supposed to do). to compile SRE on a non-unix platform that doesn't support inline, you have to add a "#define inline" somewhere... - added code to generate a SRE_OP_INFO primitive - added code to do fast prefix search (enabled by the USE_FAST_SEARCH define; default is on, in this release) --- Lib/sre_compile.py | 57 +++++++++++++++++++++++++++++++++- Modules/_sre.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 134 insertions(+), 13 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index c042375..344dc29 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -23,6 +23,7 @@ else: raise RuntimeError, "cannot find a useable array type" def _compile(code, pattern, flags): + # internal: compile a (sub)pattern emit = code.append for op, av in pattern: if op is ANY: @@ -152,21 +153,75 @@ def _compile(code, pattern, flags): else: raise ValueError, ("unsupported operand type", op) +def _compile_info(code, pattern, flags): + # internal: compile an info block. in the current version, + # this contains min/max pattern width and a literal prefix, + # if any + lo, hi = pattern.getwidth() + if lo == 0: + return # not worth it + # look for a literal prefix + prefix = [] + if not (flags & SRE_FLAG_IGNORECASE): + for op, av in pattern.data: + if op is LITERAL: + prefix.append(ord(av)) + else: + break + # add an info block + emit = code.append + emit(OPCODES[INFO]) + skip = len(code); emit(0) + # literal flag + mask = 0 + if len(prefix) == len(pattern.data): + mask = 1 + emit(mask) + # pattern length + emit(lo) + if hi < 32768: + emit(hi) + else: + emit(0) + # add literal prefix + emit(len(prefix)) + if prefix: + code.extend(prefix) + # generate overlap table + table = [-1] + ([0]*len(prefix)) + for i in range(len(prefix)): + table[i+1] = table[i]+1 + while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: + table[i+1] = table[table[i+1]-1]+1 + code.extend(table[1:]) # don't store first entry + code[skip] = len(code) - skip + def compile(p, flags=0): # internal: convert pattern list to internal format + + # compile, as necessary if type(p) in (type(""), type(u"")): import sre_parse pattern = p p = sre_parse.parse(p) else: pattern = None + flags = p.pattern.flags | flags code = [] + + # compile info block + _compile_info(code, p, flags) + + # compile the pattern _compile(code, p.data, flags) + code.append(OPCODES[SUCCESS]) - # FIXME: get rid of this limitation + + # FIXME: get rid of this limitation! assert p.pattern.groups <= 100,\ "sorry, but this version only supports 100 named groups" + return _sre.compile( pattern, flags, array.array(WORDSIZE, code).tostring(), diff --git a/Modules/_sre.c b/Modules/_sre.c index 206e8d0..6b0fa61 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -19,6 +19,7 @@ * 00-06-25 fl major changes to better deal with nested repeats (0.9) * 00-06-28 fl fixed findall (0.9.1) * 00-06-29 fl fixed split, added more scanner features (0.9.2) + * 00-06-30 fl tuning, fast search (0.9.3) * * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. * @@ -29,8 +30,7 @@ #ifndef SRE_RECURSIVE -static char -copyright[] = " SRE 0.9.2 Copyright (c) 1997-2000 by Secret Labs AB "; +char copyright[] = " SRE 0.9.3 Copyright (c) 1997-2000 by Secret Labs AB "; #include "Python.h" @@ -55,12 +55,15 @@ copyright[] = " SRE 0.9.2 Copyright (c) 1997-2000 by Secret Labs AB "; #define HAVE_UNICODE #endif +/* optional features */ +#define USE_FAST_SEARCH + #if defined(_MSC_VER) #pragma optimize("agtw", on) /* doesn't seem to make much difference... */ /* fastest possible local call under MSVC */ #define LOCAL(type) static __inline type __fastcall #else -#define LOCAL(type) static type +#define LOCAL(type) static inline type #endif /* error codes */ @@ -396,6 +399,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) TRACE(("%8d: enter\n", PTR(ptr))); + if (pattern[0] == SRE_OP_INFO) { + /* optimization info block */ + /* args: <1=skip> <2=flags> <3=min> ... */ + if (pattern[3] && (end - ptr) < pattern[3]) { + TRACE(("reject (got %d chars, need %d)\n", + (end - ptr), pattern[3])); + return 0; + } + pattern += pattern[1] + 1; + } + stackbase = stack = state->stackbase; lastmark = state->lastmark; @@ -917,20 +931,72 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) SRE_CHAR* end = state->end; int status = 0; int prefix_len = 0; - SRE_CODE* prefix = NULL; + SRE_CODE* prefix; + SRE_CODE* overlap; + int literal = 0; if (pattern[0] == SRE_OP_INFO) { - /* args: */ - end -= pattern[2]; - prefix_len = pattern[4]; - prefix = pattern + 5; - pattern += pattern[1]; + /* optimization info block */ + /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix> <6=data...> */ + + if (pattern[3] > 0) { + /* adjust end point (but make sure we leave at least one + character in there) */ + end -= pattern[3]-1; + if (end <= ptr) + end = ptr+1; + } + + literal = pattern[2]; + + prefix = pattern + 6; + prefix_len = pattern[5]; + + overlap = prefix + prefix_len - 1; + + pattern += 1 + pattern[1]; } - /* if (prefix_len > 0) ... */ +#if defined(USE_FAST_SEARCH) + if (prefix_len > 1) { + /* pattern starts with a known prefix. use the overlap + table to skip forward as fast as we possibly can */ + int i = 0; + end = state->end; + while (ptr < end) { + for (;;) { + if (*ptr != (SRE_CHAR) prefix[i]) { + if (!i) + break; + else + i = overlap[i]; + } else { + if (++i == prefix_len) { + /* found a potential match */ + TRACE(("%8d: === SEARCH === hit\n", PTR(ptr))); + state->start = ptr - prefix_len + 1; + state->ptr = ptr + 1; + if (literal) + return 1; /* all of it */ + status = SRE_MATCH(state, pattern + 2*prefix_len); + if (status != 0) + return status; + /* close but no cigar -- try again */ + i = overlap[i]; + } + break; + } + + } + ptr++; + } + return 0; + } +#endif if (pattern[0] == SRE_OP_LITERAL) { - /* pattern starts with a literal */ + /* pattern starts with a literal character. this is used for + short prefixes, and if fast search is disabled*/ SRE_CHAR chr = (SRE_CHAR) pattern[1]; for (;;) { while (ptr < end && *ptr != chr) @@ -944,8 +1010,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) if (status != 0) break; } - } else + /* general case */ while (ptr <= end) { TRACE(("%8d: === SEARCH ===\n", PTR(ptr))); state->start = state->ptr = ptr++; -- cgit v0.12