summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2000-06-29 23:33:12 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2000-06-29 23:33:12 (GMT)
commit29c08beab08ae3e8b9686a119f5cf0afe99ed918 (patch)
treeaf7731824bc150a290f02095bdaecf37edaaf605
parent22e1bf7da556de6c14c1e3531db23ca2ff6d8fbb (diff)
downloadcpython-29c08beab08ae3e8b9686a119f5cf0afe99ed918.zip
cpython-29c08beab08ae3e8b9686a119f5cf0afe99ed918.tar.gz
cpython-29c08beab08ae3e8b9686a119f5cf0afe99ed918.tar.bz2
still trying to figure out how to fix the remaining
group reset problem. in the meantime, I added some optimizations: - added "inline" directive to LOCAL (this assumes that AC_C_INLINE does what it's supposed to do). to compile SRE on a non-unix platform that doesn't support inline, you have to add a "#define inline" somewhere... - added code to generate a SRE_OP_INFO primitive - added code to do fast prefix search (enabled by the USE_FAST_SEARCH define; default is on, in this release)
-rw-r--r--Lib/sre_compile.py57
-rw-r--r--Modules/_sre.c90
2 files changed, 134 insertions, 13 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index c042375..344dc29 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -23,6 +23,7 @@ else:
raise RuntimeError, "cannot find a useable array type"
def _compile(code, pattern, flags):
+ # internal: compile a (sub)pattern
emit = code.append
for op, av in pattern:
if op is ANY:
@@ -152,21 +153,75 @@ def _compile(code, pattern, flags):
else:
raise ValueError, ("unsupported operand type", op)
+def _compile_info(code, pattern, flags):
+ # internal: compile an info block. in the current version,
+ # this contains min/max pattern width and a literal prefix,
+ # if any
+ lo, hi = pattern.getwidth()
+ if lo == 0:
+ return # not worth it
+ # look for a literal prefix
+ prefix = []
+ if not (flags & SRE_FLAG_IGNORECASE):
+ for op, av in pattern.data:
+ if op is LITERAL:
+ prefix.append(ord(av))
+ else:
+ break
+ # add an info block
+ emit = code.append
+ emit(OPCODES[INFO])
+ skip = len(code); emit(0)
+ # literal flag
+ mask = 0
+ if len(prefix) == len(pattern.data):
+ mask = 1
+ emit(mask)
+ # pattern length
+ emit(lo)
+ if hi < 32768:
+ emit(hi)
+ else:
+ emit(0)
+ # add literal prefix
+ emit(len(prefix))
+ if prefix:
+ code.extend(prefix)
+ # generate overlap table
+ table = [-1] + ([0]*len(prefix))
+ for i in range(len(prefix)):
+ table[i+1] = table[i]+1
+ while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
+ table[i+1] = table[table[i+1]-1]+1
+ code.extend(table[1:]) # don't store first entry
+ code[skip] = len(code) - skip
+
def compile(p, flags=0):
# internal: convert pattern list to internal format
+
+ # compile, as necessary
if type(p) in (type(""), type(u"")):
import sre_parse
pattern = p
p = sre_parse.parse(p)
else:
pattern = None
+
flags = p.pattern.flags | flags
code = []
+
+ # compile info block
+ _compile_info(code, p, flags)
+
+ # compile the pattern
_compile(code, p.data, flags)
+
code.append(OPCODES[SUCCESS])
- # FIXME: <fl> get rid of this limitation
+
+ # FIXME: <fl> get rid of this limitation!
assert p.pattern.groups <= 100,\
"sorry, but this version only supports 100 named groups"
+
return _sre.compile(
pattern, flags,
array.array(WORDSIZE, code).tostring(),
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 206e8d0..6b0fa61 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -19,6 +19,7 @@
* 00-06-25 fl major changes to better deal with nested repeats (0.9)
* 00-06-28 fl fixed findall (0.9.1)
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
+ * 00-06-30 fl tuning, fast search (0.9.3)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
@@ -29,8 +30,7 @@
#ifndef SRE_RECURSIVE
-static char
-copyright[] = " SRE 0.9.2 Copyright (c) 1997-2000 by Secret Labs AB ";
+char copyright[] = " SRE 0.9.3 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h"
@@ -55,12 +55,15 @@ copyright[] = " SRE 0.9.2 Copyright (c) 1997-2000 by Secret Labs AB ";
#define HAVE_UNICODE
#endif
+/* optional features */
+#define USE_FAST_SEARCH
+
#if defined(_MSC_VER)
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
/* fastest possible local call under MSVC */
#define LOCAL(type) static __inline type __fastcall
#else
-#define LOCAL(type) static type
+#define LOCAL(type) static inline type
#endif
/* error codes */
@@ -396,6 +399,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: enter\n", PTR(ptr)));
+ if (pattern[0] == SRE_OP_INFO) {
+ /* optimization info block */
+ /* args: <1=skip> <2=flags> <3=min> ... */
+ if (pattern[3] && (end - ptr) < pattern[3]) {
+ TRACE(("reject (got %d chars, need %d)\n",
+ (end - ptr), pattern[3]));
+ return 0;
+ }
+ pattern += pattern[1] + 1;
+ }
+
stackbase = stack = state->stackbase;
lastmark = state->lastmark;
@@ -917,20 +931,72 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
SRE_CHAR* end = state->end;
int status = 0;
int prefix_len = 0;
- SRE_CODE* prefix = NULL;
+ SRE_CODE* prefix;
+ SRE_CODE* overlap;
+ int literal = 0;
if (pattern[0] == SRE_OP_INFO) {
- /* args: <skip> <min> <max> <prefix> <prefix data...> */
- end -= pattern[2];
- prefix_len = pattern[4];
- prefix = pattern + 5;
- pattern += pattern[1];
+ /* optimization info block */
+ /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix> <6=data...> */
+
+ if (pattern[3] > 0) {
+ /* adjust end point (but make sure we leave at least one
+ character in there) */
+ end -= pattern[3]-1;
+ if (end <= ptr)
+ end = ptr+1;
+ }
+
+ literal = pattern[2];
+
+ prefix = pattern + 6;
+ prefix_len = pattern[5];
+
+ overlap = prefix + prefix_len - 1;
+
+ pattern += 1 + pattern[1];
}
- /* if (prefix_len > 0) ... */
+#if defined(USE_FAST_SEARCH)
+ if (prefix_len > 1) {
+ /* pattern starts with a known prefix. use the overlap
+ table to skip forward as fast as we possibly can */
+ int i = 0;
+ end = state->end;
+ while (ptr < end) {
+ for (;;) {
+ if (*ptr != (SRE_CHAR) prefix[i]) {
+ if (!i)
+ break;
+ else
+ i = overlap[i];
+ } else {
+ if (++i == prefix_len) {
+ /* found a potential match */
+ TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
+ state->start = ptr - prefix_len + 1;
+ state->ptr = ptr + 1;
+ if (literal)
+ return 1; /* all of it */
+ status = SRE_MATCH(state, pattern + 2*prefix_len);
+ if (status != 0)
+ return status;
+ /* close but no cigar -- try again */
+ i = overlap[i];
+ }
+ break;
+ }
+
+ }
+ ptr++;
+ }
+ return 0;
+ }
+#endif
if (pattern[0] == SRE_OP_LITERAL) {
- /* pattern starts with a literal */
+ /* pattern starts with a literal character. this is used for
+ short prefixes, and if fast search is disabled*/
SRE_CHAR chr = (SRE_CHAR) pattern[1];
for (;;) {
while (ptr < end && *ptr != chr)
@@ -944,8 +1010,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
if (status != 0)
break;
}
-
} else
+ /* general case */
while (ptr <= end) {
TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
state->start = state->ptr = ptr++;