summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_sre.c90
1 files changed, 78 insertions, 12 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 206e8d0..6b0fa61 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -19,6 +19,7 @@
* 00-06-25 fl major changes to better deal with nested repeats (0.9)
* 00-06-28 fl fixed findall (0.9.1)
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
+ * 00-06-30 fl tuning, fast search (0.9.3)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
@@ -29,8 +30,7 @@
#ifndef SRE_RECURSIVE
-static char
-copyright[] = " SRE 0.9.2 Copyright (c) 1997-2000 by Secret Labs AB ";
+char copyright[] = " SRE 0.9.3 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h"
@@ -55,12 +55,15 @@ copyright[] = " SRE 0.9.2 Copyright (c) 1997-2000 by Secret Labs AB ";
#define HAVE_UNICODE
#endif
+/* optional features */
+#define USE_FAST_SEARCH
+
#if defined(_MSC_VER)
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
/* fastest possible local call under MSVC */
#define LOCAL(type) static __inline type __fastcall
#else
-#define LOCAL(type) static type
+#define LOCAL(type) static inline type
#endif
/* error codes */
@@ -396,6 +399,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: enter\n", PTR(ptr)));
+ if (pattern[0] == SRE_OP_INFO) {
+ /* optimization info block */
+ /* args: <1=skip> <2=flags> <3=min> ... */
+ if (pattern[3] && (end - ptr) < pattern[3]) {
+ TRACE(("reject (got %d chars, need %d)\n",
+ (end - ptr), pattern[3]));
+ return 0;
+ }
+ pattern += pattern[1] + 1;
+ }
+
stackbase = stack = state->stackbase;
lastmark = state->lastmark;
@@ -917,20 +931,72 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
SRE_CHAR* end = state->end;
int status = 0;
int prefix_len = 0;
- SRE_CODE* prefix = NULL;
+ SRE_CODE* prefix;
+ SRE_CODE* overlap;
+ int literal = 0;
if (pattern[0] == SRE_OP_INFO) {
- /* args: <skip> <min> <max> <prefix> <prefix data...> */
- end -= pattern[2];
- prefix_len = pattern[4];
- prefix = pattern + 5;
- pattern += pattern[1];
+ /* optimization info block */
+ /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix> <6=data...> */
+
+ if (pattern[3] > 0) {
+ /* adjust end point (but make sure we leave at least one
+ character in there) */
+ end -= pattern[3]-1;
+ if (end <= ptr)
+ end = ptr+1;
+ }
+
+ literal = pattern[2];
+
+ prefix = pattern + 6;
+ prefix_len = pattern[5];
+
+ overlap = prefix + prefix_len - 1;
+
+ pattern += 1 + pattern[1];
}
- /* if (prefix_len > 0) ... */
+#if defined(USE_FAST_SEARCH)
+ if (prefix_len > 1) {
+ /* pattern starts with a known prefix. use the overlap
+ table to skip forward as fast as we possibly can */
+ int i = 0;
+ end = state->end;
+ while (ptr < end) {
+ for (;;) {
+ if (*ptr != (SRE_CHAR) prefix[i]) {
+ if (!i)
+ break;
+ else
+ i = overlap[i];
+ } else {
+ if (++i == prefix_len) {
+ /* found a potential match */
+ TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
+ state->start = ptr - prefix_len + 1;
+ state->ptr = ptr + 1;
+ if (literal)
+ return 1; /* all of it */
+ status = SRE_MATCH(state, pattern + 2*prefix_len);
+ if (status != 0)
+ return status;
+ /* close but no cigar -- try again */
+ i = overlap[i];
+ }
+ break;
+ }
+
+ }
+ ptr++;
+ }
+ return 0;
+ }
+#endif
if (pattern[0] == SRE_OP_LITERAL) {
- /* pattern starts with a literal */
+ /* pattern starts with a literal character. this is used for
+ short prefixes, and if fast search is disabled*/
SRE_CHAR chr = (SRE_CHAR) pattern[1];
for (;;) {
while (ptr < end && *ptr != chr)
@@ -944,8 +1010,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
if (status != 0)
break;
}
-
} else
+ /* general case */
while (ptr <= end) {
TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
state->start = state->ptr = ptr++;