diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2000-07-02 12:00:07 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2000-07-02 12:00:07 (GMT) |
commit | 3562f1176403653ebfbef6275d449ad42d1b843a (patch) | |
tree | 371b851c1c6bd77c5149a4c553c5ca94824632b4 /Modules | |
parent | c13222cdff4373a9763b9c7df4b2e12e7e3b776f (diff) | |
download | cpython-3562f1176403653ebfbef6275d449ad42d1b843a.zip cpython-3562f1176403653ebfbef6275d449ad42d1b843a.tar.gz cpython-3562f1176403653ebfbef6275d449ad42d1b843a.tar.bz2 |
-- use charset bitmaps where appropriate. this gives a 5-10%
speedup for some tests, including the python tokenizer.
-- added support for an optional charset anchor to the engine
(currently unused by the code generator).
-- removed workaround for array module bug.
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/_sre.c | 63 | ||||
-rw-r--r-- | Modules/sre_constants.h | 38 |
2 files changed, 66 insertions, 35 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c index 7206b95..3bc0237 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -378,6 +378,13 @@ SRE_MEMBER(SRE_CODE* set, SRE_CODE ch) set += 2; break; + case SRE_OP_CHARSET: + /* args: <bitmap> (16 bits per code word) */ + if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15)))) + return ok; + set += 16; + break; + case SRE_OP_CATEGORY: /* args: <category> */ if (sre_category(set[0], (int) ch)) @@ -952,35 +959,38 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) SRE_CHAR* ptr = state->start; SRE_CHAR* end = state->end; int status = 0; - int prefix_len = 0; - SRE_CODE* prefix; - SRE_CODE* overlap; - int literal = 0; + int prefix_len; + SRE_CODE* prefix = NULL; + SRE_CODE* charset = NULL; + SRE_CODE* overlap = NULL; + int flags = 0; if (pattern[0] == SRE_OP_INFO) { /* optimization info block */ - /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix> <6=data...> */ + /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ + + flags = pattern[2]; if (pattern[3] > 0) { /* adjust end point (but make sure we leave at least one - character in there) */ + character in there, so literal search will work) */ end -= pattern[3]-1; if (end <= ptr) end = ptr+1; } - literal = pattern[2]; - - prefix = pattern + 6; - prefix_len = pattern[5]; - - overlap = prefix + prefix_len - 1; + if (flags & SRE_INFO_PREFIX) { + prefix_len = pattern[5]; + prefix = pattern + 6; + overlap = prefix + prefix_len - 1; + } else if (flags & SRE_INFO_CHARSET) + charset = pattern + 5; pattern += 1 + pattern[1]; } #if defined(USE_FAST_SEARCH) - if (prefix_len > 1) { + if (prefix && overlap && prefix_len > 1) { /* pattern starts with a known prefix. use the overlap table to skip forward as fast as we possibly can */ int i = 0; @@ -998,8 +1008,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) TRACE(("%8d: === SEARCH === hit\n", PTR(ptr))); state->start = ptr - prefix_len + 1; state->ptr = ptr + 1; - if (literal) - return 1; /* all of it */ + if (flags & SRE_INFO_LITERAL) + return 1; /* we got all of it */ status = SRE_MATCH(state, pattern + 2*prefix_len); if (status != 0) return status; @@ -1016,9 +1026,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) } #endif - if (pattern[0] == SRE_OP_LITERAL) { - /* pattern starts with a literal character. this is used for - short prefixes, and if fast search is disabled*/ + if (pattern[0] == SRE_OP_LITERAL) { + /* pattern starts with a literal character. this is used + for short prefixes, and if fast search is disabled */ SRE_CODE chr = pattern[1]; for (;;) { while (ptr < end && (SRE_CODE) ptr[0] != chr) @@ -1032,6 +1042,22 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) if (status != 0) break; } +#if 0 + } else if (charset) { + /* pattern starts with a character from a known set */ + for (;;) { + while (ptr < end && !SRE_MEMBER(charset, ptr[0])) + ptr++; + if (ptr == end) + return 0; + TRACE(("%8d: === SEARCH === charset\n", PTR(ptr))); + state->start = ptr; + state->ptr = ptr; + status = SRE_MATCH(state, pattern); + if (status != 0) + break; + } +#endif } else /* general case */ while (ptr <= end) { @@ -1044,6 +1070,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) return status; } + #if !defined(SRE_RECURSIVE) diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 2ec00ba..da25ec4 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -20,23 +20,24 @@ #define SRE_OP_BRANCH 6 #define SRE_OP_CALL 7 #define SRE_OP_CATEGORY 8 -#define SRE_OP_GROUP 9 -#define SRE_OP_GROUP_IGNORE 10 -#define SRE_OP_IN 11 -#define SRE_OP_IN_IGNORE 12 -#define SRE_OP_INFO 13 -#define SRE_OP_JUMP 14 -#define SRE_OP_LITERAL 15 -#define SRE_OP_LITERAL_IGNORE 16 -#define SRE_OP_MARK 17 -#define SRE_OP_MAX_REPEAT 18 -#define SRE_OP_MAX_REPEAT_ONE 19 -#define SRE_OP_MIN_REPEAT 20 -#define SRE_OP_NOT_LITERAL 21 -#define SRE_OP_NOT_LITERAL_IGNORE 22 -#define SRE_OP_NEGATE 23 -#define SRE_OP_RANGE 24 -#define SRE_OP_REPEAT 25 +#define SRE_OP_CHARSET 9 +#define SRE_OP_GROUP 10 +#define SRE_OP_GROUP_IGNORE 11 +#define SRE_OP_IN 12 +#define SRE_OP_IN_IGNORE 13 +#define SRE_OP_INFO 14 +#define SRE_OP_JUMP 15 +#define SRE_OP_LITERAL 16 +#define SRE_OP_LITERAL_IGNORE 17 +#define SRE_OP_MARK 18 +#define SRE_OP_MAX_REPEAT 19 +#define SRE_OP_MAX_REPEAT_ONE 20 +#define SRE_OP_MIN_REPEAT 21 +#define SRE_OP_NOT_LITERAL 22 +#define SRE_OP_NOT_LITERAL_IGNORE 23 +#define SRE_OP_NEGATE 24 +#define SRE_OP_RANGE 25 +#define SRE_OP_REPEAT 26 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BOUNDARY 2 @@ -68,3 +69,6 @@ #define SRE_FLAG_DOTALL 16 #define SRE_FLAG_UNICODE 32 #define SRE_FLAG_VERBOSE 64 +#define SRE_INFO_PREFIX 1 +#define SRE_INFO_LITERAL 2 +#define SRE_INFO_CHARSET 4 |