summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2000-07-02 12:00:07 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2000-07-02 12:00:07 (GMT)
commit3562f1176403653ebfbef6275d449ad42d1b843a (patch)
tree371b851c1c6bd77c5149a4c553c5ca94824632b4 /Modules
parentc13222cdff4373a9763b9c7df4b2e12e7e3b776f (diff)
downloadcpython-3562f1176403653ebfbef6275d449ad42d1b843a.zip
cpython-3562f1176403653ebfbef6275d449ad42d1b843a.tar.gz
cpython-3562f1176403653ebfbef6275d449ad42d1b843a.tar.bz2
-- use charset bitmaps where appropriate. this gives a 5-10%
speedup for some tests, including the python tokenizer. -- added support for an optional charset anchor to the engine (currently unused by the code generator). -- removed workaround for array module bug.
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_sre.c63
-rw-r--r--Modules/sre_constants.h38
2 files changed, 66 insertions, 35 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 7206b95..3bc0237 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -378,6 +378,13 @@ SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
set += 2;
break;
+ case SRE_OP_CHARSET:
+ /* args: <bitmap> (16 bits per code word) */
+ if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
+ return ok;
+ set += 16;
+ break;
+
case SRE_OP_CATEGORY:
/* args: <category> */
if (sre_category(set[0], (int) ch))
@@ -952,35 +959,38 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
SRE_CHAR* ptr = state->start;
SRE_CHAR* end = state->end;
int status = 0;
- int prefix_len = 0;
- SRE_CODE* prefix;
- SRE_CODE* overlap;
- int literal = 0;
+ int prefix_len;
+ SRE_CODE* prefix = NULL;
+ SRE_CODE* charset = NULL;
+ SRE_CODE* overlap = NULL;
+ int flags = 0;
if (pattern[0] == SRE_OP_INFO) {
/* optimization info block */
- /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix> <6=data...> */
+ /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
+
+ flags = pattern[2];
if (pattern[3] > 0) {
/* adjust end point (but make sure we leave at least one
- character in there) */
+ character in there, so literal search will work) */
end -= pattern[3]-1;
if (end <= ptr)
end = ptr+1;
}
- literal = pattern[2];
-
- prefix = pattern + 6;
- prefix_len = pattern[5];
-
- overlap = prefix + prefix_len - 1;
+ if (flags & SRE_INFO_PREFIX) {
+ prefix_len = pattern[5];
+ prefix = pattern + 6;
+ overlap = prefix + prefix_len - 1;
+ } else if (flags & SRE_INFO_CHARSET)
+ charset = pattern + 5;
pattern += 1 + pattern[1];
}
#if defined(USE_FAST_SEARCH)
- if (prefix_len > 1) {
+ if (prefix && overlap && prefix_len > 1) {
/* pattern starts with a known prefix. use the overlap
table to skip forward as fast as we possibly can */
int i = 0;
@@ -998,8 +1008,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
state->start = ptr - prefix_len + 1;
state->ptr = ptr + 1;
- if (literal)
- return 1; /* all of it */
+ if (flags & SRE_INFO_LITERAL)
+ return 1; /* we got all of it */
status = SRE_MATCH(state, pattern + 2*prefix_len);
if (status != 0)
return status;
@@ -1016,9 +1026,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
}
#endif
- if (pattern[0] == SRE_OP_LITERAL) {
- /* pattern starts with a literal character. this is used for
- short prefixes, and if fast search is disabled*/
+ if (pattern[0] == SRE_OP_LITERAL) {
+ /* pattern starts with a literal character. this is used
+ for short prefixes, and if fast search is disabled */
SRE_CODE chr = pattern[1];
for (;;) {
while (ptr < end && (SRE_CODE) ptr[0] != chr)
@@ -1032,6 +1042,22 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
if (status != 0)
break;
}
+#if 0
+ } else if (charset) {
+ /* pattern starts with a character from a known set */
+ for (;;) {
+ while (ptr < end && !SRE_MEMBER(charset, ptr[0]))
+ ptr++;
+ if (ptr == end)
+ return 0;
+ TRACE(("%8d: === SEARCH === charset\n", PTR(ptr)));
+ state->start = ptr;
+ state->ptr = ptr;
+ status = SRE_MATCH(state, pattern);
+ if (status != 0)
+ break;
+ }
+#endif
} else
/* general case */
while (ptr <= end) {
@@ -1044,6 +1070,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
return status;
}
+
#if !defined(SRE_RECURSIVE)
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index 2ec00ba..da25ec4 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -20,23 +20,24 @@
#define SRE_OP_BRANCH 6
#define SRE_OP_CALL 7
#define SRE_OP_CATEGORY 8
-#define SRE_OP_GROUP 9
-#define SRE_OP_GROUP_IGNORE 10
-#define SRE_OP_IN 11
-#define SRE_OP_IN_IGNORE 12
-#define SRE_OP_INFO 13
-#define SRE_OP_JUMP 14
-#define SRE_OP_LITERAL 15
-#define SRE_OP_LITERAL_IGNORE 16
-#define SRE_OP_MARK 17
-#define SRE_OP_MAX_REPEAT 18
-#define SRE_OP_MAX_REPEAT_ONE 19
-#define SRE_OP_MIN_REPEAT 20
-#define SRE_OP_NOT_LITERAL 21
-#define SRE_OP_NOT_LITERAL_IGNORE 22
-#define SRE_OP_NEGATE 23
-#define SRE_OP_RANGE 24
-#define SRE_OP_REPEAT 25
+#define SRE_OP_CHARSET 9
+#define SRE_OP_GROUP 10
+#define SRE_OP_GROUP_IGNORE 11
+#define SRE_OP_IN 12
+#define SRE_OP_IN_IGNORE 13
+#define SRE_OP_INFO 14
+#define SRE_OP_JUMP 15
+#define SRE_OP_LITERAL 16
+#define SRE_OP_LITERAL_IGNORE 17
+#define SRE_OP_MARK 18
+#define SRE_OP_MAX_REPEAT 19
+#define SRE_OP_MAX_REPEAT_ONE 20
+#define SRE_OP_MIN_REPEAT 21
+#define SRE_OP_NOT_LITERAL 22
+#define SRE_OP_NOT_LITERAL_IGNORE 23
+#define SRE_OP_NEGATE 24
+#define SRE_OP_RANGE 25
+#define SRE_OP_REPEAT 26
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2
@@ -68,3 +69,6 @@
#define SRE_FLAG_DOTALL 16
#define SRE_FLAG_UNICODE 32
#define SRE_FLAG_VERBOSE 64
+#define SRE_INFO_PREFIX 1
+#define SRE_INFO_LITERAL 2
+#define SRE_INFO_CHARSET 4