summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2000-08-01 21:05:41 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2000-08-01 21:05:41 (GMT)
commit2f2c67d7e5934bdf96835f3c4774388b3e654314 (patch)
tree1d573a6163fe77af982c4637f809b1e468873a74
parent329e29198dccbbb7f0e7e84c026196dbfc47befa (diff)
downloadcpython-2f2c67d7e5934bdf96835f3c4774388b3e654314.zip
cpython-2f2c67d7e5934bdf96835f3c4774388b3e654314.tar.gz
cpython-2f2c67d7e5934bdf96835f3c4774388b3e654314.tar.bz2
-- fixed width calculations for alternations
-- fixed literal check in branch operator (this broke test_tokenize, as reported by Mark Favas) -- added REPEAT_ONE operator (still not enabled, though) -- added some debugging stuff (maxlevel)
-rw-r--r--Lib/sre_compile.py5
-rw-r--r--Lib/sre_parse.py10
-rw-r--r--Modules/_sre.c217
-rw-r--r--Modules/sre.h2
4 files changed, 199 insertions, 35 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 2d1cbb1..8fdcecf 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -270,6 +270,7 @@ def _compile_info(code, pattern, flags):
table[i+1] = table[table[i+1]-1]+1
code.extend(table[1:]) # don't store first entry
elif charset:
+ # FIXME: use charset optimizer!
for char in charset:
emit(OPCODES[LITERAL])
emit(char)
@@ -283,7 +284,7 @@ try:
except NameError:
pass
-def _compile1(p, flags):
+def _code(p, flags):
flags = p.pattern.flags | flags
code = []
@@ -308,7 +309,7 @@ def compile(p, flags=0):
else:
pattern = None
- code = _compile1(p, flags)
+ code = _code(p, flags)
# print code
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 299aa0e..1eec3d3 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -137,12 +137,12 @@ class SubPattern:
lo = hi = 0L
for op, av in self.data:
if op is BRANCH:
- l = sys.maxint
- h = 0
+ i = sys.maxint
+ j = 0
for av in av[1]:
- i, j = av.getwidth()
- l = min(l, i)
- h = min(h, j)
+ l, h = av.getwidth()
+ i = min(i, l)
+ j = min(j, h)
lo = lo + i
hi = hi + j
elif op is CALL:
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 0b208e6..69bc171 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -219,6 +219,14 @@ mark_init(SRE_STATE* state)
static void
mark_fini(SRE_STATE* state)
{
+#if 0
+ /* FIXME: debugging */
+ if (state->maxlevel > 0)
+ printf("max %d\n", state->maxlevel);
+ if (state->mark_stack_base > 0)
+ printf("mark stack %d\n", state->mark_stack_base);
+#endif
+
if (state->mark_stack)
free(state->mark_stack);
mark_init(state);
@@ -430,7 +438,7 @@ SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
}
LOCAL(int)
-SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
+SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
{
/* check if string matches the given pattern. returns -1 for
error, 0 for failure, and 1 for success */
@@ -443,7 +451,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
- TRACE(("%8d: enter\n", PTR(ptr)));
+ TRACE(("%8d: enter %d\n", PTR(ptr), level));
if (pattern[0] == SRE_OP_INFO) {
/* optimization info block */
@@ -456,6 +464,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
pattern += pattern[1] + 1;
}
+ /* FIXME: debugging */
+ if (level > state->maxlevel)
+ state->maxlevel = level;
+
for (;;) {
switch (*pattern++) {
@@ -623,7 +635,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
state->ptr = ptr - pattern[1];
if (state->ptr < state->beginning)
return 0;
- i = SRE_MATCH(state, pattern + 2);
+ i = SRE_MATCH(state, pattern + 2, level + 1);
if (i <= 0)
return i;
if (pattern[1] > 0 && state->ptr != ptr)
@@ -638,7 +650,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
state->ptr = ptr - pattern[1];
if (state->ptr < state->beginning)
return 0;
- i = SRE_MATCH(state, pattern + 2);
+ i = SRE_MATCH(state, pattern + 2, level + 1);
if (i < 0)
return i;
if (i)
@@ -656,10 +668,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
lastmark = state->lastmark;
while (pattern[0]) {
TRACE(("%8d: try branch\n", PTR(ptr)));
- if (pattern[2] != SRE_OP_LITERAL ||
- (ptr < end && (SRE_CODE) ptr[0] == pattern[3])) {
+ if (pattern[1] != SRE_OP_LITERAL ||
+ (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
state->ptr = ptr;
- i = SRE_MATCH(state, pattern + 1);
+ i = SRE_MATCH(state, pattern + 1, level + 1);
if (i)
return i;
}
@@ -670,6 +682,155 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
}
return 0;
+ case SRE_OP_REPEAT_ONE:
+ /* match repeated sequence (maximizing regexp) */
+
+ /* this operator only works if the repeated item is
+ exactly one character wide, and we're not already
+ collecting backtracking points. for other cases,
+ use the MAX_REPEAT operator instead */
+
+ /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
+
+ TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
+ pattern[1], pattern[2]));
+
+ count = 0;
+
+ if (pattern[3] == SRE_OP_ANY) {
+ /* repeated wildcard. skip to the end of the target
+ string, and backtrack from there */
+ /* FIXME: must look for line endings */
+ if (ptr + pattern[1] > end)
+ return 0; /* cannot match */
+ count = pattern[2];
+ if (count > end - ptr)
+ count = end - ptr;
+ ptr += count;
+
+ } else if (pattern[3] == SRE_OP_LITERAL) {
+ /* repeated literal */
+ SRE_CODE chr = pattern[4];
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || (SRE_CODE) ptr[0] != chr)
+ break;
+ ptr++;
+ count++;
+ }
+
+ } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
+ /* repeated literal */
+ SRE_CODE chr = pattern[4];
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
+ break;
+ ptr++;
+ count++;
+ }
+
+ } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
+ /* repeated non-literal */
+ SRE_CODE chr = pattern[4];
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || (SRE_CODE) ptr[0] == chr)
+ break;
+ ptr++;
+ count++;
+ }
+
+ } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
+ /* repeated non-literal */
+ SRE_CODE chr = pattern[4];
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
+ break;
+ ptr++;
+ count++;
+ }
+
+ } else if (pattern[3] == SRE_OP_IN) {
+ /* repeated set */
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
+ break;
+ ptr++;
+ count++;
+ }
+
+ } else {
+ /* repeated single character pattern */
+ state->ptr = ptr;
+ while (count < (int) pattern[2]) {
+ i = SRE_MATCH(state, pattern + 3, level + 1);
+ if (i < 0)
+ return i;
+ if (!i)
+ break;
+ count++;
+ }
+ state->ptr = ptr;
+ ptr += count;
+ }
+
+ /* when we arrive here, count contains the number of
+ matches, and ptr points to the tail of the target
+ string. check if the rest of the pattern matches,
+ and backtrack if not. */
+
+ TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
+
+ if (count < (int) pattern[1])
+ return 0;
+
+ if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
+ /* tail is empty. we're finished */
+ TRACE(("%8d: tail is empty\n", PTR(ptr)));
+ state->ptr = ptr;
+ return 1;
+
+ } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
+ /* tail starts with a literal. skip positions where
+ the rest of the pattern cannot possibly match */
+ SRE_CODE chr = pattern[pattern[0]+1];
+ TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
+ for (;;) {
+ TRACE(("%8d: scan for tail match\n", PTR(ptr)));
+ while (count >= (int) pattern[1] &&
+ (ptr >= end || *ptr != chr)) {
+ ptr--;
+ count--;
+ }
+ TRACE(("%8d: check tail\n", PTR(ptr)));
+ if (count < (int) pattern[1])
+ break;
+ state->ptr = ptr;
+ i = SRE_MATCH(state, pattern + pattern[0], level + 1);
+ if (i > 0) {
+ TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+ return 1;
+ }
+ ptr--;
+ count--;
+ }
+
+ } else {
+ /* general case */
+ TRACE(("%8d: tail is pattern\n", PTR(ptr)));
+ while (count >= (int) pattern[1]) {
+ state->ptr = ptr;
+ i = SRE_MATCH(state, pattern + pattern[0], level + 1);
+ if (i < 0)
+ return i;
+ if (i) {
+ TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+ return 1;
+ }
+ ptr--;
+ count--;
+ }
+ }
+ return 0;
+
case SRE_OP_REPEAT:
/* create repeat context. all the hard work is done
by the UNTIL operator */
@@ -677,8 +838,6 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: repeat {%d,%d}\n", PTR(ptr),
pattern[1], pattern[2]));
- state->ptr = ptr;
-
rep.count = -1;
rep.pattern = pattern;
@@ -686,10 +845,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
rep.prev = state->repeat;
state->repeat = &rep;
- i = SRE_MATCH(state, pattern + pattern[0]);
+ state->ptr = ptr;
+ i = SRE_MATCH(state, pattern + pattern[0], level + 1);
state->repeat = rep.prev;
- /* free(rp); */
return i;
@@ -714,7 +873,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* not enough matches */
TRACE(("%8d: match item (required)\n", PTR(ptr)));
rp->count = count;
- i = SRE_MATCH(state, rp->pattern + 3);
+ i = SRE_MATCH(state, rp->pattern + 3, level + 1);
if (i)
return i;
rp->count = count - 1;
@@ -729,7 +888,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
rp->count = count;
lastmark = state->lastmark;
mark_save(state, 0, lastmark);
- i = SRE_MATCH(state, rp->pattern + 3);
+ i = SRE_MATCH(state, rp->pattern + 3, level + 1);
if (i)
return i;
mark_restore(state, 0, lastmark);
@@ -741,11 +900,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
tail matches */
TRACE(("%8d: match tail\n", PTR(ptr)));
state->repeat = rp->prev;
- i = SRE_MATCH(state, pattern);
- if (i) {
- /* free(rp); */
+ i = SRE_MATCH(state, pattern, level + 1);
+ if (i)
return i;
- }
state->repeat = rp;
return 0;
@@ -767,7 +924,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* not enough matches */
TRACE(("%8d: match item (required)\n", PTR(ptr)));
rp->count = count;
- i = SRE_MATCH(state, rp->pattern + 3);
+ i = SRE_MATCH(state, rp->pattern + 3, level + 1);
if (i)
return i;
rp->count = count-1;
@@ -778,7 +935,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* see if the tail matches */
TRACE(("%8d: match tail\n", PTR(ptr)));
state->repeat = rp->prev;
- i = SRE_MATCH(state, pattern);
+ i = SRE_MATCH(state, pattern, level + 1);
if (i) {
/* free(rp); */
return i;
@@ -790,7 +947,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: match item (optional)\n", PTR(ptr)));
rp->count = count;
- i = SRE_MATCH(state, rp->pattern + 3);
+ i = SRE_MATCH(state, rp->pattern + 3, level + 1);
if (i)
return i;
rp->count = count - 1;
@@ -865,7 +1022,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
state->ptr = ptr + 1;
if (flags & SRE_INFO_LITERAL)
return 1; /* we got all of it */
- status = SRE_MATCH(state, pattern + 2*prefix_len);
+ status = SRE_MATCH(state, pattern + 2*prefix_len, 1);
if (status != 0)
return status;
/* close but no cigar -- try again */
@@ -893,7 +1050,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: === SEARCH === literal\n", PTR(ptr)));
state->start = ptr;
state->ptr = ++ptr;
- status = SRE_MATCH(state, pattern + 2);
+ status = SRE_MATCH(state, pattern + 2, 1);
if (status != 0)
break;
}
@@ -907,7 +1064,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: === SEARCH === charset\n", PTR(ptr)));
state->start = ptr;
state->ptr = ptr;
- status = SRE_MATCH(state, pattern);
+ status = SRE_MATCH(state, pattern, 1);
if (status != 0)
break;
}
@@ -916,7 +1073,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
while (ptr <= end) {
TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
state->start = state->ptr = ptr++;
- status = SRE_MATCH(state, pattern);
+ status = SRE_MATCH(state, pattern, 1);
if (status != 0)
break;
}
@@ -1032,6 +1189,9 @@ state_reset(SRE_STATE* state)
state->repeat = NULL;
+ /* FIXME: debugging */
+ state->maxlevel = 0;
+
mark_fini(state);
}
@@ -1110,6 +1270,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->lower = sre_lower;
state->mark_stack = NULL;
+ state->mark_stack_base = 0;
state_reset(state);
@@ -1262,10 +1423,10 @@ pattern_match(PatternObject* self, PyObject* args)
state.ptr = state.start;
if (state.charsize == 1) {
- status = sre_match(&state, PatternObject_GetCode(self));
+ status = sre_match(&state, PatternObject_GetCode(self), 1);
} else {
#if defined(HAVE_UNICODE)
- status = sre_umatch(&state, PatternObject_GetCode(self));
+ status = sre_umatch(&state, PatternObject_GetCode(self), 1);
#endif
}
@@ -1941,10 +2102,10 @@ scanner_match(ScannerObject* self, PyObject* args)
state->ptr = state->start;
if (state->charsize == 1) {
- status = sre_match(state, PatternObject_GetCode(self->pattern));
+ status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
} else {
#if defined(HAVE_UNICODE)
- status = sre_umatch(state, PatternObject_GetCode(self->pattern));
+ status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
#endif
}
diff --git a/Modules/sre.h b/Modules/sre.h
index bf58eb5..553e456 100644
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -74,6 +74,8 @@ typedef struct {
SRE_REPEAT *repeat; /* current repeat context */
/* hooks */
SRE_TOLOWER_HOOK lower;
+ /* debugging */
+ int maxlevel;
} SRE_STATE;
typedef struct {