summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2000-08-01 22:47:49 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2000-08-01 22:47:49 (GMT)
commite186983842f0b27606b141010513fa8e3d0cc5db (patch)
tree8160cdbd00dc449a79a25cfaa6a16069b4bd74b3
parentfb06539e999271ea9b07b754d461f2172d65978b (diff)
downloadcpython-e186983842f0b27606b141010513fa8e3d0cc5db.zip
cpython-e186983842f0b27606b141010513fa8e3d0cc5db.tar.gz
cpython-e186983842f0b27606b141010513fa8e3d0cc5db.tar.bz2
final 0.9.8 updates:
-- added REPEAT_ONE operator -- added ANY_ALL operator (used to represent "(?s).")
-rw-r--r--Lib/sre.py5
-rw-r--r--Lib/sre_compile.py51
-rw-r--r--Lib/sre_constants.py3
-rw-r--r--Lib/sre_parse.py2
-rw-r--r--Modules/_sre.c61
-rw-r--r--Modules/sre_constants.h51
6 files changed, 105 insertions, 68 deletions
diff --git a/Lib/sre.py b/Lib/sre.py
index 3e125a7..edfefc1 100644
--- a/Lib/sre.py
+++ b/Lib/sre.py
@@ -98,7 +98,10 @@ def _compile(pattern, flags=0):
return _cache[key]
except KeyError:
pass
- p = sre_compile.compile(pattern, flags)
+ try:
+ p = sre_compile.compile(pattern, flags)
+ except error, v:
+ raise error, v # invalid expression
if len(_cache) >= _MAXCACHE:
_cache.clear()
_cache[key] = p
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 8fdcecf..abd619e 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -73,6 +73,13 @@ def _charset(charset, fixup=None):
return out
return charset
+def _simple(av):
+ # check if av is a "simple" operator
+ lo, hi = av[2].getwidth()
+ if lo == 0:
+ raise error, "nothing to repeat"
+ return lo == hi == 1 and av[2][0][0] != SUBPATTERN
+
def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
emit = code.append
@@ -116,10 +123,9 @@ def _compile(code, pattern, flags):
code[skip] = len(code) - skip
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
- emit(OPCODES[op])
+ emit(OPCODES[ANY_ALL])
else:
- emit(OPCODES[CATEGORY])
- emit(CHCODES[CATEGORY_NOT_LINEBREAK])
+ emit(OPCODES[ANY])
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
if flags & SRE_FLAG_TEMPLATE:
raise error, "internal: unsupported template operator"
@@ -130,30 +136,25 @@ def _compile(code, pattern, flags):
_compile(code, av[2], flags)
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
+ elif _simple(av) and op == MAX_REPEAT:
+ emit(OPCODES[REPEAT_ONE])
+ skip = len(code); emit(0)
+ emit(av[0])
+ emit(av[1])
+ _compile(code, av[2], flags)
+ emit(OPCODES[SUCCESS])
+ code[skip] = len(code) - skip
else:
- lo, hi = av[2].getwidth()
- if lo == 0:
- raise error, "nothing to repeat"
- if 0 and lo == hi == 1 and op is MAX_REPEAT:
- # FIXME: <fl> fast and wrong (but we'll fix that)
- emit(OPCODES[REPEAT_ONE])
- skip = len(code); emit(0)
- emit(av[0])
- emit(av[1])
- _compile(code, av[2], flags)
- emit(OPCODES[SUCCESS])
- code[skip] = len(code) - skip
+ emit(OPCODES[REPEAT])
+ skip = len(code); emit(0)
+ emit(av[0])
+ emit(av[1])
+ _compile(code, av[2], flags)
+ code[skip] = len(code) - skip
+ if op == MAX_REPEAT:
+ emit(OPCODES[MAX_UNTIL])
else:
- emit(OPCODES[REPEAT])
- skip = len(code); emit(0)
- emit(av[0])
- emit(av[1])
- _compile(code, av[2], flags)
- code[skip] = len(code) - skip
- if op == MAX_REPEAT:
- emit(OPCODES[MAX_UNTIL])
- else:
- emit(OPCODES[MIN_UNTIL])
+ emit(OPCODES[MIN_UNTIL])
elif op is SUBPATTERN:
if av[0]:
emit(OPCODES[MARK])
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index e595915..5a20930 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -20,6 +20,7 @@ FAILURE = "failure"
SUCCESS = "success"
ANY = "any"
+ANY_ALL = "any_all"
ASSERT = "assert"
ASSERT_NOT = "assert_not"
AT = "at"
@@ -81,7 +82,7 @@ OPCODES = [
# failure=0 success=1 (just because it looks better that way :-)
FAILURE, SUCCESS,
- ANY,
+ ANY, ANY_ALL,
ASSERT, ASSERT_NOT,
AT,
BRANCH,
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 1eec3d3..1c1d0d5 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -142,7 +142,7 @@ class SubPattern:
for av in av[1]:
l, h = av.getwidth()
i = min(i, l)
- j = min(j, h)
+ j = max(j, h)
lo = lo + i
hi = hi + j
elif op is CALL:
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 69bc171..677edb8 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -448,6 +448,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
int i, count;
SRE_REPEAT* rp;
int lastmark;
+ SRE_CODE chr;
SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
@@ -525,8 +526,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
break;
case SRE_OP_ANY:
- /* match anything */
+ /* match anything (except a newline) */
/* <ANY> */
+ TRACE(("%8d: anything (except newline)\n", PTR(ptr)));
+ if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
+ return 0;
+ ptr++;
+ break;
+
+ case SRE_OP_ANY_ALL:
+ /* match anything */
+ /* <ANY_ALL> */
TRACE(("%8d: anything\n", PTR(ptr)));
if (ptr >= end)
return 0;
@@ -695,60 +705,79 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
pattern[1], pattern[2]));
+ if (ptr + pattern[1] > end)
+ return 0; /* cannot match */
+
count = 0;
- if (pattern[3] == SRE_OP_ANY) {
+ switch (pattern[3]) {
+
+ case SRE_OP_ANY:
+ /* repeated wildcard. */
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
+ break;
+ ptr++;
+ count++;
+ }
+ break;
+
+ case SRE_OP_ANY_ALL:
/* repeated wildcard. skip to the end of the target
string, and backtrack from there */
- /* FIXME: must look for line endings */
if (ptr + pattern[1] > end)
return 0; /* cannot match */
count = pattern[2];
if (count > end - ptr)
count = end - ptr;
ptr += count;
+ break;
- } else if (pattern[3] == SRE_OP_LITERAL) {
+ case SRE_OP_LITERAL:
/* repeated literal */
- SRE_CODE chr = pattern[4];
+ chr = pattern[4];
while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) ptr[0] != chr)
break;
ptr++;
count++;
}
+ break;
- } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
+ case SRE_OP_LITERAL_IGNORE:
/* repeated literal */
- SRE_CODE chr = pattern[4];
+ chr = pattern[4];
while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
break;
ptr++;
count++;
}
+ break;
- } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
+ case SRE_OP_NOT_LITERAL:
/* repeated non-literal */
- SRE_CODE chr = pattern[4];
+ chr = pattern[4];
while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) ptr[0] == chr)
break;
ptr++;
count++;
}
-
- } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
+ break;
+
+ case SRE_OP_NOT_LITERAL_IGNORE:
/* repeated non-literal */
- SRE_CODE chr = pattern[4];
+ chr = pattern[4];
while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
break;
ptr++;
count++;
}
+ break;
- } else if (pattern[3] == SRE_OP_IN) {
+ case SRE_OP_IN:
/* repeated set */
while (count < (int) pattern[2]) {
if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
@@ -756,8 +785,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
ptr++;
count++;
}
+ break;
- } else {
+ default:
/* repeated single character pattern */
state->ptr = ptr;
while (count < (int) pattern[2]) {
@@ -770,6 +800,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
}
state->ptr = ptr;
ptr += count;
+ break;
}
/* when we arrive here, count contains the number of
@@ -791,7 +822,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
} else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
/* tail starts with a literal. skip positions where
the rest of the pattern cannot possibly match */
- SRE_CODE chr = pattern[pattern[0]+1];
+ chr = pattern[pattern[0]+1];
TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
for (;;) {
TRACE(("%8d: scan for tail match\n", PTR(ptr)));
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index 5cfe495..5c55c3d 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -14,31 +14,32 @@
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
-#define SRE_OP_ASSERT 3
-#define SRE_OP_ASSERT_NOT 4
-#define SRE_OP_AT 5
-#define SRE_OP_BRANCH 6
-#define SRE_OP_CALL 7
-#define SRE_OP_CATEGORY 8
-#define SRE_OP_CHARSET 9
-#define SRE_OP_GROUPREF 10
-#define SRE_OP_GROUPREF_IGNORE 11
-#define SRE_OP_IN 12
-#define SRE_OP_IN_IGNORE 13
-#define SRE_OP_INFO 14
-#define SRE_OP_JUMP 15
-#define SRE_OP_LITERAL 16
-#define SRE_OP_LITERAL_IGNORE 17
-#define SRE_OP_MARK 18
-#define SRE_OP_MAX_UNTIL 19
-#define SRE_OP_MIN_UNTIL 20
-#define SRE_OP_NOT_LITERAL 21
-#define SRE_OP_NOT_LITERAL_IGNORE 22
-#define SRE_OP_NEGATE 23
-#define SRE_OP_RANGE 24
-#define SRE_OP_REPEAT 25
-#define SRE_OP_REPEAT_ONE 26
-#define SRE_OP_SUBPATTERN 27
+#define SRE_OP_ANY_ALL 3
+#define SRE_OP_ASSERT 4
+#define SRE_OP_ASSERT_NOT 5
+#define SRE_OP_AT 6
+#define SRE_OP_BRANCH 7
+#define SRE_OP_CALL 8
+#define SRE_OP_CATEGORY 9
+#define SRE_OP_CHARSET 10
+#define SRE_OP_GROUPREF 11
+#define SRE_OP_GROUPREF_IGNORE 12
+#define SRE_OP_IN 13
+#define SRE_OP_IN_IGNORE 14
+#define SRE_OP_INFO 15
+#define SRE_OP_JUMP 16
+#define SRE_OP_LITERAL 17
+#define SRE_OP_LITERAL_IGNORE 18
+#define SRE_OP_MARK 19
+#define SRE_OP_MAX_UNTIL 20
+#define SRE_OP_MIN_UNTIL 21
+#define SRE_OP_NOT_LITERAL 22
+#define SRE_OP_NOT_LITERAL_IGNORE 23
+#define SRE_OP_NEGATE 24
+#define SRE_OP_RANGE 25
+#define SRE_OP_REPEAT 26
+#define SRE_OP_REPEAT_ONE 27
+#define SRE_OP_SUBPATTERN 28
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2