summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/sre.py31
-rw-r--r--Lib/sre_compile.py2
-rw-r--r--Lib/sre_constants.py2
-rw-r--r--Lib/sre_parse.py17
-rw-r--r--Modules/_sre.c30
-rw-r--r--Modules/sre.h2
-rw-r--r--Modules/sre_constants.h31
7 files changed, 95 insertions, 20 deletions
diff --git a/Lib/sre.py b/Lib/sre.py
index a09184b..79f12a1 100644
--- a/Lib/sre.py
+++ b/Lib/sre.py
@@ -155,3 +155,34 @@ def _pickle(p):
return _compile, (p.pattern, p.flags)
copy_reg.pickle(type(_compile("")), _pickle, _compile)
+
+# --------------------------------------------------------------------
+# experimental stuff (see python-dev discussions for details)
+
+class Scanner:
+ def __init__(self, lexicon):
+ self.lexicon = lexicon
+ p = []
+ for phrase, action in lexicon:
+ p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
+ self.scanner = sre.compile("|".join(p))
+ def scan(self, string):
+ result = []
+ append = result.append
+ match = self.scanner.match
+ i = 0
+ while 1:
+ m = match(string, i)
+ if not m:
+ break
+ j = m.end()
+ if i == j:
+ break
+ action = self.lexicon[m.index][1]
+ if callable(action):
+ self.match = match
+ action = action(self, m.group())
+ if action is not None:
+ append(action)
+ i = j
+ return result, string[i:]
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index a593ee7..e5c501e 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -208,7 +208,7 @@ def _compile(code, pattern, flags):
else:
emit(OPCODES[op])
emit(av-1)
- elif op is MARK:
+ elif op in (MARK, INDEX):
emit(OPCODES[op])
emit(av)
else:
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index f0e45ea..076637d 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -33,6 +33,7 @@ GROUP = "group"
GROUP_IGNORE = "group_ignore"
IN = "in"
IN_IGNORE = "in_ignore"
+INDEX = "index"
INFO = "info"
JUMP = "jump"
LITERAL = "literal"
@@ -90,6 +91,7 @@ OPCODES = [
CATEGORY,
CHARSET,
GROUP, GROUP_IGNORE,
+ INDEX,
IN, IN_IGNORE,
INFO,
JUMP,
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index b263256..81ca217 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -451,6 +451,23 @@ def _parse(source, state):
if gid is None:
raise error, "unknown group name"
subpattern.append((GROUP, gid))
+ elif source.match("#"):
+ index = ""
+ while 1:
+ char = source.get()
+ if char is None:
+ raise error, "unterminated index"
+ if char == ")":
+ break
+ index = index + char
+ try:
+ index = int(index)
+ if index < 0 or index > MAXREPEAT:
+ raise ValueError
+ except ValueError:
+ raise error, "illegal index"
+ subpattern.append((INDEX, index))
+ continue
else:
char = source.get()
if char is None:
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 3bc0237..e11a892 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -21,6 +21,7 @@
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
* 00-06-30 fl added fast search optimization (0.9.3)
* 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
+ * 00-07-02 fl added charset optimizations, etc (0.9.5)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
@@ -31,7 +32,7 @@
#ifndef SRE_RECURSIVE
-char copyright[] = " SRE 0.9.4 Copyright (c) 1997-2000 by Secret Labs AB ";
+char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h"
@@ -587,6 +588,14 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
pattern++;
break;
+ case SRE_OP_INDEX:
+ /* set index */
+ /* args: <index> */
+ TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
+ state->index = pattern[0];
+ pattern++;
+ break;
+
case SRE_OP_JUMP:
case SRE_OP_INFO:
/* jump forward */
@@ -810,7 +819,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* match maximum number of items, pushing alternate end
points to the stack */
- while (pattern[2] == 32767 || count < (int) pattern[2]) {
+ while (pattern[2] == 65535 || count < (int) pattern[2]) {
state->stackbase = stack;
i = SRE_MATCH(state, pattern + 3);
state->stackbase = stackbase; /* rewind */
@@ -980,10 +989,12 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
}
if (flags & SRE_INFO_PREFIX) {
+ /* pattern starts with a known prefix */
prefix_len = pattern[5];
prefix = pattern + 6;
overlap = prefix + prefix_len - 1;
} else if (flags & SRE_INFO_CHARSET)
+ /* pattern starts with a character from a known set */
charset = pattern + 5;
pattern += 1 + pattern[1];
@@ -1042,7 +1053,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
if (status != 0)
break;
}
-#if 0
} else if (charset) {
/* pattern starts with a character from a known set */
for (;;) {
@@ -1057,7 +1067,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
if (status != 0)
break;
}
-#endif
} else
/* general case */
while (ptr <= end) {
@@ -1204,6 +1213,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
for (i = 0; i < SRE_MARK_SIZE; i++)
state->mark[i] = NULL;
+ state->index = -1;
+
state->stack = NULL;
state->stackbase = 0;
state->stacksize = 0;
@@ -1286,6 +1297,8 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
} else
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
+ match->index = state->index;
+
return (PyObject*) match;
} else if (status < 0) {
@@ -1887,6 +1900,15 @@ match_getattr(MatchObject* self, char* name)
if (!strcmp(name, "endpos"))
return Py_BuildValue("i", 0); /* FIXME */
+ if (!strcmp(name, "index")) {
+ /* experimental */
+ if (self->index < 0) {
+ Py_INCREF(Py_None);
+ return Py_None;
+ } else
+ return Py_BuildValue("i", self->index);
+ }
+
PyErr_SetString(PyExc_AttributeError, name);
return NULL;
}
diff --git a/Modules/sre.h b/Modules/sre.h
index 274f085..7e7d835 100644
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -33,6 +33,7 @@ typedef struct {
PyObject_HEAD
PyObject* string; /* link to the target string */
PatternObject* pattern; /* link to the regex (pattern) object */
+ int index; /* last index marker seen by the engine (-1 if none) */
int groups; /* number of groups (start/end marks) */
int mark[2];
} MatchObject;
@@ -57,6 +58,7 @@ typedef struct {
/* character size */
int charsize;
/* registers */
+ int index;
int lastmark;
void* mark[SRE_MARK_SIZE];
/* backtracking stack */
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index da25ec4..bffcdde 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -23,21 +23,22 @@
#define SRE_OP_CHARSET 9
#define SRE_OP_GROUP 10
#define SRE_OP_GROUP_IGNORE 11
-#define SRE_OP_IN 12
-#define SRE_OP_IN_IGNORE 13
-#define SRE_OP_INFO 14
-#define SRE_OP_JUMP 15
-#define SRE_OP_LITERAL 16
-#define SRE_OP_LITERAL_IGNORE 17
-#define SRE_OP_MARK 18
-#define SRE_OP_MAX_REPEAT 19
-#define SRE_OP_MAX_REPEAT_ONE 20
-#define SRE_OP_MIN_REPEAT 21
-#define SRE_OP_NOT_LITERAL 22
-#define SRE_OP_NOT_LITERAL_IGNORE 23
-#define SRE_OP_NEGATE 24
-#define SRE_OP_RANGE 25
-#define SRE_OP_REPEAT 26
+#define SRE_OP_INDEX 12
+#define SRE_OP_IN 13
+#define SRE_OP_IN_IGNORE 14
+#define SRE_OP_INFO 15
+#define SRE_OP_JUMP 16
+#define SRE_OP_LITERAL 17
+#define SRE_OP_LITERAL_IGNORE 18
+#define SRE_OP_MARK 19
+#define SRE_OP_MAX_REPEAT 20
+#define SRE_OP_MAX_REPEAT_ONE 21
+#define SRE_OP_MIN_REPEAT 22
+#define SRE_OP_NOT_LITERAL 23
+#define SRE_OP_NOT_LITERAL_IGNORE 24
+#define SRE_OP_NEGATE 25
+#define SRE_OP_RANGE 26
+#define SRE_OP_REPEAT 27
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2