From 7cafe4d7e466996d5fc32e871fe834e0e0c94282 Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Sun, 2 Jul 2000 17:33:27 +0000 Subject: - actually enabled charset anchors in the engine (still not used by the code generator) - changed max repeat value in engine (to match earlier array fix) - added experimental "which part matched?" mechanism to sre; see http://hem.passagen.se/eff/2000_07_01_bot-archive.htm#416954 or python-dev for details. --- Lib/sre.py | 31 +++++++++++++++++++++++++++++++ Lib/sre_compile.py | 2 +- Lib/sre_constants.py | 2 ++ Lib/sre_parse.py | 17 +++++++++++++++++ Modules/_sre.c | 30 ++++++++++++++++++++++++++---- Modules/sre.h | 2 ++ Modules/sre_constants.h | 31 ++++++++++++++++--------------- 7 files changed, 95 insertions(+), 20 deletions(-) diff --git a/Lib/sre.py b/Lib/sre.py index a09184b..79f12a1 100644 --- a/Lib/sre.py +++ b/Lib/sre.py @@ -155,3 +155,34 @@ def _pickle(p): return _compile, (p.pattern, p.flags) copy_reg.pickle(type(_compile("")), _pickle, _compile) + +# -------------------------------------------------------------------- +# experimental stuff (see python-dev discussions for details) + +class Scanner: + def __init__(self, lexicon): + self.lexicon = lexicon + p = [] + for phrase, action in lexicon: + p.append("(?:%s)(?P#%d)" % (phrase, len(p))) + self.scanner = sre.compile("|".join(p)) + def scan(self, string): + result = [] + append = result.append + match = self.scanner.match + i = 0 + while 1: + m = match(string, i) + if not m: + break + j = m.end() + if i == j: + break + action = self.lexicon[m.index][1] + if callable(action): + self.match = match + action = action(self, m.group()) + if action is not None: + append(action) + i = j + return result, string[i:] diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index a593ee7..e5c501e 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -208,7 +208,7 @@ def _compile(code, pattern, flags): else: emit(OPCODES[op]) emit(av-1) - elif op is MARK: + elif op in (MARK, INDEX): emit(OPCODES[op]) emit(av) else: diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index f0e45ea..076637d 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -33,6 +33,7 @@ GROUP = "group" GROUP_IGNORE = "group_ignore" IN = "in" IN_IGNORE = "in_ignore" +INDEX = "index" INFO = "info" JUMP = "jump" LITERAL = "literal" @@ -90,6 +91,7 @@ OPCODES = [ CATEGORY, CHARSET, GROUP, GROUP_IGNORE, + INDEX, IN, IN_IGNORE, INFO, JUMP, diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index b263256..81ca217 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -451,6 +451,23 @@ def _parse(source, state): if gid is None: raise error, "unknown group name" subpattern.append((GROUP, gid)) + elif source.match("#"): + index = "" + while 1: + char = source.get() + if char is None: + raise error, "unterminated index" + if char == ")": + break + index = index + char + try: + index = int(index) + if index < 0 or index > MAXREPEAT: + raise ValueError + except ValueError: + raise error, "illegal index" + subpattern.append((INDEX, index)) + continue else: char = source.get() if char is None: diff --git a/Modules/_sre.c b/Modules/_sre.c index 3bc0237..e11a892 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -21,6 +21,7 @@ * 00-06-29 fl fixed split, added more scanner features (0.9.2) * 00-06-30 fl added fast search optimization (0.9.3) * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4) + * 00-07-02 fl added charset optimizations, etc (0.9.5) * * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. * @@ -31,7 +32,7 @@ #ifndef SRE_RECURSIVE -char copyright[] = " SRE 0.9.4 Copyright (c) 1997-2000 by Secret Labs AB "; +char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB "; #include "Python.h" @@ -587,6 +588,14 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) pattern++; break; + case SRE_OP_INDEX: + /* set index */ + /* args: */ + TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0])); + state->index = pattern[0]; + pattern++; + break; + case SRE_OP_JUMP: case SRE_OP_INFO: /* jump forward */ @@ -810,7 +819,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) /* match maximum number of items, pushing alternate end points to the stack */ - while (pattern[2] == 32767 || count < (int) pattern[2]) { + while (pattern[2] == 65535 || count < (int) pattern[2]) { state->stackbase = stack; i = SRE_MATCH(state, pattern + 3); state->stackbase = stackbase; /* rewind */ @@ -980,10 +989,12 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) } if (flags & SRE_INFO_PREFIX) { + /* pattern starts with a known prefix */ prefix_len = pattern[5]; prefix = pattern + 6; overlap = prefix + prefix_len - 1; } else if (flags & SRE_INFO_CHARSET) + /* pattern starts with a character from a known set */ charset = pattern + 5; pattern += 1 + pattern[1]; @@ -1042,7 +1053,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) if (status != 0) break; } -#if 0 } else if (charset) { /* pattern starts with a character from a known set */ for (;;) { @@ -1057,7 +1067,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) if (status != 0) break; } -#endif } else /* general case */ while (ptr <= end) { @@ -1204,6 +1213,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args) for (i = 0; i < SRE_MARK_SIZE; i++) state->mark[i] = NULL; + state->index = -1; + state->stack = NULL; state->stackbase = 0; state->stacksize = 0; @@ -1286,6 +1297,8 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state, } else match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ + match->index = state->index; + return (PyObject*) match; } else if (status < 0) { @@ -1887,6 +1900,15 @@ match_getattr(MatchObject* self, char* name) if (!strcmp(name, "endpos")) return Py_BuildValue("i", 0); /* FIXME */ + if (!strcmp(name, "index")) { + /* experimental */ + if (self->index < 0) { + Py_INCREF(Py_None); + return Py_None; + } else + return Py_BuildValue("i", self->index); + } + PyErr_SetString(PyExc_AttributeError, name); return NULL; } diff --git a/Modules/sre.h b/Modules/sre.h index 274f085..7e7d835 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -33,6 +33,7 @@ typedef struct { PyObject_HEAD PyObject* string; /* link to the target string */ PatternObject* pattern; /* link to the regex (pattern) object */ + int index; /* last index marker seen by the engine (-1 if none) */ int groups; /* number of groups (start/end marks) */ int mark[2]; } MatchObject; @@ -57,6 +58,7 @@ typedef struct { /* character size */ int charsize; /* registers */ + int index; int lastmark; void* mark[SRE_MARK_SIZE]; /* backtracking stack */ diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index da25ec4..bffcdde 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -23,21 +23,22 @@ #define SRE_OP_CHARSET 9 #define SRE_OP_GROUP 10 #define SRE_OP_GROUP_IGNORE 11 -#define SRE_OP_IN 12 -#define SRE_OP_IN_IGNORE 13 -#define SRE_OP_INFO 14 -#define SRE_OP_JUMP 15 -#define SRE_OP_LITERAL 16 -#define SRE_OP_LITERAL_IGNORE 17 -#define SRE_OP_MARK 18 -#define SRE_OP_MAX_REPEAT 19 -#define SRE_OP_MAX_REPEAT_ONE 20 -#define SRE_OP_MIN_REPEAT 21 -#define SRE_OP_NOT_LITERAL 22 -#define SRE_OP_NOT_LITERAL_IGNORE 23 -#define SRE_OP_NEGATE 24 -#define SRE_OP_RANGE 25 -#define SRE_OP_REPEAT 26 +#define SRE_OP_INDEX 12 +#define SRE_OP_IN 13 +#define SRE_OP_IN_IGNORE 14 +#define SRE_OP_INFO 15 +#define SRE_OP_JUMP 16 +#define SRE_OP_LITERAL 17 +#define SRE_OP_LITERAL_IGNORE 18 +#define SRE_OP_MARK 19 +#define SRE_OP_MAX_REPEAT 20 +#define SRE_OP_MAX_REPEAT_ONE 21 +#define SRE_OP_MIN_REPEAT 22 +#define SRE_OP_NOT_LITERAL 23 +#define SRE_OP_NOT_LITERAL_IGNORE 24 +#define SRE_OP_NEGATE 25 +#define SRE_OP_RANGE 26 +#define SRE_OP_REPEAT 27 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BOUNDARY 2 -- cgit v0.12