diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2000-07-03 18:44:21 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2000-07-03 18:44:21 (GMT) |
commit | 6f013982366154ce570f69b6117dbcc6b1d5d89a (patch) | |
tree | 00f3bcae833f7bbcb15ba1a22ef2bdac3c148033 | |
parent | 40c48685a2b16dc7fdccd82fe1d927e52ed5e3db (diff) | |
download | cpython-6f013982366154ce570f69b6117dbcc6b1d5d89a.zip cpython-6f013982366154ce570f69b6117dbcc6b1d5d89a.tar.gz cpython-6f013982366154ce570f69b6117dbcc6b1d5d89a.tar.bz2 |
- added lookbehind support (?<=pattern), (?<!pattern).
the pattern must have a fixed width.
- got rid of array-module dependencies; the match pro-
gram is now stored inside the pattern object, rather
than in an extra string buffer.
- cleaned up a various of potential leaks, api abuses,
and other minors in the engine module.
- use mal's new isalnum macro, rather than my own work-
around.
- untabified test_sre.py. seems like I removed a couple
of trailing spaces in the process...
-rw-r--r-- | Lib/sre_compile.py | 29 | ||||
-rw-r--r-- | Lib/sre_parse.py | 12 | ||||
-rw-r--r-- | Lib/test/test_sre.py | 50 | ||||
-rw-r--r-- | Modules/_sre.c | 132 | ||||
-rw-r--r-- | Modules/sre.h | 17 |
5 files changed, 137 insertions, 103 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 36986eb..701b267 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -10,18 +10,10 @@ # other compatibility work. # -import array import _sre from sre_constants import * -# find an array type code that matches the engine's code size -for WORDSIZE in "Hil": - if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize(): - break -else: - raise RuntimeError, "cannot find a useable array type" - MAXCODE = 65535 def _charset(charset, fixup): @@ -170,7 +162,20 @@ def _compile(code, pattern, flags): emit((group-1)*2+1) elif op in (SUCCESS, FAILURE): emit(OPCODES[op]) - elif op in (ASSERT, ASSERT_NOT, CALL): + elif op in (ASSERT, ASSERT_NOT): + emit(OPCODES[op]) + skip = len(code); emit(0) + if av[0] >= 0: + emit(0) # look ahead + else: + lo, hi = av[1].getwidth() + if lo != hi: + raise error, "look-behind requires fixed-width pattern" + emit(lo) # look behind + _compile(code, av[1], flags) + emit(OPCODES[SUCCESS]) + code[skip] = len(code) - skip + elif op is CALL: emit(OPCODES[op]) skip = len(code); emit(0) _compile(code, av, flags) @@ -305,7 +310,7 @@ def compile(p, flags=0): indexgroup[i] = k return _sre.compile( - pattern, flags, - array.array(WORDSIZE, code).tostring(), - p.pattern.groups-1, groupindex, indexgroup + pattern, flags, code, + p.pattern.groups-1, + groupindex, indexgroup ) diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index d78737f..07ab782 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -482,9 +482,15 @@ def _parse(source, state): if source.next is None or source.next == ")": break source.get() - elif source.next in ("=", "!"): + elif source.next in ("=", "!", "<"): # lookahead assertions char = source.get() + dir = 1 + if char == "<": + if source.next not in ("=", "!"): + raise error, "syntax error" + dir = -1 # lookbehind + char = source.get() b = [] while 1: p = _parse(source, state) @@ -493,9 +499,9 @@ def _parse(source, state): b.append(p) p = _branch(state, b) if char == "=": - subpattern.append((ASSERT, p)) + subpattern.append((ASSERT, (dir, p))) else: - subpattern.append((ASSERT_NOT, p)) + subpattern.append((ASSERT_NOT, (dir, p))) break elif source.match("|"): b.append(p) diff --git a/Lib/test/test_sre.py b/Lib/test/test_sre.py index dc42ed1..a22c51a 100644 --- a/Lib/test/test_sre.py +++ b/Lib/test/test_sre.py @@ -35,20 +35,20 @@ if verbose: try: assert sre.sub("(?i)b+", "x", "bbbb BBBB") == 'x x' - + def bump_num(matchobj): int_value = int(matchobj.group(0)) return str(int_value + 1) assert sre.sub(r'\d+', bump_num, '08.2 -2 23x99y') == '9.3 -3 24x100y' assert sre.sub(r'\d+', bump_num, '08.2 -2 23x99y', 3) == '9.3 -3 23x99y' - + assert sre.sub('.', lambda m: r"\n", 'x') == '\\n' assert sre.sub('.', r"\n", 'x') == '\n' s = r"\1\1" assert sre.sub('(.)', s, 'x') == 'xx' - assert sre.sub('(.)', sre.escape(s), 'x') == s + assert sre.sub('(.)', sre.escape(s), 'x') == s assert sre.sub('(.)', lambda m: s, 'x') == s assert sre.sub('(?P<a>x)', '\g<a>\g<a>', 'xx') == 'xxxx' @@ -144,7 +144,7 @@ except AssertionError: if verbose: print 'Running tests on sre.split' - + try: assert sre.split(":", ":a:b::c") == ['', 'a', 'b', '', 'c'] assert sre.split(":*", ":a:b::c") == ['', 'a', 'b', 'c'] @@ -164,7 +164,7 @@ try: assert sre.split(':', 'a:b:c:d', 2) == ['a', 'b', 'c:d'] assert sre.split("(:)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c'] - assert sre.split("(:*)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c'] + assert sre.split("(:*)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c'] except AssertionError: raise TestFailed, "qualified sre.split" @@ -186,29 +186,29 @@ if verbose: try: # No groups at all - m = sre.match('a', 'a') ; assert m.groups() == () + m = sre.match('a', 'a') ; assert m.groups() == () # A single group - m = sre.match('(a)', 'a') ; assert m.groups() == ('a',) + m = sre.match('(a)', 'a') ; assert m.groups() == ('a',) pat = sre.compile('((a)|(b))(c)?') - assert pat.match('a').groups() == ('a', 'a', None, None) - assert pat.match('b').groups() == ('b', None, 'b', None) - assert pat.match('ac').groups() == ('a', 'a', None, 'c') - assert pat.match('bc').groups() == ('b', None, 'b', 'c') - assert pat.match('bc').groups("") == ('b', "", 'b', 'c') + assert pat.match('a').groups() == ('a', 'a', None, None) + assert pat.match('b').groups() == ('b', None, 'b', None) + assert pat.match('ac').groups() == ('a', 'a', None, 'c') + assert pat.match('bc').groups() == ('b', None, 'b', 'c') + assert pat.match('bc').groups("") == ('b', "", 'b', 'c') except AssertionError: raise TestFailed, "match .groups() method" try: # A single group - m = sre.match('(a)', 'a') - assert m.group(0) == 'a' ; assert m.group(0) == 'a' + m = sre.match('(a)', 'a') + assert m.group(0) == 'a' ; assert m.group(0) == 'a' assert m.group(1) == 'a' ; assert m.group(1, 1) == ('a', 'a') pat = sre.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') - assert pat.match('a').group(1, 2, 3) == ('a', None, None) - assert pat.match('b').group('a1', 'b2', 'c3') == (None, 'b', None) - assert pat.match('ac').group(1, 'b2', 3) == ('a', None, 'c') + assert pat.match('a').group(1, 2, 3) == ('a', None, None) + assert pat.match('b').group('a1', 'b2', 'c3') == (None, 'b', None) + assert pat.match('ac').group(1, 'b2', 3) == ('a', None, 'c') except AssertionError: raise TestFailed, "match .group() method" @@ -252,10 +252,10 @@ try: assert sre.I == sre.IGNORECASE assert sre.L == sre.LOCALE assert sre.M == sre.MULTILINE - assert sre.S == sre.DOTALL - assert sre.X == sre.VERBOSE - assert sre.T == sre.TEMPLATE - assert sre.U == sre.UNICODE + assert sre.S == sre.DOTALL + assert sre.X == sre.VERBOSE + assert sre.T == sre.TEMPLATE + assert sre.U == sre.UNICODE except AssertionError: raise TestFailed, 're module constants' @@ -272,7 +272,7 @@ if verbose: else: # To save time, only run the first and last 10 tests #tests = tests[:10] + tests[-10:] - pass + pass for t in tests: sys.stdout.flush() @@ -280,7 +280,7 @@ for t in tests: if len(t)==5: pattern, s, outcome, repl, expected = t elif len(t)==3: - pattern, s, outcome = t + pattern, s, outcome = t else: raise ValueError, ('Test tuples should have 3 or 5 fields',t) @@ -288,7 +288,7 @@ for t in tests: obj=sre.compile(pattern) except sre.error: if outcome==SYNTAX_ERROR: pass # Expected a syntax error - else: + else: print '=== Syntax error:', t except KeyboardInterrupt: raise KeyboardInterrupt except: @@ -356,7 +356,7 @@ for t in tests: # of the match and see if it still succeeds. \B will # break (because it won't match at the end or start of a # string), so we'll ignore patterns that feature it. - + if pattern[:2]!='\\B' and pattern[-2:]!='\\B': obj=sre.compile(pattern) result=obj.search(s, result.start(0), result.end(0)+1) diff --git a/Modules/_sre.c b/Modules/_sre.c index d6f050e..764e155 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -22,6 +22,7 @@ * 00-06-30 fl added fast search optimization (0.9.3) * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4) * 00-07-02 fl added charset optimizations, etc (0.9.5) + * 00-07-03 fl store code in pattern object, lookbehind, etc * * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. * @@ -144,14 +145,6 @@ static unsigned int sre_lower_unicode(unsigned int ch) { return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); } - -#if !defined(Py_UNICODE_ISALNUM) -/* FIXME: workaround. should be fixed in unicodectype.c */ -#define Py_UNICODE_ISALNUM(ch)\ - (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISUPPER(ch) ||\ - Py_UNICODE_ISTITLE(ch) || Py_UNICODE_ISDIGIT(ch)) -#endif - #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch)) #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) @@ -592,7 +585,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) /* set index */ /* args: <index> */ TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0])); - state->index = pattern[0]; + state->lastindex = pattern[0]; pattern++; break; @@ -606,10 +599,12 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) case SRE_OP_ASSERT: /* assert subpattern */ - /* args: <skip> <pattern> */ - TRACE(("%8d: assert subpattern\n", PTR(ptr))); - state->ptr = ptr; - i = SRE_MATCH(state, pattern + 1); + /* args: <skip> <back> <pattern> */ + TRACE(("%8d: assert subpattern %d\n", PTR(ptr), pattern[1])); + state->ptr = ptr - pattern[1]; + if (state->ptr < state->beginning) + goto failure; + i = SRE_MATCH(state, pattern + 2); if (i < 0) return i; if (!i) @@ -620,9 +615,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) case SRE_OP_ASSERT_NOT: /* assert not subpattern */ /* args: <skip> <pattern> */ - TRACE(("%8d: assert not subpattern\n", PTR(ptr))); - state->ptr = ptr; - i = SRE_MATCH(state, pattern + 1); + TRACE(("%8d: assert not subpattern %d\n", PTR(ptr), pattern[1])); + state->ptr = ptr - pattern[1]; + if (state->ptr < state->beginning) + goto failure; + i = SRE_MATCH(state, pattern + 2); if (i < 0) return i; if (i) @@ -1098,6 +1095,7 @@ _compile(PyObject* self_, PyObject* args) /* "compile" pattern descriptor to pattern object */ PatternObject* self; + int i, n; PyObject* pattern; int flags = 0; @@ -1105,24 +1103,37 @@ _compile(PyObject* self_, PyObject* args) int groups = 0; PyObject* groupindex = NULL; PyObject* indexgroup = NULL; - if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags, - &PyString_Type, &code, + if (!PyArg_ParseTuple(args, "OiO|iOO", &pattern, &flags, &code, &groups, &groupindex, &indexgroup)) return NULL; - self = PyObject_NEW(PatternObject, &Pattern_Type); - if (self == NULL) + code = PySequence_Fast(code, "code argument must be a sequence"); + if (!code) + return NULL; + n = PySequence_Length(code); + + self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n); + if (!self) { + Py_DECREF(code); return NULL; + } + + for (i = 0; i < n; i++) { + PyObject *o = PySequence_Fast_GET_ITEM(code, i); + self->code[i] = (SRE_CODE) PyInt_AsLong(o); + } + + Py_DECREF(code); + + if (PyErr_Occurred()) + return NULL; Py_INCREF(pattern); self->pattern = pattern; self->flags = flags; - Py_INCREF(code); - self->code = code; - self->groups = groups; Py_XINCREF(groupindex); @@ -1217,7 +1228,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args) for (i = 0; i < SRE_MARK_SIZE; i++) state->mark[i] = NULL; - state->index = -1; + state->lastindex = -1; state->stack = NULL; state->stackbase = 0; @@ -1274,8 +1285,9 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state, if (status > 0) { /* create match object (with room for extra group marks) */ - match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups); - if (match == NULL) + match = PyObject_NEW_VAR(MatchObject, &Match_Type, + 2*(pattern->groups+1)); + if (!match) return NULL; Py_INCREF(pattern); @@ -1301,7 +1313,10 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state, } else match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ - match->index = state->index; + match->lastindex = state->lastindex; + + match->pos = ((char*) state->start - base) / n; + match->endpos = ((char*) state->end - base) / n; return (PyObject*) match; @@ -1329,12 +1344,12 @@ pattern_scanner(PatternObject* pattern, PyObject* args) /* create match object (with room for extra group marks) */ self = PyObject_NEW(ScannerObject, &Scanner_Type); - if (self == NULL) + if (!self) return NULL; string = state_init(&self->state, pattern, args); if (!string) { - PyObject_DEL(self); + PyObject_Del(self); return NULL; } @@ -1350,10 +1365,9 @@ pattern_scanner(PatternObject* pattern, PyObject* args) static void pattern_dealloc(PatternObject* self) { - Py_XDECREF(self->code); Py_XDECREF(self->pattern); Py_XDECREF(self->groupindex); - PyMem_DEL(self); + PyObject_DEL(self); } static PyObject* @@ -1614,10 +1628,11 @@ pattern_getattr(PatternObject* self, char* name) statichere PyTypeObject Pattern_Type = { PyObject_HEAD_INIT(NULL) - 0, "SRE_Pattern", sizeof(PatternObject), 0, + 0, "SRE_Pattern", + sizeof(PatternObject), sizeof(SRE_CODE), (destructor)pattern_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ - (getattrfunc)pattern_getattr, /*tp_getattr*/ + (getattrfunc)pattern_getattr /*tp_getattr*/ }; /* -------------------------------------------------------------------- */ @@ -1628,7 +1643,7 @@ match_dealloc(MatchObject* self) { Py_XDECREF(self->string); Py_DECREF(self->pattern); - PyMem_DEL(self); + PyObject_DEL(self); } static PyObject* @@ -1643,31 +1658,40 @@ match_getslice_by_index(MatchObject* self, int index, PyObject* def) return NULL; } - if (self->string == Py_None || self->mark[index+index] < 0) { + index *= 2; + + if (self->string == Py_None || self->mark[index] < 0) { /* return default value if the string or group is undefined */ Py_INCREF(def); return def; } return PySequence_GetSlice( - self->string, self->mark[index+index], self->mark[index+index+1] + self->string, self->mark[index], self->mark[index+1] ); } static int match_getindex(MatchObject* self, PyObject* index) { - if (!PyInt_Check(index) && self->pattern->groupindex != NULL) { - /* FIXME: resource leak? */ - index = PyObject_GetItem(self->pattern->groupindex, index); - if (!index) - return -1; - } + int i; if (PyInt_Check(index)) return (int) PyInt_AS_LONG(index); - return -1; + i = -1; + + if (self->pattern->groupindex) { + index = PyObject_GetItem(self->pattern->groupindex, index); + if (index) { + if (PyInt_Check(index)) + i = (int) PyInt_AS_LONG(index); + Py_DECREF(index); + } else + PyErr_Clear(); + } + + return i; } static PyObject* @@ -1889,17 +1913,17 @@ match_getattr(MatchObject* self, char* name) if (!strcmp(name, "lastindex")) { /* experimental */ - if (self->index >= 0) - return Py_BuildValue("i", self->index); + if (self->lastindex >= 0) + return Py_BuildValue("i", self->lastindex); Py_INCREF(Py_None); return Py_None; } if (!strcmp(name, "lastgroup")) { /* experimental */ - if (self->pattern->indexgroup) { + if (self->pattern->indexgroup && self->lastindex >= 0) { PyObject* result = PySequence_GetItem( - self->pattern->indexgroup, self->index + self->pattern->indexgroup, self->lastindex ); if (result) return result; @@ -1920,10 +1944,10 @@ match_getattr(MatchObject* self, char* name) } if (!strcmp(name, "pos")) - return Py_BuildValue("i", 0); /* FIXME */ + return Py_BuildValue("i", self->pos); if (!strcmp(name, "endpos")) - return Py_BuildValue("i", 0); /* FIXME */ + return Py_BuildValue("i", self->endpos); PyErr_SetString(PyExc_AttributeError, name); return NULL; @@ -1935,11 +1959,10 @@ match_getattr(MatchObject* self, char* name) statichere PyTypeObject Match_Type = { PyObject_HEAD_INIT(NULL) 0, "SRE_Match", - sizeof(MatchObject), /* size of basic object */ - sizeof(int), /* space for group item */ + sizeof(MatchObject), sizeof(int), (destructor)match_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ - (getattrfunc)match_getattr, /*tp_getattr*/ + (getattrfunc)match_getattr /*tp_getattr*/ }; /* -------------------------------------------------------------------- */ @@ -1951,7 +1974,7 @@ scanner_dealloc(ScannerObject* self) state_fini(&self->state); Py_DECREF(self->string); Py_DECREF(self->pattern); - PyMem_DEL(self); + PyObject_DEL(self); } static PyObject* @@ -2041,8 +2064,7 @@ scanner_getattr(ScannerObject* self, char* name) statichere PyTypeObject Scanner_Type = { PyObject_HEAD_INIT(NULL) 0, "SRE_Scanner", - sizeof(ScannerObject), /* size of basic object */ - 0, + sizeof(ScannerObject), 0, (destructor)scanner_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ (getattrfunc)scanner_getattr, /*tp_getattr*/ diff --git a/Modules/sre.h b/Modules/sre.h index f66d608..d4e93da 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -17,26 +17,27 @@ #define SRE_CODE unsigned short typedef struct { - PyObject_HEAD - PyObject* code; /* link to the code string object */ + PyObject_VAR_HEAD int groups; PyObject* groupindex; PyObject* indexgroup; /* compatibility */ PyObject* pattern; /* pattern source (or None) */ int flags; /* flags used when compiling pattern source */ + /* pattern code */ + SRE_CODE code[1]; } PatternObject; -#define PatternObject_GetCode(o)\ - ((void*) PyString_AS_STRING(((PatternObject*)(o))->code)) +#define PatternObject_GetCode(o) (((PatternObject*)(o))->code) typedef struct { - PyObject_HEAD + PyObject_VAR_HEAD PyObject* string; /* link to the target string */ PatternObject* pattern; /* link to the regex (pattern) object */ - int index; /* last index marker seen by the engine (-1 if none) */ + int pos, endpos; /* current target slice */ + int lastindex; /* last index marker seen by the engine (-1 if none) */ int groups; /* number of groups (start/end marks) */ - int mark[2]; + int mark[1]; } MatchObject; typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); @@ -59,7 +60,7 @@ typedef struct { /* character size */ int charsize; /* registers */ - int index; + int lastindex; int lastmark; void* mark[SRE_MARK_SIZE]; /* backtracking stack */ |