From be2211e9401a0be96915c473ef99041beb5a4992 Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Thu, 29 Jun 2000 16:57:40 +0000 Subject: - fixed split (test_sre still complains about split, but that's caused by the group reset bug, not split itself) - added more mark slots (should be dynamically allocated, but 100 is better than 32. and checking for the upper limit is better than overwriting the memory ;-) - internal: renamed the cursor helper class - internal: removed some bloat from sre_compile --- Lib/sre.py | 20 ++++--- Lib/sre_compile.py | 159 ++++++++++++++++++++++------------------------------- Modules/_sre.c | 58 ++++++++++--------- Modules/sre.h | 9 ++- 4 files changed, 116 insertions(+), 130 deletions(-) diff --git a/Lib/sre.py b/Lib/sre.py index 32b3e8f..e0a51e3 100644 --- a/Lib/sre.py +++ b/Lib/sre.py @@ -26,7 +26,7 @@ T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE U = UNICODE = sre_compile.SRE_FLAG_UNICODE # sre exception -error = sre_parse.error +error = sre_compile.error # -------------------------------------------------------------------- # public interface @@ -105,7 +105,7 @@ def _subn(pattern, template, string, count=0): n = i = 0 s = [] append = s.append - c = pattern.cursor(string) + c = pattern.scanner(string) while not count or n < count: m = c.search() if not m: @@ -127,16 +127,20 @@ def _split(pattern, string, maxsplit=0): n = i = 0 s = [] append = s.append - c = pattern.cursor(string) + extend = s.extend + c = pattern.scanner(string) + g = c.groups while not maxsplit or n < maxsplit: m = c.search() if not m: break - j = m.start() - append(string[i:j]) - i = m.end() - if i <= j: - break + b, e = m.span() + if e == i: + continue + append(string[i:b]) + if g and b != e: + extend(m.groups()) + i = e n = n + 1 if i < len(string): append(string[i:]) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 2d7c021..a51531b 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -11,8 +11,7 @@ # other compatibility work. # -import array, string, sys - +import array import _sre from sre_constants import * @@ -24,123 +23,101 @@ for WORDSIZE in "BHil": else: raise RuntimeError, "cannot find a useable array type" -# FIXME: should move some optimizations from the parser to here! - -class Code: - def __init__(self): - self.data = [] - def __len__(self): - return len(self.data) - def __getitem__(self, index): - return self.data[index] - def __setitem__(self, index, code): - self.data[index] = code - def append(self, code): - self.data.append(code) - def todata(self): - # print self.data - try: - return array.array(WORDSIZE, self.data).tostring() - except OverflowError: - print self.data - raise - def _compile(code, pattern, flags): - append = code.append + emit = code.append for op, av in pattern: if op is ANY: if flags & SRE_FLAG_DOTALL: - append(OPCODES[op]) # any character at all! + emit(OPCODES[op]) else: - append(OPCODES[CATEGORY]) - append(CHCODES[CATEGORY_NOT_LINEBREAK]) + emit(OPCODES[CATEGORY]) + emit(CHCODES[CATEGORY_NOT_LINEBREAK]) elif op in (SUCCESS, FAILURE): - append(OPCODES[op]) + emit(OPCODES[op]) elif op is AT: - append(OPCODES[op]) + emit(OPCODES[op]) if flags & SRE_FLAG_MULTILINE: - append(ATCODES[AT_MULTILINE[av]]) + emit(ATCODES[AT_MULTILINE[av]]) else: - append(ATCODES[av]) + emit(ATCODES[av]) elif op is BRANCH: - append(OPCODES[op]) + emit(OPCODES[op]) tail = [] for av in av[1]: - skip = len(code); append(0) + skip = len(code); emit(0) _compile(code, av, flags) -## append(OPCODES[SUCCESS]) - append(OPCODES[JUMP]) - tail.append(len(code)); append(0) + emit(OPCODES[JUMP]) + tail.append(len(code)); emit(0) code[skip] = len(code) - skip - append(0) # end of branch + emit(0) # end of branch for tail in tail: code[tail] = len(code) - tail elif op is CALL: - append(OPCODES[op]) - skip = len(code); append(0) + emit(OPCODES[op]) + skip = len(code); emit(0) _compile(code, av, flags) - append(OPCODES[SUCCESS]) + emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip elif op is CATEGORY: - append(OPCODES[op]) + emit(OPCODES[op]) if flags & SRE_FLAG_LOCALE: - append(CH_LOCALE[CHCODES[av]]) + emit(CH_LOCALE[CHCODES[av]]) elif flags & SRE_FLAG_UNICODE: - append(CH_UNICODE[CHCODES[av]]) + emit(CH_UNICODE[CHCODES[av]]) else: - append(CHCODES[av]) + emit(CHCODES[av]) elif op is GROUP: if flags & SRE_FLAG_IGNORECASE: - append(OPCODES[OP_IGNORE[op]]) + emit(OPCODES[OP_IGNORE[op]]) else: - append(OPCODES[op]) - append(av-1) + emit(OPCODES[op]) + emit(av-1) elif op is IN: if flags & SRE_FLAG_IGNORECASE: - append(OPCODES[OP_IGNORE[op]]) + emit(OPCODES[OP_IGNORE[op]]) def fixup(literal, flags=flags): return _sre.getlower(ord(literal), flags) else: - append(OPCODES[op]) + emit(OPCODES[op]) fixup = ord - skip = len(code); append(0) + skip = len(code); emit(0) for op, av in av: - append(OPCODES[op]) + emit(OPCODES[op]) if op is NEGATE: pass elif op is LITERAL: - append(fixup(av)) + emit(fixup(av)) elif op is RANGE: - append(fixup(av[0])) - append(fixup(av[1])) + emit(fixup(av[0])) + emit(fixup(av[1])) elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: - append(CH_LOCALE[CHCODES[av]]) + emit(CH_LOCALE[CHCODES[av]]) elif flags & SRE_FLAG_UNICODE: - append(CH_UNICODE[CHCODES[av]]) + emit(CH_UNICODE[CHCODES[av]]) else: - append(CHCODES[av]) + emit(CHCODES[av]) else: - raise ValueError, "unsupported set operator" - append(OPCODES[FAILURE]) + raise error, "internal: unsupported set operator" + emit(OPCODES[FAILURE]) code[skip] = len(code) - skip elif op in (LITERAL, NOT_LITERAL): if flags & SRE_FLAG_IGNORECASE: - append(OPCODES[OP_IGNORE[op]]) + emit(OPCODES[OP_IGNORE[op]]) else: - append(OPCODES[op]) - append(ord(av)) + emit(OPCODES[op]) + emit(ord(av)) elif op is MARK: - append(OPCODES[op]) - append(av) + emit(OPCODES[op]) + emit(av) elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): if flags & SRE_FLAG_TEMPLATE: - append(OPCODES[REPEAT]) - skip = len(code); append(0) - append(av[0]) - append(av[1]) + emit(OPCODES[REPEAT]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) _compile(code, av[2], flags) - append(OPCODES[SUCCESS]) + emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip else: lo, hi = av[2].getwidth() @@ -149,35 +126,35 @@ def _compile(code, pattern, flags): if 0 and lo == hi == 1 and op is MAX_REPEAT: # FIXME: need a better way to figure out when # it's safe to use this one (in the parser, probably) - append(OPCODES[MAX_REPEAT_ONE]) - skip = len(code); append(0) - append(av[0]) - append(av[1]) + emit(OPCODES[MAX_REPEAT_ONE]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) _compile(code, av[2], flags) - append(OPCODES[SUCCESS]) + emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip else: - append(OPCODES[op]) - skip = len(code); append(0) - append(av[0]) - append(av[1]) + emit(OPCODES[op]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) _compile(code, av[2], flags) - append(OPCODES[SUCCESS]) + emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip elif op is SUBPATTERN: group = av[0] if group: - append(OPCODES[MARK]) - append((group-1)*2) + emit(OPCODES[MARK]) + emit((group-1)*2) _compile(code, av[1], flags) if group: - append(OPCODES[MARK]) - append((group-1)*2+1) + emit(OPCODES[MARK]) + emit((group-1)*2+1) else: raise ValueError, ("unsupported operand type", op) def compile(p, flags=0): - # convert pattern list to internal format + # internal: convert pattern list to internal format if type(p) in (type(""), type(u"")): import sre_parse pattern = p @@ -185,18 +162,14 @@ def compile(p, flags=0): else: pattern = None flags = p.pattern.flags | flags - code = Code() + code = [] _compile(code, p.data, flags) code.append(OPCODES[SUCCESS]) - data = code.todata() - if 0: # debugging - print - print "-" * 68 - import sre_disasm - sre_disasm.disasm(data) - print "-" * 68 + # FIXME: get rid of this limitation + assert p.pattern.groups <= 100,\ + "sorry, but this version only supports 100 named groups" return _sre.compile( pattern, flags, - data, + array.array(WORDSIZE, code).tostring(), p.pattern.groups-1, p.pattern.groupdict ) diff --git a/Modules/_sre.c b/Modules/_sre.c index 21214ed..dba2afd 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -14,11 +14,12 @@ * 00-03-06 fl first alpha, sort of (0.5) * 00-03-14 fl removed most compatibility stuff (0.6) * 00-05-10 fl towards third alpha (0.8.2) - * 00-05-13 fl added experimental cursor stuff (0.8.3) + * 00-05-13 fl added experimental scanner stuff (0.8.3) * 00-05-27 fl final bug hunt (0.8.4) * 00-06-21 fl less bugs, more taste (0.8.5) * 00-06-25 fl major changes to better deal with nested repeats (0.9) * 00-06-28 fl fixed findall (0.9.1) + * 00-06-29 fl fixed split, added more scanner features (0.9.2) * * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. * @@ -384,7 +385,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) int i, count; /* FIXME: this is a hack! */ - void* mark_copy[64]; + void* mark_copy[SRE_MARK_SIZE]; void* mark = NULL; TRACE(("%8d: enter\n", PTR(ptr))); @@ -954,7 +955,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) staticforward PyTypeObject Pattern_Type; staticforward PyTypeObject Match_Type; -staticforward PyTypeObject Cursor_Type; +staticforward PyTypeObject Scanner_Type; static PyObject * _compile(PyObject* self_, PyObject* args) @@ -1074,7 +1075,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args) state->lastmark = 0; /* FIXME: dynamic! */ - for (i = 0; i < 64; i++) + for (i = 0; i < SRE_MARK_SIZE; i++) state->mark[i] = NULL; state->stack = NULL; @@ -1176,15 +1177,15 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state, } static PyObject* -pattern_cursor(PatternObject* pattern, PyObject* args) +pattern_scanner(PatternObject* pattern, PyObject* args) { /* create search state object */ - CursorObject* self; + ScannerObject* self; PyObject* string; /* create match object (with room for extra group marks) */ - self = PyObject_NEW(CursorObject, &Cursor_Type); + self = PyObject_NEW(ScannerObject, &Scanner_Type); if (self == NULL) return NULL; @@ -1431,7 +1432,7 @@ static PyMethodDef pattern_methods[] = { {"split", (PyCFunction) pattern_split, 1}, {"findall", (PyCFunction) pattern_findall, 1}, /* experimental */ - {"cursor", (PyCFunction) pattern_cursor, 1}, + {"scanner", (PyCFunction) pattern_scanner, 1}, {NULL, NULL} }; @@ -1467,7 +1468,7 @@ pattern_getattr(PatternObject* self, char* name) statichere PyTypeObject Pattern_Type = { PyObject_HEAD_INIT(NULL) - 0, "Pattern", sizeof(PatternObject), 0, + 0, "SRE_Pattern", sizeof(PatternObject), 0, (destructor)pattern_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ (getattrfunc)pattern_getattr, /*tp_getattr*/ @@ -1761,7 +1762,7 @@ match_getattr(MatchObject* self, char* name) statichere PyTypeObject Match_Type = { PyObject_HEAD_INIT(NULL) - 0, "Match", + 0, "SRE_Match", sizeof(MatchObject), /* size of basic object */ sizeof(int), /* space for group item */ (destructor)match_dealloc, /*tp_dealloc*/ @@ -1770,10 +1771,10 @@ statichere PyTypeObject Match_Type = { }; /* -------------------------------------------------------------------- */ -/* cursor methods (experimental) */ +/* scanner methods (experimental) */ static void -cursor_dealloc(CursorObject* self) +scanner_dealloc(ScannerObject* self) { state_fini(&self->state); Py_DECREF(self->string); @@ -1782,7 +1783,7 @@ cursor_dealloc(CursorObject* self) } static PyObject* -cursor_match(CursorObject* self, PyObject* args) +scanner_match(ScannerObject* self, PyObject* args) { SRE_STATE* state = &self->state; PyObject* match; @@ -1811,7 +1812,7 @@ cursor_match(CursorObject* self, PyObject* args) static PyObject* -cursor_search(CursorObject* self, PyObject* args) +scanner_search(ScannerObject* self, PyObject* args) { SRE_STATE* state = &self->state; PyObject* match; @@ -1830,24 +1831,26 @@ cursor_search(CursorObject* self, PyObject* args) match = pattern_new_match((PatternObject*) self->pattern, state, self->string, status); - if (status >= 0) + if (status == 0 || state->ptr == state->start) + state->start = (void*) ((char*) state->ptr + state->charsize); + else state->start = state->ptr; return match; } -static PyMethodDef cursor_methods[] = { - {"match", (PyCFunction) cursor_match, 0}, - {"search", (PyCFunction) cursor_search, 0}, +static PyMethodDef scanner_methods[] = { + {"match", (PyCFunction) scanner_match, 0}, + {"search", (PyCFunction) scanner_search, 0}, {NULL, NULL} }; static PyObject* -cursor_getattr(CursorObject* self, char* name) +scanner_getattr(ScannerObject* self, char* name) { PyObject* res; - res = Py_FindMethod(cursor_methods, (PyObject*) self, name); + res = Py_FindMethod(scanner_methods, (PyObject*) self, name); if (res) return res; @@ -1859,18 +1862,21 @@ cursor_getattr(CursorObject* self, char* name) return self->pattern; } + if (!strcmp(name, "groups")) + return Py_BuildValue("i", ((PatternObject*) self->pattern)->groups); + PyErr_SetString(PyExc_AttributeError, name); return NULL; } -statichere PyTypeObject Cursor_Type = { +statichere PyTypeObject Scanner_Type = { PyObject_HEAD_INIT(NULL) - 0, "Cursor", - sizeof(CursorObject), /* size of basic object */ + 0, "SRE_Scanner", + sizeof(ScannerObject), /* size of basic object */ 0, - (destructor)cursor_dealloc, /*tp_dealloc*/ + (destructor)scanner_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ - (getattrfunc)cursor_getattr, /*tp_getattr*/ + (getattrfunc)scanner_getattr, /*tp_getattr*/ }; static PyMethodDef _functions[] = { @@ -1888,7 +1894,7 @@ init_sre() { /* Patch object types */ Pattern_Type.ob_type = Match_Type.ob_type = - Cursor_Type.ob_type = &PyType_Type; + Scanner_Type.ob_type = &PyType_Type; Py_InitModule("_" MODULE, _functions); } diff --git a/Modules/sre.h b/Modules/sre.h index a7786ed..722f890 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -46,6 +46,9 @@ typedef struct { void* ptr; } SRE_STACK; +/* FIXME: shouldn't be a constant, really... */ +#define SRE_MARK_SIZE 200 + typedef struct { /* string pointers */ void* ptr; /* current position (also end of current slice) */ @@ -56,7 +59,7 @@ typedef struct { int charsize; /* registers */ int lastmark; - void* mark[64]; /* FIXME: should be dynamically allocated! */ + void* mark[SRE_MARK_SIZE]; /* backtracking stack */ SRE_STACK* stack; int stacksize; @@ -66,11 +69,11 @@ typedef struct { } SRE_STATE; typedef struct { - /* search helper */ + /* scanner (internal helper object) */ PyObject_HEAD PyObject* pattern; PyObject* string; SRE_STATE state; -} CursorObject; +} ScannerObject; #endif -- cgit v0.12