From b700df9824a768893fb83dec41779ac89035313e Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 31 Mar 2000 14:59:30 +0000 Subject: Adding Fredrik Lundh's _sre.c module and its header files. NOTE: THIS IS VERY ROUGH ALPHA CODE! --- Modules/_sre.c | 1453 +++++++++++++++++++++++++++++++++++++++++++++++ Modules/sre.h | 38 ++ Modules/sre_constants.h | 27 + 3 files changed, 1518 insertions(+) create mode 100644 Modules/_sre.c create mode 100644 Modules/sre.h create mode 100644 Modules/sre_constants.h diff --git a/Modules/_sre.c b/Modules/_sre.c new file mode 100644 index 0000000..8af876f --- /dev/null +++ b/Modules/_sre.c @@ -0,0 +1,1453 @@ +/* -*- Mode: C; tab-width: 4 -*- + * + * Secret Labs' Regular Expression Engine + * $Id$ + * + * simple regular expression matching engine + * + * partial history: + * 99-10-24 fl created (bits and pieces from the template matcher) + * 99-11-13 fl added categories, branching, and more (0.2) + * 99-11-16 fl some tweaks to compile on non-Windows platforms + * 99-12-18 fl non-literals, generic maximizing repeat (0.3) + * 99-02-28 fl tons of changes (not all to the better ;-) (0.4) + * 99-03-06 fl first alpha, sort of (0.5) + * 99-03-14 fl removed most compatibility stuff (0.6) + * + * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * + * This code can only be used for 1.6 alpha testing. All other use + * require explicit permission from Secret Labs AB. + * + * Portions of this engine have been developed in cooperation with + * CNRI. Hewlett-Packard provided funding for 1.6 integration and + * other compatibility work. + */ + +#ifndef SRE_RECURSIVE + +char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB "; + +#include "Python.h" + +#include "sre.h" + +#include "unicodeobject.h" + +#if defined(HAVE_LIMITS_H) +#include +#else +#define INT_MAX 2147483647 +#endif + +#include /* temporary hack */ + +/* defining this one enables tracing */ +#undef DEBUG + +#ifdef WIN32 /* FIXME: don't assume Windows == MSVC */ +#pragma optimize("agtw", on) /* doesn't seem to make much difference... */ +/* fastest possible local call under MSVC */ +#define LOCAL(type) static __inline type __fastcall +#else +#define LOCAL(type) static type +#endif + +/* error codes */ +#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ +#define SRE_ERROR_MEMORY -9 /* out of memory */ + +#ifdef DEBUG +#define TRACE(v) printf v +#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning) +#else +#define TRACE(v) +#endif + +#define SRE_CODE unsigned short /* unsigned short or larger */ + +typedef struct { + /* string pointers */ + void* ptr; /* current position (also end of current slice) */ + void* beginning; /* start of original string */ + void* start; /* start of current slice */ + void* end; /* end of original string */ + /* character size */ + int charsize; + /* registers */ + int marks; + void* mark[64]; /* FIXME: should be dynamically allocated! */ + /* FIXME */ + /* backtracking stack */ + void** stack; + int stacksize; + int stackbase; +} SRE_STATE; + +#if 1 /* FIXME: fix this one! */ +#define SRE_TO_LOWER Py_UNICODE_TOLOWER +#define SRE_IS_DIGIT Py_UNICODE_ISDIGIT +#define SRE_IS_SPACE Py_UNICODE_ISSPACE +#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0) +#else +#define SRE_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch) +#define SRE_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0) +#define SRE_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0) +#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0) +#endif + +#define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_') + +LOCAL(int) +sre_category(SRE_CODE category, unsigned int ch) +{ + switch (category) { + case 'd': + return SRE_IS_DIGIT(ch); + case 'D': + return !SRE_IS_DIGIT(ch); + case 's': + return SRE_IS_SPACE(ch); + case 'S': + return !SRE_IS_SPACE(ch); + case 'w': + return SRE_IS_WORD(ch); + case 'W': + return !SRE_IS_WORD(ch); + } + return 0; +} + +/* helpers */ + +LOCAL(int) +_stack_free(SRE_STATE* state) +{ + if (state->stack) { + TRACE(("release stack\n")); + free(state->stack); + state->stack = NULL; + } + state->stacksize = 0; + return 0; +} + +static int /* shouldn't be LOCAL */ +_stack_extend(SRE_STATE* state, int lo, int hi) +{ + void** stack; + int stacksize; + + /* grow the stack to a suitable size; we need at least lo entries, + at most hi entries. if for some reason hi is lower than lo, lo + wins */ + + stacksize = state->stacksize; + + if (stacksize == 0) { + /* create new stack */ + stacksize = 512; + if (stacksize < lo) + stacksize = lo; + else if (stacksize > hi) + stacksize = hi; + TRACE(("allocate stack %d\n", stacksize)); + stack = malloc(sizeof(void*) * stacksize); + } else { + /* grow the stack (typically by a factor of two) */ + while (stacksize < lo) + stacksize = 2 * stacksize; + /* FIXME: could trim size if it's larger than lo, and + much larger than hi */ + TRACE(("grow stack to %d\n", stacksize)); + stack = realloc(state->stack, sizeof(void*) * stacksize); + } + + if (!stack) { + _stack_free(state); + return SRE_ERROR_MEMORY; + } + + state->stack = stack; + state->stacksize = stacksize; + + return 0; +} + +/* set things up for the 8-bit version */ + +#define SRE_CHAR unsigned char +#define SRE_AT sre_at +#define SRE_MEMBER sre_member +#define SRE_MATCH sre_match +#define SRE_SEARCH sre_search +#define SRE_RECURSIVE + +#include "_sre.c" + +#undef SRE_RECURSIVE +#undef SRE_SEARCH +#undef SRE_MATCH +#undef SRE_MEMBER +#undef SRE_AT +#undef SRE_CHAR + +/* set things up for the 16-bit unicode version */ + +#define SRE_CHAR Py_UNICODE +#define SRE_AT sre_uat +#define SRE_MEMBER sre_umember +#define SRE_MATCH sre_umatch +#define SRE_SEARCH sre_usearch + +#endif /* SRE_RECURSIVE */ + +/* -------------------------------------------------------------------- */ +/* String matching engine */ + +/* the following section is compiled twice, with different character + settings */ + +LOCAL(int) +SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) +{ + /* check if pointer is at given position. return 1 if so, 0 + otherwise */ + + int this, that; + + switch (at) { + case 'a': + /* beginning */ + return (ptr == state->beginning); + case 'z': + /* end */ + return (ptr == state->end); + case 'b': + /* word boundary */ + if (state->beginning == state->end) + return 0; + that = ((void*) ptr > state->beginning) ? + SRE_IS_WORD((int) ptr[-1]) : 0; + this = ((void*) ptr < state->end) ? + SRE_IS_WORD((int) ptr[0]) : 0; + return this != that; + case 'B': + /* word non-boundary */ + if (state->beginning == state->end) + return 0; + that = ((void*) ptr > state->beginning) ? + SRE_IS_WORD((int) ptr[-1]) : 0; + this = ((void*) ptr < state->end) ? + SRE_IS_WORD((int) ptr[0]) : 0; + return this == that; + } + + return 0; +} + +LOCAL(int) +SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch) +{ + /* check if character is a member of the given set. return 1 if + so, 0 otherwise */ + + int ok = 1; + + for (;;) { + switch (*set++) { + + case SRE_OP_NEGATE: + ok = !ok; + break; + + case SRE_OP_FAILURE: + return !ok; + + case SRE_OP_LITERAL: + if (ch == (SRE_CHAR) set[0]) + return ok; + set++; + break; + + case SRE_OP_RANGE: + if ((SRE_CHAR) set[0] <= ch && ch <= (SRE_CHAR) set[1]) + return ok; + set += 2; + break; + + case SRE_OP_CATEGORY: + if (sre_category(set[0], (int) ch)) + return ok; + set += 1; + break; + + default: + /* FIXME: internal error */ + return 0; + } + } +} + +LOCAL(int) +SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) +{ + /* check if string matches the given pattern. returns -1 for + error, 0 for failure, and 1 for success */ + + SRE_CHAR* end = state->end; + SRE_CHAR* ptr = state->ptr; + int stacksize; + int stackbase; + int i, count; + + for (;;) { + + TRACE(("[%p]\n", pattern)); + + switch (*pattern++) { + + case SRE_OP_FAILURE: + /* immediate failure */ + TRACE(("%8d: failure\n", PTR(ptr))); + return 0; + + case SRE_OP_SUCCESS: + /* end of pattern */ + TRACE(("%8d: success\n", PTR(ptr))); + state->ptr = ptr; + return 1; + + case SRE_OP_AT: + /* match at given position */ + TRACE(("%8d: match at \\%c\n", PTR(ptr), *pattern)); + if (!SRE_AT(state, ptr, *pattern)) + return 0; + pattern++; + break; + + case SRE_OP_LITERAL: + /* match literal character */ + /* args: */ + TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) *pattern)); + if (ptr >= end || *ptr != (SRE_CHAR) *pattern) + return 0; + pattern++; + ptr++; + break; + + case SRE_OP_NOT_LITERAL: + /* match anything that is not literal character */ + /* args: */ + TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) *pattern)); + if (ptr >= end || *ptr == (SRE_CHAR) *pattern) + return 0; + pattern++; + ptr++; + break; + + case SRE_OP_ANY: + /* match anything */ + TRACE(("%8d: any\n", PTR(ptr))); + if (ptr >= end) + return 0; + ptr++; + break; + + case SRE_OP_IN: + /* match set member (or non_member) */ + /* args: */ + TRACE(("%8d: set %c\n", PTR(ptr), *ptr)); + if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr)) + return 0; + pattern += pattern[0]; + ptr++; + break; + + case SRE_OP_GROUP: + /* match backreference */ + i = pattern[0]; + { + /* FIXME: optimize size! */ + SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i]; + SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1]; + if (!p || !e || e < p) + return 0; + while (p < e) { + if (ptr >= end || *ptr != *p) + return 0; + p++; ptr++; + } + } + pattern++; + break; + + case SRE_OP_LITERAL_IGNORE: + TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern)); + if (ptr >= end || SRE_TO_LOWER(*ptr) != (SRE_CHAR) *pattern) + return 0; + pattern++; + ptr++; + break; + + case SRE_OP_NOT_LITERAL_IGNORE: + TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), + (SRE_CHAR) *pattern)); + if (ptr >= end || SRE_TO_LOWER(*ptr) == (SRE_CHAR) *pattern) + return 0; + pattern++; + ptr++; + break; + + case SRE_OP_IN_IGNORE: + TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr)); + if (ptr >= end + || !SRE_MEMBER(pattern+1, (SRE_CHAR) SRE_TO_LOWER(*ptr))) + return 0; + pattern += pattern[0]; + ptr++; + break; + + case SRE_OP_MARK: + /* set mark */ + /* args: */ + TRACE(("%8d: set mark(%d)\n", PTR(ptr), pattern[0])); + state->mark[pattern[0]] = ptr; + pattern++; + break; + + case SRE_OP_JUMP: + /* jump forward */ + /* args: */ + TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0])); + pattern += pattern[0]; + break; + + case SRE_OP_CALL: + /* match subpattern, without backtracking */ + /* args: */ + TRACE(("%8d: match subpattern\n", PTR(ptr))); + state->ptr = ptr; + if (!SRE_MATCH(state, pattern + 1)) + return 0; + pattern += pattern[0]; + ptr = state->ptr; + break; + + case SRE_OP_MAX_REPEAT_ONE: + + /* match repeated sequence (maximizing regexp). this + variant only works if the repeated item is exactly one + character wide, and we're not already collecting + backtracking points. for other cases, use the + MAX_REPEAT operator instead */ + + /* args: */ + + TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr), + pattern[1], pattern[2])); + + count = 0; + + if (pattern[3] == SRE_OP_ANY) { + /* repeated wildcard. skip to the end of the target + string, and backtrack from there */ + /* FIXME: must look for line endings */ + if (ptr + pattern[1] > end) + return 0; /* cannot match */ + count = pattern[2]; + if (count > end - ptr) + count = end - ptr; + ptr += count; + + } else if (pattern[3] == SRE_OP_LITERAL) { + /* repeated literal */ + SRE_CHAR chr = (SRE_CHAR) pattern[4]; + while (count < (int) pattern[2]) { + if (ptr >= end || *ptr != chr) + break; + ptr++; + count++; + } + + } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) { + /* repeated literal */ + SRE_CHAR chr = (SRE_CHAR) pattern[4]; + while (count < (int) pattern[2]) { + if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) != chr) + break; + ptr++; + count++; + } + + } else if (pattern[3] == SRE_OP_NOT_LITERAL) { + /* repeated non-literal */ + SRE_CHAR chr = (SRE_CHAR) pattern[4]; + while (count < (int) pattern[2]) { + if (ptr >= end || *ptr == chr) + break; + ptr++; + count++; + } + + } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) { + /* repeated non-literal */ + SRE_CHAR chr = (SRE_CHAR) pattern[4]; + while (count < (int) pattern[2]) { + if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) == chr) + break; + ptr++; + count++; + } + + } else if (pattern[3] == SRE_OP_IN) { + /* repeated set */ + while (count < (int) pattern[2]) { + if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr)) + break; + ptr++; + count++; + } + + } else { + /* repeated single character pattern */ + state->ptr = ptr; + while (count < (int) pattern[2]) { + i = SRE_MATCH(state, pattern + 3); + if (i < 0) + return i; + if (i == 0) + break; + count++; + } + state->ptr = ptr; + ptr += count; + } + + /* when we arrive here, count contains the number of + matches, and ptr points to the tail of the target + string. check if the rest of the pattern matches, and + backtrack if not. */ + + /* FIXME: this is a mess. fix it! */ + + TRACE(("%8d: repeat %d found\n", PTR(ptr), count)); + + if (count < (int) pattern[1]) + return 0; + + if (pattern[pattern[0]] == SRE_OP_SUCCESS) { + /* tail is empty. we're finished */ + TRACE(("%8d: tail is empty\n", PTR(ptr))); + state->ptr = ptr; + return 1; + + } else if (pattern[pattern[0]] == SRE_OP_LITERAL) { + /* tail starts with a literal. we can speed things up + by skipping positions where the rest of the pattern + cannot possibly match */ + SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1]; + TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr)); + for (;;) { + TRACE(("%8d: scan for tail match\n", PTR(ptr))); + while (count >= (int) pattern[1] && + (ptr >= end || *ptr != chr)) { + ptr--; + count--; + } + TRACE(("%8d: check tail\n", PTR(ptr))); + if (count < (int) pattern[1]) + break; + state->ptr = ptr; + i = SRE_MATCH(state, pattern + pattern[0]); + if (i > 0) { + TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); + return 1; + } + TRACE(("%8d: BACKTRACK\n", PTR(ptr))); + ptr--; + count--; + } + + } else { + TRACE(("%8d: tail is pattern\n", PTR(ptr))); + while (count >= (int) pattern[1]) { + state->ptr = ptr; + i = SRE_MATCH(state, pattern + pattern[0]); + if (i > 0) { + TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); + return 1; + } + TRACE(("%8d: BACKTRACK\n", PTR(ptr))); + ptr--; + count--; + } + } + return 0; /* failure! */ + +/* ----------------------------------------------------------------------- */ +/* FIXME: the following section is just plain broken */ + + case SRE_OP_MAX_REPEAT: + /* match repeated sequence (maximizing regexp). repeated + group should end with a MAX_UNTIL code */ + + TRACE(("%8d: max repeat %d %d\n", PTR(ptr), + pattern[1], pattern[2])); + + count = 0; + state->ptr = ptr; + + /* FIXME: umm. what about matching the minimum + number of items before starting to collect backtracking + positions? */ + + stackbase = state->stackbase; + + while (count < (int) pattern[2]) { + /* store current position on the stack */ + TRACE(("%8d: push mark at index %d\n", PTR(ptr), count)); + if (stackbase + count >= state->stacksize) { + i = _stack_extend(state, stackbase + count + 1, + stackbase + pattern[2]); + if (i < 0) + return i; + } + state->stack[stackbase + count] = ptr; + /* check if we can match another item */ + state->stackbase += count + 1; + i = SRE_MATCH(state, pattern + 3); + state->stackbase = stackbase; /* rewind */ + if (i != 2) + break; + if (state->ptr == ptr) { + /* if the match was successful but empty, set the + count to max and terminate the scanning loop */ + stacksize = count; /* actual size of stack */ + count = (int) pattern[2]; + goto check_tail; /* FIXME: eliminate goto */ + } + count++; + ptr = state->ptr; + + } + + stacksize = count; /* actual number of entries on the stack */ + + check_tail: + + /* when we get here, count is the number of matches, + stacksize is the number of match points on the stack + (usually same as count, but it might be smaller) and + ptr points to the tail. */ + + if (count < (int) pattern[1]) + return 0; + + /* make sure that rest of the expression matches. if it + doesn't, backtrack */ + + TRACE(("%8d: repeat %d found (stack size = %d)\n", PTR(ptr), + count, stacksize + 1)); + + TRACE(("%8d: tail is pattern\n", PTR(ptr))); + + /* hope for the best */ + state->ptr = ptr; + state->stackbase += stacksize + 1; + i = SRE_MATCH(state, pattern + pattern[0]); + state->stackbase = stackbase; + if (i > 0) { + TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); + return 1; + } + + /* backtrack! */ + while (count >= (int) pattern[1]) { + ptr = state->stack[stackbase + (count < stacksize ? count : stacksize)]; + state->ptr = ptr; + count--; + TRACE(("%8d: BACKTRACK\n", PTR(ptr))); + state->stackbase += stacksize + 1; + i = SRE_MATCH(state, pattern + pattern[0]); + state->stackbase = stackbase; + if (i > 0) { + TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); + return 1; + } + } + return 0; /* failure! */ + + case SRE_OP_MAX_UNTIL: + /* match repeated sequence (maximizing regexp). repeated + group should end with a MAX_UNTIL code */ + + TRACE(("%8d: max until\n", PTR(ptr))); + state->ptr = ptr; + return 2; /* always succeeds, for now... */ + +/* end of totally broken section */ +/* ----------------------------------------------------------------------- */ + + case SRE_OP_MIN_REPEAT: + /* match repeated sequence (minimizing regexp) */ + TRACE(("%8d: min repeat %d %d\n", PTR(ptr), + pattern[1], pattern[2])); + count = 0; + state->ptr = ptr; + /* match minimum number of items */ + while (count < (int) pattern[1]) { + i = SRE_MATCH(state, pattern + 3); + if (i <= 0) + return i; + count++; + } + /* move forward until the tail matches. */ + while (count <= (int) pattern[2]) { + ptr = state->ptr; + i = SRE_MATCH(state, pattern + pattern[0]); + if (i > 0) { + TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); + return 1; + } + TRACE(("%8d: BACKTRACK\n", PTR(ptr))); + state->ptr = ptr; /* backtrack */ + i = SRE_MATCH(state, pattern + 3); + if (i <= 0) + return i; + count++; + } + return 0; /* failure! */ + + case SRE_OP_MIN_UNTIL: + /* end of repeat group */ + TRACE(("%8d: min until\n", PTR(ptr))); + state->ptr = ptr; + return 2; /* always succeeds, for now... */ + + case SRE_OP_BRANCH: + /* match one of several subpatterns */ + /* format: ... */ + TRACE(("%8d: branch\n", PTR(ptr))); + while (*pattern) { + if (pattern[1] != SRE_OP_LITERAL || + (ptr < end && *ptr == (SRE_CHAR) pattern[2])) { + TRACE(("%8d: branch check\n", PTR(ptr))); + state->ptr = ptr; + i = SRE_MATCH(state, pattern + 1); + if (i > 0) { + TRACE(("%8d: branch succeeded\n", PTR(ptr))); + return 1; + } + } + pattern += *pattern; + } + TRACE(("%8d: branch failed\n", PTR(ptr))); + return 0; /* failure! */ + + case SRE_OP_REPEAT: + /* TEMPLATE: match repeated sequence (no backtracking) */ + /* format: */ + TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2])); + count = 0; + state->ptr = ptr; + while (count < (int) pattern[2]) { + i = SRE_MATCH(state, pattern + 3); + if (i <= 0) + break; + count++; + } + if (count <= (int) pattern[1]) + return 0; + TRACE(("%8d: repeat %d matches\n", PTR(ptr), count)); + pattern += pattern[0]; + ptr = state->ptr; + break; + + default: + return SRE_ERROR_ILLEGAL; + } + } +} + +LOCAL(int) +SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) +{ + SRE_CHAR* ptr = state->start; + SRE_CHAR* end = state->end; + int status = 0; + + /* FIXME: add IGNORE cases (or implement full ASSERT support? */ + + if (pattern[0] == SRE_OP_LITERAL) { + /* pattern starts with a literal */ + SRE_CHAR chr = (SRE_CHAR) pattern[1]; + for (;;) { + while (ptr < end && *ptr != chr) + ptr++; + if (ptr == end) + return 0; + TRACE(("%8d: search found literal\n", PTR(ptr))); + state->start = ptr; + state->ptr = ++ptr; + status = SRE_MATCH(state, pattern + 2); + if (status != 0) + break; + } + + } else if (pattern[0] == SRE_OP_IN) { + /* pattern starts with a set */ + for (;;) { + /* format: */ + while (ptr < end && !SRE_MEMBER(pattern + 2, *ptr)) + ptr++; + if (ptr == end) + return 0; + TRACE(("%8d: search found set\n", PTR(ptr))); + state->start = ptr; + state->ptr = ++ptr; + status = SRE_MATCH(state, pattern + pattern[1] + 1); + if (status != 0) + break; + } + + } else + while (ptr <= end) { + TRACE(("%8d: search\n", PTR(ptr))); + state->start = state->ptr = ptr++; + status = SRE_MATCH(state, pattern); + if (status != 0) + break; + } + + return status; +} + +#ifndef SRE_RECURSIVE + +/* -------------------------------------------------------------------- */ +/* factories and destructors */ + +/* see sre.h for object declarations */ + +staticforward PyTypeObject Pattern_Type; +staticforward PyTypeObject Match_Type; + +static PyObject * +_compile(PyObject* self_, PyObject* args) +{ + /* "compile" pattern descriptor to pattern object */ + + PatternObject* self; + + PyObject* pattern; + PyObject* code; + int groups = 0; + PyObject* groupindex = NULL; + if (!PyArg_ParseTuple(args, "OO!|iO", &pattern, + &PyString_Type, &code, &groups, &groupindex)) + return NULL; + + self = PyObject_NEW(PatternObject, &Pattern_Type); + if (self == NULL) + return NULL; + + Py_INCREF(pattern); + self->pattern = pattern; + + Py_INCREF(code); + self->code = code; + + self->groups = groups; + + Py_XINCREF(groupindex); + self->groupindex = groupindex; + + return (PyObject*) self; +} + +static PyObject * +_getcodesize(PyObject* self_, PyObject* args) +{ + return Py_BuildValue("i", sizeof(SRE_CODE)); +} + +static PyObject* +_pattern_new_match(PatternObject* pattern, SRE_STATE* state, + PyObject* string, int status) +{ + /* create match object (from state object) */ + + MatchObject* match; + int i, j; + + TRACE(("status = %d\n", status)); + + if (status > 0) { + + /* create match object (with room for extra group marks) */ + match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups); + if (match == NULL) + return NULL; + + Py_INCREF(pattern); + match->pattern = pattern; + + Py_INCREF(string); + match->string = string; + + match->groups = pattern->groups+1; + + /* group zero */ + match->mark[0] = ((char*) state->start - + (char*) state->beginning) / state->charsize; + match->mark[1] = ((char*) state->ptr - + (char*) state->beginning) / state->charsize; + + /* fill in the rest of the groups */ + for (i = j = 0; i < pattern->groups; i++, j+=2) + if (state->mark[j] != NULL && state->mark[j+1] != NULL) { + match->mark[j+2] = ((char*) state->mark[j] - + (char*) state->beginning) / state->charsize; + match->mark[j+3] = ((char*) state->mark[j+1] - + (char*) state->beginning) / state->charsize; + } else + match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ + + return (PyObject*) match; + + } else if (status < 0) { + + /* internal error */ + PyErr_SetString( + PyExc_RuntimeError, "internal error in regular expression engine" + ); + return NULL; + + } + + Py_INCREF(Py_None); + return Py_None; +} + +/* -------------------------------------------------------------------- */ +/* pattern methods */ + +LOCAL(PyObject*) +_setup(SRE_STATE* state, PyObject* args) +{ + /* prepare state object */ + + PyBufferProcs *buffer; + int i, count; + void* ptr; + + PyObject* string; + int start = 0; + int end = INT_MAX; + if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end)) + return NULL; + + /* get pointer to string buffer */ + buffer = string->ob_type->tp_as_buffer; + if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount || + buffer->bf_getsegcount(string, NULL) != 1) { + PyErr_SetString(PyExc_TypeError, "expected read-only buffer"); + return NULL; + } + + /* determine buffer size */ + count = buffer->bf_getreadbuffer(string, 0, &ptr); + if (count < 0) { + /* sanity check */ + PyErr_SetString(PyExc_TypeError, "buffer has negative size"); + return NULL; + } + + /* determine character size */ + state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1); + + count /= state->charsize; + + /* adjust boundaries */ + if (start < 0) + start = 0; + else if (start > count) + start = count; + + if (end < 0) + end = 0; + else if (end > count) + end = count; + + state->beginning = ptr; + + state->start = (void*) ((char*) ptr + start * state->charsize); + state->end = (void*) ((char*) ptr + end * state->charsize); + + /* FIXME: dynamic! */ + for (i = 0; i < 64; i++) + state->mark[i] = NULL; + + state->stack = NULL; + state->stackbase = 0; + state->stacksize = 0; + + return string; +} + +static void +_pattern_dealloc(PatternObject* self) +{ + Py_XDECREF(self->code); + Py_XDECREF(self->pattern); + Py_XDECREF(self->groupindex); + PyMem_DEL(self); +} + +static PyObject* +_pattern_match(PatternObject* self, PyObject* args) +{ + SRE_STATE state; + PyObject* string; + int status; + + string = _setup(&state, args); + if (!string) + return NULL; + + state.ptr = state.start; + + if (state.charsize == 1) { + status = sre_match(&state, PatternObject_GetCode(self)); + } else { + status = sre_umatch(&state, PatternObject_GetCode(self)); + } + + _stack_free(&state); + + return _pattern_new_match(self, &state, string, status); +} + +static PyObject* +_pattern_search(PatternObject* self, PyObject* args) +{ + SRE_STATE state; + PyObject* string; + int status; + + string = _setup(&state, args); + if (!string) + return NULL; + + if (state.charsize == 1) { + status = sre_search(&state, PatternObject_GetCode(self)); + } else { + status = sre_usearch(&state, PatternObject_GetCode(self)); + } + + _stack_free(&state); + + return _pattern_new_match(self, &state, string, status); +} + +static PyObject* +_pattern_findall(PatternObject* self, PyObject* args) +{ + /* FIXME: not sure about the semantics here. this is good enough + for SXP, though... */ + + SRE_STATE state; + PyObject* string; + PyObject* list; + int status; + + string = _setup(&state, args); + if (!string) + return NULL; + + list = PyList_New(0); + + while (state.start < state.end) { + + PyObject* item; + + state.ptr = state.start; + + if (state.charsize == 1) { + status = sre_match(&state, PatternObject_GetCode(self)); + } else { + status = sre_umatch(&state, PatternObject_GetCode(self)); + } + + if (status >= 0) { + + if (status == 0) + state.ptr = (void*) ((char*) state.start + 1); + + item = PySequence_GetSlice( + string, + ((char*) state.start - (char*) state.beginning), + ((char*) state.ptr - (char*) state.beginning)); + if (!item) + goto error; + if (PyList_Append(list, item) < 0) + goto error; + + state.start = state.ptr; + + } else { + + /* internal error */ + PyErr_SetString( + PyExc_RuntimeError, + "internal error in regular expression engine" + ); + goto error; + + } + } + + _stack_free(&state); + + return list; + +error: + _stack_free(&state); + return NULL; + +} + +static PyMethodDef _pattern_methods[] = { + {"match", (PyCFunction) _pattern_match, 1}, + {"search", (PyCFunction) _pattern_search, 1}, + {"findall", (PyCFunction) _pattern_findall, 1}, + {NULL, NULL} +}; + +static PyObject* +_pattern_getattr(PatternObject* self, char* name) +{ + PyObject* res; + + res = Py_FindMethod(_pattern_methods, (PyObject*) self, name); + + if (res) + return res; + + PyErr_Clear(); + + /* attributes */ + if (!strcmp(name, "pattern")) { + Py_INCREF(self->pattern); + return self->pattern; + } + + PyErr_SetString(PyExc_AttributeError, name); + return NULL; +} + +statichere PyTypeObject Pattern_Type = { + PyObject_HEAD_INIT(NULL) + 0, "Pattern", sizeof(PatternObject), 0, + (destructor)_pattern_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + (getattrfunc)_pattern_getattr, /*tp_getattr*/ +}; + +/* -------------------------------------------------------------------- */ +/* match methods */ + +static void +_match_dealloc(MatchObject* self) +{ + Py_XDECREF(self->string); + Py_DECREF(self->pattern); + PyMem_DEL(self); +} + +static PyObject* +getslice_by_index(MatchObject* self, int index) +{ + if (index < 0 || index >= self->groups) { + /* raise IndexError if we were given a bad group number */ + PyErr_SetString( + PyExc_IndexError, + "no such group" + ); + return NULL; + } + + if (self->string == Py_None || self->mark[index+index] < 0) { + /* return None if the string or group is undefined */ + Py_INCREF(Py_None); + return Py_None; + } + + return PySequence_GetSlice( + self->string, self->mark[index+index], self->mark[index+index+1] + ); +} + +static PyObject* +getslice(MatchObject* self, PyObject* index) +{ + if (!PyInt_Check(index) && self->pattern->groupindex != NULL) { + /* FIXME: resource leak? */ + index = PyObject_GetItem(self->pattern->groupindex, index); + if (!index) + return NULL; + } + + if (PyInt_Check(index)) + return getslice_by_index(self, (int) PyInt_AS_LONG(index)); + + return getslice_by_index(self, -1); /* signal error */ +} + +static PyObject* +_match_group(MatchObject* self, PyObject* args) +{ + PyObject* result; + int i, size; + + size = PyTuple_GET_SIZE(args); + + switch (size) { + case 0: + result = getslice(self, Py_False); /* force error */ + break; + case 1: + result = getslice(self, PyTuple_GET_ITEM(args, 0)); + break; + default: + /* fetch multiple items */ + result = PyTuple_New(size); + if (!result) + return NULL; + for (i = 0; i < size; i++) { + PyObject* item = getslice(self, PyTuple_GET_ITEM(args, i)); + if (!item) { + Py_DECREF(result); + return NULL; + } + PyTuple_SET_ITEM(result, i, item); + } + break; + } + return result; +} + +static PyObject* +_match_groups(MatchObject* self, PyObject* args) +{ + PyObject* result; + int index; + + result = PyTuple_New(self->groups-1); + if (!result) + return NULL; + + for (index = 1; index < self->groups; index++) { + PyObject* item; + /* FIXME: handle default! */ + item = getslice_by_index(self, index); + if (!item) { + Py_DECREF(result); + return NULL; + } + PyTuple_SET_ITEM(result, index-1, item); + } + + return result; +} + +static PyObject* +_match_groupdict(MatchObject* self, PyObject* args) +{ + PyObject* result; + PyObject* keys; + int index; + + result = PyDict_New(); + if (!result) + return NULL; + if (!self->pattern->groupindex) + return result; + + keys = PyMapping_Keys(self->pattern->groupindex); + if (!keys) + return NULL; + + for (index = 0; index < PySequence_Length(keys); index++) { + PyObject* key; + PyObject* item; + key = PySequence_GetItem(keys, index); + if (!key) { + Py_DECREF(keys); + Py_DECREF(result); + return NULL; + } + item = getslice(self, key); + if (!item) { + Py_DECREF(key); + Py_DECREF(keys); + Py_DECREF(result); + return NULL; + } + /* FIXME: this can fail, right? */ + PyDict_SetItem(result, key, item); + } + + Py_DECREF(keys); + + return result; +} + +static PyObject* +_match_start(MatchObject* self, PyObject* args) +{ + int index = 0; + if (!PyArg_ParseTuple(args, "|i", &index)) + return NULL; + + if (index < 0 || index >= self->groups) { + PyErr_SetString( + PyExc_IndexError, + "no such group" + ); + return NULL; + } + + if (self->mark[index*2] < 0) { + Py_INCREF(Py_None); + return Py_None; + } + + return Py_BuildValue("i", self->mark[index*2]); +} + +static PyObject* +_match_end(MatchObject* self, PyObject* args) +{ + int index = 0; + if (!PyArg_ParseTuple(args, "|i", &index)) + return NULL; + + if (index < 0 || index >= self->groups) { + PyErr_SetString( + PyExc_IndexError, + "no such group" + ); + return NULL; + } + + if (self->mark[index*2] < 0) { + Py_INCREF(Py_None); + return Py_None; + } + + return Py_BuildValue("i", self->mark[index*2+1]); +} + +static PyObject* +_match_span(MatchObject* self, PyObject* args) +{ + int index = 0; + if (!PyArg_ParseTuple(args, "|i", &index)) + return NULL; + + if (index < 0 || index >= self->groups) { + PyErr_SetString( + PyExc_IndexError, + "no such group" + ); + return NULL; + } + + if (self->mark[index*2] < 0) { + Py_INCREF(Py_None); + return Py_None; + } + + return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]); +} + +static PyMethodDef _match_methods[] = { + {"group", (PyCFunction) _match_group, 1}, + {"start", (PyCFunction) _match_start, 1}, + {"end", (PyCFunction) _match_end, 1}, + {"span", (PyCFunction) _match_span, 1}, + {"groups", (PyCFunction) _match_groups, 1}, + {"groupdict", (PyCFunction) _match_groupdict, 1}, + {NULL, NULL} +}; + +static PyObject* +_match_getattr(MatchObject* self, char* name) +{ + PyObject* res; + + res = Py_FindMethod(_match_methods, (PyObject*) self, name); + if (res) + return res; + + PyErr_Clear(); + + /* attributes! */ + if (!strcmp(name, "string")) { + Py_INCREF(self->string); + return self->string; + } + if (!strcmp(name, "regs")) + /* FIXME: should return the whole list! */ + return Py_BuildValue("((i,i))", self->mark[0], self->mark[1]); + if (!strcmp(name, "re")) { + Py_INCREF(self->pattern); + return (PyObject*) self->pattern; + } + if (!strcmp(name, "groupindex") && self->pattern->groupindex) { + Py_INCREF(self->pattern->groupindex); + return self->pattern->groupindex; + } + if (!strcmp(name, "pos")) + return Py_BuildValue("i", 0); /* FIXME */ + if (!strcmp(name, "endpos")) + return Py_BuildValue("i", 0); /* FIXME */ + + PyErr_SetString(PyExc_AttributeError, name); + return NULL; +} + +/* FIXME: implement setattr("string", None) as a special case (to + detach the associated string, if any */ + +statichere PyTypeObject Match_Type = { + PyObject_HEAD_INIT(NULL) + 0, "Match", + sizeof(MatchObject), /* size of basic object */ + sizeof(int), /* space for group item */ + (destructor)_match_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + (getattrfunc)_match_getattr, /*tp_getattr*/ +}; + +static PyMethodDef _functions[] = { + {"compile", _compile, 1}, + {"getcodesize", _getcodesize, 1}, + {NULL, NULL} +}; + +void +#ifdef WIN32 +__declspec(dllexport) +#endif +init_sre() +{ + /* Patch object types */ + Pattern_Type.ob_type = Match_Type.ob_type = &PyType_Type; + + Py_InitModule("_sre", _functions); +} + +#endif diff --git a/Modules/sre.h b/Modules/sre.h new file mode 100644 index 0000000..2936b05 --- /dev/null +++ b/Modules/sre.h @@ -0,0 +1,38 @@ +/* + * Secret Labs' Regular Expression Engine + * $Id$ + * + * simple regular expression matching engine + * + * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * + * See the _sre.c file for information on usage and redistribution. + */ + +#ifndef SRE_INCLUDED +#define SRE_INCLUDED + +#include "sre_constants.h" + +/* Python objects */ + +typedef struct { + PyObject_HEAD + PyObject* code; /* link to the code string object */ + PyObject* pattern; /* link to the pattern source (or None) */ + int groups; + PyObject* groupindex; +} PatternObject; + +#define PatternObject_GetCode(o) ((void*) PyString_AS_STRING((o)->code)) + +typedef struct { + PyObject_HEAD + PyObject* string; /* link to the target string */ + PatternObject* pattern; /* link to the regex (pattern) object */ + int groups; /* number of groups (start/end marks) */ + int mark[2]; +} MatchObject; + +#endif + diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h new file mode 100644 index 0000000..6b940d3 --- /dev/null +++ b/Modules/sre_constants.h @@ -0,0 +1,27 @@ +/* generated by sre_constants.py */ +#define SRE_OP_FAILURE 0 +#define SRE_OP_SUCCESS 1 +#define SRE_OP_ANY 2 +#define SRE_OP_ASSERT 3 +#define SRE_OP_AT 4 +#define SRE_OP_BRANCH 5 +#define SRE_OP_CALL 6 +#define SRE_OP_CATEGORY 7 +#define SRE_OP_GROUP 8 +#define SRE_OP_GROUP_IGNORE 9 +#define SRE_OP_IN 10 +#define SRE_OP_IN_IGNORE 11 +#define SRE_OP_JUMP 12 +#define SRE_OP_LITERAL 13 +#define SRE_OP_LITERAL_IGNORE 14 +#define SRE_OP_MARK 15 +#define SRE_OP_MAX_REPEAT 16 +#define SRE_OP_MAX_UNTIL 17 +#define SRE_OP_MAX_REPEAT_ONE 18 +#define SRE_OP_MIN_REPEAT 19 +#define SRE_OP_MIN_UNTIL 20 +#define SRE_OP_NOT_LITERAL 21 +#define SRE_OP_NOT_LITERAL_IGNORE 22 +#define SRE_OP_NEGATE 23 +#define SRE_OP_RANGE 24 +#define SRE_OP_REPEAT 25 -- cgit v0.12