summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2000-03-31 14:59:30 (GMT)
committerGuido van Rossum <guido@python.org>2000-03-31 14:59:30 (GMT)
commitb700df9824a768893fb83dec41779ac89035313e (patch)
tree2d871f39d80632d370b0b671a46b1d133a71c38a
parent7627c0de6968471996ce05aab200115d56efa1d5 (diff)
downloadcpython-b700df9824a768893fb83dec41779ac89035313e.zip
cpython-b700df9824a768893fb83dec41779ac89035313e.tar.gz
cpython-b700df9824a768893fb83dec41779ac89035313e.tar.bz2
Adding Fredrik Lundh's _sre.c module and its header files.
NOTE: THIS IS VERY ROUGH ALPHA CODE!
-rw-r--r--Modules/_sre.c1453
-rw-r--r--Modules/sre.h38
-rw-r--r--Modules/sre_constants.h27
3 files changed, 1518 insertions, 0 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c
new file mode 100644
index 0000000..8af876f
--- /dev/null
+++ b/Modules/_sre.c
@@ -0,0 +1,1453 @@
+/* -*- Mode: C; tab-width: 4 -*-
+ *
+ * Secret Labs' Regular Expression Engine
+ * $Id$
+ *
+ * simple regular expression matching engine
+ *
+ * partial history:
+ * 99-10-24 fl created (bits and pieces from the template matcher)
+ * 99-11-13 fl added categories, branching, and more (0.2)
+ * 99-11-16 fl some tweaks to compile on non-Windows platforms
+ * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
+ * 99-02-28 fl tons of changes (not all to the better ;-) (0.4)
+ * 99-03-06 fl first alpha, sort of (0.5)
+ * 99-03-14 fl removed most compatibility stuff (0.6)
+ *
+ * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
+ *
+ * This code can only be used for 1.6 alpha testing. All other use
+ * require explicit permission from Secret Labs AB.
+ *
+ * Portions of this engine have been developed in cooperation with
+ * CNRI. Hewlett-Packard provided funding for 1.6 integration and
+ * other compatibility work.
+ */
+
+#ifndef SRE_RECURSIVE
+
+char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
+
+#include "Python.h"
+
+#include "sre.h"
+
+#include "unicodeobject.h"
+
+#if defined(HAVE_LIMITS_H)
+#include <limits.h>
+#else
+#define INT_MAX 2147483647
+#endif
+
+#include <ctype.h> /* temporary hack */
+
+/* defining this one enables tracing */
+#undef DEBUG
+
+#ifdef WIN32 /* FIXME: <fl> don't assume Windows == MSVC */
+#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
+/* fastest possible local call under MSVC */
+#define LOCAL(type) static __inline type __fastcall
+#else
+#define LOCAL(type) static type
+#endif
+
+/* error codes */
+#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
+#define SRE_ERROR_MEMORY -9 /* out of memory */
+
+#ifdef DEBUG
+#define TRACE(v) printf v
+#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
+#else
+#define TRACE(v)
+#endif
+
+#define SRE_CODE unsigned short /* unsigned short or larger */
+
+typedef struct {
+ /* string pointers */
+ void* ptr; /* current position (also end of current slice) */
+ void* beginning; /* start of original string */
+ void* start; /* start of current slice */
+ void* end; /* end of original string */
+ /* character size */
+ int charsize;
+ /* registers */
+ int marks;
+ void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
+ /* FIXME */
+ /* backtracking stack */
+ void** stack;
+ int stacksize;
+ int stackbase;
+} SRE_STATE;
+
+#if 1 /* FIXME: <fl> fix this one! */
+#define SRE_TO_LOWER Py_UNICODE_TOLOWER
+#define SRE_IS_DIGIT Py_UNICODE_ISDIGIT
+#define SRE_IS_SPACE Py_UNICODE_ISSPACE
+#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
+#else
+#define SRE_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
+#define SRE_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
+#define SRE_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
+#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
+#endif
+
+#define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
+
+LOCAL(int)
+sre_category(SRE_CODE category, unsigned int ch)
+{
+ switch (category) {
+ case 'd':
+ return SRE_IS_DIGIT(ch);
+ case 'D':
+ return !SRE_IS_DIGIT(ch);
+ case 's':
+ return SRE_IS_SPACE(ch);
+ case 'S':
+ return !SRE_IS_SPACE(ch);
+ case 'w':
+ return SRE_IS_WORD(ch);
+ case 'W':
+ return !SRE_IS_WORD(ch);
+ }
+ return 0;
+}
+
+/* helpers */
+
+LOCAL(int)
+_stack_free(SRE_STATE* state)
+{
+ if (state->stack) {
+ TRACE(("release stack\n"));
+ free(state->stack);
+ state->stack = NULL;
+ }
+ state->stacksize = 0;
+ return 0;
+}
+
+static int /* shouldn't be LOCAL */
+_stack_extend(SRE_STATE* state, int lo, int hi)
+{
+ void** stack;
+ int stacksize;
+
+ /* grow the stack to a suitable size; we need at least lo entries,
+ at most hi entries. if for some reason hi is lower than lo, lo
+ wins */
+
+ stacksize = state->stacksize;
+
+ if (stacksize == 0) {
+ /* create new stack */
+ stacksize = 512;
+ if (stacksize < lo)
+ stacksize = lo;
+ else if (stacksize > hi)
+ stacksize = hi;
+ TRACE(("allocate stack %d\n", stacksize));
+ stack = malloc(sizeof(void*) * stacksize);
+ } else {
+ /* grow the stack (typically by a factor of two) */
+ while (stacksize < lo)
+ stacksize = 2 * stacksize;
+ /* FIXME: <fl> could trim size if it's larger than lo, and
+ much larger than hi */
+ TRACE(("grow stack to %d\n", stacksize));
+ stack = realloc(state->stack, sizeof(void*) * stacksize);
+ }
+
+ if (!stack) {
+ _stack_free(state);
+ return SRE_ERROR_MEMORY;
+ }
+
+ state->stack = stack;
+ state->stacksize = stacksize;
+
+ return 0;
+}
+
+/* set things up for the 8-bit version */
+
+#define SRE_CHAR unsigned char
+#define SRE_AT sre_at
+#define SRE_MEMBER sre_member
+#define SRE_MATCH sre_match
+#define SRE_SEARCH sre_search
+#define SRE_RECURSIVE
+
+#include "_sre.c"
+
+#undef SRE_RECURSIVE
+#undef SRE_SEARCH
+#undef SRE_MATCH
+#undef SRE_MEMBER
+#undef SRE_AT
+#undef SRE_CHAR
+
+/* set things up for the 16-bit unicode version */
+
+#define SRE_CHAR Py_UNICODE
+#define SRE_AT sre_uat
+#define SRE_MEMBER sre_umember
+#define SRE_MATCH sre_umatch
+#define SRE_SEARCH sre_usearch
+
+#endif /* SRE_RECURSIVE */
+
+/* -------------------------------------------------------------------- */
+/* String matching engine */
+
+/* the following section is compiled twice, with different character
+ settings */
+
+LOCAL(int)
+SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
+{
+ /* check if pointer is at given position. return 1 if so, 0
+ otherwise */
+
+ int this, that;
+
+ switch (at) {
+ case 'a':
+ /* beginning */
+ return (ptr == state->beginning);
+ case 'z':
+ /* end */
+ return (ptr == state->end);
+ case 'b':
+ /* word boundary */
+ if (state->beginning == state->end)
+ return 0;
+ that = ((void*) ptr > state->beginning) ?
+ SRE_IS_WORD((int) ptr[-1]) : 0;
+ this = ((void*) ptr < state->end) ?
+ SRE_IS_WORD((int) ptr[0]) : 0;
+ return this != that;
+ case 'B':
+ /* word non-boundary */
+ if (state->beginning == state->end)
+ return 0;
+ that = ((void*) ptr > state->beginning) ?
+ SRE_IS_WORD((int) ptr[-1]) : 0;
+ this = ((void*) ptr < state->end) ?
+ SRE_IS_WORD((int) ptr[0]) : 0;
+ return this == that;
+ }
+
+ return 0;
+}
+
+LOCAL(int)
+SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
+{
+ /* check if character is a member of the given set. return 1 if
+ so, 0 otherwise */
+
+ int ok = 1;
+
+ for (;;) {
+ switch (*set++) {
+
+ case SRE_OP_NEGATE:
+ ok = !ok;
+ break;
+
+ case SRE_OP_FAILURE:
+ return !ok;
+
+ case SRE_OP_LITERAL:
+ if (ch == (SRE_CHAR) set[0])
+ return ok;
+ set++;
+ break;
+
+ case SRE_OP_RANGE:
+ if ((SRE_CHAR) set[0] <= ch && ch <= (SRE_CHAR) set[1])
+ return ok;
+ set += 2;
+ break;
+
+ case SRE_OP_CATEGORY:
+ if (sre_category(set[0], (int) ch))
+ return ok;
+ set += 1;
+ break;
+
+ default:
+ /* FIXME: internal error */
+ return 0;
+ }
+ }
+}
+
+LOCAL(int)
+SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
+{
+ /* check if string matches the given pattern. returns -1 for
+ error, 0 for failure, and 1 for success */
+
+ SRE_CHAR* end = state->end;
+ SRE_CHAR* ptr = state->ptr;
+ int stacksize;
+ int stackbase;
+ int i, count;
+
+ for (;;) {
+
+ TRACE(("[%p]\n", pattern));
+
+ switch (*pattern++) {
+
+ case SRE_OP_FAILURE:
+ /* immediate failure */
+ TRACE(("%8d: failure\n", PTR(ptr)));
+ return 0;
+
+ case SRE_OP_SUCCESS:
+ /* end of pattern */
+ TRACE(("%8d: success\n", PTR(ptr)));
+ state->ptr = ptr;
+ return 1;
+
+ case SRE_OP_AT:
+ /* match at given position */
+ TRACE(("%8d: match at \\%c\n", PTR(ptr), *pattern));
+ if (!SRE_AT(state, ptr, *pattern))
+ return 0;
+ pattern++;
+ break;
+
+ case SRE_OP_LITERAL:
+ /* match literal character */
+ /* args: <code> */
+ TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) *pattern));
+ if (ptr >= end || *ptr != (SRE_CHAR) *pattern)
+ return 0;
+ pattern++;
+ ptr++;
+ break;
+
+ case SRE_OP_NOT_LITERAL:
+ /* match anything that is not literal character */
+ /* args: <code> */
+ TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) *pattern));
+ if (ptr >= end || *ptr == (SRE_CHAR) *pattern)
+ return 0;
+ pattern++;
+ ptr++;
+ break;
+
+ case SRE_OP_ANY:
+ /* match anything */
+ TRACE(("%8d: any\n", PTR(ptr)));
+ if (ptr >= end)
+ return 0;
+ ptr++;
+ break;
+
+ case SRE_OP_IN:
+ /* match set member (or non_member) */
+ /* args: <skip> <set> */
+ TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
+ if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
+ return 0;
+ pattern += pattern[0];
+ ptr++;
+ break;
+
+ case SRE_OP_GROUP:
+ /* match backreference */
+ i = pattern[0];
+ {
+ /* FIXME: optimize size! */
+ SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
+ SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
+ if (!p || !e || e < p)
+ return 0;
+ while (p < e) {
+ if (ptr >= end || *ptr != *p)
+ return 0;
+ p++; ptr++;
+ }
+ }
+ pattern++;
+ break;
+
+ case SRE_OP_LITERAL_IGNORE:
+ TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern));
+ if (ptr >= end || SRE_TO_LOWER(*ptr) != (SRE_CHAR) *pattern)
+ return 0;
+ pattern++;
+ ptr++;
+ break;
+
+ case SRE_OP_NOT_LITERAL_IGNORE:
+ TRACE(("%8d: literal not lower(%c)\n", PTR(ptr),
+ (SRE_CHAR) *pattern));
+ if (ptr >= end || SRE_TO_LOWER(*ptr) == (SRE_CHAR) *pattern)
+ return 0;
+ pattern++;
+ ptr++;
+ break;
+
+ case SRE_OP_IN_IGNORE:
+ TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
+ if (ptr >= end
+ || !SRE_MEMBER(pattern+1, (SRE_CHAR) SRE_TO_LOWER(*ptr)))
+ return 0;
+ pattern += pattern[0];
+ ptr++;
+ break;
+
+ case SRE_OP_MARK:
+ /* set mark */
+ /* args: <mark> */
+ TRACE(("%8d: set mark(%d)\n", PTR(ptr), pattern[0]));
+ state->mark[pattern[0]] = ptr;
+ pattern++;
+ break;
+
+ case SRE_OP_JUMP:
+ /* jump forward */
+ /* args: <skip> */
+ TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
+ pattern += pattern[0];
+ break;
+
+ case SRE_OP_CALL:
+ /* match subpattern, without backtracking */
+ /* args: <skip> <pattern> */
+ TRACE(("%8d: match subpattern\n", PTR(ptr)));
+ state->ptr = ptr;
+ if (!SRE_MATCH(state, pattern + 1))
+ return 0;
+ pattern += pattern[0];
+ ptr = state->ptr;
+ break;
+
+ case SRE_OP_MAX_REPEAT_ONE:
+
+ /* match repeated sequence (maximizing regexp). this
+ variant only works if the repeated item is exactly one
+ character wide, and we're not already collecting
+ backtracking points. for other cases, use the
+ MAX_REPEAT operator instead */
+
+ /* args: <skip> <min> <max> <step> */
+
+ TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
+ pattern[1], pattern[2]));
+
+ count = 0;
+
+ if (pattern[3] == SRE_OP_ANY) {
+ /* repeated wildcard. skip to the end of the target
+ string, and backtrack from there */
+ /* FIXME: must look for line endings */
+ if (ptr + pattern[1] > end)
+ return 0; /* cannot match */
+ count = pattern[2];
+ if (count > end - ptr)
+ count = end - ptr;
+ ptr += count;
+
+ } else if (pattern[3] == SRE_OP_LITERAL) {
+ /* repeated literal */
+ SRE_CHAR chr = (SRE_CHAR) pattern[4];
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || *ptr != chr)
+ break;
+ ptr++;
+ count++;
+ }
+
+ } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
+ /* repeated literal */
+ SRE_CHAR chr = (SRE_CHAR) pattern[4];
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) != chr)
+ break;
+ ptr++;
+ count++;
+ }
+
+ } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
+ /* repeated non-literal */
+ SRE_CHAR chr = (SRE_CHAR) pattern[4];
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || *ptr == chr)
+ break;
+ ptr++;
+ count++;
+ }
+
+ } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
+ /* repeated non-literal */
+ SRE_CHAR chr = (SRE_CHAR) pattern[4];
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) == chr)
+ break;
+ ptr++;
+ count++;
+ }
+
+ } else if (pattern[3] == SRE_OP_IN) {
+ /* repeated set */
+ while (count < (int) pattern[2]) {
+ if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
+ break;
+ ptr++;
+ count++;
+ }
+
+ } else {
+ /* repeated single character pattern */
+ state->ptr = ptr;
+ while (count < (int) pattern[2]) {
+ i = SRE_MATCH(state, pattern + 3);
+ if (i < 0)
+ return i;
+ if (i == 0)
+ break;
+ count++;
+ }
+ state->ptr = ptr;
+ ptr += count;
+ }
+
+ /* when we arrive here, count contains the number of
+ matches, and ptr points to the tail of the target
+ string. check if the rest of the pattern matches, and
+ backtrack if not. */
+
+ /* FIXME: <fl> this is a mess. fix it! */
+
+ TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
+
+ if (count < (int) pattern[1])
+ return 0;
+
+ if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
+ /* tail is empty. we're finished */
+ TRACE(("%8d: tail is empty\n", PTR(ptr)));
+ state->ptr = ptr;
+ return 1;
+
+ } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
+ /* tail starts with a literal. we can speed things up
+ by skipping positions where the rest of the pattern
+ cannot possibly match */
+ SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1];
+ TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
+ for (;;) {
+ TRACE(("%8d: scan for tail match\n", PTR(ptr)));
+ while (count >= (int) pattern[1] &&
+ (ptr >= end || *ptr != chr)) {
+ ptr--;
+ count--;
+ }
+ TRACE(("%8d: check tail\n", PTR(ptr)));
+ if (count < (int) pattern[1])
+ break;
+ state->ptr = ptr;
+ i = SRE_MATCH(state, pattern + pattern[0]);
+ if (i > 0) {
+ TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+ return 1;
+ }
+ TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
+ ptr--;
+ count--;
+ }
+
+ } else {
+ TRACE(("%8d: tail is pattern\n", PTR(ptr)));
+ while (count >= (int) pattern[1]) {
+ state->ptr = ptr;
+ i = SRE_MATCH(state, pattern + pattern[0]);
+ if (i > 0) {
+ TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+ return 1;
+ }
+ TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
+ ptr--;
+ count--;
+ }
+ }
+ return 0; /* failure! */
+
+/* ----------------------------------------------------------------------- */
+/* FIXME: the following section is just plain broken */
+
+ case SRE_OP_MAX_REPEAT:
+ /* match repeated sequence (maximizing regexp). repeated
+ group should end with a MAX_UNTIL code */
+
+ TRACE(("%8d: max repeat %d %d\n", PTR(ptr),
+ pattern[1], pattern[2]));
+
+ count = 0;
+ state->ptr = ptr;
+
+ /* FIXME: <fl> umm. what about matching the minimum
+ number of items before starting to collect backtracking
+ positions? */
+
+ stackbase = state->stackbase;
+
+ while (count < (int) pattern[2]) {
+ /* store current position on the stack */
+ TRACE(("%8d: push mark at index %d\n", PTR(ptr), count));
+ if (stackbase + count >= state->stacksize) {
+ i = _stack_extend(state, stackbase + count + 1,
+ stackbase + pattern[2]);
+ if (i < 0)
+ return i;
+ }
+ state->stack[stackbase + count] = ptr;
+ /* check if we can match another item */
+ state->stackbase += count + 1;
+ i = SRE_MATCH(state, pattern + 3);
+ state->stackbase = stackbase; /* rewind */
+ if (i != 2)
+ break;
+ if (state->ptr == ptr) {
+ /* if the match was successful but empty, set the
+ count to max and terminate the scanning loop */
+ stacksize = count; /* actual size of stack */
+ count = (int) pattern[2];
+ goto check_tail; /* FIXME: <fl> eliminate goto */
+ }
+ count++;
+ ptr = state->ptr;
+
+ }
+
+ stacksize = count; /* actual number of entries on the stack */
+
+ check_tail:
+
+ /* when we get here, count is the number of matches,
+ stacksize is the number of match points on the stack
+ (usually same as count, but it might be smaller) and
+ ptr points to the tail. */
+
+ if (count < (int) pattern[1])
+ return 0;
+
+ /* make sure that rest of the expression matches. if it
+ doesn't, backtrack */
+
+ TRACE(("%8d: repeat %d found (stack size = %d)\n", PTR(ptr),
+ count, stacksize + 1));
+
+ TRACE(("%8d: tail is pattern\n", PTR(ptr)));
+
+ /* hope for the best */
+ state->ptr = ptr;
+ state->stackbase += stacksize + 1;
+ i = SRE_MATCH(state, pattern + pattern[0]);
+ state->stackbase = stackbase;
+ if (i > 0) {
+ TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+ return 1;
+ }
+
+ /* backtrack! */
+ while (count >= (int) pattern[1]) {
+ ptr = state->stack[stackbase + (count < stacksize ? count : stacksize)];
+ state->ptr = ptr;
+ count--;
+ TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
+ state->stackbase += stacksize + 1;
+ i = SRE_MATCH(state, pattern + pattern[0]);
+ state->stackbase = stackbase;
+ if (i > 0) {
+ TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+ return 1;
+ }
+ }
+ return 0; /* failure! */
+
+ case SRE_OP_MAX_UNTIL:
+ /* match repeated sequence (maximizing regexp). repeated
+ group should end with a MAX_UNTIL code */
+
+ TRACE(("%8d: max until\n", PTR(ptr)));
+ state->ptr = ptr;
+ return 2; /* always succeeds, for now... */
+
+/* end of totally broken section */
+/* ----------------------------------------------------------------------- */
+
+ case SRE_OP_MIN_REPEAT:
+ /* match repeated sequence (minimizing regexp) */
+ TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
+ pattern[1], pattern[2]));
+ count = 0;
+ state->ptr = ptr;
+ /* match minimum number of items */
+ while (count < (int) pattern[1]) {
+ i = SRE_MATCH(state, pattern + 3);
+ if (i <= 0)
+ return i;
+ count++;
+ }
+ /* move forward until the tail matches. */
+ while (count <= (int) pattern[2]) {
+ ptr = state->ptr;
+ i = SRE_MATCH(state, pattern + pattern[0]);
+ if (i > 0) {
+ TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+ return 1;
+ }
+ TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
+ state->ptr = ptr; /* backtrack */
+ i = SRE_MATCH(state, pattern + 3);
+ if (i <= 0)
+ return i;
+ count++;
+ }
+ return 0; /* failure! */
+
+ case SRE_OP_MIN_UNTIL:
+ /* end of repeat group */
+ TRACE(("%8d: min until\n", PTR(ptr)));
+ state->ptr = ptr;
+ return 2; /* always succeeds, for now... */
+
+ case SRE_OP_BRANCH:
+ /* match one of several subpatterns */
+ /* format: <branch> <size> <head> ... <null> <tail> */
+ TRACE(("%8d: branch\n", PTR(ptr)));
+ while (*pattern) {
+ if (pattern[1] != SRE_OP_LITERAL ||
+ (ptr < end && *ptr == (SRE_CHAR) pattern[2])) {
+ TRACE(("%8d: branch check\n", PTR(ptr)));
+ state->ptr = ptr;
+ i = SRE_MATCH(state, pattern + 1);
+ if (i > 0) {
+ TRACE(("%8d: branch succeeded\n", PTR(ptr)));
+ return 1;
+ }
+ }
+ pattern += *pattern;
+ }
+ TRACE(("%8d: branch failed\n", PTR(ptr)));
+ return 0; /* failure! */
+
+ case SRE_OP_REPEAT:
+ /* TEMPLATE: match repeated sequence (no backtracking) */
+ /* format: <repeat> <skip> <min> <max> */
+ TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
+ count = 0;
+ state->ptr = ptr;
+ while (count < (int) pattern[2]) {
+ i = SRE_MATCH(state, pattern + 3);
+ if (i <= 0)
+ break;
+ count++;
+ }
+ if (count <= (int) pattern[1])
+ return 0;
+ TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
+ pattern += pattern[0];
+ ptr = state->ptr;
+ break;
+
+ default:
+ return SRE_ERROR_ILLEGAL;
+ }
+ }
+}
+
+LOCAL(int)
+SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
+{
+ SRE_CHAR* ptr = state->start;
+ SRE_CHAR* end = state->end;
+ int status = 0;
+
+ /* FIXME: <fl> add IGNORE cases (or implement full ASSERT support? */
+
+ if (pattern[0] == SRE_OP_LITERAL) {
+ /* pattern starts with a literal */
+ SRE_CHAR chr = (SRE_CHAR) pattern[1];
+ for (;;) {
+ while (ptr < end && *ptr != chr)
+ ptr++;
+ if (ptr == end)
+ return 0;
+ TRACE(("%8d: search found literal\n", PTR(ptr)));
+ state->start = ptr;
+ state->ptr = ++ptr;
+ status = SRE_MATCH(state, pattern + 2);
+ if (status != 0)
+ break;
+ }
+
+ } else if (pattern[0] == SRE_OP_IN) {
+ /* pattern starts with a set */
+ for (;;) {
+ /* format: <in> <skip> <data> */
+ while (ptr < end && !SRE_MEMBER(pattern + 2, *ptr))
+ ptr++;
+ if (ptr == end)
+ return 0;
+ TRACE(("%8d: search found set\n", PTR(ptr)));
+ state->start = ptr;
+ state->ptr = ++ptr;
+ status = SRE_MATCH(state, pattern + pattern[1] + 1);
+ if (status != 0)
+ break;
+ }
+
+ } else
+ while (ptr <= end) {
+ TRACE(("%8d: search\n", PTR(ptr)));
+ state->start = state->ptr = ptr++;
+ status = SRE_MATCH(state, pattern);
+ if (status != 0)
+ break;
+ }
+
+ return status;
+}
+
+#ifndef SRE_RECURSIVE
+
+/* -------------------------------------------------------------------- */
+/* factories and destructors */
+
+/* see sre.h for object declarations */
+
+staticforward PyTypeObject Pattern_Type;
+staticforward PyTypeObject Match_Type;
+
+static PyObject *
+_compile(PyObject* self_, PyObject* args)
+{
+ /* "compile" pattern descriptor to pattern object */
+
+ PatternObject* self;
+
+ PyObject* pattern;
+ PyObject* code;
+ int groups = 0;
+ PyObject* groupindex = NULL;
+ if (!PyArg_ParseTuple(args, "OO!|iO", &pattern,
+ &PyString_Type, &code, &groups, &groupindex))
+ return NULL;
+
+ self = PyObject_NEW(PatternObject, &Pattern_Type);
+ if (self == NULL)
+ return NULL;
+
+ Py_INCREF(pattern);
+ self->pattern = pattern;
+
+ Py_INCREF(code);
+ self->code = code;
+
+ self->groups = groups;
+
+ Py_XINCREF(groupindex);
+ self->groupindex = groupindex;
+
+ return (PyObject*) self;
+}
+
+static PyObject *
+_getcodesize(PyObject* self_, PyObject* args)
+{
+ return Py_BuildValue("i", sizeof(SRE_CODE));
+}
+
+static PyObject*
+_pattern_new_match(PatternObject* pattern, SRE_STATE* state,
+ PyObject* string, int status)
+{
+ /* create match object (from state object) */
+
+ MatchObject* match;
+ int i, j;
+
+ TRACE(("status = %d\n", status));
+
+ if (status > 0) {
+
+ /* create match object (with room for extra group marks) */
+ match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
+ if (match == NULL)
+ return NULL;
+
+ Py_INCREF(pattern);
+ match->pattern = pattern;
+
+ Py_INCREF(string);
+ match->string = string;
+
+ match->groups = pattern->groups+1;
+
+ /* group zero */
+ match->mark[0] = ((char*) state->start -
+ (char*) state->beginning) / state->charsize;
+ match->mark[1] = ((char*) state->ptr -
+ (char*) state->beginning) / state->charsize;
+
+ /* fill in the rest of the groups */
+ for (i = j = 0; i < pattern->groups; i++, j+=2)
+ if (state->mark[j] != NULL && state->mark[j+1] != NULL) {
+ match->mark[j+2] = ((char*) state->mark[j] -
+ (char*) state->beginning) / state->charsize;
+ match->mark[j+3] = ((char*) state->mark[j+1] -
+ (char*) state->beginning) / state->charsize;
+ } else
+ match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
+
+ return (PyObject*) match;
+
+ } else if (status < 0) {
+
+ /* internal error */
+ PyErr_SetString(
+ PyExc_RuntimeError, "internal error in regular expression engine"
+ );
+ return NULL;
+
+ }
+
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+/* -------------------------------------------------------------------- */
+/* pattern methods */
+
+LOCAL(PyObject*)
+_setup(SRE_STATE* state, PyObject* args)
+{
+ /* prepare state object */
+
+ PyBufferProcs *buffer;
+ int i, count;
+ void* ptr;
+
+ PyObject* string;
+ int start = 0;
+ int end = INT_MAX;
+ if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
+ return NULL;
+
+ /* get pointer to string buffer */
+ buffer = string->ob_type->tp_as_buffer;
+ if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
+ buffer->bf_getsegcount(string, NULL) != 1) {
+ PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
+ return NULL;
+ }
+
+ /* determine buffer size */
+ count = buffer->bf_getreadbuffer(string, 0, &ptr);
+ if (count < 0) {
+ /* sanity check */
+ PyErr_SetString(PyExc_TypeError, "buffer has negative size");
+ return NULL;
+ }
+
+ /* determine character size */
+ state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
+
+ count /= state->charsize;
+
+ /* adjust boundaries */
+ if (start < 0)
+ start = 0;
+ else if (start > count)
+ start = count;
+
+ if (end < 0)
+ end = 0;
+ else if (end > count)
+ end = count;
+
+ state->beginning = ptr;
+
+ state->start = (void*) ((char*) ptr + start * state->charsize);
+ state->end = (void*) ((char*) ptr + end * state->charsize);
+
+ /* FIXME: dynamic! */
+ for (i = 0; i < 64; i++)
+ state->mark[i] = NULL;
+
+ state->stack = NULL;
+ state->stackbase = 0;
+ state->stacksize = 0;
+
+ return string;
+}
+
+static void
+_pattern_dealloc(PatternObject* self)
+{
+ Py_XDECREF(self->code);
+ Py_XDECREF(self->pattern);
+ Py_XDECREF(self->groupindex);
+ PyMem_DEL(self);
+}
+
+static PyObject*
+_pattern_match(PatternObject* self, PyObject* args)
+{
+ SRE_STATE state;
+ PyObject* string;
+ int status;
+
+ string = _setup(&state, args);
+ if (!string)
+ return NULL;
+
+ state.ptr = state.start;
+
+ if (state.charsize == 1) {
+ status = sre_match(&state, PatternObject_GetCode(self));
+ } else {
+ status = sre_umatch(&state, PatternObject_GetCode(self));
+ }
+
+ _stack_free(&state);
+
+ return _pattern_new_match(self, &state, string, status);
+}
+
+static PyObject*
+_pattern_search(PatternObject* self, PyObject* args)
+{
+ SRE_STATE state;
+ PyObject* string;
+ int status;
+
+ string = _setup(&state, args);
+ if (!string)
+ return NULL;
+
+ if (state.charsize == 1) {
+ status = sre_search(&state, PatternObject_GetCode(self));
+ } else {
+ status = sre_usearch(&state, PatternObject_GetCode(self));
+ }
+
+ _stack_free(&state);
+
+ return _pattern_new_match(self, &state, string, status);
+}
+
+static PyObject*
+_pattern_findall(PatternObject* self, PyObject* args)
+{
+ /* FIXME: not sure about the semantics here. this is good enough
+ for SXP, though... */
+
+ SRE_STATE state;
+ PyObject* string;
+ PyObject* list;
+ int status;
+
+ string = _setup(&state, args);
+ if (!string)
+ return NULL;
+
+ list = PyList_New(0);
+
+ while (state.start < state.end) {
+
+ PyObject* item;
+
+ state.ptr = state.start;
+
+ if (state.charsize == 1) {
+ status = sre_match(&state, PatternObject_GetCode(self));
+ } else {
+ status = sre_umatch(&state, PatternObject_GetCode(self));
+ }
+
+ if (status >= 0) {
+
+ if (status == 0)
+ state.ptr = (void*) ((char*) state.start + 1);
+
+ item = PySequence_GetSlice(
+ string,
+ ((char*) state.start - (char*) state.beginning),
+ ((char*) state.ptr - (char*) state.beginning));
+ if (!item)
+ goto error;
+ if (PyList_Append(list, item) < 0)
+ goto error;
+
+ state.start = state.ptr;
+
+ } else {
+
+ /* internal error */
+ PyErr_SetString(
+ PyExc_RuntimeError,
+ "internal error in regular expression engine"
+ );
+ goto error;
+
+ }
+ }
+
+ _stack_free(&state);
+
+ return list;
+
+error:
+ _stack_free(&state);
+ return NULL;
+
+}
+
+static PyMethodDef _pattern_methods[] = {
+ {"match", (PyCFunction) _pattern_match, 1},
+ {"search", (PyCFunction) _pattern_search, 1},
+ {"findall", (PyCFunction) _pattern_findall, 1},
+ {NULL, NULL}
+};
+
+static PyObject*
+_pattern_getattr(PatternObject* self, char* name)
+{
+ PyObject* res;
+
+ res = Py_FindMethod(_pattern_methods, (PyObject*) self, name);
+
+ if (res)
+ return res;
+
+ PyErr_Clear();
+
+ /* attributes */
+ if (!strcmp(name, "pattern")) {
+ Py_INCREF(self->pattern);
+ return self->pattern;
+ }
+
+ PyErr_SetString(PyExc_AttributeError, name);
+ return NULL;
+}
+
+statichere PyTypeObject Pattern_Type = {
+ PyObject_HEAD_INIT(NULL)
+ 0, "Pattern", sizeof(PatternObject), 0,
+ (destructor)_pattern_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ (getattrfunc)_pattern_getattr, /*tp_getattr*/
+};
+
+/* -------------------------------------------------------------------- */
+/* match methods */
+
+static void
+_match_dealloc(MatchObject* self)
+{
+ Py_XDECREF(self->string);
+ Py_DECREF(self->pattern);
+ PyMem_DEL(self);
+}
+
+static PyObject*
+getslice_by_index(MatchObject* self, int index)
+{
+ if (index < 0 || index >= self->groups) {
+ /* raise IndexError if we were given a bad group number */
+ PyErr_SetString(
+ PyExc_IndexError,
+ "no such group"
+ );
+ return NULL;
+ }
+
+ if (self->string == Py_None || self->mark[index+index] < 0) {
+ /* return None if the string or group is undefined */
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+
+ return PySequence_GetSlice(
+ self->string, self->mark[index+index], self->mark[index+index+1]
+ );
+}
+
+static PyObject*
+getslice(MatchObject* self, PyObject* index)
+{
+ if (!PyInt_Check(index) && self->pattern->groupindex != NULL) {
+ /* FIXME: resource leak? */
+ index = PyObject_GetItem(self->pattern->groupindex, index);
+ if (!index)
+ return NULL;
+ }
+
+ if (PyInt_Check(index))
+ return getslice_by_index(self, (int) PyInt_AS_LONG(index));
+
+ return getslice_by_index(self, -1); /* signal error */
+}
+
+static PyObject*
+_match_group(MatchObject* self, PyObject* args)
+{
+ PyObject* result;
+ int i, size;
+
+ size = PyTuple_GET_SIZE(args);
+
+ switch (size) {
+ case 0:
+ result = getslice(self, Py_False); /* force error */
+ break;
+ case 1:
+ result = getslice(self, PyTuple_GET_ITEM(args, 0));
+ break;
+ default:
+ /* fetch multiple items */
+ result = PyTuple_New(size);
+ if (!result)
+ return NULL;
+ for (i = 0; i < size; i++) {
+ PyObject* item = getslice(self, PyTuple_GET_ITEM(args, i));
+ if (!item) {
+ Py_DECREF(result);
+ return NULL;
+ }
+ PyTuple_SET_ITEM(result, i, item);
+ }
+ break;
+ }
+ return result;
+}
+
+static PyObject*
+_match_groups(MatchObject* self, PyObject* args)
+{
+ PyObject* result;
+ int index;
+
+ result = PyTuple_New(self->groups-1);
+ if (!result)
+ return NULL;
+
+ for (index = 1; index < self->groups; index++) {
+ PyObject* item;
+ /* FIXME: <fl> handle default! */
+ item = getslice_by_index(self, index);
+ if (!item) {
+ Py_DECREF(result);
+ return NULL;
+ }
+ PyTuple_SET_ITEM(result, index-1, item);
+ }
+
+ return result;
+}
+
+static PyObject*
+_match_groupdict(MatchObject* self, PyObject* args)
+{
+ PyObject* result;
+ PyObject* keys;
+ int index;
+
+ result = PyDict_New();
+ if (!result)
+ return NULL;
+ if (!self->pattern->groupindex)
+ return result;
+
+ keys = PyMapping_Keys(self->pattern->groupindex);
+ if (!keys)
+ return NULL;
+
+ for (index = 0; index < PySequence_Length(keys); index++) {
+ PyObject* key;
+ PyObject* item;
+ key = PySequence_GetItem(keys, index);
+ if (!key) {
+ Py_DECREF(keys);
+ Py_DECREF(result);
+ return NULL;
+ }
+ item = getslice(self, key);
+ if (!item) {
+ Py_DECREF(key);
+ Py_DECREF(keys);
+ Py_DECREF(result);
+ return NULL;
+ }
+ /* FIXME: <fl> this can fail, right? */
+ PyDict_SetItem(result, key, item);
+ }
+
+ Py_DECREF(keys);
+
+ return result;
+}
+
+static PyObject*
+_match_start(MatchObject* self, PyObject* args)
+{
+ int index = 0;
+ if (!PyArg_ParseTuple(args, "|i", &index))
+ return NULL;
+
+ if (index < 0 || index >= self->groups) {
+ PyErr_SetString(
+ PyExc_IndexError,
+ "no such group"
+ );
+ return NULL;
+ }
+
+ if (self->mark[index*2] < 0) {
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+
+ return Py_BuildValue("i", self->mark[index*2]);
+}
+
+static PyObject*
+_match_end(MatchObject* self, PyObject* args)
+{
+ int index = 0;
+ if (!PyArg_ParseTuple(args, "|i", &index))
+ return NULL;
+
+ if (index < 0 || index >= self->groups) {
+ PyErr_SetString(
+ PyExc_IndexError,
+ "no such group"
+ );
+ return NULL;
+ }
+
+ if (self->mark[index*2] < 0) {
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+
+ return Py_BuildValue("i", self->mark[index*2+1]);
+}
+
+static PyObject*
+_match_span(MatchObject* self, PyObject* args)
+{
+ int index = 0;
+ if (!PyArg_ParseTuple(args, "|i", &index))
+ return NULL;
+
+ if (index < 0 || index >= self->groups) {
+ PyErr_SetString(
+ PyExc_IndexError,
+ "no such group"
+ );
+ return NULL;
+ }
+
+ if (self->mark[index*2] < 0) {
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+
+ return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
+}
+
+static PyMethodDef _match_methods[] = {
+ {"group", (PyCFunction) _match_group, 1},
+ {"start", (PyCFunction) _match_start, 1},
+ {"end", (PyCFunction) _match_end, 1},
+ {"span", (PyCFunction) _match_span, 1},
+ {"groups", (PyCFunction) _match_groups, 1},
+ {"groupdict", (PyCFunction) _match_groupdict, 1},
+ {NULL, NULL}
+};
+
+static PyObject*
+_match_getattr(MatchObject* self, char* name)
+{
+ PyObject* res;
+
+ res = Py_FindMethod(_match_methods, (PyObject*) self, name);
+ if (res)
+ return res;
+
+ PyErr_Clear();
+
+ /* attributes! */
+ if (!strcmp(name, "string")) {
+ Py_INCREF(self->string);
+ return self->string;
+ }
+ if (!strcmp(name, "regs"))
+ /* FIXME: should return the whole list! */
+ return Py_BuildValue("((i,i))", self->mark[0], self->mark[1]);
+ if (!strcmp(name, "re")) {
+ Py_INCREF(self->pattern);
+ return (PyObject*) self->pattern;
+ }
+ if (!strcmp(name, "groupindex") && self->pattern->groupindex) {
+ Py_INCREF(self->pattern->groupindex);
+ return self->pattern->groupindex;
+ }
+ if (!strcmp(name, "pos"))
+ return Py_BuildValue("i", 0); /* FIXME */
+ if (!strcmp(name, "endpos"))
+ return Py_BuildValue("i", 0); /* FIXME */
+
+ PyErr_SetString(PyExc_AttributeError, name);
+ return NULL;
+}
+
+/* FIXME: implement setattr("string", None) as a special case (to
+ detach the associated string, if any */
+
+statichere PyTypeObject Match_Type = {
+ PyObject_HEAD_INIT(NULL)
+ 0, "Match",
+ sizeof(MatchObject), /* size of basic object */
+ sizeof(int), /* space for group item */
+ (destructor)_match_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ (getattrfunc)_match_getattr, /*tp_getattr*/
+};
+
+static PyMethodDef _functions[] = {
+ {"compile", _compile, 1},
+ {"getcodesize", _getcodesize, 1},
+ {NULL, NULL}
+};
+
+void
+#ifdef WIN32
+__declspec(dllexport)
+#endif
+init_sre()
+{
+ /* Patch object types */
+ Pattern_Type.ob_type = Match_Type.ob_type = &PyType_Type;
+
+ Py_InitModule("_sre", _functions);
+}
+
+#endif
diff --git a/Modules/sre.h b/Modules/sre.h
new file mode 100644
index 0000000..2936b05
--- /dev/null
+++ b/Modules/sre.h
@@ -0,0 +1,38 @@
+/*
+ * Secret Labs' Regular Expression Engine
+ * $Id$
+ *
+ * simple regular expression matching engine
+ *
+ * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
+ *
+ * See the _sre.c file for information on usage and redistribution.
+ */
+
+#ifndef SRE_INCLUDED
+#define SRE_INCLUDED
+
+#include "sre_constants.h"
+
+/* Python objects */
+
+typedef struct {
+ PyObject_HEAD
+ PyObject* code; /* link to the code string object */
+ PyObject* pattern; /* link to the pattern source (or None) */
+ int groups;
+ PyObject* groupindex;
+} PatternObject;
+
+#define PatternObject_GetCode(o) ((void*) PyString_AS_STRING((o)->code))
+
+typedef struct {
+ PyObject_HEAD
+ PyObject* string; /* link to the target string */
+ PatternObject* pattern; /* link to the regex (pattern) object */
+ int groups; /* number of groups (start/end marks) */
+ int mark[2];
+} MatchObject;
+
+#endif
+
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
new file mode 100644
index 0000000..6b940d3
--- /dev/null
+++ b/Modules/sre_constants.h
@@ -0,0 +1,27 @@
+/* generated by sre_constants.py */
+#define SRE_OP_FAILURE 0
+#define SRE_OP_SUCCESS 1
+#define SRE_OP_ANY 2
+#define SRE_OP_ASSERT 3
+#define SRE_OP_AT 4
+#define SRE_OP_BRANCH 5
+#define SRE_OP_CALL 6
+#define SRE_OP_CATEGORY 7
+#define SRE_OP_GROUP 8
+#define SRE_OP_GROUP_IGNORE 9
+#define SRE_OP_IN 10
+#define SRE_OP_IN_IGNORE 11
+#define SRE_OP_JUMP 12
+#define SRE_OP_LITERAL 13
+#define SRE_OP_LITERAL_IGNORE 14
+#define SRE_OP_MARK 15
+#define SRE_OP_MAX_REPEAT 16
+#define SRE_OP_MAX_UNTIL 17
+#define SRE_OP_MAX_REPEAT_ONE 18
+#define SRE_OP_MIN_REPEAT 19
+#define SRE_OP_MIN_UNTIL 20
+#define SRE_OP_NOT_LITERAL 21
+#define SRE_OP_NOT_LITERAL_IGNORE 22
+#define SRE_OP_NEGATE 23
+#define SRE_OP_RANGE 24
+#define SRE_OP_REPEAT 25