summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Modules/regexpr.c384
-rw-r--r--Modules/regexpr.h110
-rw-r--r--Modules/reopmodule.c375
3 files changed, 700 insertions, 169 deletions
diff --git a/Modules/regexpr.c b/Modules/regexpr.c
index 87f747f..0b6ec8b 100644
--- a/Modules/regexpr.c
+++ b/Modules/regexpr.c
@@ -1,3 +1,7 @@
+/*
+ * -*- mode: c-mode; c-file-style: python -*-
+ */
+
/* regexpr.c
*
* Author: Tatu Ylonen <ylo@ngs.fi>
@@ -57,6 +61,12 @@ char *realloc();
#endif /* __STDC__ */
#endif /* THINK_C */
+/* The original code blithely assumed that sizeof(short) == 2. Not
+ * always true. Original instances of "(short)x" were replaced by
+ * SHORT(x), where SHORT is #defined below. */
+
+#define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
+
/* The stack implementation is taken from an idea by Andrew Kuchling.
* It's a doubly linked list of arrays. The advantages of this over a
* simple linked list are that the number of mallocs required are
@@ -75,27 +85,27 @@ char *realloc();
typedef union item_t
{
- struct
- {
- int num;
- int level;
- char *start;
- char *end;
- } reg;
- struct
- {
- int count;
- int level;
- int phantom;
- char *code;
- char *text;
- } fail;
- struct
- {
- int num;
- int level;
- int count;
- } cntr;
+ struct
+ {
+ int num;
+ int level;
+ char *start;
+ char *end;
+ } reg;
+ struct
+ {
+ int count;
+ int level;
+ int phantom;
+ char *code;
+ char *text;
+ } fail;
+ struct
+ {
+ int num;
+ int level;
+ int count;
+ } cntr;
} item_t;
#define STACK_PAGE_SIZE 256
@@ -105,43 +115,98 @@ typedef union item_t
typedef struct item_page_t
{
- item_t items[STACK_PAGE_SIZE];
- struct item_page_t *prev;
- struct item_page_t *next;
+ item_t items[STACK_PAGE_SIZE];
+ struct item_page_t *prev;
+ struct item_page_t *next;
} item_page_t;
typedef struct match_state
{
- /* Structure to encapsulate the stack. */
- struct
- {
- /* index into the curent page. If index == 0 and you need
- * to pop and item, move to the previous page and set
- * index = STACK_PAGE_SIZE - 1. Otherwise decrement index
- * to push a page. If index == STACK_PAGE_SIZE and you
- * need to push a page move to the next page and set index
- * = 0. If there is no new next page, allocate a new page
- * and link it in. Otherwise, increment index to push a
- * page. */
- int index;
- item_page_t *current; /* Pointer to the current page. */
- item_page_t first; /* First page is statically allocated. */
- } stack;
- char *start[NUM_REGISTERS];
- char *end[NUM_REGISTERS];
-
- int changed[NUM_REGISTERS];
- /* The number of registers that have been pushed onto the stack
- * since the last failure point. */
- int count;
- /* Used to control when registers need to be pushed onto the
- * stack. */
- int level;
- /* The number of failure points on the stack. */
- int point;
+ /* The number of registers that have been pushed onto the stack
+ * since the last failure point. */
+
+ int count;
+
+ /* Used to control when registers need to be pushed onto the
+ * stack. */
+
+ int level;
+
+ /* The number of failure points on the stack. */
+
+ int point;
+
+ /* Storage for the registers. Each register consists of two
+ * pointers to characters. So register N is represented as
+ * start[N] and end[N]. The pointers must be converted to
+ * offsets from the beginning of the string before returning the
+ * registers to the calling program. */
+
+ char *start[NUM_REGISTERS];
+ char *end[NUM_REGISTERS];
+
+ /* Keeps track of whether a register has changed recently. */
+
+ int changed[NUM_REGISTERS];
+
+ /* Structure to encapsulate the stack. */
+ struct
+ {
+ /* index into the curent page. If index == 0 and you need
+ * to pop an item, move to the previous page and set index
+ * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
+ * push a page. If index == STACK_PAGE_SIZE and you need
+ * to push a page move to the next page and set index =
+ * 0. If there is no new next page, allocate a new page
+ * and link it in. Otherwise, increment index to push a
+ * page. */
+
+ int index;
+ item_page_t *current; /* Pointer to the current page. */
+ item_page_t first; /* First page is statically allocated. */
+ } stack;
} match_state;
+/* Initialize a state object */
+
+/* #define NEW_STATE(state) \ */
+/* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
+/* state.stack.current = &state.stack.first; \ */
+/* state.stack.first.prev = NULL; \ */
+/* state.stack.first.next = NULL; \ */
+/* state.stack.index = 0; \ */
+/* state.level = 1 */
+
+#define NEW_STATE(state, nregs) \
+{ \
+ int i; \
+ for (i = 0; i < nregs; i++) \
+ { \
+ state.start[i] = NULL; \
+ state.end[i] = NULL; \
+ state.changed[i] = 0; \
+ } \
+ state.stack.current = &state.stack.first; \
+ state.stack.first.prev = NULL; \
+ state.stack.first.next = NULL; \
+ state.stack.index = 0; \
+ state.level = 1; \
+ state.count = 0; \
+ state.level = 0; \
+ state.point = 0; \
+}
+
+/* Free any memory that might have been malloc'd */
+
+#define FREE_STATE(state) \
+while(state.stack.first.next != NULL) \
+{ \
+ state.stack.current = state.stack.first.next; \
+ state.stack.first.next = state.stack.current->next; \
+ free(state.stack.current); \
+}
+
/* Discard the top 'count' stack items. */
#define STACK_DISCARD(stack, count, on_error) \
@@ -226,24 +291,6 @@ else \
#define STACK_EMPTY(stack) ((stack.index == 0) && \
(stack.current->prev == NULL))
-
-/* Initialize a state object */
-
-#define NEW_STATE(state) \
-memset(&state, 0, sizeof(match_state)); \
-state.stack.current = &state.stack.first; \
-state.level = 1
-
-/* Free any memory that might have been malloc'd */
-
-#define FREE_STATE(state) \
-while(state.stack.first.next != NULL) \
-{ \
- state.stack.current = state.stack.first.next; \
- state.stack.first.next = state.stack.current->next; \
- free(state.stack.current); \
-}
-
/* Return the start of register 'reg' */
#define GET_REG_START(state, reg) (state.start[reg])
@@ -302,22 +349,6 @@ state.end[reg] = text
/* Update the last failure point with a new position in the text. */
-/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
-/* { \ */
-/* item_t *item; \ */
-/* STACK_DISCARD(state.stack, state.count, on_error); \ */
-/* STACK_TOP(state.stack, item, on_error); \ */
-/* item->fail.text = xtext; \ */
-/* state.count = 0; \ */
-/* } */
-
-/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
-/* { \ */
-/* item_t *item; \ */
-/* STACK_BACK(state.stack, item, state.count + 1, on_error); \ */
-/* item->fail.text = xtext; \ */
-/* } */
-
#define UPDATE_FAILURE(state, xtext, on_error) \
{ \
item_t *item; \
@@ -391,7 +422,8 @@ enum regexp_compiled_ops /* opcodes for compiled regexp */
Cwordbound, /* match if at word boundary */
Cnotwordbound, /* match if not at word boundary */
Csyntaxspec, /* matches syntax code (1 byte follows) */
- Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
+ Cnotsyntaxspec, /* matches if syntax code does not match (1 byte foll)*/
+ Crepeat1
};
enum regexp_syntax_op /* syntax codes for plain and quoted characters */
@@ -581,6 +613,8 @@ static void re_compile_fastmap_aux(char *code,
case Cwordbound:
case Cnotwordbound:
{
+ for (a = 0; a < 256; a++)
+ fastmap[a] = 1;
break;
}
case Csyntaxspec:
@@ -648,7 +682,7 @@ static void re_compile_fastmap_aux(char *code,
{
a = (unsigned char)code[pos++];
a |= (unsigned char)code[pos++] << 8;
- pos += (int)(short)a;
+ pos += (int)SHORT(a);
if (visited[pos])
{
/* argh... the regexp contains empty loops. This is not
@@ -664,10 +698,15 @@ static void re_compile_fastmap_aux(char *code,
{
a = (unsigned char)code[pos++];
a |= (unsigned char)code[pos++] << 8;
- a = pos + (int)(short)a;
+ a = pos + (int)SHORT(a);
re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
break;
}
+ case Crepeat1:
+ {
+ pos += 2;
+ break;
+ }
default:
{
abort(); /* probably some opcode is missing from this switch */
@@ -754,10 +793,11 @@ static int re_optimize_star_jump(regexp_t bufp, char *code)
char ch;
int a;
int b;
+ int num_instructions = 0;
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
- a = (int)(short)a;
+ a = (int)SHORT(a);
p1 = code + a + 3; /* skip the failure_jump */
assert(p1[-3] == Cfailure_jump);
@@ -775,6 +815,7 @@ static int re_optimize_star_jump(regexp_t bufp, char *code)
/* loop until we find something that consumes a character */
loop_p1:
+ num_instructions++;
switch (*p1++)
{
case Cbol:
@@ -824,6 +865,7 @@ static int re_optimize_star_jump(regexp_t bufp, char *code)
/* now we know that we can't backtrack. */
while (p1 != p2 - 3)
{
+ num_instructions++;
switch (*p1++)
{
case Cend:
@@ -873,11 +915,22 @@ static int re_optimize_star_jump(regexp_t bufp, char *code)
}
}
+ make_update_jump:
code -= 3;
a += 3; /* jump to after the Cfailure_jump */
code[0] = Cupdate_failure_jump;
code[1] = a & 0xff;
code[2] = a >> 8;
+ if (num_instructions > 1)
+ return 1;
+ assert(num_instructions == 1);
+ /* if the only instruction matches a single character, we can do
+ * better
+ */
+ p1 = code + 3 + a; /* start of sole instruction */
+ if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
+ *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
+ code[0] = Crepeat1;
return 1;
make_normal_jump:
@@ -939,6 +992,7 @@ static int re_optimize(regexp_t bufp)
case Cjump:
case Cdummy_failure_jump:
case Cfailure_jump:
+ case Crepeat1:
{
code += 2;
break;
@@ -1111,7 +1165,8 @@ char *re_compile_pattern(char *regex, int size, regexp_t bufp)
re_compile_initialize();
bufp->used = 0;
bufp->fastmap_accurate = 0;
- bufp->uses_registers = 0;
+ bufp->uses_registers = 1;
+ bufp->num_registers = 1;
translate = bufp->translate;
pattern = bufp->buffer;
alloc = bufp->allocated;
@@ -1289,6 +1344,7 @@ char *re_compile_pattern(char *regex, int size, regexp_t bufp)
STORE(Cstart_memory);
STORE(next_register);
open_registers[num_open_registers++] = next_register;
+ bufp->num_registers++;
next_register++;
}
paren_depth++;
@@ -1545,27 +1601,8 @@ int re_match(regexp_t bufp,
code = bufp->buffer;
translate = bufp->translate;
-/* translated = NULL; */
-/* if (bufp->translate) */
-/* { */
-/* char *t1; */
-/* char *t2; */
-
-/* translated = malloc(size); */
-/* if (translated == NULL) */
-/* goto error; */
-
-/* t1 = string; */
-/* t2 = translated; */
-/* while(t1 < textend) */
-/* *t2++ = bufp->translate[*t1++]; */
-
-/* text = translated + pos; */
-/* textstart = translated; */
-/* textend = translated + size; */
-/* } */
- NEW_STATE(state);
+ NEW_STATE(state, bufp->num_registers);
continue_matching:
switch (*code++)
@@ -1587,7 +1624,7 @@ int re_match(regexp_t bufp,
}
else
{
- for (a = 1; a < RE_NREGS; a++)
+ for (a = 1; a < bufp->num_registers; a++)
{
if ((GET_REG_START(state, a) == NULL) ||
(GET_REG_END(state, a) == NULL))
@@ -1599,10 +1636,13 @@ int re_match(regexp_t bufp,
old_regs->start[a] = GET_REG_START(state, a) - textstart;
old_regs->end[a] = GET_REG_END(state, a) - textstart;
}
+ for (; a < RE_NREGS; a++)
+ {
+ old_regs->start[a] = -1;
+ old_regs->end[a] = -1;
+ }
}
}
-/* if(translated) */
-/* free(translated); */
FREE_STATE(state);
return match_end - pos;
}
@@ -1703,18 +1743,18 @@ int re_match(regexp_t bufp,
{
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
- code += (int)(short)a;
+ code += (int)SHORT(a);
goto continue_matching;
}
case Cdummy_failure_jump:
{
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
- a = (int)(short)a;
+ a = (int)SHORT(a);
assert(*code == Cfailure_jump);
b = (unsigned char)code[1];
b |= (unsigned char)code[2] << 8;
- PUSH_FAILURE(state, code + (int)(short)b + 3, NULL, goto error);
+ PUSH_FAILURE(state, code + (int)SHORT(b) + 3, NULL, goto error);
code += a;
goto continue_matching;
}
@@ -1722,10 +1762,120 @@ int re_match(regexp_t bufp,
{
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
- a = (int)(short)a;
+ a = (int)SHORT(a);
PUSH_FAILURE(state, code + a, text, goto error);
goto continue_matching;
}
+ case Crepeat1:
+ {
+ char *pinst;
+ a = (unsigned char)*code++;
+ a |= (unsigned char)*code++ << 8;
+ a = (int)SHORT(a);
+ pinst = code + a;
+ /* pinst is sole instruction in loop, and it matches a
+ * single character. Since Crepeat1 was originally a
+ * Cupdate_failure_jump, we also know that backtracking is
+ * useless: so long as the single-character expression matches,
+ * it must be used. Also, in the case of +, we've already
+ * matched one character, so + can't fail: nothing here can
+ * cause a failure.
+ */
+ switch (*pinst++)
+ {
+ case Cset:
+ {
+ if (translate)
+ {
+ while (text < textend)
+ {
+ ch = translate[(unsigned char)*text];
+ if (pinst[ch/8] & (1<<(ch & 7)))
+ text++;
+ else
+ break;
+ }
+ }
+ else
+ {
+ while (text < textend)
+ {
+ ch = (unsigned char)*text;
+ if (pinst[ch/8] & (1<<(ch & 7)))
+ text++;
+ else
+ break;
+ }
+ }
+ break;
+ }
+ case Cexact:
+ {
+ ch = (unsigned char)*pinst;
+ if (translate)
+ {
+ while (text < textend &&
+ translate[(unsigned char)*text] == ch)
+ text++;
+ }
+ else
+ {
+ while (text < textend && (unsigned char)*text == ch)
+ text++;
+ }
+ break;
+ }
+ case Canychar:
+ {
+ while (text < textend && (unsigned char)*text != '\n')
+ text++;
+ break;
+ }
+ case Csyntaxspec:
+ {
+ a = (unsigned char)*pinst;
+ if (translate)
+ {
+ while (text < textend &&
+ translate[SYNTAX(*text)] == a)
+ text++;
+ }
+ else
+ {
+ while (text < textend && SYNTAX(*text) == a)
+ text++;
+ }
+ break;
+ }
+ case Cnotsyntaxspec:
+ {
+ a = (unsigned char)*pinst;
+ if (translate)
+ {
+ while (text < textend &&
+ translate[SYNTAX(*text)] != a)
+ text++;
+ }
+ else
+ {
+ while (text < textend && SYNTAX(*text) != a)
+ text++;
+ }
+ break;
+ }
+ default:
+ {
+ abort();
+ /*NOTREACHED*/
+ }
+ }
+ /* due to the funky way + and * are compiled, the top failure-
+ * stack entry at this point is actually a success entry --
+ * update it & pop it
+ */
+ UPDATE_FAILURE(state, text, goto error);
+ goto fail; /* i.e., succeed <wink/sigh> */
+ }
case Cbegbuf:
{
if (text == textstart)
diff --git a/Modules/regexpr.h b/Modules/regexpr.h
index e623362..1221802 100644
--- a/Modules/regexpr.h
+++ b/Modules/regexpr.h
@@ -1,3 +1,7 @@
+/*
+ * -*- mode: c-mode; c-file-style: python -*-
+ */
+
#ifndef Py_REGEXPR_H
#define Py_REGEXPR_H
#ifdef __cplusplus
@@ -5,22 +9,22 @@ extern "C" {
#endif
/*
-
-regexpr.h
-
-Author: Tatu Ylonen <ylo@ngs.fi>
-
-Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
-
-Permission to use, copy, modify, distribute, and sell this software
-and its documentation for any purpose is hereby granted without fee,
-provided that the above copyright notice appear in all copies. This
-software is provided "as is" without express or implied warranty.
-
-Created: Thu Sep 26 17:15:36 1991 ylo
-Last modified: Mon Nov 4 15:49:46 1991 ylo
-
-*/
+ * regexpr.h
+ *
+ * Author: Tatu Ylonen <ylo@ngs.fi>
+ *
+ * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appear in all copies. This
+ * software is provided "as is" without express or implied warranty.
+ *
+ * Created: Thu Sep 26 17:15:36 1991 ylo
+ * Last modified: Mon Nov 4 15:49:46 1991 ylo
+ */
+
+/* $Id$ */
#ifndef REGEXPR_H
#define REGEXPR_H
@@ -29,21 +33,22 @@ Last modified: Mon Nov 4 15:49:46 1991 ylo
typedef struct re_pattern_buffer
{
- char *buffer; /* compiled pattern */
- int allocated; /* allocated size of compiled pattern */
- int used; /* actual length of compiled pattern */
- char *fastmap; /* fastmap[ch] is true if ch can start pattern */
- char *translate; /* translation to apply during compilation/matching */
- char fastmap_accurate; /* true if fastmap is valid */
- char can_be_null; /* true if can match empty string */
- char uses_registers; /* registers are used and need to be initialized */
- char anchor; /* anchor: 0=none 1=begline 2=begbuf */
+ char *buffer; /* compiled pattern */
+ int allocated; /* allocated size of compiled pattern */
+ int used; /* actual length of compiled pattern */
+ char *fastmap; /* fastmap[ch] is true if ch can start pattern */
+ char *translate; /* translation to apply during compilation/matching */
+ char fastmap_accurate; /* true if fastmap is valid */
+ char can_be_null; /* true if can match empty string */
+ char uses_registers; /* registers are used and need to be initialized */
+ int num_registers; /* number of registers used */
+ char anchor; /* anchor: 0=none 1=begline 2=begbuf */
} *regexp_t;
typedef struct re_registers
{
- int start[RE_NREGS]; /* start offset of region */
- int end[RE_NREGS]; /* end offset of region */
+ int start[RE_NREGS]; /* start offset of region */
+ int end[RE_NREGS]; /* end offset of region */
} *regexp_registers_t;
/* bit definitions for syntax */
@@ -77,52 +82,53 @@ typedef struct re_registers
#ifdef HAVE_PROTOTYPES
extern int re_syntax;
-/* This is the actual syntax mask. It was added so that Python
- could do syntax-dependent munging of patterns before compilation. */
+/* This is the actual syntax mask. It was added so that Python could do
+ * syntax-dependent munging of patterns before compilation. */
int re_set_syntax(int syntax);
/* This sets the syntax to use and returns the previous syntax. The
- syntax is specified by a bit mask of the above defined bits. */
+ * syntax is specified by a bit mask of the above defined bits. */
char *re_compile_pattern(char *regex, int regex_size, regexp_t compiled);
/* This compiles the regexp (given in regex and length in regex_size).
- This returns NULL if the regexp compiled successfully, and an error
- message if an error was encountered. The buffer field must be
- initialized to a memory area allocated by malloc (or to NULL) before
- use, and the allocated field must be set to its length (or 0 if buffer is
- NULL). Also, the translate field must be set to point to a valid
- translation table, or NULL if it is not used. */
+ * This returns NULL if the regexp compiled successfully, and an error
+ * message if an error was encountered. The buffer field must be
+ * initialized to a memory area allocated by malloc (or to NULL) before
+ * use, and the allocated field must be set to its length (or 0 if
+ * buffer is NULL). Also, the translate field must be set to point to a
+ * valid translation table, or NULL if it is not used. */
int re_match(regexp_t compiled, char *string, int size, int pos,
regexp_registers_t old_regs);
/* This tries to match the regexp against the string. This returns the
- length of the matched portion, or -1 if the pattern could not be
- matched and -2 if an error (such as failure stack overflow) is
- encountered. */
+ * length of the matched portion, or -1 if the pattern could not be
+ * matched and -2 if an error (such as failure stack overflow) is
+ * encountered. */
int re_search(regexp_t compiled, char *string, int size, int startpos,
int range, regexp_registers_t regs);
-/* This rearches for a substring matching the regexp. This returns the first
- index at which a match is found. range specifies at how many positions to
- try matching; positive values indicate searching forwards, and negative
- values indicate searching backwards. mstop specifies the offset beyond
- which a match must not go. This returns -1 if no match is found, and
- -2 if an error (such as failure stack overflow) is encountered. */
+/* This rearches for a substring matching the regexp. This returns the
+ * first index at which a match is found. range specifies at how many
+ * positions to try matching; positive values indicate searching
+ * forwards, and negative values indicate searching backwards. mstop
+ * specifies the offset beyond which a match must not go. This returns
+ * -1 if no match is found, and -2 if an error (such as failure stack
+ * overflow) is encountered. */
void re_compile_fastmap(regexp_t compiled);
/* This computes the fastmap for the regexp. For this to have any effect,
- the calling program must have initialized the fastmap field to point
- to an array of 256 characters. */
+ * the calling program must have initialized the fastmap field to point
+ * to an array of 256 characters. */
char *re_comp(char *s);
/* BSD 4.2 regex library routine re_comp. This compiles the regexp into
- an internal buffer. This returns NULL if the regexp was compiled
- successfully, and an error message if there was an error. */
+ * an internal buffer. This returns NULL if the regexp was compiled
+ * successfully, and an error message if there was an error. */
int re_exec(char *s);
-/* BSD 4.2 regexp library routine re_exec. This returns true if the string
- matches the regular expression (that is, a matching part is found
- anywhere in the string). */
+/* BSD 4.2 regexp library routine re_exec. This returns true if the
+ * string matches the regular expression (that is, a matching part is
+ * found anywhere in the string). */
#else /* HAVE_PROTOTYPES */
diff --git a/Modules/reopmodule.c b/Modules/reopmodule.c
new file mode 100644
index 0000000..0d12210
--- /dev/null
+++ b/Modules/reopmodule.c
@@ -0,0 +1,375 @@
+/***********************************************************
+Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
+The Netherlands.
+
+ All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the names of Stichting Mathematisch
+Centrum or CWI or Corporation for National Research Initiatives or
+CNRI not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+While CWI is the initial source for this software, a modified version
+is made available by the Corporation for National Research Initiatives
+(CNRI) at the Internet address ftp://ftp.python.org.
+
+STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
+CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
+DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+
+******************************************************************/
+
+/* $Id$ */
+
+/* Regular expression objects */
+/* This uses Tatu Ylonen's copyleft-free reimplementation of
+ GNU regular expressions */
+
+#include "Python.h"
+
+#include <ctype.h>
+
+#include "regexpr.h"
+
+static PyObject *ReopError; /* Exception */
+
+static PyObject *
+makeresult(regs, num_regs)
+ struct re_registers *regs;
+ int num_regs;
+{
+ PyObject *v;
+ int i;
+ static PyObject *filler = NULL;
+
+ if (filler == NULL) {
+ filler = Py_BuildValue("(ii)", -1, -1);
+ if (filler == NULL)
+ return NULL;
+ }
+ v = PyTuple_New(num_regs);
+ if (v == NULL)
+ return NULL;
+
+ for (i = 0; i < num_regs; i++) {
+ int lo = regs->start[i];
+ int hi = regs->end[i];
+ PyObject *w;
+ if (lo == -1 && hi == -1) {
+ w = filler;
+ Py_INCREF(w);
+ }
+ else
+ w = Py_BuildValue("(ii)", lo, hi);
+ if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
+ Py_DECREF(v);
+ return NULL;
+ }
+ }
+ return v;
+}
+
+static PyObject *
+reop_match(self, args)
+ PyObject *self;
+ PyObject *args;
+{
+ char *string;
+ int fastmaplen, stringlen;
+ int can_be_null, anchor, i;
+ int num_regs, flags, pos, result;
+ struct re_pattern_buffer bufp;
+ struct re_registers re_regs;
+
+ if (!PyArg_Parse(args, "(s#iiis#is#i)",
+ &(bufp.buffer), &(bufp.allocated),
+ &num_regs, &flags, &can_be_null,
+ &(bufp.fastmap), &fastmaplen,
+ &anchor,
+ &string, &stringlen,
+ &pos))
+ return NULL;
+
+ /* XXX sanity-check the input data */
+ bufp.used=bufp.allocated;
+ bufp.translate=NULL;
+ bufp.fastmap_accurate=1;
+ bufp.can_be_null=can_be_null;
+ bufp.uses_registers=1;
+ bufp.num_registers=num_regs;
+ bufp.anchor=anchor;
+
+ for(i=0; i<num_regs; i++) {re_regs.start[i]=-1; re_regs.end[i]=-1;}
+
+ result = re_match(&bufp,
+ string, stringlen, pos,
+ &re_regs);
+ if (result < -1) {
+ /* Failure like stack overflow */
+ PyErr_SetString(ReopError, "match failure");
+ return NULL;
+ }
+ return makeresult(&re_regs, num_regs);
+}
+
+static PyObject *
+reop_search(self, args)
+ PyObject *self;
+ PyObject *args;
+{
+ char *string;
+ int fastmaplen, stringlen;
+ int can_be_null, anchor, i;
+ int num_regs, flags, pos, result;
+ struct re_pattern_buffer bufp;
+ struct re_registers re_regs;
+
+ if (!PyArg_Parse(args, "(s#iiis#is#i)",
+ &(bufp.buffer), &(bufp.allocated),
+ &num_regs, &flags, &can_be_null,
+ &(bufp.fastmap), &fastmaplen,
+ &anchor,
+ &string, &stringlen,
+ &pos))
+ return NULL;
+
+ /* XXX sanity-check the input data */
+ bufp.used=bufp.allocated;
+ bufp.translate=NULL;
+ bufp.fastmap_accurate=1;
+ bufp.can_be_null=can_be_null;
+ bufp.uses_registers=1;
+ bufp.num_registers=1;
+ bufp.anchor=anchor;
+
+ for(i=0; i<num_regs; i++) {re_regs.start[i]=-1; re_regs.end[i]=-1;}
+
+ result = re_search(&bufp,
+ string, stringlen, pos, stringlen-pos,
+ &re_regs);
+ if (result < -1) {
+ /* Failure like stack overflow */
+ PyErr_SetString(ReopError, "match failure");
+ return NULL;
+ }
+ return makeresult(&re_regs, num_regs);
+}
+
+#if 0
+/* Functions originally in the regsub module.
+ Added June 1, 1997.
+ */
+
+/* A cache of previously used patterns is maintained. Notice that if
+ you change the reop syntax flag, entries in the cache are
+ invalidated.
+ XXX Solution: use (syntax flag, pattern) as keys? Clear the cache
+ every so often, or once it gets past a certain size?
+*/
+
+static PyObject *cache_dict=NULL;
+
+/* Accept an object; if it's a reop pattern, Py_INCREF it and return
+ it. If it's a string, a reop object is compiled and cached.
+*/
+
+static reopobject *
+cached_compile(pattern)
+ PyObject *pattern;
+{
+ reopobject *p2;
+
+ if (!PyString_Check(pattern))
+ {
+ /* It's not a string, so assume it's a compiled reop object */
+ /* XXX check that! */
+ Py_INCREF(pattern);
+ return (reopobject*)pattern;
+ }
+ if (cache_dict==NULL)
+ {
+ cache_dict=PyDict_New();
+ if (cache_dict==NULL)
+ {
+ return (reopobject*)NULL;
+ }
+ }
+
+ /* See if the pattern has already been cached; if so, return that
+ reop object */
+ p2=(reopobject*)PyDict_GetItem(cache_dict, pattern);
+ if (p2)
+ {
+ Py_INCREF(p2);
+ return (reopobject*)p2;
+ }
+
+ /* Compile the pattern and cache it */
+ p2=(reopobject*)newreopobject(pattern, NULL, pattern, NULL);
+ if (!p2) return p2;
+ PyDict_SetItem(cache_dict, pattern, (PyObject*)p2);
+ return p2;
+}
+
+
+static PyObject *
+internal_split(args, retain)
+ PyObject *args;
+ int retain;
+{
+ PyObject *newlist, *s;
+ reopobject *pattern;
+ int maxsplit=0, count=0, length, next=0, result;
+ int match_end=0; /* match_start is defined below */
+ char *start;
+
+ if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern,
+ &maxsplit))
+ {
+ PyErr_Clear();
+ if (!PyArg_ParseTuple(args, "s#O", &start, &length, &pattern))
+ return NULL;
+ }
+ pattern=cached_compile((PyObject *)pattern);
+ if (!pattern) return NULL;
+
+ newlist=PyList_New(0);
+ if (!newlist) return NULL;
+
+ do
+ {
+ result = re_search(&pattern->re_patbuf,
+ start, length, next, length-next,
+ &pattern->re_regs);
+ if (result < -1)
+ { /* Erk... an error happened during the reop search */
+ Py_DECREF(newlist);
+ PyErr_SetString(ReopError, "match failure");
+ return NULL;
+ }
+ if (next<=result)
+ {
+ int match_start=pattern->re_regs.start[0];
+ int oldmatch_end=match_end;
+ match_end=pattern->re_regs.end[0];
+
+ if (match_start==match_end)
+ { /* A zero-length match; increment to the next position */
+ next=result+1;
+ match_end=oldmatch_end;
+ continue;
+ }
+
+ /* Append the string up to the start of the match */
+ s=PyString_FromStringAndSize(start+oldmatch_end, match_start-oldmatch_end);
+ if (!s)
+ {
+ Py_DECREF(newlist);
+ return NULL;
+ }
+ PyList_Append(newlist, s);
+ Py_DECREF(s);
+
+ if (retain)
+ {
+ /* Append a string containing whatever matched */
+ s=PyString_FromStringAndSize(start+match_start, match_end-match_start);
+ if (!s)
+ {
+ Py_DECREF(newlist);
+ return NULL;
+ }
+ PyList_Append(newlist, s);
+ Py_DECREF(s);
+ }
+ /* Update the pointer, and increment the count of splits */
+ next=match_end; count++;
+ }
+ } while (result!=-1 && !(maxsplit && maxsplit==count) &&
+ next<length);
+ s=PyString_FromStringAndSize(start+match_end, length-match_end);
+ if (!s)
+ {
+ Py_DECREF(newlist);
+ return NULL;
+ }
+ PyList_Append(newlist, s);
+ Py_DECREF(s);
+ Py_DECREF(pattern);
+ return newlist;
+}
+
+static PyObject *
+reop_split(self, args)
+ PyObject *self;
+ PyObject *args;
+{
+ return internal_split(args, 0);
+}
+
+static PyObject *
+reop_splitx(self, args)
+ PyObject *self;
+ PyObject *args;
+{
+ return internal_split(args, 1);
+}
+#endif
+
+static struct PyMethodDef reop_global_methods[] = {
+ {"match", reop_match, 0},
+ {"search", reop_search, 0},
+#if 0
+ {"split", reop_split, 0},
+ {"splitx", reop_splitx, 0},
+#endif
+ {NULL, NULL} /* sentinel */
+};
+
+void
+initreop()
+{
+ PyObject *m, *d, *v;
+ int i;
+ char *s;
+
+ m = Py_InitModule("reop", reop_global_methods);
+ d = PyModule_GetDict(m);
+
+ /* Initialize reop.error exception */
+ v = ReopError = PyString_FromString("reop.error");
+ if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
+ goto finally;
+
+ /* Initialize reop.casefold constant */
+ if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
+ goto finally;
+
+ if (!(s = PyString_AsString(v)))
+ goto finally;
+
+ for (i = 0; i < 256; i++) {
+ if (isupper(i))
+ s[i] = tolower(i);
+ else
+ s[i] = i;
+ }
+ if (PyDict_SetItemString(d, "casefold", v) < 0)
+ goto finally;
+ Py_DECREF(v);
+
+ if (!PyErr_Occurred())
+ return;
+ finally:
+ Py_FatalError("can't initialize reop module");
+}