diff options
-rw-r--r-- | Modules/regexpr.c | 384 | ||||
-rw-r--r-- | Modules/regexpr.h | 110 | ||||
-rw-r--r-- | Modules/reopmodule.c | 375 |
3 files changed, 700 insertions, 169 deletions
diff --git a/Modules/regexpr.c b/Modules/regexpr.c index 87f747f..0b6ec8b 100644 --- a/Modules/regexpr.c +++ b/Modules/regexpr.c @@ -1,3 +1,7 @@ +/* + * -*- mode: c-mode; c-file-style: python -*- + */ + /* regexpr.c * * Author: Tatu Ylonen <ylo@ngs.fi> @@ -57,6 +61,12 @@ char *realloc(); #endif /* __STDC__ */ #endif /* THINK_C */ +/* The original code blithely assumed that sizeof(short) == 2. Not + * always true. Original instances of "(short)x" were replaced by + * SHORT(x), where SHORT is #defined below. */ + +#define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x)) + /* The stack implementation is taken from an idea by Andrew Kuchling. * It's a doubly linked list of arrays. The advantages of this over a * simple linked list are that the number of mallocs required are @@ -75,27 +85,27 @@ char *realloc(); typedef union item_t { - struct - { - int num; - int level; - char *start; - char *end; - } reg; - struct - { - int count; - int level; - int phantom; - char *code; - char *text; - } fail; - struct - { - int num; - int level; - int count; - } cntr; + struct + { + int num; + int level; + char *start; + char *end; + } reg; + struct + { + int count; + int level; + int phantom; + char *code; + char *text; + } fail; + struct + { + int num; + int level; + int count; + } cntr; } item_t; #define STACK_PAGE_SIZE 256 @@ -105,43 +115,98 @@ typedef union item_t typedef struct item_page_t { - item_t items[STACK_PAGE_SIZE]; - struct item_page_t *prev; - struct item_page_t *next; + item_t items[STACK_PAGE_SIZE]; + struct item_page_t *prev; + struct item_page_t *next; } item_page_t; typedef struct match_state { - /* Structure to encapsulate the stack. */ - struct - { - /* index into the curent page. If index == 0 and you need - * to pop and item, move to the previous page and set - * index = STACK_PAGE_SIZE - 1. Otherwise decrement index - * to push a page. If index == STACK_PAGE_SIZE and you - * need to push a page move to the next page and set index - * = 0. If there is no new next page, allocate a new page - * and link it in. Otherwise, increment index to push a - * page. */ - int index; - item_page_t *current; /* Pointer to the current page. */ - item_page_t first; /* First page is statically allocated. */ - } stack; - char *start[NUM_REGISTERS]; - char *end[NUM_REGISTERS]; - - int changed[NUM_REGISTERS]; - /* The number of registers that have been pushed onto the stack - * since the last failure point. */ - int count; - /* Used to control when registers need to be pushed onto the - * stack. */ - int level; - /* The number of failure points on the stack. */ - int point; + /* The number of registers that have been pushed onto the stack + * since the last failure point. */ + + int count; + + /* Used to control when registers need to be pushed onto the + * stack. */ + + int level; + + /* The number of failure points on the stack. */ + + int point; + + /* Storage for the registers. Each register consists of two + * pointers to characters. So register N is represented as + * start[N] and end[N]. The pointers must be converted to + * offsets from the beginning of the string before returning the + * registers to the calling program. */ + + char *start[NUM_REGISTERS]; + char *end[NUM_REGISTERS]; + + /* Keeps track of whether a register has changed recently. */ + + int changed[NUM_REGISTERS]; + + /* Structure to encapsulate the stack. */ + struct + { + /* index into the curent page. If index == 0 and you need + * to pop an item, move to the previous page and set index + * = STACK_PAGE_SIZE - 1. Otherwise decrement index to + * push a page. If index == STACK_PAGE_SIZE and you need + * to push a page move to the next page and set index = + * 0. If there is no new next page, allocate a new page + * and link it in. Otherwise, increment index to push a + * page. */ + + int index; + item_page_t *current; /* Pointer to the current page. */ + item_page_t first; /* First page is statically allocated. */ + } stack; } match_state; +/* Initialize a state object */ + +/* #define NEW_STATE(state) \ */ +/* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */ +/* state.stack.current = &state.stack.first; \ */ +/* state.stack.first.prev = NULL; \ */ +/* state.stack.first.next = NULL; \ */ +/* state.stack.index = 0; \ */ +/* state.level = 1 */ + +#define NEW_STATE(state, nregs) \ +{ \ + int i; \ + for (i = 0; i < nregs; i++) \ + { \ + state.start[i] = NULL; \ + state.end[i] = NULL; \ + state.changed[i] = 0; \ + } \ + state.stack.current = &state.stack.first; \ + state.stack.first.prev = NULL; \ + state.stack.first.next = NULL; \ + state.stack.index = 0; \ + state.level = 1; \ + state.count = 0; \ + state.level = 0; \ + state.point = 0; \ +} + +/* Free any memory that might have been malloc'd */ + +#define FREE_STATE(state) \ +while(state.stack.first.next != NULL) \ +{ \ + state.stack.current = state.stack.first.next; \ + state.stack.first.next = state.stack.current->next; \ + free(state.stack.current); \ +} + /* Discard the top 'count' stack items. */ #define STACK_DISCARD(stack, count, on_error) \ @@ -226,24 +291,6 @@ else \ #define STACK_EMPTY(stack) ((stack.index == 0) && \ (stack.current->prev == NULL)) - -/* Initialize a state object */ - -#define NEW_STATE(state) \ -memset(&state, 0, sizeof(match_state)); \ -state.stack.current = &state.stack.first; \ -state.level = 1 - -/* Free any memory that might have been malloc'd */ - -#define FREE_STATE(state) \ -while(state.stack.first.next != NULL) \ -{ \ - state.stack.current = state.stack.first.next; \ - state.stack.first.next = state.stack.current->next; \ - free(state.stack.current); \ -} - /* Return the start of register 'reg' */ #define GET_REG_START(state, reg) (state.start[reg]) @@ -302,22 +349,6 @@ state.end[reg] = text /* Update the last failure point with a new position in the text. */ -/* #define UPDATE_FAILURE(state, xtext, on_error) \ */ -/* { \ */ -/* item_t *item; \ */ -/* STACK_DISCARD(state.stack, state.count, on_error); \ */ -/* STACK_TOP(state.stack, item, on_error); \ */ -/* item->fail.text = xtext; \ */ -/* state.count = 0; \ */ -/* } */ - -/* #define UPDATE_FAILURE(state, xtext, on_error) \ */ -/* { \ */ -/* item_t *item; \ */ -/* STACK_BACK(state.stack, item, state.count + 1, on_error); \ */ -/* item->fail.text = xtext; \ */ -/* } */ - #define UPDATE_FAILURE(state, xtext, on_error) \ { \ item_t *item; \ @@ -391,7 +422,8 @@ enum regexp_compiled_ops /* opcodes for compiled regexp */ Cwordbound, /* match if at word boundary */ Cnotwordbound, /* match if not at word boundary */ Csyntaxspec, /* matches syntax code (1 byte follows) */ - Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/ + Cnotsyntaxspec, /* matches if syntax code does not match (1 byte foll)*/ + Crepeat1 }; enum regexp_syntax_op /* syntax codes for plain and quoted characters */ @@ -581,6 +613,8 @@ static void re_compile_fastmap_aux(char *code, case Cwordbound: case Cnotwordbound: { + for (a = 0; a < 256; a++) + fastmap[a] = 1; break; } case Csyntaxspec: @@ -648,7 +682,7 @@ static void re_compile_fastmap_aux(char *code, { a = (unsigned char)code[pos++]; a |= (unsigned char)code[pos++] << 8; - pos += (int)(short)a; + pos += (int)SHORT(a); if (visited[pos]) { /* argh... the regexp contains empty loops. This is not @@ -664,10 +698,15 @@ static void re_compile_fastmap_aux(char *code, { a = (unsigned char)code[pos++]; a |= (unsigned char)code[pos++] << 8; - a = pos + (int)(short)a; + a = pos + (int)SHORT(a); re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap); break; } + case Crepeat1: + { + pos += 2; + break; + } default: { abort(); /* probably some opcode is missing from this switch */ @@ -754,10 +793,11 @@ static int re_optimize_star_jump(regexp_t bufp, char *code) char ch; int a; int b; + int num_instructions = 0; a = (unsigned char)*code++; a |= (unsigned char)*code++ << 8; - a = (int)(short)a; + a = (int)SHORT(a); p1 = code + a + 3; /* skip the failure_jump */ assert(p1[-3] == Cfailure_jump); @@ -775,6 +815,7 @@ static int re_optimize_star_jump(regexp_t bufp, char *code) /* loop until we find something that consumes a character */ loop_p1: + num_instructions++; switch (*p1++) { case Cbol: @@ -824,6 +865,7 @@ static int re_optimize_star_jump(regexp_t bufp, char *code) /* now we know that we can't backtrack. */ while (p1 != p2 - 3) { + num_instructions++; switch (*p1++) { case Cend: @@ -873,11 +915,22 @@ static int re_optimize_star_jump(regexp_t bufp, char *code) } } + make_update_jump: code -= 3; a += 3; /* jump to after the Cfailure_jump */ code[0] = Cupdate_failure_jump; code[1] = a & 0xff; code[2] = a >> 8; + if (num_instructions > 1) + return 1; + assert(num_instructions == 1); + /* if the only instruction matches a single character, we can do + * better + */ + p1 = code + 3 + a; /* start of sole instruction */ + if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar || + *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec) + code[0] = Crepeat1; return 1; make_normal_jump: @@ -939,6 +992,7 @@ static int re_optimize(regexp_t bufp) case Cjump: case Cdummy_failure_jump: case Cfailure_jump: + case Crepeat1: { code += 2; break; @@ -1111,7 +1165,8 @@ char *re_compile_pattern(char *regex, int size, regexp_t bufp) re_compile_initialize(); bufp->used = 0; bufp->fastmap_accurate = 0; - bufp->uses_registers = 0; + bufp->uses_registers = 1; + bufp->num_registers = 1; translate = bufp->translate; pattern = bufp->buffer; alloc = bufp->allocated; @@ -1289,6 +1344,7 @@ char *re_compile_pattern(char *regex, int size, regexp_t bufp) STORE(Cstart_memory); STORE(next_register); open_registers[num_open_registers++] = next_register; + bufp->num_registers++; next_register++; } paren_depth++; @@ -1545,27 +1601,8 @@ int re_match(regexp_t bufp, code = bufp->buffer; translate = bufp->translate; -/* translated = NULL; */ -/* if (bufp->translate) */ -/* { */ -/* char *t1; */ -/* char *t2; */ - -/* translated = malloc(size); */ -/* if (translated == NULL) */ -/* goto error; */ - -/* t1 = string; */ -/* t2 = translated; */ -/* while(t1 < textend) */ -/* *t2++ = bufp->translate[*t1++]; */ - -/* text = translated + pos; */ -/* textstart = translated; */ -/* textend = translated + size; */ -/* } */ - NEW_STATE(state); + NEW_STATE(state, bufp->num_registers); continue_matching: switch (*code++) @@ -1587,7 +1624,7 @@ int re_match(regexp_t bufp, } else { - for (a = 1; a < RE_NREGS; a++) + for (a = 1; a < bufp->num_registers; a++) { if ((GET_REG_START(state, a) == NULL) || (GET_REG_END(state, a) == NULL)) @@ -1599,10 +1636,13 @@ int re_match(regexp_t bufp, old_regs->start[a] = GET_REG_START(state, a) - textstart; old_regs->end[a] = GET_REG_END(state, a) - textstart; } + for (; a < RE_NREGS; a++) + { + old_regs->start[a] = -1; + old_regs->end[a] = -1; + } } } -/* if(translated) */ -/* free(translated); */ FREE_STATE(state); return match_end - pos; } @@ -1703,18 +1743,18 @@ int re_match(regexp_t bufp, { a = (unsigned char)*code++; a |= (unsigned char)*code++ << 8; - code += (int)(short)a; + code += (int)SHORT(a); goto continue_matching; } case Cdummy_failure_jump: { a = (unsigned char)*code++; a |= (unsigned char)*code++ << 8; - a = (int)(short)a; + a = (int)SHORT(a); assert(*code == Cfailure_jump); b = (unsigned char)code[1]; b |= (unsigned char)code[2] << 8; - PUSH_FAILURE(state, code + (int)(short)b + 3, NULL, goto error); + PUSH_FAILURE(state, code + (int)SHORT(b) + 3, NULL, goto error); code += a; goto continue_matching; } @@ -1722,10 +1762,120 @@ int re_match(regexp_t bufp, { a = (unsigned char)*code++; a |= (unsigned char)*code++ << 8; - a = (int)(short)a; + a = (int)SHORT(a); PUSH_FAILURE(state, code + a, text, goto error); goto continue_matching; } + case Crepeat1: + { + char *pinst; + a = (unsigned char)*code++; + a |= (unsigned char)*code++ << 8; + a = (int)SHORT(a); + pinst = code + a; + /* pinst is sole instruction in loop, and it matches a + * single character. Since Crepeat1 was originally a + * Cupdate_failure_jump, we also know that backtracking is + * useless: so long as the single-character expression matches, + * it must be used. Also, in the case of +, we've already + * matched one character, so + can't fail: nothing here can + * cause a failure. + */ + switch (*pinst++) + { + case Cset: + { + if (translate) + { + while (text < textend) + { + ch = translate[(unsigned char)*text]; + if (pinst[ch/8] & (1<<(ch & 7))) + text++; + else + break; + } + } + else + { + while (text < textend) + { + ch = (unsigned char)*text; + if (pinst[ch/8] & (1<<(ch & 7))) + text++; + else + break; + } + } + break; + } + case Cexact: + { + ch = (unsigned char)*pinst; + if (translate) + { + while (text < textend && + translate[(unsigned char)*text] == ch) + text++; + } + else + { + while (text < textend && (unsigned char)*text == ch) + text++; + } + break; + } + case Canychar: + { + while (text < textend && (unsigned char)*text != '\n') + text++; + break; + } + case Csyntaxspec: + { + a = (unsigned char)*pinst; + if (translate) + { + while (text < textend && + translate[SYNTAX(*text)] == a) + text++; + } + else + { + while (text < textend && SYNTAX(*text) == a) + text++; + } + break; + } + case Cnotsyntaxspec: + { + a = (unsigned char)*pinst; + if (translate) + { + while (text < textend && + translate[SYNTAX(*text)] != a) + text++; + } + else + { + while (text < textend && SYNTAX(*text) != a) + text++; + } + break; + } + default: + { + abort(); + /*NOTREACHED*/ + } + } + /* due to the funky way + and * are compiled, the top failure- + * stack entry at this point is actually a success entry -- + * update it & pop it + */ + UPDATE_FAILURE(state, text, goto error); + goto fail; /* i.e., succeed <wink/sigh> */ + } case Cbegbuf: { if (text == textstart) diff --git a/Modules/regexpr.h b/Modules/regexpr.h index e623362..1221802 100644 --- a/Modules/regexpr.h +++ b/Modules/regexpr.h @@ -1,3 +1,7 @@ +/* + * -*- mode: c-mode; c-file-style: python -*- + */ + #ifndef Py_REGEXPR_H #define Py_REGEXPR_H #ifdef __cplusplus @@ -5,22 +9,22 @@ extern "C" { #endif /* - -regexpr.h - -Author: Tatu Ylonen <ylo@ngs.fi> - -Copyright (c) 1991 Tatu Ylonen, Espoo, Finland - -Permission to use, copy, modify, distribute, and sell this software -and its documentation for any purpose is hereby granted without fee, -provided that the above copyright notice appear in all copies. This -software is provided "as is" without express or implied warranty. - -Created: Thu Sep 26 17:15:36 1991 ylo -Last modified: Mon Nov 4 15:49:46 1991 ylo - -*/ + * regexpr.h + * + * Author: Tatu Ylonen <ylo@ngs.fi> + * + * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appear in all copies. This + * software is provided "as is" without express or implied warranty. + * + * Created: Thu Sep 26 17:15:36 1991 ylo + * Last modified: Mon Nov 4 15:49:46 1991 ylo + */ + +/* $Id$ */ #ifndef REGEXPR_H #define REGEXPR_H @@ -29,21 +33,22 @@ Last modified: Mon Nov 4 15:49:46 1991 ylo typedef struct re_pattern_buffer { - char *buffer; /* compiled pattern */ - int allocated; /* allocated size of compiled pattern */ - int used; /* actual length of compiled pattern */ - char *fastmap; /* fastmap[ch] is true if ch can start pattern */ - char *translate; /* translation to apply during compilation/matching */ - char fastmap_accurate; /* true if fastmap is valid */ - char can_be_null; /* true if can match empty string */ - char uses_registers; /* registers are used and need to be initialized */ - char anchor; /* anchor: 0=none 1=begline 2=begbuf */ + char *buffer; /* compiled pattern */ + int allocated; /* allocated size of compiled pattern */ + int used; /* actual length of compiled pattern */ + char *fastmap; /* fastmap[ch] is true if ch can start pattern */ + char *translate; /* translation to apply during compilation/matching */ + char fastmap_accurate; /* true if fastmap is valid */ + char can_be_null; /* true if can match empty string */ + char uses_registers; /* registers are used and need to be initialized */ + int num_registers; /* number of registers used */ + char anchor; /* anchor: 0=none 1=begline 2=begbuf */ } *regexp_t; typedef struct re_registers { - int start[RE_NREGS]; /* start offset of region */ - int end[RE_NREGS]; /* end offset of region */ + int start[RE_NREGS]; /* start offset of region */ + int end[RE_NREGS]; /* end offset of region */ } *regexp_registers_t; /* bit definitions for syntax */ @@ -77,52 +82,53 @@ typedef struct re_registers #ifdef HAVE_PROTOTYPES extern int re_syntax; -/* This is the actual syntax mask. It was added so that Python - could do syntax-dependent munging of patterns before compilation. */ +/* This is the actual syntax mask. It was added so that Python could do + * syntax-dependent munging of patterns before compilation. */ int re_set_syntax(int syntax); /* This sets the syntax to use and returns the previous syntax. The - syntax is specified by a bit mask of the above defined bits. */ + * syntax is specified by a bit mask of the above defined bits. */ char *re_compile_pattern(char *regex, int regex_size, regexp_t compiled); /* This compiles the regexp (given in regex and length in regex_size). - This returns NULL if the regexp compiled successfully, and an error - message if an error was encountered. The buffer field must be - initialized to a memory area allocated by malloc (or to NULL) before - use, and the allocated field must be set to its length (or 0 if buffer is - NULL). Also, the translate field must be set to point to a valid - translation table, or NULL if it is not used. */ + * This returns NULL if the regexp compiled successfully, and an error + * message if an error was encountered. The buffer field must be + * initialized to a memory area allocated by malloc (or to NULL) before + * use, and the allocated field must be set to its length (or 0 if + * buffer is NULL). Also, the translate field must be set to point to a + * valid translation table, or NULL if it is not used. */ int re_match(regexp_t compiled, char *string, int size, int pos, regexp_registers_t old_regs); /* This tries to match the regexp against the string. This returns the - length of the matched portion, or -1 if the pattern could not be - matched and -2 if an error (such as failure stack overflow) is - encountered. */ + * length of the matched portion, or -1 if the pattern could not be + * matched and -2 if an error (such as failure stack overflow) is + * encountered. */ int re_search(regexp_t compiled, char *string, int size, int startpos, int range, regexp_registers_t regs); -/* This rearches for a substring matching the regexp. This returns the first - index at which a match is found. range specifies at how many positions to - try matching; positive values indicate searching forwards, and negative - values indicate searching backwards. mstop specifies the offset beyond - which a match must not go. This returns -1 if no match is found, and - -2 if an error (such as failure stack overflow) is encountered. */ +/* This rearches for a substring matching the regexp. This returns the + * first index at which a match is found. range specifies at how many + * positions to try matching; positive values indicate searching + * forwards, and negative values indicate searching backwards. mstop + * specifies the offset beyond which a match must not go. This returns + * -1 if no match is found, and -2 if an error (such as failure stack + * overflow) is encountered. */ void re_compile_fastmap(regexp_t compiled); /* This computes the fastmap for the regexp. For this to have any effect, - the calling program must have initialized the fastmap field to point - to an array of 256 characters. */ + * the calling program must have initialized the fastmap field to point + * to an array of 256 characters. */ char *re_comp(char *s); /* BSD 4.2 regex library routine re_comp. This compiles the regexp into - an internal buffer. This returns NULL if the regexp was compiled - successfully, and an error message if there was an error. */ + * an internal buffer. This returns NULL if the regexp was compiled + * successfully, and an error message if there was an error. */ int re_exec(char *s); -/* BSD 4.2 regexp library routine re_exec. This returns true if the string - matches the regular expression (that is, a matching part is found - anywhere in the string). */ +/* BSD 4.2 regexp library routine re_exec. This returns true if the + * string matches the regular expression (that is, a matching part is + * found anywhere in the string). */ #else /* HAVE_PROTOTYPES */ diff --git a/Modules/reopmodule.c b/Modules/reopmodule.c new file mode 100644 index 0000000..0d12210 --- /dev/null +++ b/Modules/reopmodule.c @@ -0,0 +1,375 @@ +/*********************************************************** +Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam, +The Netherlands. + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the names of Stichting Mathematisch +Centrum or CWI or Corporation for National Research Initiatives or +CNRI not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +While CWI is the initial source for this software, a modified version +is made available by the Corporation for National Research Initiatives +(CNRI) at the Internet address ftp://ftp.python.org. + +STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH +CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL +DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR +PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. + +******************************************************************/ + +/* $Id$ */ + +/* Regular expression objects */ +/* This uses Tatu Ylonen's copyleft-free reimplementation of + GNU regular expressions */ + +#include "Python.h" + +#include <ctype.h> + +#include "regexpr.h" + +static PyObject *ReopError; /* Exception */ + +static PyObject * +makeresult(regs, num_regs) + struct re_registers *regs; + int num_regs; +{ + PyObject *v; + int i; + static PyObject *filler = NULL; + + if (filler == NULL) { + filler = Py_BuildValue("(ii)", -1, -1); + if (filler == NULL) + return NULL; + } + v = PyTuple_New(num_regs); + if (v == NULL) + return NULL; + + for (i = 0; i < num_regs; i++) { + int lo = regs->start[i]; + int hi = regs->end[i]; + PyObject *w; + if (lo == -1 && hi == -1) { + w = filler; + Py_INCREF(w); + } + else + w = Py_BuildValue("(ii)", lo, hi); + if (w == NULL || PyTuple_SetItem(v, i, w) < 0) { + Py_DECREF(v); + return NULL; + } + } + return v; +} + +static PyObject * +reop_match(self, args) + PyObject *self; + PyObject *args; +{ + char *string; + int fastmaplen, stringlen; + int can_be_null, anchor, i; + int num_regs, flags, pos, result; + struct re_pattern_buffer bufp; + struct re_registers re_regs; + + if (!PyArg_Parse(args, "(s#iiis#is#i)", + &(bufp.buffer), &(bufp.allocated), + &num_regs, &flags, &can_be_null, + &(bufp.fastmap), &fastmaplen, + &anchor, + &string, &stringlen, + &pos)) + return NULL; + + /* XXX sanity-check the input data */ + bufp.used=bufp.allocated; + bufp.translate=NULL; + bufp.fastmap_accurate=1; + bufp.can_be_null=can_be_null; + bufp.uses_registers=1; + bufp.num_registers=num_regs; + bufp.anchor=anchor; + + for(i=0; i<num_regs; i++) {re_regs.start[i]=-1; re_regs.end[i]=-1;} + + result = re_match(&bufp, + string, stringlen, pos, + &re_regs); + if (result < -1) { + /* Failure like stack overflow */ + PyErr_SetString(ReopError, "match failure"); + return NULL; + } + return makeresult(&re_regs, num_regs); +} + +static PyObject * +reop_search(self, args) + PyObject *self; + PyObject *args; +{ + char *string; + int fastmaplen, stringlen; + int can_be_null, anchor, i; + int num_regs, flags, pos, result; + struct re_pattern_buffer bufp; + struct re_registers re_regs; + + if (!PyArg_Parse(args, "(s#iiis#is#i)", + &(bufp.buffer), &(bufp.allocated), + &num_regs, &flags, &can_be_null, + &(bufp.fastmap), &fastmaplen, + &anchor, + &string, &stringlen, + &pos)) + return NULL; + + /* XXX sanity-check the input data */ + bufp.used=bufp.allocated; + bufp.translate=NULL; + bufp.fastmap_accurate=1; + bufp.can_be_null=can_be_null; + bufp.uses_registers=1; + bufp.num_registers=1; + bufp.anchor=anchor; + + for(i=0; i<num_regs; i++) {re_regs.start[i]=-1; re_regs.end[i]=-1;} + + result = re_search(&bufp, + string, stringlen, pos, stringlen-pos, + &re_regs); + if (result < -1) { + /* Failure like stack overflow */ + PyErr_SetString(ReopError, "match failure"); + return NULL; + } + return makeresult(&re_regs, num_regs); +} + +#if 0 +/* Functions originally in the regsub module. + Added June 1, 1997. + */ + +/* A cache of previously used patterns is maintained. Notice that if + you change the reop syntax flag, entries in the cache are + invalidated. + XXX Solution: use (syntax flag, pattern) as keys? Clear the cache + every so often, or once it gets past a certain size? +*/ + +static PyObject *cache_dict=NULL; + +/* Accept an object; if it's a reop pattern, Py_INCREF it and return + it. If it's a string, a reop object is compiled and cached. +*/ + +static reopobject * +cached_compile(pattern) + PyObject *pattern; +{ + reopobject *p2; + + if (!PyString_Check(pattern)) + { + /* It's not a string, so assume it's a compiled reop object */ + /* XXX check that! */ + Py_INCREF(pattern); + return (reopobject*)pattern; + } + if (cache_dict==NULL) + { + cache_dict=PyDict_New(); + if (cache_dict==NULL) + { + return (reopobject*)NULL; + } + } + + /* See if the pattern has already been cached; if so, return that + reop object */ + p2=(reopobject*)PyDict_GetItem(cache_dict, pattern); + if (p2) + { + Py_INCREF(p2); + return (reopobject*)p2; + } + + /* Compile the pattern and cache it */ + p2=(reopobject*)newreopobject(pattern, NULL, pattern, NULL); + if (!p2) return p2; + PyDict_SetItem(cache_dict, pattern, (PyObject*)p2); + return p2; +} + + +static PyObject * +internal_split(args, retain) + PyObject *args; + int retain; +{ + PyObject *newlist, *s; + reopobject *pattern; + int maxsplit=0, count=0, length, next=0, result; + int match_end=0; /* match_start is defined below */ + char *start; + + if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern, + &maxsplit)) + { + PyErr_Clear(); + if (!PyArg_ParseTuple(args, "s#O", &start, &length, &pattern)) + return NULL; + } + pattern=cached_compile((PyObject *)pattern); + if (!pattern) return NULL; + + newlist=PyList_New(0); + if (!newlist) return NULL; + + do + { + result = re_search(&pattern->re_patbuf, + start, length, next, length-next, + &pattern->re_regs); + if (result < -1) + { /* Erk... an error happened during the reop search */ + Py_DECREF(newlist); + PyErr_SetString(ReopError, "match failure"); + return NULL; + } + if (next<=result) + { + int match_start=pattern->re_regs.start[0]; + int oldmatch_end=match_end; + match_end=pattern->re_regs.end[0]; + + if (match_start==match_end) + { /* A zero-length match; increment to the next position */ + next=result+1; + match_end=oldmatch_end; + continue; + } + + /* Append the string up to the start of the match */ + s=PyString_FromStringAndSize(start+oldmatch_end, match_start-oldmatch_end); + if (!s) + { + Py_DECREF(newlist); + return NULL; + } + PyList_Append(newlist, s); + Py_DECREF(s); + + if (retain) + { + /* Append a string containing whatever matched */ + s=PyString_FromStringAndSize(start+match_start, match_end-match_start); + if (!s) + { + Py_DECREF(newlist); + return NULL; + } + PyList_Append(newlist, s); + Py_DECREF(s); + } + /* Update the pointer, and increment the count of splits */ + next=match_end; count++; + } + } while (result!=-1 && !(maxsplit && maxsplit==count) && + next<length); + s=PyString_FromStringAndSize(start+match_end, length-match_end); + if (!s) + { + Py_DECREF(newlist); + return NULL; + } + PyList_Append(newlist, s); + Py_DECREF(s); + Py_DECREF(pattern); + return newlist; +} + +static PyObject * +reop_split(self, args) + PyObject *self; + PyObject *args; +{ + return internal_split(args, 0); +} + +static PyObject * +reop_splitx(self, args) + PyObject *self; + PyObject *args; +{ + return internal_split(args, 1); +} +#endif + +static struct PyMethodDef reop_global_methods[] = { + {"match", reop_match, 0}, + {"search", reop_search, 0}, +#if 0 + {"split", reop_split, 0}, + {"splitx", reop_splitx, 0}, +#endif + {NULL, NULL} /* sentinel */ +}; + +void +initreop() +{ + PyObject *m, *d, *v; + int i; + char *s; + + m = Py_InitModule("reop", reop_global_methods); + d = PyModule_GetDict(m); + + /* Initialize reop.error exception */ + v = ReopError = PyString_FromString("reop.error"); + if (v == NULL || PyDict_SetItemString(d, "error", v) != 0) + goto finally; + + /* Initialize reop.casefold constant */ + if (!(v = PyString_FromStringAndSize((char *)NULL, 256))) + goto finally; + + if (!(s = PyString_AsString(v))) + goto finally; + + for (i = 0; i < 256; i++) { + if (isupper(i)) + s[i] = tolower(i); + else + s[i] = i; + } + if (PyDict_SetItemString(d, "casefold", v) < 0) + goto finally; + Py_DECREF(v); + + if (!PyErr_Occurred()) + return; + finally: + Py_FatalError("can't initialize reop module"); +} |