diff options
-rw-r--r-- | Lib/re.py | 35 | ||||
-rwxr-xr-x | Lib/test/re_tests.py | 15 | ||||
-rw-r--r-- | Lib/test/regex_tests.py | 8 | ||||
-rw-r--r-- | Lib/test/test_re.py | 13 | ||||
-rw-r--r-- | Modules/regexmodule.c | 12 | ||||
-rw-r--r-- | Modules/regexpr.c | 169 | ||||
-rw-r--r-- | Modules/regexpr.h | 38 | ||||
-rw-r--r-- | Modules/reopmodule.c | 50 |
8 files changed, 226 insertions, 114 deletions
@@ -317,10 +317,19 @@ class Eol(Instruction): class Set(Instruction): name = 'set' - def __init__(self, set): + def __init__(self, set, flags=0): self.set = set - Instruction.__init__(self, chr(3), 33) + if flags & IGNORECASE: self.set=map(string.lower, self.set) + if len(set)==1: + # If only one element, use the "exact" opcode (it'll be faster) + Instruction.__init__(self, chr(4), 2) + else: + # Use the "set" opcode + Instruction.__init__(self, chr(3), 33) def assemble(self, position, labels): + if len(self.set)==1: + # If only one character in set, generate an "exact" opcode + return self.opcode + self.set[0] result = self.opcode temp = 0 for i, c in map(lambda x: (x, chr(x)), range(256)): @@ -333,14 +342,16 @@ class Set(Instruction): def __repr__(self): result = '%-15s' % (self.name) self.set.sort() + # XXX this should print more intelligently for char in self.set: result = result + char return result class Exact(Instruction): name = 'exact' - def __init__(self, char): + def __init__(self, char, flags): self.char = char + if flags & IGNORECASE: self.char=string.lower(self.char) Instruction.__init__(self, chr(4), 2) def assemble(self, position, labels): return self.opcode + self.char @@ -881,7 +892,7 @@ def compile(pattern, flags=0): escape_type, value, index = expand_escape(pattern, index) if escape_type == CHAR: - stack.append([Exact(value)]) + stack.append([Exact(value, flags)]) lastop = '\\' + value elif escape_type == MEMORY_REFERENCE: @@ -1306,7 +1317,7 @@ def compile(pattern, flags=0): elif char == '.': if flags & DOTALL: - stack.append([Set(map(chr, range(256)))]) + stack.append([Set(map(chr, range(256)), flags)]) else: stack.append([AnyChar()]) lastop = '.' @@ -1336,12 +1347,12 @@ def compile(pattern, flags=0): index = end + 1 # do not change lastop else: - stack.append([Exact(char)]) + stack.append([Exact(char, flags)]) lastop = '#' elif char in string.whitespace: if not (flags & VERBOSE): - stack.append([Exact(char)]) + stack.append([Exact(char, flags)]) lastop = char elif char == '[': @@ -1449,22 +1460,25 @@ def compile(pattern, flags=0): index = index + 1 if negate: + # If case is being ignored, then both upper- and lowercase + # versions of the letters must be excluded. + if flags & IGNORECASE: set=set+map(string.upper, set) notset = [] for char in map(chr, range(256)): if char not in set: notset.append(char) if len(notset) == 0: raise error, 'empty negated set' - stack.append([Set(notset)]) + stack.append([Set(notset, flags)]) else: if len(set) == 0: raise error, 'empty set' - stack.append([Set(set)]) + stack.append([Set(set, flags)]) lastop = '[]' else: - stack.append([Exact(char)]) + stack.append([Exact(char, flags)]) lastop = char code = [] @@ -1485,6 +1499,7 @@ def compile(pattern, flags=0): code.append(Label(label)) label = label + 1 code.append(End()) +# print code return RegexObject(pattern, flags, code, register, groupindex) # Replace expand_escape and _expand functions with their C equivalents. diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py index eb50558..9143938 100755 --- a/Lib/test/re_tests.py +++ b/Lib/test/re_tests.py @@ -318,6 +318,7 @@ tests = [ # ('((((((((((a))))))))))\\41', 'aa', FAIL), # ('((((((((((a))))))))))\\41', 'a!', SUCCEED, 'found', 'a!'), ('((((((((((a))))))))))\\41', '', SYNTAX_ERROR), + ('(?i)((((((((((a))))))))))\\41', '', SYNTAX_ERROR), ('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'), ('multiple words of text', 'uh-uh', FAIL), ('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'), @@ -448,7 +449,6 @@ tests = [ ('(?i)((((((((((a))))))))))\\10', 'AA', SUCCEED, 'found', 'AA'), #('(?i)((((((((((a))))))))))\\41', 'AA', FAIL), #('(?i)((((((((((a))))))))))\\41', 'A!', SUCCEED, 'found', 'A!'), - ('(?i)((((((((((a))))))))))\\41', '', SYNTAX_ERROR), ('(?i)(((((((((a)))))))))', 'A', SUCCEED, 'found', 'A'), ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', SUCCEED, 'g1', 'A'), ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', SUCCEED, 'g1', 'C'), @@ -506,10 +506,21 @@ xyzabc ('a.b', 'a\nb', FAIL), ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'), - # test \w, etc. + # test \w, etc. both inside and outside character classes ('\\w+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'), + ('[\\w]+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'), ('\\D+', '1234abc5678', SUCCEED, 'found', 'abc'), + ('[\\D]+', '1234abc5678', SUCCEED, 'found', 'abc'), ('[\\da-fA-F]+', '123abc', SUCCEED, 'found', '123abc'), ('[\\d-x]', '-', SYNTAX_ERROR), + (r'([\s]*)([\S]*)([\s]*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '), + (r'(\s*)(\S*)(\s*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '), + + (r'\xff', '\377', SUCCEED, 'found', chr(255)), + (r'\x00ff', '\377', SUCCEED, 'found', chr(255)), + (r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), + ('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), + (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)), + (r'[\t][\n][\v][\r][\f][\a][\A][\b][\B][\Z][\g]', '\t\n\v\r\f\aA\bBZg', SUCCEED, 'found', '\t\n\v\r\f\aA\bBZg'), ] diff --git a/Lib/test/regex_tests.py b/Lib/test/regex_tests.py index 70ecdab..dcb980a 100644 --- a/Lib/test/regex_tests.py +++ b/Lib/test/regex_tests.py @@ -278,6 +278,12 @@ tests = [ ('\\([xyz]*\\)x', 'abcx', SUCCEED, 'found+"-"+g1', 'x-'), ('\\(a\\)+b\\|aac', 'aac', SUCCEED, - 'found+"-"+g1', 'aac-None') + 'found+"-"+g1', 'aac-None'), +('\<a', 'a', SUCCEED, 'found', 'a'), +('\<a', '!', FAIL), +('a\<b', 'ab', FAIL), +('a\>', 'ab', FAIL), +('a\>', 'a!', SUCCEED, 'found', 'a'), +('a\>', 'a', SUCCEED, 'found', 'a'), ] diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 1581856..c4b21cf 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -31,6 +31,10 @@ try: assert re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx') == 'xxxx' + assert re.sub('a', r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D', 'a') == '\t\n\v\r\f\a\bBZ\aAwWsSdD' + assert re.sub('a', '\t\n\v\r\f\a', 'a') == '\t\n\v\r\f\a' + assert re.sub('a', '\t\n\v\r\f\a', 'a') == (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)) + except AssertionError: raise TestFailed, "re.sub" @@ -120,7 +124,6 @@ if verbose: print 'Running re_tests test suite' for t in tests: - print t sys.stdout.flush() pattern=s=outcome=repl=expected=None if len(t)==5: @@ -136,6 +139,7 @@ for t in tests: if outcome==SYNTAX_ERROR: pass # Expected a syntax error else: print '=== Syntax error:', t + except KeyboardInterrupt: raise KeyboardInterrupt except: print '*** Unexpected error ***' if verbose: @@ -182,3 +186,10 @@ for t in tests: print repr(repl)+' should be '+repr(expected) else: print '=== Failed incorrectly', t + + # Try the match with IGNORECASE enabled, and check that it + # still succeeds. + obj=re.compile(pattern, re.IGNORECASE) + result=obj.search(s) + if result==None: + print '=== Fails on case-insensitive match', t diff --git a/Modules/regexmodule.c b/Modules/regexmodule.c index ad86068..32360f2 100644 --- a/Modules/regexmodule.c +++ b/Modules/regexmodule.c @@ -132,8 +132,10 @@ regobj_match(re, args) re->re_lastok = NULL; result = re_match(&re->re_patbuf, buffer, size, offset, &re->re_regs); if (result < -1) { - /* Failure like stack overflow */ - PyErr_SetString(RegexError, "match failure"); + /* Serious failure of some sort; if re_match didn't + set an exception, raise a generic error */ + if (!PyErr_Occurred()) + PyErr_SetString(RegexError, "match failure"); return NULL; } if (result >= 0) { @@ -174,8 +176,10 @@ regobj_search(re, args) result = re_search(&re->re_patbuf, buffer, size, offset, range, &re->re_regs); if (result < -1) { - /* Failure like stack overflow */ - PyErr_SetString(RegexError, "match failure"); + /* Serious failure of some sort; if re_match didn't + set an exception, raise a generic error */ + if (!PyErr_Occurred()) + PyErr_SetString(RegexError, "match failure"); return NULL; } if (result >= 0) { diff --git a/Modules/regexpr.c b/Modules/regexpr.c index 4ea6a3e..08fcc3a 100644 --- a/Modules/regexpr.c +++ b/Modules/regexpr.c @@ -33,6 +33,7 @@ #include "myproto.h" /* For PROTO macro --Guido */ #include <stdio.h> +#include "Python.h" #ifndef NDEBUG #define NDEBUG 1 @@ -85,16 +86,16 @@ typedef union item_t { int num; int level; - char *start; - char *end; + unsigned char *start; + unsigned char *end; } reg; struct { int count; int level; int phantom; - char *code; - char *text; + unsigned char *code; + unsigned char *text; } fail; struct { @@ -139,8 +140,8 @@ typedef struct match_state * offsets from the beginning of the string before returning the * registers to the calling program. */ - char *start[NUM_REGISTERS]; - char *end[NUM_REGISTERS]; + unsigned char *start[NUM_REGISTERS]; + unsigned char *end[NUM_REGISTERS]; /* Keeps track of whether a register has changed recently. */ @@ -422,7 +423,7 @@ enum regexp_compiled_ops /* opcodes for compiled regexp */ Cwordbound, /* match if at word boundary */ Cnotwordbound, /* match if not at word boundary */ Csyntaxspec, /* matches syntax code (1 byte follows) */ - Cnotsyntaxspec, /* matches if syntax code does not match (1 byte foll)*/ + Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */ Crepeat1 }; @@ -469,7 +470,7 @@ static int regexp_ansi_sequences; #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)] -char re_syntax_table[256]; +unsigned char re_syntax_table[256]; void re_compile_initialize(void) { @@ -593,11 +594,11 @@ static int hex_char_to_decimal(int ch) return 16; } -static void re_compile_fastmap_aux(char *code, +static void re_compile_fastmap_aux(unsigned char *code, int pos, - char *visited, - char *can_be_null, - char *fastmap) + unsigned char *visited, + unsigned char *can_be_null, + unsigned char *fastmap) { int a; int b; @@ -717,19 +718,20 @@ static void re_compile_fastmap_aux(char *code, } default: { - abort(); /* probably some opcode is missing from this switch */ + PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?"); + return; /*NOTREACHED*/ } } } -static int re_do_compile_fastmap(char *buffer, +static int re_do_compile_fastmap(unsigned char *buffer, int used, int pos, - char *can_be_null, - char *fastmap) + unsigned char *can_be_null, + unsigned char *fastmap) { - char small_visited[512], *visited; + unsigned char small_visited[512], *visited; if (used <= sizeof(small_visited)) visited = small_visited; @@ -759,6 +761,7 @@ void re_compile_fastmap(regexp_t bufp) &bufp->can_be_null, bufp->fastmap)) return; + if (PyErr_Occurred()) return; if (bufp->buffer[0] == Cbol) bufp->anchor = 1; /* begline */ else @@ -792,22 +795,29 @@ void re_compile_fastmap(regexp_t bufp) * */ -static int re_optimize_star_jump(regexp_t bufp, char *code) +static int re_optimize_star_jump(regexp_t bufp, unsigned char *code) { - char map[256]; - char can_be_null; - char *p1; - char *p2; - char ch; + unsigned char map[256]; + unsigned char can_be_null; + unsigned char *p1; + unsigned char *p2; + unsigned char ch; int a; int b; int num_instructions = 0; - + a = (unsigned char)*code++; a |= (unsigned char)*code++ << 8; a = (int)SHORT(a); p1 = code + a + 3; /* skip the failure_jump */ + /* Check that the jump is within the pattern */ + if (p1<bufp->buffer || bufp->buffer+bufp->used<p1) + { + PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (failure_jump opt)"); + return 0; + } + assert(p1[-3] == Cfailure_jump); p2 = code; /* p1 points inside loop, p2 points to after loop */ @@ -923,7 +933,7 @@ static int re_optimize_star_jump(regexp_t bufp, char *code) } } - make_update_jump: + /* make_update_jump: */ code -= 3; a += 3; /* jump to after the Cfailure_jump */ code[0] = Cupdate_failure_jump; @@ -948,7 +958,7 @@ static int re_optimize_star_jump(regexp_t bufp, char *code) static int re_optimize(regexp_t bufp) { - char *code; + unsigned char *code; code = bufp->buffer; @@ -1073,7 +1083,7 @@ else \ #define GETHEX(var) \ { \ - char gethex_ch, gethex_value; \ + unsigned char gethex_ch, gethex_value; \ NEXTCHAR(gethex_ch); \ gethex_value = hex_char_to_decimal(gethex_ch); \ if (gethex_value == 16) \ @@ -1147,7 +1157,7 @@ else \ } \ } -char *re_compile_pattern(char *regex, int size, regexp_t bufp) +unsigned char *re_compile_pattern(unsigned char *regex, int size, regexp_t bufp) { int a; int pos; @@ -1161,8 +1171,8 @@ char *re_compile_pattern(char *regex, int size, regexp_t bufp) int future_jumps[MAX_NESTING]; int num_jumps; unsigned char ch = '\0'; - char *pattern; - char *translate; + unsigned char *pattern; + unsigned char *translate; int next_register; int paren_depth; int num_open_registers; @@ -1580,23 +1590,23 @@ if (translate) \ var = translate[var] int re_match(regexp_t bufp, - char *string, + unsigned char *string, int size, int pos, regexp_registers_t old_regs) { - char *code; - char *translate; - char *text; - char *textstart; - char *textend; + unsigned char *code; + unsigned char *translate; + unsigned char *text; + unsigned char *textstart; + unsigned char *textend; int a; int b; int ch; int reg; int match_end; - char *regstart; - char *regend; + unsigned char *regstart; + unsigned char *regend; int regsize; match_state state; @@ -1738,18 +1748,36 @@ int re_match(regexp_t bufp, a = (unsigned char)*code++; a |= (unsigned char)*code++ << 8; code += (int)SHORT(a); + if (code<bufp->buffer || bufp->buffer+bufp->used<code) { + PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cjump)"); + FREE_STATE(state); + return -2; + } goto continue_matching; } case Cdummy_failure_jump: { + unsigned char *failuredest; + a = (unsigned char)*code++; a |= (unsigned char)*code++ << 8; a = (int)SHORT(a); assert(*code == Cfailure_jump); b = (unsigned char)code[1]; b |= (unsigned char)code[2] << 8; - PUSH_FAILURE(state, code + (int)SHORT(b) + 3, NULL, goto error); + failuredest = code + (int)SHORT(b) + 3; + if (failuredest<bufp->buffer || bufp->buffer+bufp->used < failuredest) { + PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cdummy_failure_jump failuredest)"); + FREE_STATE(state); + return -2; + } + PUSH_FAILURE(state, failuredest, NULL, goto error); code += a; + if (code<bufp->buffer || bufp->buffer+bufp->used < code) { + PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cdummy_failure_jump code)"); + FREE_STATE(state); + return -2; + } goto continue_matching; } case Cfailure_jump: @@ -1757,16 +1785,26 @@ int re_match(regexp_t bufp, a = (unsigned char)*code++; a |= (unsigned char)*code++ << 8; a = (int)SHORT(a); + if (code+a<bufp->buffer || bufp->buffer+bufp->used < code+a) { + PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cfailure_jump)"); + FREE_STATE(state); + return -2; + } PUSH_FAILURE(state, code + a, text, goto error); goto continue_matching; } case Crepeat1: { - char *pinst; + unsigned char *pinst; a = (unsigned char)*code++; a |= (unsigned char)*code++ << 8; a = (int)SHORT(a); pinst = code + a; + if (pinst<bufp->buffer || bufp->buffer+bufp->used<pinst) { + PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Crepeat1)"); + FREE_STATE(state); + return -2; + } /* pinst is sole instruction in loop, and it matches a * single character. Since Crepeat1 was originally a * Cupdate_failure_jump, we also know that backtracking @@ -1777,8 +1815,8 @@ int re_match(regexp_t bufp, switch (*pinst++) { case Cset: - { - if (translate) + { + if (translate) { while (text < textend) { @@ -1801,7 +1839,7 @@ int re_match(regexp_t bufp, } } break; - } + } case Cexact: { ch = (unsigned char)*pinst; @@ -1858,7 +1896,9 @@ int re_match(regexp_t bufp, } default: { - abort(); + FREE_STATE(state); + PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?"); + return -2; /*NOTREACHED*/ } } @@ -1884,7 +1924,7 @@ int re_match(regexp_t bufp, { if (text == textend) goto fail; - if (!(SYNTAX(*text) & Sword)) + if (!(SYNTAX(*text) & Sword)) goto fail; if (text == textstart) goto continue_matching; @@ -1900,9 +1940,9 @@ int re_match(regexp_t bufp, goto fail; if (text == textend) goto continue_matching; - if (SYNTAX(*text) & Sword) - goto fail; - goto continue_matching; + if (!(SYNTAX(*text) & Sword)) + goto continue_matching; + goto fail; } case Cwordbound: { @@ -1936,15 +1976,19 @@ int re_match(regexp_t bufp, { NEXTCHAR(ch); if (SYNTAX(ch) & (unsigned char)*code++) - break; + goto fail; goto continue_matching; } default: { - abort(); + FREE_STATE(state); + PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?"); + return -2; /*NOTREACHED*/ } } + + #if 0 /* This line is never reached --Guido */ abort(); @@ -1952,7 +1996,8 @@ int re_match(regexp_t bufp, /* *NOTREACHED */ - + + /* Using "break;" in the above switch statement is equivalent to "goto fail;" */ fail: POP_FAILURE(state, code, text, goto done_matching, goto error); goto continue_matching; @@ -1969,33 +2014,37 @@ int re_match(regexp_t bufp, FREE_STATE(state); return -2; } + #undef PREFETCH #undef NEXTCHAR int re_search(regexp_t bufp, - char *string, + unsigned char *string, int size, int pos, int range, regexp_registers_t regs) { - char *fastmap; - char *translate; - char *text; - char *partstart; - char *partend; + unsigned char *fastmap; + unsigned char *translate; + unsigned char *text; + unsigned char *partstart; + unsigned char *partend; int dir; int ret; - char anchor; + unsigned char anchor; assert(size >= 0 && pos >= 0); assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */ fastmap = bufp->fastmap; translate = bufp->translate; - if (fastmap && !bufp->fastmap_accurate) - re_compile_fastmap(bufp); + if (fastmap && !bufp->fastmap_accurate) { + re_compile_fastmap(bufp); + if (PyErr_Occurred()) return -2; + } + anchor = bufp->anchor; if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */ fastmap = NULL; diff --git a/Modules/regexpr.h b/Modules/regexpr.h index 9ac2ab9..729088e 100644 --- a/Modules/regexpr.h +++ b/Modules/regexpr.h @@ -33,16 +33,16 @@ extern "C" { typedef struct re_pattern_buffer { - char *buffer; /* compiled pattern */ + unsigned char *buffer; /* compiled pattern */ int allocated; /* allocated size of compiled pattern */ int used; /* actual length of compiled pattern */ - char *fastmap; /* fastmap[ch] is true if ch can start pattern */ - char *translate; /* translation to apply during compilation/matching */ - char fastmap_accurate; /* true if fastmap is valid */ - char can_be_null; /* true if can match empty string */ - char uses_registers; /* registers are used and need to be initialized */ + unsigned char *fastmap; /* fastmap[ch] is true if ch can start pattern */ + unsigned char *translate; /* translation to apply during compilation/matching */ + unsigned char fastmap_accurate; /* true if fastmap is valid */ + unsigned char can_be_null; /* true if can match empty string */ + unsigned char uses_registers; /* registers are used and need to be initialized */ int num_registers; /* number of registers used */ - char anchor; /* anchor: 0=none 1=begline 2=begbuf */ + unsigned char anchor; /* anchor: 0=none 1=begline 2=begbuf */ } *regexp_t; typedef struct re_registers @@ -93,7 +93,7 @@ extern int re_syntax; /* This is the actual syntax mask. It was added so that Python could do * syntax-dependent munging of patterns before compilation. */ -extern char re_syntax_table[256]; +extern unsigned char re_syntax_table[256]; void re_compile_initialize(void); @@ -101,7 +101,7 @@ int re_set_syntax(int syntax); /* This sets the syntax to use and returns the previous syntax. The * syntax is specified by a bit mask of the above defined bits. */ -char *re_compile_pattern(char *regex, int regex_size, regexp_t compiled); +unsigned char *re_compile_pattern(unsigned char *regex, int regex_size, regexp_t compiled); /* This compiles the regexp (given in regex and length in regex_size). * This returns NULL if the regexp compiled successfully, and an error * message if an error was encountered. The buffer field must be @@ -110,14 +110,14 @@ char *re_compile_pattern(char *regex, int regex_size, regexp_t compiled); * buffer is NULL). Also, the translate field must be set to point to a * valid translation table, or NULL if it is not used. */ -int re_match(regexp_t compiled, char *string, int size, int pos, +int re_match(regexp_t compiled, unsigned char *string, int size, int pos, regexp_registers_t old_regs); /* This tries to match the regexp against the string. This returns the * length of the matched portion, or -1 if the pattern could not be * matched and -2 if an error (such as failure stack overflow) is * encountered. */ -int re_search(regexp_t compiled, char *string, int size, int startpos, +int re_search(regexp_t compiled, unsigned char *string, int size, int startpos, int range, regexp_registers_t regs); /* This rearches for a substring matching the regexp. This returns the * first index at which a match is found. range specifies at how many @@ -132,28 +132,16 @@ void re_compile_fastmap(regexp_t compiled); * the calling program must have initialized the fastmap field to point * to an array of 256 characters. */ -char *re_comp(char *s); -/* BSD 4.2 regex library routine re_comp. This compiles the regexp into - * an internal buffer. This returns NULL if the regexp was compiled - * successfully, and an error message if there was an error. */ - -int re_exec(char *s); -/* BSD 4.2 regexp library routine re_exec. This returns true if the - * string matches the regular expression (that is, a matching part is - * found anywhere in the string). */ - #else /* HAVE_PROTOTYPES */ extern int re_syntax; -extern char re_syntax_table[256]; +extern unsigned char re_syntax_table[256]; void re_compile_initialize(); int re_set_syntax(); -char *re_compile_pattern(); +unsigned char *re_compile_pattern(); int re_match(); int re_search(); void re_compile_fastmap(); -char *re_comp(); -int re_exec(); #endif /* HAVE_PROTOTYPES */ diff --git a/Modules/reopmodule.c b/Modules/reopmodule.c index 3578ac7..39e6ece 100644 --- a/Modules/reopmodule.c +++ b/Modules/reopmodule.c @@ -62,7 +62,7 @@ static PyObject *ReopError; /* Exception */ #define BEGINNING_OF_BUFFER 7 #define END_OF_BUFFER 8 -static char *reop_casefold; +static unsigned char *reop_casefold; static PyObject * makeresult(regs, num_regs) @@ -105,7 +105,7 @@ reop_match(self, args) PyObject *self; PyObject *args; { - char *string; + unsigned char *string; int fastmaplen, stringlen; int can_be_null, anchor, i; int flags, pos, result; @@ -163,8 +163,8 @@ reop_match(self, args) if (result < -1) { /* Failure like stack overflow */ - PyErr_SetString(ReopError, "match failure"); - + if (!PyErr_Occurred()) + PyErr_SetString(ReopError, "match failure"); return NULL; } if (result == -1) { @@ -174,12 +174,38 @@ reop_match(self, args) return makeresult(&re_regs, bufp.num_registers); } +#if 0 +static PyObject * +reop_optimize(self, args) + PyObject *self; + PyObject *args; +{ + unsigned char *buffer; + int buflen; + struct re_pattern_buffer bufp; + + PyObject *opt_code; + + if (!PyArg_Parse(args, "(s#)", &buffer, &buflen)) return NULL; + /* Create a new string for the optimized code */ + opt_code=PyString_FromStringAndSize(buffer, buflen); + if (opt_code!=NULL) + { + bufp.buffer = PyString_AsString(opt_code); + bufp.used=bufp.allocated=buflen; + + } + return opt_code; + +} +#endif + static PyObject * reop_search(self, args) PyObject *self; PyObject *args; { - char *string; + unsigned char *string; int fastmaplen, stringlen; int can_be_null, anchor, i; int flags, pos, result; @@ -237,7 +263,8 @@ reop_search(self, args) if (result < -1) { /* Failure like stack overflow */ - PyErr_SetString(ReopError, "match failure"); + if (!PyErr_Occurred()) + PyErr_SetString(ReopError, "match failure"); return NULL; } @@ -626,7 +653,7 @@ reop__expand(self, args) { PyObject *results, *match_obj; PyObject *repl_obj, *newstring; - char *repl; + unsigned char *repl; int size, total_len, i, start, pos; if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj)) @@ -810,7 +837,7 @@ internal_split(args, retain) reopobject *pattern; int maxsplit=0, count=0, length, next=0, result; int match_end=0; /* match_start is defined below */ - char *start; + unsigned char *start; if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern, &maxsplit)) @@ -911,6 +938,7 @@ static struct PyMethodDef reop_global_methods[] = { {"expand_escape", reop_expand_escape, 1}, {"_expand", reop__expand, 1}, #if 0 + {"_optimize", reop_optimize, 0}, {"split", reop_split, 0}, {"splitx", reop_splitx, 0}, #endif @@ -922,8 +950,8 @@ initreop() { PyObject *m, *d, *k, *v, *o; int i; - char *s; - char j[2]; + unsigned char *s; + unsigned char j[2]; re_compile_initialize(); @@ -936,7 +964,7 @@ initreop() goto finally; /* Initialize reop.casefold constant */ - if (!(v = PyString_FromStringAndSize((char *)NULL, 256))) + if (!(v = PyString_FromStringAndSize((unsigned char *)NULL, 256))) goto finally; if (!(s = PyString_AsString(v))) |