From 43b3b49b5ab486295baef3a35cd8e836f735c065 Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Fri, 30 Jun 2000 10:41:31 +0000 Subject: - fixed lookahead assertions (#10, #11, #12) - untabified sre_constants.py --- Lib/sre_compile.py | 100 +++++++++++++++++++++++------------------------ Lib/sre_constants.py | 32 +++++++++++---- Lib/sre_parse.py | 19 +++++++++ Lib/test/output/test_sre | 3 -- Modules/_sre.c | 25 ++++++++---- Modules/sre_constants.h | 57 ++++++++++++++++----------- 6 files changed, 146 insertions(+), 90 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 9fdc8f3..0829c00 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -26,52 +26,12 @@ def _compile(code, pattern, flags): # internal: compile a (sub)pattern emit = code.append for op, av in pattern: - if op is ANY: - if flags & SRE_FLAG_DOTALL: - emit(OPCODES[op]) - else: - emit(OPCODES[CATEGORY]) - emit(CHCODES[CATEGORY_NOT_LINEBREAK]) - elif op in (SUCCESS, FAILURE): - emit(OPCODES[op]) - elif op is AT: - emit(OPCODES[op]) - if flags & SRE_FLAG_MULTILINE: - emit(ATCODES[AT_MULTILINE[av]]) - else: - emit(ATCODES[av]) - elif op is BRANCH: - emit(OPCODES[op]) - tail = [] - for av in av[1]: - skip = len(code); emit(0) - _compile(code, av, flags) - emit(OPCODES[JUMP]) - tail.append(len(code)); emit(0) - code[skip] = len(code) - skip - emit(0) # end of branch - for tail in tail: - code[tail] = len(code) - tail - elif op is CALL: - emit(OPCODES[op]) - skip = len(code); emit(0) - _compile(code, av, flags) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip - elif op is CATEGORY: - emit(OPCODES[op]) - if flags & SRE_FLAG_LOCALE: - emit(CHCODES[CH_LOCALE[av]]) - elif flags & SRE_FLAG_UNICODE: - emit(CHCODES[CH_UNICODE[av]]) - else: - emit(CHCODES[av]) - elif op is GROUP: + if op in (LITERAL, NOT_LITERAL): if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) else: emit(OPCODES[op]) - emit(av-1) + emit(ord(av)) elif op is IN: if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) @@ -101,15 +61,12 @@ def _compile(code, pattern, flags): raise error, "internal: unsupported set operator" emit(OPCODES[FAILURE]) code[skip] = len(code) - skip - elif op in (LITERAL, NOT_LITERAL): - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - else: + elif op is ANY: + if flags & SRE_FLAG_DOTALL: emit(OPCODES[op]) - emit(ord(av)) - elif op is MARK: - emit(OPCODES[op]) - emit(av) + else: + emit(OPCODES[CATEGORY]) + emit(CHCODES[CATEGORY_NOT_LINEBREAK]) elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): if flags & SRE_FLAG_TEMPLATE: emit(OPCODES[REPEAT]) @@ -150,6 +107,49 @@ def _compile(code, pattern, flags): if group: emit(OPCODES[MARK]) emit((group-1)*2+1) + elif op in (SUCCESS, FAILURE): + emit(OPCODES[op]) + elif op in (ASSERT, ASSERT_NOT, CALL): + emit(OPCODES[op]) + skip = len(code); emit(0) + _compile(code, av, flags) + emit(OPCODES[SUCCESS]) + code[skip] = len(code) - skip + elif op is AT: + emit(OPCODES[op]) + if flags & SRE_FLAG_MULTILINE: + emit(ATCODES[AT_MULTILINE[av]]) + else: + emit(ATCODES[av]) + elif op is BRANCH: + emit(OPCODES[op]) + tail = [] + for av in av[1]: + skip = len(code); emit(0) + _compile(code, av, flags) + emit(OPCODES[JUMP]) + tail.append(len(code)); emit(0) + code[skip] = len(code) - skip + emit(0) # end of branch + for tail in tail: + code[tail] = len(code) - tail + elif op is CATEGORY: + emit(OPCODES[op]) + if flags & SRE_FLAG_LOCALE: + emit(CHCODES[CH_LOCALE[av]]) + elif flags & SRE_FLAG_UNICODE: + emit(CHCODES[CH_UNICODE[av]]) + else: + emit(CHCODES[av]) + elif op is GROUP: + if flags & SRE_FLAG_IGNORECASE: + emit(OPCODES[OP_IGNORE[op]]) + else: + emit(OPCODES[op]) + emit(av-1) + elif op is MARK: + emit(OPCODES[op]) + emit(av) else: raise ValueError, ("unsupported operand type", op) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index f5e7894..45f4f48 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -23,6 +23,7 @@ SUCCESS = "success" ANY = "any" ASSERT = "assert" +ASSERT_NOT = "assert_not" AT = "at" BRANCH = "branch" CALL = "call" @@ -81,7 +82,7 @@ OPCODES = [ FAILURE, SUCCESS, ANY, - ASSERT, + ASSERT, ASSERT_NOT, AT, BRANCH, CALL, @@ -121,8 +122,8 @@ def makedict(list): d = {} i = 0 for item in list: - d[item] = i - i = i + 1 + d[item] = i + i = i + 1 return d OPCODES = makedict(OPCODES) @@ -176,12 +177,27 @@ SRE_FLAG_VERBOSE = 64 if __name__ == "__main__": import string def dump(f, d, prefix): - items = d.items() - items.sort(lambda a, b: cmp(a[1], b[1])) - for k, v in items: - f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v)) + items = d.items() + items.sort(lambda a, b: cmp(a[1], b[1])) + for k, v in items: + f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v)) f = open("sre_constants.h", "w") - f.write("/* generated from sre_constants.py */\n") + f.write("""\ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * NOTE: This file is generated by sre_constants.py. If you need + * to change anything in here, edit sre_constants.py and run it. + * + * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * + * See the _sre.c file for information on usage and redistribution. + */ + +""") + dump(f, OPCODES, "SRE_OP") dump(f, ATCODES, "SRE") dump(f, CHCODES, "SRE") diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index a6f3082..d3dbe00 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -470,6 +470,25 @@ def _parse(source, state, flags=0): if source.next is None or source.next == ")": break source.get() + elif source.next in ("=", "!"): + # lookahead assertions + char = source.get() + b = [] + while 1: + p = _parse(source, state, flags) + if source.next == ")": + if b: + b.append(p) + p = _branch(state, b) + if char == "=": + subpattern.append((ASSERT, p)) + else: + subpattern.append((ASSERT_NOT, p)) + break + elif source.match("|"): + b.append(p) + else: + raise error, "pattern not properly closed" else: # flags while FLAGS.has_key(source.next): diff --git a/Lib/test/output/test_sre b/Lib/test/output/test_sre index 75caa55..d3732b5 100644 --- a/Lib/test/output/test_sre +++ b/Lib/test/output/test_sre @@ -6,7 +6,4 @@ test_support -- test failed re module cPickle === grouping error ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', 0, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/') 'd:msgs/tdir/sub1/-trial/' should be 'd:msgs/tdir/sub1/-tdir/' === grouping error ('([abc])*bcd', 'abcd', 0, 'found+"-"+g1', 'abcd-a') 'abcd-c' should be 'abcd-a' === grouping error ('(?i)([abc])*bcd', 'ABCD', 0, 'found+"-"+g1', 'ABCD-A') 'ABCD-C' should be 'ABCD-A' -=== Syntax error: ('a(?!b).', 'abad', 0, 'found', 'ad') -=== Syntax error: ('a(?=d).', 'abad', 0, 'found', 'ad') -=== Syntax error: ('a(?=c|d).', 'abad', 0, 'found', 'ad') === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A') diff --git a/Modules/_sre.c b/Modules/_sre.c index 6fcd65e..22b6c73 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -20,6 +20,7 @@ * 00-06-28 fl fixed findall (0.9.1) * 00-06-29 fl fixed split, added more scanner features (0.9.2) * 00-06-30 fl tuning, fast search (0.9.3) + * 00-06-30 fl added assert (lookahead) primitives (0.9.4) * * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. * @@ -30,7 +31,7 @@ #ifndef SRE_RECURSIVE -char copyright[] = " SRE 0.9.3 Copyright (c) 1997-2000 by Secret Labs AB "; +char copyright[] = " SRE 0.9.4 Copyright (c) 1997-2000 by Secret Labs AB "; #include "Python.h" @@ -576,11 +577,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) pattern += pattern[0]; break; -#if 0 - case SRE_OP_CALL: - /* match subpattern, without backtracking */ + case SRE_OP_ASSERT: + /* assert subpattern */ /* args: */ - TRACE(("%8d: subpattern\n", PTR(ptr))); + TRACE(("%8d: assert subpattern\n", PTR(ptr))); state->ptr = ptr; i = SRE_MATCH(state, pattern + 1); if (i < 0) @@ -588,9 +588,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) if (!i) goto failure; pattern += pattern[0]; - ptr = state->ptr; break; -#endif + + case SRE_OP_ASSERT_NOT: + /* assert not subpattern */ + /* args: */ + TRACE(("%8d: assert not subpattern\n", PTR(ptr))); + state->ptr = ptr; + i = SRE_MATCH(state, pattern + 1); + if (i < 0) + return i; + if (i) + goto failure; + pattern += pattern[0]; + break; #if 0 case SRE_OP_MAX_REPEAT_ONE: diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 01c8448..2ec00ba 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -1,29 +1,42 @@ -/* generated from sre_constants.py */ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * NOTE: This file is generated by sre_constants.py. If you need + * to change anything in here, edit sre_constants.py and run it. + * + * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * + * See the _sre.c file for information on usage and redistribution. + */ + #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 #define SRE_OP_ASSERT 3 -#define SRE_OP_AT 4 -#define SRE_OP_BRANCH 5 -#define SRE_OP_CALL 6 -#define SRE_OP_CATEGORY 7 -#define SRE_OP_GROUP 8 -#define SRE_OP_GROUP_IGNORE 9 -#define SRE_OP_IN 10 -#define SRE_OP_IN_IGNORE 11 -#define SRE_OP_INFO 12 -#define SRE_OP_JUMP 13 -#define SRE_OP_LITERAL 14 -#define SRE_OP_LITERAL_IGNORE 15 -#define SRE_OP_MARK 16 -#define SRE_OP_MAX_REPEAT 17 -#define SRE_OP_MAX_REPEAT_ONE 18 -#define SRE_OP_MIN_REPEAT 19 -#define SRE_OP_NOT_LITERAL 20 -#define SRE_OP_NOT_LITERAL_IGNORE 21 -#define SRE_OP_NEGATE 22 -#define SRE_OP_RANGE 23 -#define SRE_OP_REPEAT 24 +#define SRE_OP_ASSERT_NOT 4 +#define SRE_OP_AT 5 +#define SRE_OP_BRANCH 6 +#define SRE_OP_CALL 7 +#define SRE_OP_CATEGORY 8 +#define SRE_OP_GROUP 9 +#define SRE_OP_GROUP_IGNORE 10 +#define SRE_OP_IN 11 +#define SRE_OP_IN_IGNORE 12 +#define SRE_OP_INFO 13 +#define SRE_OP_JUMP 14 +#define SRE_OP_LITERAL 15 +#define SRE_OP_LITERAL_IGNORE 16 +#define SRE_OP_MARK 17 +#define SRE_OP_MAX_REPEAT 18 +#define SRE_OP_MAX_REPEAT_ONE 19 +#define SRE_OP_MIN_REPEAT 20 +#define SRE_OP_NOT_LITERAL 21 +#define SRE_OP_NOT_LITERAL_IGNORE 22 +#define SRE_OP_NEGATE 23 +#define SRE_OP_RANGE 24 +#define SRE_OP_REPEAT 25 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BOUNDARY 2 -- cgit v0.12