From 41c99e7f96f7a0f192839801c568d8a80dcc7091 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 14 Apr 2003 17:59:34 +0000 Subject: SF patch #720991 by Gary Herron: A small fix for bug #545855 and Greg Chapman's addition of op code SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of pattern '*?' on a long string. --- Lib/sre_compile.py | 7 ++++-- Lib/sre_constants.py | 4 +++- Lib/sre_parse.py | 4 +++- Lib/test/test_sre.py | 13 +++++++++++ Misc/NEWS | 4 ++++ Modules/_sre.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++ Modules/sre_constants.h | 1 + 7 files changed, 89 insertions(+), 4 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index bb17649..3e54819 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -55,8 +55,11 @@ def _compile(code, pattern, flags): _compile(code, av[2], flags) emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip - elif _simple(av) and op == MAX_REPEAT: - emit(OPCODES[REPEAT_ONE]) + elif _simple(av) and op != REPEAT: + if op == MAX_REPEAT: + emit(OPCODES[REPEAT_ONE]) + else: + emit(OPCODES[MIN_REPEAT_ONE]) skip = len(code); emit(0) emit(av[0]) emit(av[1]) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index b0b61d9..2cd85a3 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -60,6 +60,7 @@ RANGE = "range" REPEAT = "repeat" REPEAT_ONE = "repeat_one" SUBPATTERN = "subpattern" +MIN_REPEAT_ONE = "min_repeat_one" # positions AT_BEGINNING = "at_beginning" @@ -120,7 +121,8 @@ OPCODES = [ RANGE, REPEAT, REPEAT_ONE, - SUBPATTERN + SUBPATTERN, + MIN_REPEAT_ONE ] diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 1b52967..fdf6767 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -419,7 +419,7 @@ def _parse(source, state): set.append(code1) set.append((LITERAL, ord("-"))) break - else: + elif this: if this[0] == "\\": code2 = _class_escape(source, this) else: @@ -431,6 +431,8 @@ def _parse(source, state): if hi < lo: raise error, "bad character range" set.append((RANGE, (lo, hi))) + else: + raise error, "unexpected end of regular expression" else: if code1[0] is IN: code1 = code1[1][0] diff --git a/Lib/test/test_sre.py b/Lib/test/test_sre.py index 6a00aff..e4eb08d 100644 --- a/Lib/test/test_sre.py +++ b/Lib/test/test_sre.py @@ -83,6 +83,19 @@ test(r"""sre.match(r'(a)?a','a').lastindex""", None) test(r"""sre.match(r'(a)(b)?b','ab').lastindex""", 1) test(r"""sre.match(r'(?Pa)(?Pb)?b','ab').lastgroup""", 'a') +# bug 545855 -- This pattern failed to cause a compile error as it +# should, instead provoking a TypeError. +test(r"""sre.compile('foo[a-')""", None, sre.error) + +# bugs 418626 at al. -- Testing Greg Chapman's addition of op code +# SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of +# pattern '*?' on a long string. +test(r"""sre.match('.*?c', 10000*'ab'+'cd').end(0)""", 20001) +test(r"""sre.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0)""", 20003) +test(r"""sre.match('.*?cd', 20000*'abc'+'de').end(0)""", 60001) +# non-simple '*?' still recurses and hits the recursion limit +test(r"""sre.search('(a|b)*?c', 10000*'ab'+'cd').end(0)""", None, RuntimeError) + if verbose: print 'Running tests on sre.sub' diff --git a/Misc/NEWS b/Misc/NEWS index a85273a..054559b 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -54,6 +54,10 @@ Core and builtins Extension modules ----------------- +- The .*? pattern in the re module is now special-cased to avoid the + recursion limit. (SF patch #720991 -- many thanks to Gary Herron + and Greg Chapman.) + - New function sys.call_tracing() allows pdb to debug code recursively. diff --git a/Modules/_sre.c b/Modules/_sre.c index 619d39b..dde365b 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -993,6 +993,66 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) } return 0; + case SRE_OP_MIN_REPEAT_ONE: + /* match repeated sequence (minimizing regexp) */ + + /* this operator only works if the repeated item is + exactly one character wide, and we're not already + collecting backtracking points. for other cases, + use the MIN_REPEAT operator */ + + /* <1=min> <2=max> item tail */ + + TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", pattern, ptr, + pattern[1], pattern[2])); + + if (ptr + pattern[1] > end) + return 0; /* cannot match */ + + state->ptr = ptr; + + if (pattern[1] == 0) + count = 0; + else { + /* count using pattern min as the maximum */ + count = SRE_COUNT(state, pattern + 3, pattern[1], level + 1); + + if (count < 0) + return count; /* exception */ + if (count < (int) pattern[1]) + return 0; /* did not match minimum number of times */ + ptr += count; /* advance past minimum matches of repeat */ + } + + if (pattern[pattern[0]] == SRE_OP_SUCCESS) { + /* tail is empty. we're finished */ + state->ptr = ptr; + return 1; + + } else { + /* general case */ + int matchmax = ((int)pattern[2] == 65535); + int c; + lastmark = state->lastmark; + while (matchmax || count <= (int) pattern[2]) { + state->ptr = ptr; + i = SRE_MATCH(state, pattern + pattern[0], level + 1); + if (i) + return i; + state->ptr = ptr; + c = SRE_COUNT(state, pattern+3, 1, level+1); + if (c < 0) + return c; + if (c == 0) + break; + assert(c == 1); + ptr++; + count++; + } + lastmark_restore(state, lastmark); + } + return 0; + case SRE_OP_REPEAT: /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index ebb9fd0..540008e 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -42,6 +42,7 @@ #define SRE_OP_REPEAT 27 #define SRE_OP_REPEAT_ONE 28 #define SRE_OP_SUBPATTERN 29 +#define SRE_OP_MIN_REPEAT_ONE 30 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 -- cgit v0.12