summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2003-04-14 17:59:34 (GMT)
committerGuido van Rossum <guido@python.org>2003-04-14 17:59:34 (GMT)
commit41c99e7f96f7a0f192839801c568d8a80dcc7091 (patch)
tree5733d278d932546d0b642dce7e631512da483f76
parent44c62ef5ee2d48cfa0bd024507f19e47d987e6b3 (diff)
downloadcpython-41c99e7f96f7a0f192839801c568d8a80dcc7091.zip
cpython-41c99e7f96f7a0f192839801c568d8a80dcc7091.tar.gz
cpython-41c99e7f96f7a0f192839801c568d8a80dcc7091.tar.bz2
SF patch #720991 by Gary Herron:
A small fix for bug #545855 and Greg Chapman's addition of op code SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of pattern '*?' on a long string.
-rw-r--r--Lib/sre_compile.py7
-rw-r--r--Lib/sre_constants.py4
-rw-r--r--Lib/sre_parse.py4
-rw-r--r--Lib/test/test_sre.py13
-rw-r--r--Misc/NEWS4
-rw-r--r--Modules/_sre.c60
-rw-r--r--Modules/sre_constants.h1
7 files changed, 89 insertions, 4 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index bb17649..3e54819 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -55,8 +55,11 @@ def _compile(code, pattern, flags):
_compile(code, av[2], flags)
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
- elif _simple(av) and op == MAX_REPEAT:
- emit(OPCODES[REPEAT_ONE])
+ elif _simple(av) and op != REPEAT:
+ if op == MAX_REPEAT:
+ emit(OPCODES[REPEAT_ONE])
+ else:
+ emit(OPCODES[MIN_REPEAT_ONE])
skip = len(code); emit(0)
emit(av[0])
emit(av[1])
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index b0b61d9..2cd85a3 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -60,6 +60,7 @@ RANGE = "range"
REPEAT = "repeat"
REPEAT_ONE = "repeat_one"
SUBPATTERN = "subpattern"
+MIN_REPEAT_ONE = "min_repeat_one"
# positions
AT_BEGINNING = "at_beginning"
@@ -120,7 +121,8 @@ OPCODES = [
RANGE,
REPEAT,
REPEAT_ONE,
- SUBPATTERN
+ SUBPATTERN,
+ MIN_REPEAT_ONE
]
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 1b52967..fdf6767 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -419,7 +419,7 @@ def _parse(source, state):
set.append(code1)
set.append((LITERAL, ord("-")))
break
- else:
+ elif this:
if this[0] == "\\":
code2 = _class_escape(source, this)
else:
@@ -431,6 +431,8 @@ def _parse(source, state):
if hi < lo:
raise error, "bad character range"
set.append((RANGE, (lo, hi)))
+ else:
+ raise error, "unexpected end of regular expression"
else:
if code1[0] is IN:
code1 = code1[1][0]
diff --git a/Lib/test/test_sre.py b/Lib/test/test_sre.py
index 6a00aff..e4eb08d 100644
--- a/Lib/test/test_sre.py
+++ b/Lib/test/test_sre.py
@@ -83,6 +83,19 @@ test(r"""sre.match(r'(a)?a','a').lastindex""", None)
test(r"""sre.match(r'(a)(b)?b','ab').lastindex""", 1)
test(r"""sre.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup""", 'a')
+# bug 545855 -- This pattern failed to cause a compile error as it
+# should, instead provoking a TypeError.
+test(r"""sre.compile('foo[a-')""", None, sre.error)
+
+# bugs 418626 at al. -- Testing Greg Chapman's addition of op code
+# SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
+# pattern '*?' on a long string.
+test(r"""sre.match('.*?c', 10000*'ab'+'cd').end(0)""", 20001)
+test(r"""sre.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0)""", 20003)
+test(r"""sre.match('.*?cd', 20000*'abc'+'de').end(0)""", 60001)
+# non-simple '*?' still recurses and hits the recursion limit
+test(r"""sre.search('(a|b)*?c', 10000*'ab'+'cd').end(0)""", None, RuntimeError)
+
if verbose:
print 'Running tests on sre.sub'
diff --git a/Misc/NEWS b/Misc/NEWS
index a85273a..054559b 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -54,6 +54,10 @@ Core and builtins
Extension modules
-----------------
+- The .*? pattern in the re module is now special-cased to avoid the
+ recursion limit. (SF patch #720991 -- many thanks to Gary Herron
+ and Greg Chapman.)
+
- New function sys.call_tracing() allows pdb to debug code
recursively.
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 619d39b..dde365b 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -993,6 +993,66 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
}
return 0;
+ case SRE_OP_MIN_REPEAT_ONE:
+ /* match repeated sequence (minimizing regexp) */
+
+ /* this operator only works if the repeated item is
+ exactly one character wide, and we're not already
+ collecting backtracking points. for other cases,
+ use the MIN_REPEAT operator */
+
+ /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
+
+ TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", pattern, ptr,
+ pattern[1], pattern[2]));
+
+ if (ptr + pattern[1] > end)
+ return 0; /* cannot match */
+
+ state->ptr = ptr;
+
+ if (pattern[1] == 0)
+ count = 0;
+ else {
+ /* count using pattern min as the maximum */
+ count = SRE_COUNT(state, pattern + 3, pattern[1], level + 1);
+
+ if (count < 0)
+ return count; /* exception */
+ if (count < (int) pattern[1])
+ return 0; /* did not match minimum number of times */
+ ptr += count; /* advance past minimum matches of repeat */
+ }
+
+ if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
+ /* tail is empty. we're finished */
+ state->ptr = ptr;
+ return 1;
+
+ } else {
+ /* general case */
+ int matchmax = ((int)pattern[2] == 65535);
+ int c;
+ lastmark = state->lastmark;
+ while (matchmax || count <= (int) pattern[2]) {
+ state->ptr = ptr;
+ i = SRE_MATCH(state, pattern + pattern[0], level + 1);
+ if (i)
+ return i;
+ state->ptr = ptr;
+ c = SRE_COUNT(state, pattern+3, 1, level+1);
+ if (c < 0)
+ return c;
+ if (c == 0)
+ break;
+ assert(c == 1);
+ ptr++;
+ count++;
+ }
+ lastmark_restore(state, lastmark);
+ }
+ return 0;
+
case SRE_OP_REPEAT:
/* create repeat context. all the hard work is done
by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index ebb9fd0..540008e 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -42,6 +42,7 @@
#define SRE_OP_REPEAT 27
#define SRE_OP_REPEAT_ONE 28
#define SRE_OP_SUBPATTERN 29
+#define SRE_OP_MIN_REPEAT_ONE 30
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2