From fdc0ebcebdabf8d44d0fb97275444ef980028f66 Mon Sep 17 00:00:00 2001 From: dkf Date: Tue, 18 Dec 2007 11:23:11 +0000 Subject: Fixes for problems created when processing regular expressions that generate very large automata. An enormous number of thanks to Will Drewry , Tavis Ormandy , and Tom Lane from the Postgresql crowd for their help in tracking these problems down. [Bug 1810264] --- ChangeLog | 9 ++++++++ generic/regc_color.c | 27 +++++++++++----------- generic/regc_nfa.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++- generic/regerrs.h | 1 + generic/regex.h | 1 + generic/regguts.h | 8 +++++++ tests/reg.test | 15 +++++++++++- 7 files changed, 110 insertions(+), 15 deletions(-) diff --git a/ChangeLog b/ChangeLog index e21c61e..c7dfe53 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2007-12-18 Donal K. Fellows + + * generic/regguts.h, generic/regc_color.c, generic/regc_nfa.c: + Fixes for problems created when processing regular expressions that + generate very large automata. An enormous number of thanks to Will + Drewry , Tavis Ormandy , and Tom + Lane from the Postgresql crowd for their help in + tracking these problems down. [Bug 1810264] + 2007-12-14 Jeff Hobbs * win/README: updated notes diff --git a/generic/regc_color.c b/generic/regc_color.c index 5aed21c..f6716be 100644 --- a/generic/regc_color.c +++ b/generic/regc_color.c @@ -552,12 +552,9 @@ struct colormap *cm; scd->sub = NOSUB; while ((a = cd->arcs) != NULL) { assert(a->co == co); - /* uncolorchain(cm, a); */ - cd->arcs = a->colorchain; + uncolorchain(cm, a); a->co = sco; - /* colorchain(cm, a); */ - a->colorchain = scd->arcs; - scd->arcs = a; + colorchain(cm, a); } freecolor(cm, co); } else { @@ -586,7 +583,10 @@ struct arc *a; { struct colordesc *cd = &cm->cd[a->co]; + if (cd->arcs) + cd->arcs->colorchain_rev = a; a->colorchain = cd->arcs; + a->colorchain_rev = NULL; cd->arcs = a; } @@ -600,18 +600,19 @@ struct colormap *cm; struct arc *a; { struct colordesc *cd = &cm->cd[a->co]; - struct arc *aa; + struct arc *aa = a->colorchain_rev; - aa = cd->arcs; - if (aa == a) /* easy case */ + if (aa == NULL) { + assert(cd->arcs == a); cd->arcs = a->colorchain; - else { - for (; aa != NULL && aa->colorchain != a; aa = aa->colorchain) - continue; - assert(aa != NULL); + } else { + assert(aa->colorchain == a); aa->colorchain = a->colorchain; } - a->colorchain = NULL; /* paranoia */ + if (a->colorchain) + a->colorchain->colorchain_rev = aa; + a->colorchain = NULL; /* paranoia */ + a->colorchain_rev = NULL; } /* diff --git a/generic/regc_nfa.c b/generic/regc_nfa.c index dd26e64..48f56a9 100644 --- a/generic/regc_nfa.c +++ b/generic/regc_nfa.c @@ -61,11 +61,12 @@ struct nfa *parent; /* NULL if primary NFA */ nfa->nstates = 0; nfa->cm = cm; nfa->v = v; + nfa->size = 0; nfa->bos[0] = nfa->bos[1] = COLORLESS; nfa->eos[0] = nfa->eos[1] = COLORLESS; + nfa->parent = parent; nfa->post = newfstate(nfa, '@'); /* number 0 */ nfa->pre = newfstate(nfa, '>'); /* number 1 */ - nfa->parent = parent; nfa->init = newstate(nfa); /* may become invalid later */ nfa->final = newstate(nfa); @@ -88,6 +89,57 @@ struct nfa *parent; /* NULL if primary NFA */ } /* + - too_many_states - checks if the max states exceeds the compile-time value + ^ static int too_many_states(struct nfa *); + */ +static int +too_many_states(nfa) +struct nfa *nfa; +{ + struct nfa *parent = nfa->parent; + size_t sz = nfa->size; + while (parent != NULL) { + sz = parent->size; + parent = parent->parent; + } + if (sz > REG_MAX_STATES) + return 1; + return 0; +} + +/* + - increment_size - increases the tracked size of the NFA and its parents. + ^ static void increment_size(struct nfa *); + */ +static void +increment_size(nfa) +struct nfa *nfa; +{ + struct nfa *parent = nfa->parent; + nfa->size++; + while (parent != NULL) { + parent->size++; + parent = parent->parent; + } +} + +/* + - decrement_size - increases the tracked size of the NFA and its parents. + ^ static void decrement_size(struct nfa *); + */ +static void +decrement_size(nfa) +struct nfa *nfa; +{ + struct nfa *parent = nfa->parent; + nfa->size--; + while (parent != NULL) { + parent->size--; + parent = parent->parent; + } +} + +/* - freenfa - free an entire NFA ^ static VOID freenfa(struct nfa *); */ @@ -123,6 +175,11 @@ struct nfa *nfa; { struct state *s; + if (too_many_states(nfa)) { + /* XXX: add specific error for this */ + NERR(REG_ETOOBIG); + return NULL; + } if (nfa->free != NULL) { s = nfa->free; nfa->free = s->next; @@ -154,6 +211,8 @@ struct nfa *nfa; } s->prev = nfa->slast; nfa->slast = s; + /* Track the current size and the parent size */ + increment_size(nfa); return s; } @@ -221,6 +280,7 @@ struct state *s; s->prev = NULL; s->next = nfa->free; /* don't delete it, put it on the free list */ nfa->free = s; + decrement_size(nfa); } /* @@ -651,6 +711,8 @@ struct state *stmp; /* s's duplicate, or NULL */ for (a = s->outs; a != NULL && !NISERR(); a = a->outchain) { duptraverse(nfa, a->to, (struct state *)NULL); + if (NISERR()) + break; assert(a->to->tmp != NULL); cparc(nfa, a, s->tmp, a->to->tmp); } diff --git a/generic/regerrs.h b/generic/regerrs.h index a3d98b6..259c0cb 100644 --- a/generic/regerrs.h +++ b/generic/regerrs.h @@ -16,3 +16,4 @@ { REG_INVARG, "REG_INVARG", "invalid argument to regex function" }, { REG_MIXED, "REG_MIXED", "character widths of regex and string differ" }, { REG_BADOPT, "REG_BADOPT", "invalid embedded option" }, +{ REG_ETOOBIG, "REG_ETOOBIG", "nfa has too many states" }, diff --git a/generic/regex.h b/generic/regex.h index 8289a50..a35925a 100644 --- a/generic/regex.h +++ b/generic/regex.h @@ -292,6 +292,7 @@ typedef struct { #define REG_INVARG 16 /* invalid argument to regex function */ #define REG_MIXED 17 /* character widths of regex and string differ */ #define REG_BADOPT 18 /* invalid embedded option */ +#define REG_ETOOBIG 19 /* nfa has too many states */ /* two specials for debugging and testing */ #define REG_ATOI 101 /* convert error-code name to number */ #define REG_ITOA 102 /* convert error-code number to name */ diff --git a/generic/regguts.h b/generic/regguts.h index b6152f4..c77a8fc 100644 --- a/generic/regguts.h +++ b/generic/regguts.h @@ -290,6 +290,7 @@ struct arc { # define freechain outchain struct arc *inchain; /* *to's ins chain */ struct arc *colorchain; /* color's arc chain */ + struct arc *colorchain_rev; /* back-link in color's arc chain */ }; struct arcbatch { /* for bulk allocation of arcs */ @@ -326,6 +327,8 @@ struct nfa { struct colormap *cm; /* the color map */ color bos[2]; /* colors, if any, assigned to BOS and BOL */ color eos[2]; /* colors, if any, assigned to EOS and EOL */ + size_t size; /* current NFA size; differs from nstates as + * it will be incremented by its children */ struct vars *v; /* simplifies compile error reporting */ struct nfa *parent; /* parent NFA, if any */ }; @@ -356,6 +359,11 @@ struct cnfa { #define NULLCNFA(cnfa) ((cnfa).nstates == 0) +/* Used to limit the maximum NFA size */ +#ifndef REG_MAX_STATES +#define REG_MAX_STATES 100000 +#endif + /* * subexpression tree diff --git a/tests/reg.test b/tests/reg.test index 8994fdc..2abe7b6 100644 --- a/tests/reg.test +++ b/tests/reg.test @@ -9,7 +9,7 @@ # # Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. # -# RCS: @(#) $Id: reg.test,v 1.16.2.3 2004/11/27 05:44:13 dgp Exp $ +# RCS: @(#) $Id: reg.test,v 1.16.2.4 2007/12/18 11:23:16 dkf Exp $ if {[lsearch [namespace children] ::tcltest] == -1} { package require tcltest 2 @@ -1130,6 +1130,19 @@ test reg-33.11 {Bug 840258} { "TQ\r\n.?<5000267>Test already stopped\r\n" {} tmp } 1 +test reg-33.12 {Bug 1810264 - bad read} { + regexp {\3161573148} {\3161573148} +} 0 +test reg-33.13 {Bug 1810264 - infinite loop} { + regexp {($|^)*} {x} +} 1 +test reg-33.14 {Bug 1810264 - super-expensive expression} { + set start [clock seconds] + regexp {(x{200}){200}$y} {x} + set time [expr {[clock seconds] - $start}] + expr {$time < 5 ? "ok" : "Complex RE took $time seconds - bad!"} +} ok + # cleanup ::tcltest::cleanupTests return -- cgit v0.12