From d6374c239eef894ece7e7472ada26b15b0abe33e Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 5 Mar 2013 14:01:48 +0000 Subject: Contributed patch from Tom Lane . Rewrites parts of the regexp engine to avoid infinite loops. --- generic/regc_nfa.c | 309 +++++++++++++++++++++++++++++++++++++++-------------- generic/regcomp.c | 7 +- 2 files changed, 235 insertions(+), 81 deletions(-) diff --git a/generic/regc_nfa.c b/generic/regc_nfa.c index 65147d4..5857372 100644 --- a/generic/regc_nfa.c +++ b/generic/regc_nfa.c @@ -497,6 +497,42 @@ freearc( } /* + - nonemptyouts - count non-EMPTY out arcs of a state + ^ static int nonemptyouts(struct state *); + */ +static int +nonemptyouts( + struct state *s) +{ + int n = 0; + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) { + if (a->type != EMPTY) + n++; + } + return n; +} + +/* + - nonemptyins - count non-EMPTY in arcs of a state + ^ static int nonemptyins(struct state *); + */ +static int +nonemptyins( + struct state *s) +{ + int n = 0; + struct arc *a; + + for (a = s->ins; a != NULL; a = a->inchain) { + if (a->type != EMPTY) + n++; + } + return n; +} + +/* - findarc - find arc, if any, from given source with given type and color * If there is more than one such arc, the result is random. ^ static struct arc *findarc(struct state *, int, pcolor); @@ -578,6 +614,26 @@ copyins( } /* + - copynonemptyins - as above, but ignore empty arcs + ^ static void copynonemptyins(struct nfa *, struct state *, struct state *); + */ +static void +copynonemptyins( + struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + struct arc *a; + + assert(oldState != newState); + + for (a=oldState->ins ; a!=NULL ; a=a->inchain) { + if (a->type != EMPTY) + cparc(nfa, a, a->from, newState); + } +} + +/* - moveouts - move all out arcs of a state to another state ^ static void moveouts(struct nfa *, struct state *, struct state *); */ @@ -617,6 +673,26 @@ copyouts( } /* + - copynonemptyouts - as above, but ignore empty arcs + ^ static void copynonemptyouts(struct nfa *, struct state *, struct state *); + */ +static void +copynonemptyouts( + struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + struct arc *a; + + assert(oldState != newState); + + for (a=oldState->outs ; a!=NULL ; a=a->outchain) { + if (a->type != EMPTY) + cparc(nfa, a, newState, a->to); + } +} + +/* - cloneouts - copy out arcs of a state to another state pair, modifying type ^ static void cloneouts(struct nfa *, struct state *, struct state *, ^ struct state *, int); @@ -1247,118 +1323,191 @@ fixempties( FILE *f) /* for debug output; NULL none */ { struct state *s; + struct state *s2; struct state *nexts; - struct state *to; struct arc *a; struct arc *nexta; - int progress; /* - * Find and eliminate empties until there are no more. + * First, get rid of any states whose sole out-arc is an EMPTY, since + * they're basically just aliases for their successor. The parsing + * algorithm creates enough of these that it's worth special-casing this. */ + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) { + nexts = s->next; + if (s->nouts == 1 && !s->flag) { + a = s->outs; + assert(a != NULL && a->outchain == NULL); + if (a->type == EMPTY) { + if (s != a->to) + moveins(nfa, s, a->to); + dropstate(nfa, s); + } + } + } - do { - progress = 0; - for (s = nfa->states; s != NULL && !NISERR(); s = nexts) { - nexts = s->next; - for (a = s->outs; a != NULL && !NISERR(); a = a->outchain) { - if (a->type == EMPTY) { - - /* - * Mark a for deletion; copy arcs to preserve graph - * connectivity after it is gone. - */ - - unempty(nfa, a); - } + /* + * Similarly, get rid of any state with a single EMPTY in-arc, by folding + * it into its predecessor. + */ + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) { + nexts = s->next; + /* while we're at it, ensure tmp fields are clear for next step */ + s->tmp = NULL; + if (s->nins == 1 && !s->flag) { + a = s->ins; + assert(a != NULL && a->inchain == NULL); + if (a->type == EMPTY) { + if (s != a->from) + moveouts(nfa, s, a->from); + dropstate(nfa, s); } + } + } + /* + * For each remaining NFA state, find all other states that are reachable + * from it by a chain of one or more EMPTY arcs. Then generate new arcs + * that eliminate the need for each such chain. + * + * If we just do this straightforwardly, the algorithm gets slow in + * complex graphs, because the same arcs get copied to all intermediate + * states of an EMPTY chain, and then uselessly pushed repeatedly to the + * chain's final state; we waste a lot of time in newarc's duplicate + * checking. To improve matters, we decree that any state with only EMPTY + * out-arcs is "doomed" and will not be part of the final NFA. That can be + * ensured by not adding any new out-arcs to such a state. Having ensured + * that, we need not update the state's in-arcs list either; all arcs that + * might have gotten pushed forward to it will just get pushed directly to + * successor states. This eliminates most of the useless duplicate arcs. + */ + for (s = nfa->states; s != NULL && !NISERR(); s = s->next) { + for (s2 = emptyreachable(s, s); s2 != s && !NISERR(); s2 = nexts) { /* - * Now pass through and delete the marked arcs. Doing all the - * deletion after all the marking prevents arc copying from - * resurrecting deleted arcs which can cause failure to converge. - * [Tcl Bug 3604074] + * If s2 is doomed, we decide that (1) we will always push arcs + * forward to it, not pull them back to s; and (2) we can optimize + * away the push-forward, per comment above. So do nothing. */ + if (s2->flag || nonemptyouts(s2) > 0) + replaceempty(nfa, s, s2); - for (a = s->outs; a != NULL; a = nexta) { - nexta = a->outchain; - if (a->from == NULL) { - progress = 1; - to = a->to; - a->from = s; - freearc(nfa, a); - if (to->nins == 0) { - while ((a = to->outs)) { - freearc(nfa, a); - } - if (nexts == to) { - nexts = to->next; - } - freestate(nfa, to); - } - if (s->nouts == 0) { - while ((a = s->ins)) { - freearc(nfa, a); - } - freestate(nfa, s); - } - } - } + /* Reset the tmp fields as we walk back */ + nexts = s2->tmp; + s2->tmp = NULL; } - if (progress && f != NULL) { - dumpnfa(nfa, f); + s->tmp = NULL; + } + + /* + * Now remove all the EMPTY arcs, since we don't need them anymore. + */ + for (s = nfa->states; s != NULL && !NISERR(); s = s->next) { + for (a = s->outs; a != NULL; a = nexta) { + nexta = a->outchain; + if (a->type == EMPTY) + freearc(nfa, a); } - } while (progress && !NISERR()); + } + + /* + * And remove any states that have become useless. (This cleanup is not + * very thorough, and would be even less so if we tried to combine it with + * the previous step; but cleanup() will take care of anything we miss.) + */ + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) { + nexts = s->next; + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); + } + + if (f != NULL && !NISERR()) + dumpnfa(nfa, f); } /* - - unempty - optimize out an EMPTY arc, if possible - * Actually, as it stands this function always succeeds, but the return value - * is kept with an eye on possible future changes. - ^ static int unempty(struct nfa *, struct arc *); + - emptyreachable - recursively find all states reachable from s by EMPTY arcs + * The return value is the last such state found. Its tmp field links back + * to the next-to-last such state, and so on back to s, so that all these + * states can be located without searching the whole NFA. + * The maximum recursion depth here is equal to the length of the longest + * loop-free chain of EMPTY arcs, which is surely no more than the size of + * the NFA, and in practice will be a lot less than that. + ^ static struct state *emptyreachable(struct state *, struct state *); */ -static int /* 0 couldn't, 1 could */ -unempty( - struct nfa *nfa, - struct arc *a) +static struct state * +emptyreachable( + struct state *s, + struct state *lastfound) { - struct state *from = a->from; - struct state *to = a->to; - - assert(a->type == EMPTY); - assert(from != nfa->pre && to != nfa->post); + struct arc *a; - if (from == to) { /* vacuous loop */ - freearc(nfa, a); - return 1; + s->tmp = lastfound; + lastfound = s; + for (a = s->outs; a != NULL; a = a->outchain) { + if (a->type == EMPTY && a->to->tmp == NULL) + lastfound = emptyreachable(a->to, lastfound); } + return lastfound; +} + +/* + - replaceempty - replace an EMPTY arc chain with some non-empty arcs + * The EMPTY arc(s) should be deleted later, but we can't do it here because + * they may still be needed to identify other arc chains during fixempties(). + ^ static void replaceempty(struct nfa *, struct state *, struct state *); + */ +static void +replaceempty( + struct nfa *nfa, + struct state *from, + struct state *to) +{ + int fromouts; + int toins; + + assert(from != to); /* - * Mark arc for deletion. + * Create replacement arcs that bypass the need for the EMPTY chain. We + * can do this either by pushing arcs forward (linking directly from + * "from"'s predecessors to "to") or by pulling them back (linking + * directly from "from" to "to"'s successors). In general, we choose + * whichever way creates greater fan-out or fan-in, so as to improve the + * odds of reducing the other state to zero in-arcs or out-arcs and + * thereby being able to delete it. However, if "from" is doomed (has no + * non-EMPTY out-arcs), we must keep it so, so always push forward in that + * case. + * + * The fan-out/fan-in comparison should count only non-EMPTY arcs. If + * "from" is doomed, we can skip counting "to"'s arcs, since we want to + * force taking the copynonemptyins path in that case. */ + fromouts = nonemptyouts(from); + toins = (fromouts == 0) ? 1 : nonemptyins(to); - a->from = NULL; - - if (from->nouts > to->nins) { - copyouts(nfa, to, from); - return 1; + if (fromouts > toins) { + copynonemptyouts(nfa, to, from); + return; } - if (from->nouts < to->nins) { - copyins(nfa, from, to); - return 1; + if (fromouts < toins) { + copynonemptyins(nfa, from, to); + return; } /* - * from->nouts == to->nins . decide on secondary issue: copy fewest arcs + * fromouts == toins. Decide on secondary issue: copy fewest arcs. + * + * Doesn't seem to be worth the trouble to exclude empties from these + * comparisons; that takes extra time and doesn't seem to improve the + * resulting graph much. */ - if (from->nins > to->nouts) { - copyouts(nfa, to, from); - return 1; + copynonemptyouts(nfa, to, from); + return; + } else { + copynonemptyins(nfa, from, to); + return; } - - copyins(nfa, from, to); - return 1; } /* diff --git a/generic/regcomp.c b/generic/regcomp.c index b15117b..3dd89d8 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -121,12 +121,16 @@ static void destroystate(struct nfa *, struct state *); static void newarc(struct nfa *, int, pcolor, struct state *, struct state *); static struct arc *allocarc(struct nfa *, struct state *); static void freearc(struct nfa *, struct arc *); +static int nonemptyouts(struct state *); +static int nonemptyins(struct state *); static struct arc *findarc(struct state *, int, pcolor); static void cparc(struct nfa *, struct arc *, struct state *, struct state *); static void moveins(struct nfa *, struct state *, struct state *); static void copyins(struct nfa *, struct state *, struct state *); +static void copynonemptyins(struct nfa *, struct state *, struct state *); static void moveouts(struct nfa *, struct state *, struct state *); static void copyouts(struct nfa *, struct state *, struct state *); +static void copynonemptyouts(struct nfa *, struct state *, struct state *); static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int); static void delsub(struct nfa *, struct state *, struct state *); static void deltraverse(struct nfa *, struct state *, struct state *); @@ -144,7 +148,8 @@ static int push(struct nfa *, struct arc *); #define COMPATIBLE 3 /* compatible but not satisfied yet */ static int combine(struct arc *, struct arc *); static void fixempties(struct nfa *, FILE *); -static int unempty(struct nfa *, struct arc *); +static struct state *emptyreachable(struct state *, struct state *); +static void replaceempty(struct nfa *, struct state *, struct state *); static void cleanup(struct nfa *); static void markreachable(struct nfa *, struct state *, struct state *, struct state *); static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); -- cgit v0.12