From 9aa93b55692af9e4c8b34f44c1f4ba06dfc8230e Mon Sep 17 00:00:00 2001 From: "dgp@users.sourceforge.net" Date: Tue, 20 Oct 2015 18:32:25 +0000 Subject: Adaptation of re-better-fixempties.patch from Tom Lane @ postgres. --- generic/regc_nfa.c | 265 +++++++++++++++++++++++++++-------------------------- generic/regcomp.c | 6 +- 2 files changed, 136 insertions(+), 135 deletions(-) diff --git a/generic/regc_nfa.c b/generic/regc_nfa.c index 0f572b8..5582e4e 100644 --- a/generic/regc_nfa.c +++ b/generic/regc_nfa.c @@ -603,44 +603,6 @@ hasnonemptyout( } /* - - nonemptyouts - count non-EMPTY out arcs of a state - ^ static int nonemptyouts(struct state *); - */ -static int -nonemptyouts( - struct state *s) -{ - int n = 0; - struct arc *a; - - for (a = s->outs; a != NULL; a = a->outchain) { - if (a->type != EMPTY) { - n++; - } - } - return n; -} - -/* - - nonemptyins - count non-EMPTY in arcs of a state - ^ static int nonemptyins(struct state *); - */ -static int -nonemptyins( - struct state *s) -{ - int n = 0; - struct arc *a; - - for (a = s->ins; a != NULL; a = a->inchain) { - if (a->type != EMPTY) { - n++; - } - } - return n; -} - -/* - findarc - find arc, if any, from given source with given type and color * If there is more than one such arc, the result is random. ^ static struct arc *findarc(struct state *, int, pcolor); @@ -1897,6 +1859,12 @@ fixempties( struct state *nexts; struct arc *a; struct arc *nexta; + int totalinarcs; + struct arc **inarcsorig; + struct arc **arcarray; + int arccount; + int prevnins; + int nskip; /* * First, get rid of any states whose sole out-arc is an EMPTY, @@ -1942,42 +1910,129 @@ fixempties( dropstate(nfa, s); } + if (NISERR()) { + return; + } + /* - * For each remaining NFA state, find all other states that are - * reachable from it by a chain of one or more EMPTY arcs. Then - * generate new arcs that eliminate the need for each such chain. + * For each remaining NFA state, find all other states from which it is + * reachable by a chain of one or more EMPTY arcs. Then generate new arcs + * that eliminate the need for each such chain. + * + * We could replace a chain of EMPTY arcs that leads from a "from" state + * to a "to" state either by pushing non-EMPTY arcs forward (linking + * directly from "from"'s predecessors to "to") or by pulling them back + * (linking directly from "from" to "to"'s successors). We choose to + * always do the former; this choice is somewhat arbitrary, but the + * approach below requires that we uniformly do one or the other. + * + * Suppose we have a chain of N successive EMPTY arcs (where N can easily + * approach the size of the NFA). All of the intermediate states must + * have additional inarcs and outarcs, else they'd have been removed by + * the steps above. Assuming their inarcs are mostly not empties, we will + * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one + * state in the chain must be duplicated to lead to all its successor + * states as well. So there is no hope of doing less than O(N^2) work; + * however, we should endeavor to keep the big-O cost from being even + * worse than that, which it can easily become without care. In + * particular, suppose we were to copy all S1's inarcs forward to S2, and + * then also to S3, and then later we consider pushing S2's inarcs forward + * to S3. If we include the arcs already copied from S1 in that, we'd be + * doing O(N^3) work. (The duplicate-arc elimination built into newarc() + * and its cohorts would get rid of the extra arcs, but not without cost.) + * + * We can avoid this cost by treating only arcs that existed at the start + * of this phase as candidates to be pushed forward. To identify those, + * we remember the first inarc each state had to start with. We rely on + * the fact that newarc() and friends put new arcs on the front of their + * to-states' inchains, and that this phase never deletes arcs, so that + * the original arcs must be the last arcs in their to-states' inchains. + * + * So the process here is that, for each state in the NFA, we gather up + * all non-EMPTY inarcs of states that can reach the target state via + * EMPTY arcs. We then sort, de-duplicate, and merge these arcs into the + * target state's inchain. (We can safely use sort-merge for this as long + * as we update each state's original-arcs pointer after we add arcs to + * it; the sort step of mergeins probably changed the order of the old + * arcs.) * - * If we just do this straightforwardly, the algorithm gets slow in - * complex graphs, because the same arcs get copied to all - * intermediate states of an EMPTY chain, and then uselessly pushed - * repeatedly to the chain's final state; we waste a lot of time in - * newarc's duplicate checking. To improve matters, we decree that - * any state with only EMPTY out-arcs is "doomed" and will not be - * part of the final NFA. That can be ensured by not adding any new - * out-arcs to such a state. Having ensured that, we need not update - * the state's in-arcs list either; all arcs that might have gotten - * pushed forward to it will just get pushed directly to successor - * states. This eliminates most of the useless duplicate arcs. + * Another refinement worth making is that, because we only add non-EMPTY + * arcs during this phase, and all added arcs have the same from-state as + * the non-EMPTY arc they were cloned from, we know ahead of time that any + * states having only EMPTY outarcs will be useless for lack of outarcs + * after we drop the EMPTY arcs. (They cannot gain non-EMPTY outarcs if + * they had none to start with.) So we need not bother to update the + * inchains of such states at all. */ + + /* Remember the states' first original inarcs */ + /* ... and while at it, count how many old inarcs there are altogether */ + inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *)); + if (inarcsorig == NULL) { + NERR(REG_ESPACE); + return; + } + totalinarcs = 0; + for (s = nfa->states; s != NULL; s = s->next) { + inarcsorig[s->no] = s->ins; + totalinarcs += s->nins; + } + + /* + * Create a workspace for accumulating the inarcs to be added to the + * current target state. totalinarcs is probably a considerable + * overestimate of the space needed, but the NFA is unlikely to be large + * enough at this point to make it worth being smarter. + */ + arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *)); + if (arcarray == NULL) { + NERR(REG_ESPACE); + FREE(inarcsorig); + return; + } + + /* And iterate over the target states */ for (s = nfa->states; s != NULL && !NISERR(); s = s->next) { - for (s2 = emptyreachable(s, s); s2 != s && !NISERR(); - s2 = nexts) { - /* - * If s2 is doomed, we decide that (1) we will always push - * arcs forward to it, not pull them back to s; and (2) we - * can optimize away the push-forward, per comment above. - * So do nothing. - */ - if (s2->flag || hasnonemptyout(s2)) { - replaceempty(nfa, s, s2); + /* Ignore target states without non-EMPTY outarcs, per note above */ + if (!s->flag && !hasnonemptyout(s)) { + continue; + } + + /* Find predecessor states and accumulate their original inarcs */ + arccount = 0; + for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts) { + /* Add s2's original inarcs to arcarray[], but ignore empties */ + for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain) { + if (a->type != EMPTY) { + arcarray[arccount++] = a; + } } + + /* Reset the tmp fields as we walk back */ + nexts = s2->tmp; + s2->tmp = NULL; + } + s->tmp = NULL; + assert(arccount <= totalinarcs); - /* Reset the tmp fields as we walk back */ - nexts = s2->tmp; - s2->tmp = NULL; + /* Remember how many original inarcs this state has */ + prevnins = s->nins; + + /* Add non-duplicate inarcs to target state */ + mergeins(nfa, s, arcarray, arccount); + + /* Now we must update the state's inarcsorig pointer */ + nskip = s->nins - prevnins; + a = s->ins; + while (nskip-- > 0) { + a = a->inchain; } - s->tmp = NULL; + inarcsorig[s->no] = a; } + + FREE(arcarray); + FREE(inarcsorig); + if (NISERR()) { return; } @@ -2013,90 +2068,38 @@ fixempties( } /* - - emptyreachable - recursively find all states reachable from s by EMPTY arcs + - emptyreachable - recursively find all states that can reach s by EMPTY arcs * The return value is the last such state found. Its tmp field links back * to the next-to-last such state, and so on back to s, so that all these * states can be located without searching the whole NFA. + * + * Since this is only used in fixempties(), we pass in the inarcsorig[] array + * maintained by that function. This lets us skip over all new inarcs, which + * are certainly not EMPTY arcs. + * * The maximum recursion depth here is equal to the length of the longest * loop-free chain of EMPTY arcs, which is surely no more than the size of - * the NFA, and in practice will be a lot less than that. + * the NFA, and in practice will be less than that. ^ static struct state *emptyreachable(struct state *, struct state *); */ static struct state * emptyreachable( + struct nfa *nfa, struct state *s, - struct state *lastfound) + struct state *lastfound, + struct arc **inarcsorig) { struct arc *a; s->tmp = lastfound; lastfound = s; - for (a = s->outs; a != NULL; a = a->outchain) { - if (a->type == EMPTY && a->to->tmp == NULL) { - lastfound = emptyreachable(a->to, lastfound); + for (a = inarcsorig[s->no]; a != NULL; a = a->inchain) { + if (a->type == EMPTY && a->from->tmp == NULL) { + lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig); } } return lastfound; } - -/* - - replaceempty - replace an EMPTY arc chain with some non-empty arcs - * The EMPTY arc(s) should be deleted later, but we can't do it here because - * they may still be needed to identify other arc chains during fixempties(). - ^ static void replaceempty(struct nfa *, struct state *, struct state *); - */ -static void -replaceempty( - struct nfa *nfa, - struct state *from, - struct state *to) -{ - int fromouts; - int toins; - - assert(from != to); - - /* - * Create replacement arcs that bypass the need for the EMPTY chain. We - * can do this either by pushing arcs forward (linking directly from - * "from"'s predecessors to "to") or by pulling them back (linking - * directly from "from" to "to"'s successors). In general, we choose - * whichever way creates greater fan-out or fan-in, so as to improve the - * odds of reducing the other state to zero in-arcs or out-arcs and - * thereby being able to delete it. However, if "from" is doomed (has no - * non-EMPTY out-arcs), we must keep it so, so always push forward in that - * case. - * - * The fan-out/fan-in comparison should count only non-EMPTY arcs. If - * "from" is doomed, we can skip counting "to"'s arcs, since we want to - * force taking the copynonemptyins path in that case. - */ - fromouts = nonemptyouts(from); - toins = (fromouts == 0) ? 1 : nonemptyins(to); - - if (fromouts > toins) { - copyouts(nfa, to, from, 0); - return; - } - if (fromouts < toins) { - copyins(nfa, from, to, 0); - return; - } - - /* - * fromouts == toins. Decide on secondary issue: copy fewest arcs. - * - * Doesn't seem to be worth the trouble to exclude empties from these - * comparisons; that takes extra time and doesn't seem to improve the - * resulting graph much. - */ - if (from->nins > to->nouts) { - copyouts(nfa, to, from, 0); - return; - } - - copyins(nfa, from, to, 0); -} /* * isconstraintarc - detect whether an arc is of a constraint type diff --git a/generic/regcomp.c b/generic/regcomp.c index b01311b..985cb93 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -124,8 +124,6 @@ static struct arc *allocarc(struct nfa *, struct state *); static void freearc(struct nfa *, struct arc *); static void changearctarget(struct arc *, struct state *); static int hasnonemptyout(struct state *); -static int nonemptyouts(struct state *); -static int nonemptyins(struct state *); static struct arc *findarc(struct state *, int, pcolor); static void cparc(struct nfa *, struct arc *, struct state *, struct state *); static void sortins(struct nfa *, struct state *); @@ -154,8 +152,8 @@ static int push(struct nfa *, struct arc *); #define COMPATIBLE 3 /* compatible but not satisfied yet */ static int combine(struct arc *, struct arc *); static void fixempties(struct nfa *, FILE *); -static struct state *emptyreachable(struct state *, struct state *); -static void replaceempty(struct nfa *, struct state *, struct state *); +static struct state *emptyreachable(struct nfa *, struct state *, + struct state *, struct arc **); static int isconstraintarc(struct arc *); static int hasconstraintout(struct state *); static void fixconstraintloops(struct nfa *, FILE *); -- cgit v0.12