1 files changed, 301 insertions, 115 deletions
diff --git a/generic/regc_nfa.c b/generic/regc_nfa.c
index 19dbe63..42489dd 100644
--- a/generic/regc_nfa.c
+++ b/generic/regc_nfa.c
@@ -142,7 +142,7 @@ DecrementSize(
 
 /*
  - freenfa - free an entire NFA
- ^ static VOID freenfa(struct nfa *);
+ ^ static void freenfa(struct nfa *);
  */
 static void
 freenfa(
@@ -242,7 +242,7 @@ newfstate(
 
 /*
  - dropstate - delete a state's inarcs and outarcs and free it
- ^ static VOID dropstate(struct nfa *, struct state *);
+ ^ static void dropstate(struct nfa *, struct state *);
  */
 static void
 dropstate(
@@ -262,7 +262,7 @@ dropstate(
 
 /*
  - freestate - free a state, which has no in-arcs or out-arcs
- ^ static VOID freestate(struct nfa *, struct state *);
+ ^ static void freestate(struct nfa *, struct state *);
  */
 static void
 freestate(
@@ -294,7 +294,7 @@ freestate(
 
 /*
  - destroystate - really get rid of an already-freed state
- ^ static VOID destroystate(struct nfa *, struct state *);
+ ^ static void destroystate(struct nfa *, struct state *);
  */
 static void
 destroystate(
@@ -317,7 +317,7 @@ destroystate(
 
 /*
  - newarc - set up a new arc within an NFA
- ^ static VOID newarc(struct nfa *, int, pcolor, struct state *,
+ ^ static void newarc(struct nfa *, int, pcolor, struct state *,
  ^	struct state *);
  */
 static void
@@ -426,7 +426,7 @@ allocarc(
 
 /*
  - freearc - free an arc
- ^ static VOID freearc(struct nfa *, struct arc *);
+ ^ static void freearc(struct nfa *, struct arc *);
  */
 static void
 freearc(
@@ -497,6 +497,62 @@ freearc(
 }
 
 /*
+ - hasnonemptyout - Does state have a non-EMPTY out arc?
+ ^ static int hasnonemptyout(struct state *);
+ */
+static int
+hasnonemptyout(
+    struct state *s)
+{
+    struct arc *a;
+
+    for (a = s->outs; a != NULL; a = a->outchain) {
+	if (a->type != EMPTY) {
+	    return 1;
+	}
+    }
+    return 0;
+}
+
+/*
+ - nonemptyouts - count non-EMPTY out arcs of a state
+ ^ static int nonemptyouts(struct state *);
+ */
+static int
+nonemptyouts(
+    struct state *s)
+{
+    int n = 0;
+    struct arc *a;
+
+    for (a = s->outs; a != NULL; a = a->outchain) {
+	if (a->type != EMPTY) {
+	    n++;
+	}
+    }
+    return n;
+}
+
+/*
+ - nonemptyins - count non-EMPTY in arcs of a state
+ ^ static int nonemptyins(struct state *);
+ */
+static int
+nonemptyins(
+    struct state *s)
+{
+    int n = 0;
+    struct arc *a;
+
+    for (a = s->ins; a != NULL; a = a->inchain) {
+	if (a->type != EMPTY) {
+	    n++;
+	}
+    }
+    return n;
+}
+
+/*
  - findarc - find arc, if any, from given source with given type and color
  * If there is more than one such arc, the result is random.
  ^ static struct arc *findarc(struct state *, int, pcolor);
@@ -519,7 +575,7 @@ findarc(
 
 /*
  - cparc - allocate a new arc within an NFA, copying details from old one
- ^ static VOID cparc(struct nfa *, struct arc *, struct state *,
+ ^ static void cparc(struct nfa *, struct arc *, struct state *,
  ^ 	struct state *);
  */
 static void
@@ -538,7 +594,7 @@ cparc(
  * existing arcs, and you would be right if it weren't for the desire
  * for duplicate suppression, which makes it easier to just make new
  * ones to exploit the suppression built into newarc.
- ^ static VOID moveins(struct nfa *, struct state *, struct state *);
+ ^ static void moveins(struct nfa *, struct state *, struct state *);
  */
 static void
 moveins(
@@ -559,27 +615,31 @@ moveins(
 }
 
 /*
- - copyins - copy all in arcs of a state to another state
- ^ static VOID copyins(struct nfa *, struct state *, struct state *);
+ - copyins - copy in arcs of a state to another state
+ * Either all arcs, or only non-empty ones as determined by all value.
+ ^ static VOID copyins(struct nfa *, struct state *, struct state *, int);
  */
 static void
 copyins(
     struct nfa *nfa,
     struct state *oldState,
-    struct state *newState)
+    struct state *newState,
+    int all)
 {
     struct arc *a;
 
     assert(oldState != newState);
 
     for (a=oldState->ins ; a!=NULL ; a=a->inchain) {
-	cparc(nfa, a, a->from, newState);
+	if (all || a->type != EMPTY) {
+	    cparc(nfa, a, a->from, newState);
+	}
     }
 }
 
 /*
  - moveouts - move all out arcs of a state to another state
- ^ static VOID moveouts(struct nfa *, struct state *, struct state *);
+ ^ static void moveouts(struct nfa *, struct state *, struct state *);
  */
 static void
 moveouts(
@@ -598,27 +658,31 @@ moveouts(
 }
 
 /*
- - copyouts - copy all out arcs of a state to another state
- ^ static VOID copyouts(struct nfa *, struct state *, struct state *);
+ - copyouts - copy out arcs of a state to another state
+ * Either all arcs, or only non-empty ones as determined by all value.
+ ^ static VOID copyouts(struct nfa *, struct state *, struct state *, int);
  */
 static void
 copyouts(
     struct nfa *nfa,
     struct state *oldState,
-    struct state *newState)
+    struct state *newState,
+    int all)
 {
     struct arc *a;
 
     assert(oldState != newState);
 
     for (a=oldState->outs ; a!=NULL ; a=a->outchain) {
-	cparc(nfa, a, newState, a->to);
+	if (all || a->type != EMPTY) {
+	    cparc(nfa, a, newState, a->to);
+	}
     }
 }
 
 /*
  - cloneouts - copy out arcs of a state to another state pair, modifying type
- ^ static VOID cloneouts(struct nfa *, struct state *, struct state *,
+ ^ static void cloneouts(struct nfa *, struct state *, struct state *,
  ^ 	struct state *, int);
  */
 static void
@@ -642,7 +706,7 @@ cloneouts(
  - delsub - delete a sub-NFA, updating subre pointers if necessary
  * This uses a recursive traversal of the sub-NFA, marking already-seen
  * states using their tmp pointer.
- ^ static VOID delsub(struct nfa *, struct state *, struct state *);
+ ^ static void delsub(struct nfa *, struct state *, struct state *);
  */
 static void
 delsub(
@@ -665,7 +729,7 @@ delsub(
 /*
  - deltraverse - the recursive heart of delsub
  * This routine's basic job is to destroy all out-arcs of the state.
- ^ static VOID deltraverse(struct nfa *, struct state *, struct state *);
+ ^ static void deltraverse(struct nfa *, struct state *, struct state *);
  */
 static void
 deltraverse(
@@ -708,7 +772,7 @@ deltraverse(
  * Another recursive traversal, this time using tmp to point to duplicates as
  * well as mark already-seen states. (You knew there was a reason why it's a
  * state pointer, didn't you? :-))
- ^ static VOID dupnfa(struct nfa *, struct state *, struct state *,
+ ^ static void dupnfa(struct nfa *, struct state *, struct state *,
  ^ 	struct state *, struct state *);
  */
 static void
@@ -725,7 +789,7 @@ dupnfa(
     }
 
     stop->tmp = to;
-    duptraverse(nfa, start, from);
+    duptraverse(nfa, start, from, 0);
     /* done, except for clearing out the tmp pointers */
 
     stop->tmp = NULL;
@@ -734,13 +798,14 @@ dupnfa(
 
 /*
  - duptraverse - recursive heart of dupnfa
- ^ static VOID duptraverse(struct nfa *, struct state *, struct state *);
+ ^ static void duptraverse(struct nfa *, struct state *, struct state *);
  */
 static void
 duptraverse(
     struct nfa *nfa,
     struct state *s,
-    struct state *stmp)		/* s's duplicate, or NULL */
+    struct state *stmp,		/* s's duplicate, or NULL */
+    int depth)
 {
     struct arc *a;
 
@@ -754,8 +819,20 @@ duptraverse(
 	return;
     }
 
+    /*
+     * Arbitrary depth limit. Needs tuning, but this value is sufficient to
+     * make all normal tests (not reg-33.14) pass.
+     */
+#ifndef DUPTRAVERSE_MAX_DEPTH
+#define DUPTRAVERSE_MAX_DEPTH 15000
+#endif
+
+    if (depth++ > DUPTRAVERSE_MAX_DEPTH) {
+	NERR(REG_ESPACE);
+    }
+
     for (a=s->outs ; a!=NULL && !NISERR() ; a=a->outchain) {
-	duptraverse(nfa, a->to, NULL);
+	duptraverse(nfa, a->to, NULL, depth);
 	if (NISERR()) {
 	    break;
 	}
@@ -766,7 +843,7 @@ duptraverse(
 
 /*
  - cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
- ^ static VOID cleartraverse(struct nfa *, struct state *);
+ ^ static void cleartraverse(struct nfa *, struct state *);
  */
 static void
 cleartraverse(
@@ -787,7 +864,7 @@ cleartraverse(
 
 /*
  - specialcolors - fill in special colors for an NFA
- ^ static VOID specialcolors(struct nfa *);
+ ^ static void specialcolors(struct nfa *);
  */
 static void
 specialcolors(
@@ -850,7 +927,7 @@ optimize(
 
 /*
  - pullback - pull back constraints backward to (with luck) eliminate them
- ^ static VOID pullback(struct nfa *, FILE *);
+ ^ static void pullback(struct nfa *, FILE *);
  */
 static void
 pullback(
@@ -957,9 +1034,9 @@ pull(
 	if (NISERR()) {
 	    return 0;
 	}
-	assert(to != from);	/* con is not an inarc */
-	copyins(nfa, from, s);	/* duplicate inarcs */
-	cparc(nfa, con, s, to);	/* move constraint arc */
+	assert(to != from);		/* con is not an inarc */
+	copyins(nfa, from, s, 1);	/* duplicate inarcs */
+	cparc(nfa, con, s, to);		/* move constraint arc */
 	freearc(nfa, con);
 	from = s;
 	con = from->outs;
@@ -1007,7 +1084,7 @@ pull(
 
 /*
  - pushfwd - push forward constraints forward to (with luck) eliminate them
- ^ static VOID pushfwd(struct nfa *, FILE *);
+ ^ static void pushfwd(struct nfa *, FILE *);
  */
 static void
 pushfwd(
@@ -1117,7 +1194,7 @@ push(
 	if (NISERR()) {
 	    return 0;
 	}
-	copyouts(nfa, to, s);	/* duplicate outarcs */
+	copyouts(nfa, to, s, 1);	/* duplicate outarcs */
 	cparc(nfa, con, from, s);	/* move constraint */
 	freearc(nfa, con);
 	to = s;
@@ -1226,7 +1303,7 @@ combine(
 
 /*
  - fixempties - get rid of EMPTY arcs
- ^ static VOID fixempties(struct nfa *, FILE *);
+ ^ static void fixempties(struct nfa *, FILE *);
  */
 static void
 fixempties(
@@ -1234,105 +1311,214 @@ fixempties(
     FILE *f)			/* for debug output; NULL none */
 {
     struct state *s;
+    struct state *s2;
     struct state *nexts;
     struct arc *a;
     struct arc *nexta;
-    int progress;
 
     /*
-     * Find and eliminate empties until there are no more.
+     * First, get rid of any states whose sole out-arc is an EMPTY,
+     * since they're basically just aliases for their successor.  The
+     * parsing algorithm creates enough of these that it's worth
+     * special-casing this.
      */
+    for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
+	nexts = s->next;
+	if (s->flag || s->nouts != 1) {
+	    continue;
+	}
+	a = s->outs;
+	assert(a != NULL && a->outchain == NULL);
+	if (a->type != EMPTY) {
+	    continue;
+	}
+	if (s != a->to) {
+	    moveins(nfa, s, a->to);
+	}
+	dropstate(nfa, s);
+    }
 
-    do {
-	progress = 0;
-	for (s = nfa->states; s != NULL && !NISERR()
-		&& s->no != FREESTATE; s = nexts) {
-	    nexts = s->next;
-	    for (a = s->outs; a != NULL && !NISERR(); a = nexta) {
-		nexta = a->outchain;
-		if (a->type == EMPTY && unempty(nfa, a)) {
-		    progress = 1;
-		}
-		assert(nexta == NULL || s->no != FREESTATE);
+    /*
+     * Similarly, get rid of any state with a single EMPTY in-arc, by
+     * folding it into its predecessor.
+     */
+    for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
+	nexts = s->next;
+	/* Ensure tmp fields are clear for next step */
+	assert(s->tmp = NULL);
+	if (s->flag || s->nins != 1) {
+	    continue;
+	}
+	a = s->ins;
+	assert(a != NULL && a->inchain == NULL);
+	if (a->type != EMPTY) {
+	    continue;
+	}
+	if (s != a->from) {
+	    moveouts(nfa, s, a->from);
+	}
+	dropstate(nfa, s);
+    }
+
+    /*
+     * For each remaining NFA state, find all other states that are
+     * reachable from it by a chain of one or more EMPTY arcs.  Then
+     * generate new arcs that eliminate the need for each such chain.
+     *
+     * If we just do this straightforwardly, the algorithm gets slow in
+     * complex graphs, because the same arcs get copied to all
+     * intermediate states of an EMPTY chain, and then uselessly pushed
+     * repeatedly to the chain's final state; we waste a lot of time in
+     * newarc's duplicate checking.  To improve matters, we decree that
+     * any state with only EMPTY out-arcs is "doomed" and will not be
+     * part of the final NFA. That can be ensured by not adding any new
+     * out-arcs to such a state. Having ensured that, we need not update
+     * the state's in-arcs list either; all arcs that might have gotten
+     * pushed forward to it will just get pushed directly to successor
+     * states.  This eliminates most of the useless duplicate arcs.
+     */
+    for (s = nfa->states; s != NULL && !NISERR(); s = s->next) {
+	for (s2 = emptyreachable(s, s); s2 != s && !NISERR();
+		s2 = nexts) {
+	    /*
+	     * If s2 is doomed, we decide that (1) we will always push
+	     * arcs forward to it, not pull them back to s; and (2) we
+	     * can optimize away the push-forward, per comment above.
+	     * So do nothing.
+	     */
+	    if (s2->flag || hasnonemptyout(s2)) {
+		replaceempty(nfa, s, s2);
 	    }
+
+	    /* Reset the tmp fields as we walk back */
+	    nexts = s2->tmp;
+	    s2->tmp = NULL;
 	}
-	if (progress && f != NULL) {
-	    dumpnfa(nfa, f);
+	s->tmp = NULL;
+    }
+    if (NISERR()) {
+	return;
+    }
+
+    /*
+     * Remove all the EMPTY arcs, since we don't need them anymore.
+     */
+    for (s = nfa->states; s != NULL; s = s->next) {
+	for (a = s->outs; a != NULL; a = nexta) {
+	    nexta = a->outchain;
+	    if (a->type == EMPTY) {
+		freearc(nfa, a);
+	    }
 	}
-    } while (progress && !NISERR());
+    }
+
+    /*
+     * And remove any states that have become useless.  (This cleanup is
+     * not very thorough, and would be even less so if we tried to
+     * combine it with the previous step; but cleanup() will take care
+     * of anything we miss.)
+     */
+    for (s = nfa->states; s != NULL; s = nexts) {
+	nexts = s->next;
+	if ((s->nins == 0 || s->nouts == 0) && !s->flag) {
+	    dropstate(nfa, s);
+	}
+    }
+
+    if (f != NULL) {
+	dumpnfa(nfa, f);
+    }
 }
 
 /*
- - unempty - optimize out an EMPTY arc, if possible
- * Actually, as it stands this function always succeeds, but the return value
- * is kept with an eye on possible future changes.
- ^ static int unempty(struct nfa *, struct arc *);
+ - emptyreachable - recursively find all states reachable from s by EMPTY arcs
+ * The return value is the last such state found.  Its tmp field links back
+ * to the next-to-last such state, and so on back to s, so that all these
+ * states can be located without searching the whole NFA.
+ * The maximum recursion depth here is equal to the length of the longest
+ * loop-free chain of EMPTY arcs, which is surely no more than the size of
+ * the NFA, and in practice will be a lot less than that.
+ ^ static struct state *emptyreachable(struct state *, struct state *);
  */
-static int			/* 0 couldn't, 1 could */
-unempty(
-    struct nfa *nfa,
-    struct arc *a)
+static struct state *
+emptyreachable(
+    struct state *s,
+    struct state *lastfound)
 {
-    struct state *from = a->from;
-    struct state *to = a->to;
-    int usefrom;		/* work on from, as opposed to to? */
-
-    assert(a->type == EMPTY);
-    assert(from != nfa->pre && to != nfa->post);
+    struct arc *a;
 
-    if (from == to) {		/* vacuous loop */
-	freearc(nfa, a);
-	return 1;
+    s->tmp = lastfound;
+    lastfound = s;
+    for (a = s->outs; a != NULL; a = a->outchain) {
+	if (a->type == EMPTY && a->to->tmp == NULL) {
+	    lastfound = emptyreachable(a->to, lastfound);
+	}
     }
+    return lastfound;
+}
+
+/*
+ - replaceempty - replace an EMPTY arc chain with some non-empty arcs
+ * The EMPTY arc(s) should be deleted later, but we can't do it here because
+ * they may still be needed to identify other arc chains during fixempties().
+ ^ static void replaceempty(struct nfa *, struct state *, struct state *);
+ */
+static void
+replaceempty(
+    struct nfa *nfa,
+    struct state *from,
+    struct state *to)
+{
+    int fromouts;
+    int toins;
+
+    assert(from != to);
 
     /*
-     * Decide which end to work on.
+     * Create replacement arcs that bypass the need for the EMPTY chain.  We
+     * can do this either by pushing arcs forward (linking directly from
+     * "from"'s predecessors to "to") or by pulling them back (linking
+     * directly from "from" to "to"'s successors).  In general, we choose
+     * whichever way creates greater fan-out or fan-in, so as to improve the
+     * odds of reducing the other state to zero in-arcs or out-arcs and
+     * thereby being able to delete it.  However, if "from" is doomed (has no
+     * non-EMPTY out-arcs), we must keep it so, so always push forward in that
+     * case.
+     *
+     * The fan-out/fan-in comparison should count only non-EMPTY arcs.  If
+     * "from" is doomed, we can skip counting "to"'s arcs, since we want to
+     * force taking the copynonemptyins path in that case.
      */
+    fromouts = nonemptyouts(from);
+    toins = (fromouts == 0) ? 1 : nonemptyins(to);
 
-    usefrom = 1;		/* default: attack from */
-    if (from->nouts > to->nins) {
-	usefrom = 0;
-    } else if (from->nouts == to->nins) {
-	/*
-	 * Decide on secondary issue: move/copy fewest arcs.
-	 */
-
-	if (from->nins > to->nouts) {
-	    usefrom = 0;
-	}
+    if (fromouts > toins) {
+	copyouts(nfa, to, from, 0);
+	return;
+    }
+    if (fromouts < toins) {
+	copyins(nfa, from, to, 0);
+	return;
     }
 
-    freearc(nfa, a);
-    if (usefrom) {
-	if (from->nouts == 0) {
-	    /*
-	     * Was the state's only outarc.
-	     */
-
-	    moveins(nfa, from, to);
-	    freestate(nfa, from);
-	} else {
-	    copyins(nfa, from, to);
-	}
-    } else {
-	if (to->nins == 0) {
-	    /*
-	     * Was the state's only inarc.
-	     */
-
-	    moveouts(nfa, to, from);
-	    freestate(nfa, to);
-	} else {
-	    copyouts(nfa, to, from);
-	}
+    /*
+     * fromouts == toins.  Decide on secondary issue: copy fewest arcs.
+     *
+     * Doesn't seem to be worth the trouble to exclude empties from these
+     * comparisons; that takes extra time and doesn't seem to improve the
+     * resulting graph much.
+     */
+    if (from->nins > to->nouts) {
+	copyouts(nfa, to, from, 0);
+	return;
     }
 
-    return 1;
+    copyins(nfa, from, to, 0);
 }
 
 /*
  - cleanup - clean up NFA after optimizations
- ^ static VOID cleanup(struct nfa *);
+ ^ static void cleanup(struct nfa *);
  */
 static void
 cleanup(
@@ -1373,7 +1559,7 @@ cleanup(
 
 /*
  - markreachable - recursive marking of reachable states
- ^ static VOID markreachable(struct nfa *, struct state *, struct state *,
+ ^ static void markreachable(struct nfa *, struct state *, struct state *,
  ^ 	struct state *);
  */
 static void
@@ -1397,7 +1583,7 @@ markreachable(
 
 /*
  - markcanreach - recursive marking of states which can reach here
- ^ static VOID markcanreach(struct nfa *, struct state *, struct state *,
+ ^ static void markcanreach(struct nfa *, struct state *, struct state *,
  ^ 	struct state *);
  */
 static void
@@ -1445,7 +1631,7 @@ analyze(
 
 /*
  - compact - compact an NFA
- ^ static VOID compact(struct nfa *, struct cnfa *);
+ ^ static void compact(struct nfa *, struct cnfa *);
  */
 static void
 compact(
@@ -1539,7 +1725,7 @@ compact(
  - carcsort - sort compacted-NFA arcs by color
  * Really dumb algorithm, but if the list is long enough for that to matter,
  * you're in real trouble anyway.
- ^ static VOID carcsort(struct carc *, struct carc *);
+ ^ static void carcsort(struct carc *, struct carc *);
  */
 static void
 carcsort(
@@ -1568,7 +1754,7 @@ carcsort(
 
 /*
  - freecnfa - free a compacted NFA
- ^ static VOID freecnfa(struct cnfa *);
+ ^ static void freecnfa(struct cnfa *);
  */
 static void
 freecnfa(
@@ -1582,7 +1768,7 @@ freecnfa(
 
 /*
  - dumpnfa - dump an NFA in human-readable form
- ^ static VOID dumpnfa(struct nfa *, FILE *);
+ ^ static void dumpnfa(struct nfa *, FILE *);
  */
 static void
 dumpnfa(
@@ -1623,7 +1809,7 @@ dumpnfa(
 
 /*
  - dumpstate - dump an NFA state in human-readable form
- ^ static VOID dumpstate(struct state *, FILE *);
+ ^ static void dumpstate(struct state *, FILE *);
  */
 static void
 dumpstate(
@@ -1653,7 +1839,7 @@ dumpstate(
 
 /*
  - dumparcs - dump out-arcs in human-readable form
- ^ static VOID dumparcs(struct state *, FILE *);
+ ^ static void dumparcs(struct state *, FILE *);
  */
 static void
 dumparcs(
@@ -1696,7 +1882,7 @@ dumprarcs(
 
 /*
  - dumparc - dump one outarc in readable form, including prefixing tab
- ^ static VOID dumparc(struct arc *, struct state *, FILE *);
+ ^ static void dumparc(struct arc *, struct state *, FILE *);
  */
 static void
 dumparc(
@@ -1770,7 +1956,7 @@ dumparc(
 
 /*
  - dumpcnfa - dump a compacted NFA in human-readable form
- ^ static VOID dumpcnfa(struct cnfa *, FILE *);
+ ^ static void dumpcnfa(struct cnfa *, FILE *);
  */
 static void
 dumpcnfa(
@@ -1811,7 +1997,7 @@ dumpcnfa(
 
 /*
  - dumpcstate - dump a compacted-NFA state in human-readable form
- ^ static VOID dumpcstate(int, struct carc *, struct cnfa *, FILE *);
+ ^ static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
  */
 static void
 dumpcstate(