1 files changed, 148 insertions, 98 deletions
diff --git a/generic/regcomp.c b/generic/regcomp.c
index c93eb24..211cd70 100644
--- a/generic/regcomp.c
+++ b/generic/regcomp.c
@@ -83,9 +83,6 @@ static int lexdigits(struct vars *, int, int, int);
 static int brenext(struct vars *, pchr);
 static void skip(struct vars *);
 static chr newline(NOPARMS);
-#ifdef REG_DEBUG
-static const chr *ch(NOPARMS);
-#endif
 static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
 /* === regc_color.c === */
 static void initcm(struct vars *, struct colormap *);
@@ -119,17 +116,22 @@ static void dropstate(struct nfa *, struct state *);
 static void freestate(struct nfa *, struct state *);
 static void destroystate(struct nfa *, struct state *);
 static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
+static void createarc(struct nfa *, int, pcolor, struct state *, struct state *);
 static struct arc *allocarc(struct nfa *, struct state *);
 static void freearc(struct nfa *, struct arc *);
+static void changearctarget(struct arc *, struct state *);
 static int hasnonemptyout(struct state *);
-static int nonemptyouts(struct state *);
-static int nonemptyins(struct state *);
 static struct arc *findarc(struct state *, int, pcolor);
 static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
+static void sortins(struct nfa *, struct state *);
+static int sortins_cmp(const void *, const void *);
+static void sortouts(struct nfa *, struct state *);
+static int sortouts_cmp(const void *, const void *);
 static void moveins(struct nfa *, struct state *, struct state *);
-static void copyins(struct nfa *, struct state *, struct state *, int);
+static void copyins(struct nfa *, struct state *, struct state *);
+static void mergeins(struct nfa *, struct state *, struct arc **, int);
 static void moveouts(struct nfa *, struct state *, struct state *);
-static void copyouts(struct nfa *, struct state *, struct state *, int);
+static void copyouts(struct nfa *, struct state *, struct state *);
 static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
 static void delsub(struct nfa *, struct state *, struct state *);
 static void deltraverse(struct nfa *, struct state *, struct state *);
@@ -139,33 +141,40 @@ static void cleartraverse(struct nfa *, struct state *);
 static void specialcolors(struct nfa *);
 static long optimize(struct nfa *, FILE *);
 static void pullback(struct nfa *, FILE *);
-static int pull(struct nfa *, struct arc *);
+static int pull(struct nfa *, struct arc *, struct state **);
 static void pushfwd(struct nfa *, FILE *);
-static int push(struct nfa *, struct arc *);
+static int push(struct nfa *, struct arc *, struct state **);
 #define	INCOMPATIBLE	1	/* destroys arc */
 #define	SATISFIED	2	/* constraint satisfied */
 #define	COMPATIBLE	3	/* compatible but not satisfied yet */
 static int combine(struct arc *, struct arc *);
 static void fixempties(struct nfa *, FILE *);
-static struct state *emptyreachable(struct state *, struct state *);
-static void replaceempty(struct nfa *, struct state *, struct state *);
+static struct state *emptyreachable(struct nfa *, struct state *,
+			struct state *, struct arc **);
+static int	isconstraintarc(struct arc *);
+static int	hasconstraintout(struct state *);
+static void fixconstraintloops(struct nfa *, FILE *);
+static int	findconstraintloop(struct nfa *, struct state *);
+static void breakconstraintloop(struct nfa *, struct state *);
+static void clonesuccessorstates(struct nfa *, struct state *, struct state *,
+		 struct state *, struct arc *, char *, char *, int);
 static void cleanup(struct nfa *);
 static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
 static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
 static long analyze(struct nfa *);
 static void compact(struct nfa *, struct cnfa *);
-static void carcsort(struct carc *, struct carc *);
+static void carcsort(struct carc *, size_t);
+static int carc_cmp(const void *, const void *);
 static void freecnfa(struct cnfa *);
 static void dumpnfa(struct nfa *, FILE *);
 #ifdef REG_DEBUG
 static void dumpstate(struct state *, FILE *);
 static void dumparcs(struct state *, FILE *);
-static int dumprarcs(struct arc *, struct state *, FILE *, int);
 static void dumparc(struct arc *, struct state *, FILE *);
 #endif
 static void dumpcnfa(struct cnfa *, FILE *);
 #ifdef REG_DEBUG
-static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
+static void dumpcstate(int, struct cnfa *, FILE *);
 #endif
 /* === regc_cvec.c === */
 static struct cvec *clearcvec(struct cvec *);
@@ -210,11 +219,12 @@ struct vars {
     struct subre *tree;		/* subexpression tree */
     struct subre *treechain;	/* all tree nodes allocated */
     struct subre *treefree;	/* any free tree nodes */
-    int ntree;			/* number of tree nodes */
+    int ntree;			/* number of tree nodes, plus one */
     struct cvec *cv;		/* interface cvec */
     struct cvec *cv2;		/* utility cvec */
     struct subre *lacons;	/* lookahead-constraint vector */
     int nlacons;		/* size of lacons */
+    size_t spaceused;		/* approx. space used for compilation */
 };
 
 /* parsing macros; most know that `v' is the struct vars pointer */
@@ -223,13 +233,13 @@ struct vars {
 #define	EAT(t)	(SEE(t) && next(v))	/* if next is this, swallow it */
 #define	VISERR(vv)	((vv)->err != 0)/* have we seen an error yet? */
 #define	ISERR()	VISERR(v)
-#define	VERR(vv,e) \
-	((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err : ((vv)->err = (e)))
+#define VERR(vv,e)	((vv)->nexttype = EOS, \
+			 (vv)->err = ((vv)->err ? (vv)->err : (e)))
 #define	ERR(e)	VERR(v, e)		/* record an error */
 #define	NOERR()	{if (ISERR()) return;}	/* if error seen, return */
 #define	NOERRN()	{if (ISERR()) return NULL;}	/* NOERR with retval */
 #define	NOERRZ()	{if (ISERR()) return 0;}	/* NOERR with retval */
-#define	INSIST(c, e)	((c) ? 0 : ERR(e))	/* if condition false, error */
+#define INSIST(c, e) do { if (!(c)) ERR(e); } while (0)	/* error if c false */
 #define	NOTE(b)	(v->re->re_info |= (b))		/* note visible condition */
 #define	EMPTYARC(x, y)	newarc(v->nfa, EMPTY, 0, x, y)
 
@@ -258,12 +268,14 @@ struct vars {
 	((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)
 
 /* static function list */
-static struct fns functions = {
+static const struct fns functions = {
     rfree,			/* regfree insides */
 };
 
 /*
  - compile - compile regular expression
+ * Note: on failure, no resources remain allocated, so regfree()
+ * need not be applied to re.
  ^ int compile(regex_t *, const chr *, size_t, int);
  */
 int
@@ -324,6 +336,7 @@ compile(
     v->cv2 = NULL;
     v->lacons = NULL;
     v->nlacons = 0;
+    v->spaceused = 0;
     re->re_magic = REMAGIC;
     re->re_info = 0;		/* bits get set during parse */
     re->re_csize = sizeof(chr);
@@ -593,13 +606,15 @@ makesearch(
 		break;
 	    }
 	}
-	if (b != NULL && s->tmp == NULL) {
-	    /*
-	     * Must be split if not already in the list (fixes bugs 505048,
-	     * 230589, 840258, 504785).
-	     */
 
-	    s->tmp = slist;
+	/*
+	 * We want to mark states as being in the list already by having non
+	 * NULL tmp fields, but we can't just store the old slist value in tmp
+	 * because that doesn't work for the first such state.  Instead, the
+	 * first list entry gets its own address in tmp.
+	 */
+	if (b != NULL && s->tmp == NULL) {
+	    s->tmp = (slist != NULL) ? slist : s;
 	    slist = s;
 	}
     }
@@ -610,8 +625,9 @@ makesearch(
 
     for (s=slist ; s!=NULL ; s=s2) {
 	s2 = newstate(nfa);
-
-	copyouts(nfa, s, s2, 1);
+	NOERR();
+	copyouts(nfa, s, s2);
+	NOERR();
 	for (a=s->ins ; a!=NULL ; a=b) {
 	    b = a->inchain;
 
@@ -620,7 +636,7 @@ makesearch(
 		freearc(nfa, a);
 	    }
 	}
-	s2 = s->tmp;
+	s2 = (s->tmp != s) ? s->tmp : NULL;
 	s->tmp = NULL;		/* clean up while we're at it */
     }
 }
@@ -982,6 +998,7 @@ parseqatom(
 	NOERR();
 	assert(v->nextvalue > 0);
 	atom = subre(v, 'b', BACKR, lp, rp);
+	NOERR();
 	subno = v->nextvalue;
 	atom->subno = subno;
 	EMPTYARC(lp, rp);	/* temporarily, so there's something */
@@ -996,13 +1013,13 @@ parseqatom(
     switch (v->nexttype) {
     case '*':
 	m = 0;
-	n = INFINITY;
+	n = DUPINF;
 	qprefer = (v->nextvalue) ? LONGER : SHORTER;
 	NEXT();
 	break;
     case '+':
 	m = 1;
-	n = INFINITY;
+	n = DUPINF;
 	qprefer = (v->nextvalue) ? LONGER : SHORTER;
 	NEXT();
 	break;
@@ -1019,7 +1036,7 @@ parseqatom(
 	    if (SEE(DIGIT)) {
 		n = scannum(v);
 	    } else {
-		n = INFINITY;
+		n = DUPINF;
 	    }
 	    if (m > n) {
 		ERR(REG_BADBR);
@@ -1102,11 +1119,17 @@ parseqatom(
     /*
      * Prepare a general-purpose state skeleton.
      *
-     *    ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
-     *   /                                            /
-     * [lp] ----> [s2] ----bypass---------------------
+     * In the no-backrefs case, we want this:
+     *
+     * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
      *
-     * where bypass is an empty, and prefix is some repetitions of atom
+     * where prefix is some repetitions of atom.  In the general case we need
+     *
+     * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
+     *
+     * where the iterator wraps around [begin] ---atom---> [end]
+     *
+     * We make the s state here for both cases; s2 is made below if needed
      */
 
     s = newstate(v->nfa);	/* first, new endpoints for the atom */
@@ -1117,11 +1140,9 @@ parseqatom(
     NOERR();
     atom->begin = s;
     atom->end = s2;
-    s = newstate(v->nfa);	/* and spots for prefix and bypass */
-    s2 = newstate(v->nfa);
+    s = newstate(v->nfa);	/* set up starting state */
     NOERR();
     EMPTYARC(lp, s);
-    EMPTYARC(lp, s2);
     NOERR();
 
     /*
@@ -1129,6 +1150,7 @@ parseqatom(
      */
 
     t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
+    NOERR();
     t->left = atom;
     atomp = &t->left;
 
@@ -1142,6 +1164,7 @@ parseqatom(
 
     assert(top->op == '=' && top->left == NULL && top->right == NULL);
     top->left = subre(v, '=', top->flags, top->begin, lp);
+    NOERR();
     top->op = '.';
     top->right = t;
 
@@ -1166,27 +1189,8 @@ parseqatom(
     }
 
     /*
-     * It's quantifier time; first, turn x{0,...} into x{1,...}|empty
-     */
-
-    if (m == 0) {
-	EMPTYARC(s2, atom->end);/* the bypass */
-	assert(PREF(qprefer) != 0);
-	f = COMBINE(qprefer, atom->flags);
-	t = subre(v, '|', f, lp, atom->end);
-	NOERR();
-	t->left = atom;
-	t->right = subre(v, '|', PREF(f), s2, atom->end);
-	NOERR();
-	t->right->left = subre(v, '=', 0, s2, atom->end);
-	NOERR();
-	*atomp = t;
-	atomp = &t->left;
-	m = 1;
-    }
-
-    /*
-     * Deal with the rest of the quantifier.
+     * It's quantifier time.  If the atom is just a backref, we'll let it deal
+     * with quantifiers internally.
      */
 
     if (atomtype == BACKREF) {
@@ -1204,21 +1208,29 @@ parseqatom(
 	atom->min = (short) m;
 	atom->max = (short) n;
 	atom->flags |= COMBINE(qprefer, atom->flags);
+	/* rest of branch can be strung starting from atom->end */
+	s2 = atom->end;
     } else if (m == 1 && n == 1) {
 	/*
 	 * No/vacuous quantifier: done.
 	 */
 
 	EMPTYARC(s, atom->begin);	/* empty prefix */
-    } else {
+	/* rest of branch can be strung starting from atom->end */
+	s2 = atom->end;
+    } else if (m > 0 && !(atom->flags & BACKR)) {
 	/*
-	 * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only second
-	 * x
+	 * If there's no backrefs involved, we can turn x{m,n} into
+	 * x{m-1,n-1}x, with capturing parens in only the second x.  This
+	 * is valid because we only care about capturing matches from the
+	 * final iteration of the quantifier.  It's a win because we can
+	 * implement the backref-free left side as a plain DFA node, since
+	 * we don't really care where its submatches are.
 	 */
 
 	dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
-	assert(m >= 1 && m != INFINITY && n >= 1);
-	repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
+	assert(m >= 1 && m != DUPINF && n >= 1);
+	repeat(v, s, atom->begin, m-1, (n == DUPINF) ? n : n-1);
 	f = COMBINE(qprefer, atom->flags);
 	t = subre(v, '.', f, s, atom->end);	/* prefix and atom */
 	NOERR();
@@ -1226,6 +1238,24 @@ parseqatom(
 	NOERR();
 	t->right = atom;
 	*atomp = t;
+	/* rest of branch can be strung starting from atom->end */
+	s2 = atom->end;
+    } else {
+	/* general case: need an iteration node */
+	s2 = newstate(v->nfa);
+	NOERR();
+	moveouts(v->nfa, atom->end, s2);
+	NOERR();
+	dupnfa(v->nfa, atom->begin, atom->end, s, s2);
+	repeat(v, s, s2, m, n);
+	f = COMBINE(qprefer, atom->flags);
+	t = subre(v, '*', f, s, s2);
+	NOERR();
+	t->min = (short) m;
+	t->max = (short) n;
+	t->left = atom;
+	*atomp = t;
+	/* rest of branch is to be strung from iteration's end state */
     }
 
     /*
@@ -1234,10 +1264,10 @@ parseqatom(
 
     t = top->right;
     if (!(SEE('|') || SEE(stopper) || SEE(EOS))) {
-	t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
+	t->right = parsebranch(v, stopper, type, s2, rp, 1);
     } else {
-	EMPTYARC(atom->end, rp);
-	t->right = subre(v, '=', 0, atom->end, rp);
+	EMPTYARC(s2, rp);
+	t->right = subre(v, '=', 0, s2, rp);
     }
     NOERR();
     assert(SEE('|') || SEE(stopper) || SEE(EOS));
@@ -1304,6 +1334,8 @@ scannum(
 
 /*
  - repeat - replicate subNFA for quantifiers
+ * The sub-NFA strung from lp to rp is modified to represent m to n
+ * repetitions of its initial contents.
  * The duplication sequences used here are chosen carefully so that any
  * pointers starting out pointing into the subexpression end up pointing into
  * the last occurrence. (Note that it may not be strung between the same left
@@ -1323,7 +1355,7 @@ repeat(
 #define	SOME		2
 #define	INF		3
 #define	PAIR(x, y)	((x)*4 + (y))
-#define	REDUCE(x)	( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
+#define	REDUCE(x)	( ((x) == DUPINF) ? INF : (((x) > 1) ? SOME : (x)) )
     const int rm = REDUCE(m);
     const int rn = REDUCE(n);
     struct state *s, *s2;
@@ -1713,11 +1745,11 @@ subre(
 	v->treechain = ret;
     }
 
-    assert(strchr("|.b(=", op) != NULL);
+    assert(strchr("=b|.*(", op) != NULL);
 
     ret->op = op;
     ret->flags = flags;
-    ret->retry = 0;
+    ret->id = 0;		/* will be assigned later */
     ret->subno = 0;
     ret->min = ret->max = 1;
     ret->left = NULL;
@@ -1770,7 +1802,8 @@ freesrnode(
     }
     sr->flags = 0;
 
-    if (v != NULL) {
+    if (v != NULL && v->treechain != NULL) {
+	/* we're still parsing, maybe we can reuse the subre */
 	sr->left = v->treefree;
 	v->treefree = sr;
     } else {
@@ -1798,7 +1831,7 @@ optst(
 }
 
 /*
- - numst - number tree nodes (assigning retry indexes)
+ - numst - number tree nodes (assigning "id" indexes)
  ^ static int numst(struct subre *, int);
  */
 static int			/* next number */
@@ -1811,7 +1844,7 @@ numst(
     assert(t != NULL);
 
     i = start;
-    t->retry = (short) i++;
+    t->id = (short) i++;
     if (t->left != NULL) {
 	i = numst(t->left, i);
     }
@@ -1823,6 +1856,19 @@ numst(
 
 /*
  - markst - mark tree nodes as INUSE
+ * Note: this is a great deal more subtle than it looks.  During initial
+ * parsing of a regex, all subres are linked into the treechain list;
+ * discarded ones are also linked into the treefree list for possible reuse.
+ * After we are done creating all subres required for a regex, we run markst()
+ * then cleanst(), which results in discarding all subres not reachable from
+ * v->tree.  We then clear v->treechain, indicating that subres must be found
+ * by descending from v->tree.  This changes the behavior of freesubre(): it
+ * will henceforth FREE() unwanted subres rather than sticking them into the
+ * treefree list.  (Doing that any earlier would result in dangling links in
+ * the treechain list.)  This all means that freev() will clean up correctly
+ * if invoked before or after markst()+cleanst(); but it would not work if
+ * called partway through this state conversion, so we mustn't error out
+ * in or between these two functions.
  ^ static void markst(struct subre *);
  */
 static void
@@ -1929,24 +1975,26 @@ newlacon(
     struct state *end,
     int pos)
 {
-    struct subre *sub;
     int n;
+    struct subre *newlacons;
+    struct subre *sub;
 
     if (v->nlacons == 0) {
-	v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
 	n = 1;		/* skip 0th */
-	v->nlacons = 2;
+	newlacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
     } else {
-	v->lacons = (struct subre *) REALLOC(v->lacons,
-		(v->nlacons+1)*sizeof(struct subre));
-	n = v->nlacons++;
+	n = v->nlacons;
+	newlacons = (struct subre *) REALLOC(v->lacons,
+					     (n + 1) * sizeof(struct subre));
     }
 
-    if (v->lacons == NULL) {
+    if (newlacons == NULL) {
 	ERR(REG_ESPACE);
 	return 0;
     }
 
+    v->lacons = newlacons;
+    v->nlacons = n + 1;
     sub = &v->lacons[n];
     sub->begin = begin;
     sub->end = end;
@@ -1994,18 +2042,20 @@ rfree(
     g = (struct guts *) re->re_guts;
     re->re_guts = NULL;
     re->re_fns = NULL;
-    g->magic = 0;
-    freecm(&g->cmap);
-    if (g->tree != NULL) {
-	freesubre(NULL, g->tree);
-    }
-    if (g->lacons != NULL) {
-	freelacons(g->lacons, g->nlacons);
-    }
-    if (!NULLCNFA(g->search)) {
-	freecnfa(&g->search);
+    if (g != NULL) {
+	g->magic = 0;
+	freecm(&g->cmap);
+	if (g->tree != NULL) {
+	    freesubre(NULL, g->tree);
+	}
+	if (g->lacons != NULL) {
+	    freelacons(g->lacons, g->nlacons);
+	}
+	if (!NULLCNFA(g->search)) {
+	    freecnfa(&g->search);
+	}
+	FREE(g);
     }
-    FREE(g);
 }
 
 /*
@@ -2037,11 +2087,11 @@ dump(
 
     fprintf(f, "\n\n\n========= DUMP ==========\n");
     fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
-	    re->re_nsub, re->re_info, re->re_csize, g->ntree);
+	    (int) re->re_nsub, re->re_info, re->re_csize, g->ntree);
 
     dumpcolors(&g->cmap, f);
     if (!NULLCNFA(g->search)) {
-	printf("\nsearch:\n");
+	fprintf(f, "\nsearch:\n");
 	dumpcnfa(&g->search, f);
     }
     for (i = 1; i < g->nlacons; i++) {
@@ -2108,7 +2158,7 @@ stdump(
     }
     if (t->min != 1 || t->max != 1) {
 	fprintf(f, " {%d,", t->min);
-	if (t->max != INFINITY) {
+	if (t->max != DUPINF) {
 	    fprintf(f, "%d", t->max);
 	}
 	fprintf(f, "}");
@@ -2146,14 +2196,14 @@ stid(
     size_t bufsize)
 {
     /*
-     * Big enough for hex int or decimal t->retry?
+     * Big enough for hex int or decimal t->id?
      */
 
-    if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1) {
+    if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->id)*3 + 1) {
 	return "unable";
     }
-    if (t->retry != 0) {
-	sprintf(buf, "%d", t->retry);
+    if (t->id != 0) {
+	sprintf(buf, "%d", t->id);
     } else {
 	sprintf(buf, "%p", t);
     }