[1115587][0e0e150e49] Major fix for regexp handling of quantified backrefs.

Contributed by Tom Lane from the Postgres project.
author: dgp <dgp@users.sourceforge.net> 2015-09-21 19:04:40 (GMT)
committer: dgp <dgp@users.sourceforge.net> 2015-09-21 19:04:40 (GMT)
commit: 4669ea6d0c14cef0c13cf12ba3e54e97e197a5d6 (patch)
tree: 0a529f6ed9e006d0521d481343a52d9f8f2818bd /generic/regcomp.c
parent: 87744f43c95c4da096d19a16453155500adadf6f (diff)
download: tcl-4669ea6d0c14cef0c13cf12ba3e54e97e197a5d6.zip
tcl-4669ea6d0c14cef0c13cf12ba3e54e97e197a5d6.tar.gz
tcl-4669ea6d0c14cef0c13cf12ba3e54e97e197a5d6.tar.bz2
1 files changed, 55 insertions, 42 deletions
diff --git a/generic/regcomp.c b/generic/regcomp.c
index 6fa3964..b8a5a87 100644
--- a/generic/regcomp.c
+++ b/generic/regcomp.c
@@ -1105,11 +1105,17 @@ parseqatom(
     /*
      * Prepare a general-purpose state skeleton.
      *
-     *    ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
-     *   /                                            /
-     * [lp] ----> [s2] ----bypass---------------------
+     * In the no-backrefs case, we want this:
      *
-     * where bypass is an empty, and prefix is some repetitions of atom
+     * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
+     *
+     * where prefix is some repetitions of atom.  In the general case we need
+     *
+     * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
+     *
+     * where the iterator wraps around [begin] ---atom---> [end]
+     *
+     * We make the s state here for both cases; s2 is made below if needed
      */
 
     s = newstate(v->nfa);	/* first, new endpoints for the atom */
@@ -1120,11 +1126,9 @@ parseqatom(
     NOERR();
     atom->begin = s;
     atom->end = s2;
-    s = newstate(v->nfa);	/* and spots for prefix and bypass */
-    s2 = newstate(v->nfa);
+    s = newstate(v->nfa);	/* set up starting state */
     NOERR();
     EMPTYARC(lp, s);
-    EMPTYARC(lp, s2);
     NOERR();
 
     /*
@@ -1171,27 +1175,8 @@ parseqatom(
     }
 
     /*
-     * It's quantifier time; first, turn x{0,...} into x{1,...}|empty
-     */
-
-    if (m == 0) {
-	EMPTYARC(s2, atom->end);/* the bypass */
-	assert(PREF(qprefer) != 0);
-	f = COMBINE(qprefer, atom->flags);
-	t = subre(v, '|', f, lp, atom->end);
-	NOERR();
-	t->left = atom;
-	t->right = subre(v, '|', PREF(f), s2, atom->end);
-	NOERR();
-	t->right->left = subre(v, '=', 0, s2, atom->end);
-	NOERR();
-	*atomp = t;
-	atomp = &t->left;
-	m = 1;
-    }
-
-    /*
-     * Deal with the rest of the quantifier.
+     * It's quantifier time.  If the atom is just a backref, we'll let it deal
+     * with quantifiers internally.
      */
 
     if (atomtype == BACKREF) {
@@ -1209,16 +1194,24 @@ parseqatom(
 	atom->min = (short) m;
 	atom->max = (short) n;
 	atom->flags |= COMBINE(qprefer, atom->flags);
+	/* rest of branch can be strung starting from atom->end */
+	s2 = atom->end;
     } else if (m == 1 && n == 1) {
 	/*
 	 * No/vacuous quantifier: done.
 	 */
 
 	EMPTYARC(s, atom->begin);	/* empty prefix */
-    } else {
+	/* rest of branch can be strung starting from atom->end */
+	s2 = atom->end;
+    } else if (m > 0 && !(atom->flags & BACKR)) {
 	/*
-	 * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only second
-	 * x
+	 * If there's no backrefs involved, we can turn x{m,n} into
+	 * x{m-1,n-1}x, with capturing parens in only the second x.  This
+	 * is valid because we only care about capturing matches from the
+	 * final iteration of the quantifier.  It's a win because we can
+	 * implement the backref-free left side as a plain DFA node, since
+	 * we don't really care where its submatches are.
 	 */
 
 	dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
@@ -1231,6 +1224,24 @@ parseqatom(
 	NOERR();
 	t->right = atom;
 	*atomp = t;
+	/* rest of branch can be strung starting from atom->end */
+	s2 = atom->end;
+    } else {
+	/* general case: need an iteration node */
+	s2 = newstate(v->nfa);
+	NOERR();
+	moveouts(v->nfa, atom->end, s2);
+	NOERR();
+	dupnfa(v->nfa, atom->begin, atom->end, s, s2);
+	repeat(v, s, s2, m, n);
+	f = COMBINE(qprefer, atom->flags);
+	t = subre(v, '*', f, s, s2);
+	NOERR();
+	t->min = (short) m;
+	t->max = (short) n;
+	t->left = atom;
+	*atomp = t;
+	/* rest of branch is to be strung from iteration's end state */
     }
 
     /*
@@ -1239,10 +1250,10 @@ parseqatom(
 
     t = top->right;
     if (!(SEE('|') || SEE(stopper) || SEE(EOS))) {
-	t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
+	t->right = parsebranch(v, stopper, type, s2, rp, 1);
     } else {
-	EMPTYARC(atom->end, rp);
-	t->right = subre(v, '=', 0, atom->end, rp);
+	EMPTYARC(s2, rp);
+	t->right = subre(v, '=', 0, s2, rp);
     }
     NOERR();
     assert(SEE('|') || SEE(stopper) || SEE(EOS));
@@ -1309,6 +1320,8 @@ scannum(
 
 /*
  - repeat - replicate subNFA for quantifiers
+ * The sub-NFA strung from lp to rp is modified to represent m to n
+ * repetitions of its initial contents.
  * The duplication sequences used here are chosen carefully so that any
  * pointers starting out pointing into the subexpression end up pointing into
  * the last occurrence. (Note that it may not be strung between the same left
@@ -1718,11 +1731,11 @@ subre(
 	v->treechain = ret;
     }
 
-    assert(strchr("|.b(=", op) != NULL);
+    assert(strchr("=b|.*(", op) != NULL);
 
     ret->op = op;
     ret->flags = flags;
-    ret->retry = 0;
+    ret->id = 0;		/* will be assigned later */
     ret->subno = 0;
     ret->min = ret->max = 1;
     ret->left = NULL;
@@ -1803,7 +1816,7 @@ optst(
 }
 
 /*
- - numst - number tree nodes (assigning retry indexes)
+ - numst - number tree nodes (assigning "id" indexes)
  ^ static int numst(struct subre *, int);
  */
 static int			/* next number */
@@ -1816,7 +1829,7 @@ numst(
     assert(t != NULL);
 
     i = start;
-    t->retry = (short) i++;
+    t->id = (short) i++;
     if (t->left != NULL) {
 	i = numst(t->left, i);
     }
@@ -2151,14 +2164,14 @@ stid(
     size_t bufsize)
 {
     /*
-     * Big enough for hex int or decimal t->retry?
+     * Big enough for hex int or decimal t->id?
      */
 
-    if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1) {
+    if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->id)*3 + 1) {
 	return "unable";
     }
-    if (t->retry != 0) {
-	sprintf(buf, "%d", t->retry);
+    if (t->id != 0) {
+	sprintf(buf, "%d", t->id);
     } else {
 	sprintf(buf, "%p", t);
     }
author	dgp <dgp@users.sourceforge.net>	2015-09-21 19:04:40 (GMT)
committer	dgp <dgp@users.sourceforge.net>	2015-09-21 19:04:40 (GMT)
commit	4669ea6d0c14cef0c13cf12ba3e54e97e197a5d6 (patch)
tree	0a529f6ed9e006d0521d481343a52d9f8f2818bd /generic/regcomp.c
parent	87744f43c95c4da096d19a16453155500adadf6f (diff)
download	tcl-4669ea6d0c14cef0c13cf12ba3e54e97e197a5d6.zip tcl-4669ea6d0c14cef0c13cf12ba3e54e97e197a5d6.tar.gz tcl-4669ea6d0c14cef0c13cf12ba3e54e97e197a5d6.tar.bz2