summaryrefslogtreecommitdiffstats
path: root/generic/regcomp.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2016-01-31 00:20:59 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2016-01-31 00:20:59 (GMT)
commitba6805a340d02c987ab075a0b9cd0c799f279441 (patch)
treef6fa856abeb8108e1a588f714f802829655bee43 /generic/regcomp.c
parent4dfcbbea544e672812bc7d4de3b7571e14cccc22 (diff)
parent7f3416400096e5498584c1ee4cad6b110314a936 (diff)
downloadtcl-ba6805a340d02c987ab075a0b9cd0c799f279441.zip
tcl-ba6805a340d02c987ab075a0b9cd0c799f279441.tar.gz
tcl-ba6805a340d02c987ab075a0b9cd0c799f279441.tar.bz2
Minor clean-ups (e.g. turn back const -> CONST, as common in 8.5 branch)8_5_with_8_6_regexp
Diffstat (limited to 'generic/regcomp.c')
-rw-r--r--generic/regcomp.c185
1 files changed, 109 insertions, 76 deletions
diff --git a/generic/regcomp.c b/generic/regcomp.c
index 8287b7e..2bc0744 100644
--- a/generic/regcomp.c
+++ b/generic/regcomp.c
@@ -139,7 +139,7 @@ static void cloneouts(struct nfa *, struct state *, struct state *, struct state
static void delsub(struct nfa *, struct state *, struct state *);
static void deltraverse(struct nfa *, struct state *, struct state *);
static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
-static void duptraverse(struct nfa *, struct state *, struct state *);
+static void duptraverse(struct nfa *, struct state *, struct state *, int);
static void cleartraverse(struct nfa *, struct state *);
static void specialcolors(struct nfa *);
static long optimize(struct nfa *, FILE *);
@@ -177,7 +177,7 @@ static void dumparc(struct arc *, struct state *, FILE *);
#endif
static void dumpcnfa(struct cnfa *, FILE *);
#ifdef REG_DEBUG
-static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
+static void dumpcstate(int, struct cnfa *, FILE *);
#endif
/* === regc_cvec.c === */
static struct cvec *clearcvec(struct cvec *);
@@ -222,7 +222,7 @@ struct vars {
struct subre *tree; /* subexpression tree */
struct subre *treechain; /* all tree nodes allocated */
struct subre *treefree; /* any free tree nodes */
- int ntree; /* number of tree nodes */
+ int ntree; /* number of tree nodes, plus one */
struct cvec *cv; /* interface cvec */
struct cvec *cv2; /* utility cvec */
struct subre *lacons; /* lookahead-constraint vector */
@@ -236,13 +236,13 @@ struct vars {
#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
#define VISERR(vv) ((vv)->err != 0)/* have we seen an error yet? */
#define ISERR() VISERR(v)
-#define VERR(vv,e) \
- ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err : ((vv)->err = (e)))
+#define VERR(vv,e) ((vv)->nexttype = EOS, \
+ (vv)->err = ((vv)->err ? (vv)->err : (e)))
#define ERR(e) VERR(v, e) /* record an error */
#define NOERR() {if (ISERR()) return;} /* if error seen, return */
#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
-#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */
+#define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */
#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
@@ -271,12 +271,14 @@ struct vars {
((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)
/* static function list */
-static struct fns functions = {
+static const struct fns functions = {
rfree, /* regfree insides */
};
/*
- compile - compile regular expression
+ * Note: on failure, no resources remain allocated, so regfree()
+ * need not be applied to re.
^ int compile(regex_t *, const chr *, size_t, int);
*/
int
@@ -607,7 +609,7 @@ makesearch(
break;
}
}
-
+
/*
* We want to mark states as being in the list already by having non
* NULL tmp fields, but we can't just store the old slist value in tmp
@@ -1120,11 +1122,17 @@ parseqatom(
/*
* Prepare a general-purpose state skeleton.
*
- * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
- * / /
- * [lp] ----> [s2] ----bypass---------------------
+ * In the no-backrefs case, we want this:
+ *
+ * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
+ *
+ * where prefix is some repetitions of atom. In the general case we need
+ *
+ * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
+ *
+ * where the iterator wraps around [begin] ---atom---> [end]
*
- * where bypass is an empty, and prefix is some repetitions of atom
+ * We make the s state here for both cases; s2 is made below if needed
*/
s = newstate(v->nfa); /* first, new endpoints for the atom */
@@ -1135,11 +1143,9 @@ parseqatom(
NOERR();
atom->begin = s;
atom->end = s2;
- s = newstate(v->nfa); /* and spots for prefix and bypass */
- s2 = newstate(v->nfa);
+ s = newstate(v->nfa); /* set up starting state */
NOERR();
EMPTYARC(lp, s);
- EMPTYARC(lp, s2);
NOERR();
/*
@@ -1186,27 +1192,8 @@ parseqatom(
}
/*
- * It's quantifier time; first, turn x{0,...} into x{1,...}|empty
- */
-
- if (m == 0) {
- EMPTYARC(s2, atom->end);/* the bypass */
- assert(PREF(qprefer) != 0);
- f = COMBINE(qprefer, atom->flags);
- t = subre(v, '|', f, lp, atom->end);
- NOERR();
- t->left = atom;
- t->right = subre(v, '|', PREF(f), s2, atom->end);
- NOERR();
- t->right->left = subre(v, '=', 0, s2, atom->end);
- NOERR();
- *atomp = t;
- atomp = &t->left;
- m = 1;
- }
-
- /*
- * Deal with the rest of the quantifier.
+ * It's quantifier time. If the atom is just a backref, we'll let it deal
+ * with quantifiers internally.
*/
if (atomtype == BACKREF) {
@@ -1224,16 +1211,24 @@ parseqatom(
atom->min = (short) m;
atom->max = (short) n;
atom->flags |= COMBINE(qprefer, atom->flags);
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
} else if (m == 1 && n == 1) {
/*
* No/vacuous quantifier: done.
*/
EMPTYARC(s, atom->begin); /* empty prefix */
- } else {
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
+ } else if (m > 0 && !(atom->flags & BACKR)) {
/*
- * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only second
- * x
+ * If there's no backrefs involved, we can turn x{m,n} into
+ * x{m-1,n-1}x, with capturing parens in only the second x. This
+ * is valid because we only care about capturing matches from the
+ * final iteration of the quantifier. It's a win because we can
+ * implement the backref-free left side as a plain DFA node, since
+ * we don't really care where its submatches are.
*/
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
@@ -1246,6 +1241,24 @@ parseqatom(
NOERR();
t->right = atom;
*atomp = t;
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
+ } else {
+ /* general case: need an iteration node */
+ s2 = newstate(v->nfa);
+ NOERR();
+ moveouts(v->nfa, atom->end, s2);
+ NOERR();
+ dupnfa(v->nfa, atom->begin, atom->end, s, s2);
+ repeat(v, s, s2, m, n);
+ f = COMBINE(qprefer, atom->flags);
+ t = subre(v, '*', f, s, s2);
+ NOERR();
+ t->min = (short) m;
+ t->max = (short) n;
+ t->left = atom;
+ *atomp = t;
+ /* rest of branch is to be strung from iteration's end state */
}
/*
@@ -1254,10 +1267,10 @@ parseqatom(
t = top->right;
if (!(SEE('|') || SEE(stopper) || SEE(EOS))) {
- t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
+ t->right = parsebranch(v, stopper, type, s2, rp, 1);
} else {
- EMPTYARC(atom->end, rp);
- t->right = subre(v, '=', 0, atom->end, rp);
+ EMPTYARC(s2, rp);
+ t->right = subre(v, '=', 0, s2, rp);
}
NOERR();
assert(SEE('|') || SEE(stopper) || SEE(EOS));
@@ -1324,6 +1337,8 @@ scannum(
/*
- repeat - replicate subNFA for quantifiers
+ * The sub-NFA strung from lp to rp is modified to represent m to n
+ * repetitions of its initial contents.
* The duplication sequences used here are chosen carefully so that any
* pointers starting out pointing into the subexpression end up pointing into
* the last occurrence. (Note that it may not be strung between the same left
@@ -1484,7 +1499,7 @@ brackpart(
celt startc, endc;
struct cvec *cv;
const chr *startp, *endp;
- chr c[1];
+ chr c;
/*
* Parse something, get rid of special cases, take shortcuts.
@@ -1496,7 +1511,7 @@ brackpart(
return;
break;
case PLAIN:
- c[0] = v->nextvalue;
+ c = v->nextvalue;
NEXT();
/*
@@ -1504,10 +1519,10 @@ brackpart(
*/
if (!SEE(RANGE)) {
- onechr(v, c[0], lp, rp);
+ onechr(v, c, lp, rp);
return;
}
- startc = element(v, c, c+1);
+ startc = element(v, &c, &c+1);
NOERR();
break;
case COLLEL:
@@ -1551,9 +1566,9 @@ brackpart(
switch (v->nexttype) {
case PLAIN:
case RANGE:
- c[0] = v->nextvalue;
+ c = v->nextvalue;
NEXT();
- endc = element(v, c, c+1);
+ endc = element(v, &c, &c+1);
NOERR();
break;
case COLLEL:
@@ -1733,11 +1748,11 @@ subre(
v->treechain = ret;
}
- assert(strchr("|.b(=", op) != NULL);
+ assert(strchr("=b|.*(", op) != NULL);
ret->op = op;
ret->flags = flags;
- ret->retry = 0;
+ ret->id = 0; /* will be assigned later */
ret->subno = 0;
ret->min = ret->max = 1;
ret->left = NULL;
@@ -1790,7 +1805,8 @@ freesrnode(
}
sr->flags = 0;
- if (v != NULL) {
+ if (v != NULL && v->treechain != NULL) {
+ /* we're still parsing, maybe we can reuse the subre */
sr->left = v->treefree;
v->treefree = sr;
} else {
@@ -1818,7 +1834,7 @@ optst(
}
/*
- - numst - number tree nodes (assigning retry indexes)
+ - numst - number tree nodes (assigning "id" indexes)
^ static int numst(struct subre *, int);
*/
static int /* next number */
@@ -1831,7 +1847,7 @@ numst(
assert(t != NULL);
i = start;
- t->retry = (short) i++;
+ t->id = (short) i++;
if (t->left != NULL) {
i = numst(t->left, i);
}
@@ -1843,6 +1859,19 @@ numst(
/*
- markst - mark tree nodes as INUSE
+ * Note: this is a great deal more subtle than it looks. During initial
+ * parsing of a regex, all subres are linked into the treechain list;
+ * discarded ones are also linked into the treefree list for possible reuse.
+ * After we are done creating all subres required for a regex, we run markst()
+ * then cleanst(), which results in discarding all subres not reachable from
+ * v->tree. We then clear v->treechain, indicating that subres must be found
+ * by descending from v->tree. This changes the behavior of freesubre(): it
+ * will henceforth FREE() unwanted subres rather than sticking them into the
+ * treefree list. (Doing that any earlier would result in dangling links in
+ * the treechain list.) This all means that freev() will clean up correctly
+ * if invoked before or after markst()+cleanst(); but it would not work if
+ * called partway through this state conversion, so we mustn't error out
+ * in or between these two functions.
^ static void markst(struct subre *);
*/
static void
@@ -1949,24 +1978,26 @@ newlacon(
struct state *end,
int pos)
{
- struct subre *sub;
int n;
+ struct subre *newlacons;
+ struct subre *sub;
if (v->nlacons == 0) {
- v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
n = 1; /* skip 0th */
- v->nlacons = 2;
+ newlacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
} else {
- v->lacons = (struct subre *) REALLOC(v->lacons,
- (v->nlacons+1)*sizeof(struct subre));
- n = v->nlacons++;
+ n = v->nlacons;
+ newlacons = (struct subre *) REALLOC(v->lacons,
+ (n + 1) * sizeof(struct subre));
}
- if (v->lacons == NULL) {
+ if (newlacons == NULL) {
ERR(REG_ESPACE);
return 0;
}
+ v->lacons = newlacons;
+ v->nlacons = n + 1;
sub = &v->lacons[n];
sub->begin = begin;
sub->end = end;
@@ -2014,18 +2045,20 @@ rfree(
g = (struct guts *) re->re_guts;
re->re_guts = NULL;
re->re_fns = NULL;
- g->magic = 0;
- freecm(&g->cmap);
- if (g->tree != NULL) {
- freesubre(NULL, g->tree);
- }
- if (g->lacons != NULL) {
- freelacons(g->lacons, g->nlacons);
- }
- if (!NULLCNFA(g->search)) {
- freecnfa(&g->search);
+ if (g != NULL) {
+ g->magic = 0;
+ freecm(&g->cmap);
+ if (g->tree != NULL) {
+ freesubre(NULL, g->tree);
+ }
+ if (g->lacons != NULL) {
+ freelacons(g->lacons, g->nlacons);
+ }
+ if (!NULLCNFA(g->search)) {
+ freecnfa(&g->search);
+ }
+ FREE(g);
}
- FREE(g);
}
/*
@@ -2057,7 +2090,7 @@ dump(
fprintf(f, "\n\n\n========= DUMP ==========\n");
fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
- re->re_nsub, re->re_info, re->re_csize, g->ntree);
+ (int) re->re_nsub, re->re_info, re->re_csize, g->ntree);
dumpcolors(&g->cmap, f);
if (!NULLCNFA(g->search)) {
@@ -2166,14 +2199,14 @@ stid(
size_t bufsize)
{
/*
- * Big enough for hex int or decimal t->retry?
+ * Big enough for hex int or decimal t->id?
*/
- if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1) {
+ if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->id)*3 + 1) {
return "unable";
}
- if (t->retry != 0) {
- sprintf(buf, "%d", t->retry);
+ if (t->id != 0) {
+ sprintf(buf, "%d", t->id);
} else {
sprintf(buf, "%p", t);
}