diff options
Diffstat (limited to 'generic/regcomp.c')
-rw-r--r-- | generic/regcomp.c | 246 |
1 files changed, 148 insertions, 98 deletions
diff --git a/generic/regcomp.c b/generic/regcomp.c index c93eb24..211cd70 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -83,9 +83,6 @@ static int lexdigits(struct vars *, int, int, int); static int brenext(struct vars *, pchr); static void skip(struct vars *); static chr newline(NOPARMS); -#ifdef REG_DEBUG -static const chr *ch(NOPARMS); -#endif static chr chrnamed(struct vars *, const chr *, const chr *, pchr); /* === regc_color.c === */ static void initcm(struct vars *, struct colormap *); @@ -119,17 +116,22 @@ static void dropstate(struct nfa *, struct state *); static void freestate(struct nfa *, struct state *); static void destroystate(struct nfa *, struct state *); static void newarc(struct nfa *, int, pcolor, struct state *, struct state *); +static void createarc(struct nfa *, int, pcolor, struct state *, struct state *); static struct arc *allocarc(struct nfa *, struct state *); static void freearc(struct nfa *, struct arc *); +static void changearctarget(struct arc *, struct state *); static int hasnonemptyout(struct state *); -static int nonemptyouts(struct state *); -static int nonemptyins(struct state *); static struct arc *findarc(struct state *, int, pcolor); static void cparc(struct nfa *, struct arc *, struct state *, struct state *); +static void sortins(struct nfa *, struct state *); +static int sortins_cmp(const void *, const void *); +static void sortouts(struct nfa *, struct state *); +static int sortouts_cmp(const void *, const void *); static void moveins(struct nfa *, struct state *, struct state *); -static void copyins(struct nfa *, struct state *, struct state *, int); +static void copyins(struct nfa *, struct state *, struct state *); +static void mergeins(struct nfa *, struct state *, struct arc **, int); static void moveouts(struct nfa *, struct state *, struct state *); -static void copyouts(struct nfa *, struct state *, struct state *, int); +static void copyouts(struct nfa *, struct state *, struct state *); static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int); static void delsub(struct nfa *, struct state *, struct state *); static void deltraverse(struct nfa *, struct state *, struct state *); @@ -139,33 +141,40 @@ static void cleartraverse(struct nfa *, struct state *); static void specialcolors(struct nfa *); static long optimize(struct nfa *, FILE *); static void pullback(struct nfa *, FILE *); -static int pull(struct nfa *, struct arc *); +static int pull(struct nfa *, struct arc *, struct state **); static void pushfwd(struct nfa *, FILE *); -static int push(struct nfa *, struct arc *); +static int push(struct nfa *, struct arc *, struct state **); #define INCOMPATIBLE 1 /* destroys arc */ #define SATISFIED 2 /* constraint satisfied */ #define COMPATIBLE 3 /* compatible but not satisfied yet */ static int combine(struct arc *, struct arc *); static void fixempties(struct nfa *, FILE *); -static struct state *emptyreachable(struct state *, struct state *); -static void replaceempty(struct nfa *, struct state *, struct state *); +static struct state *emptyreachable(struct nfa *, struct state *, + struct state *, struct arc **); +static int isconstraintarc(struct arc *); +static int hasconstraintout(struct state *); +static void fixconstraintloops(struct nfa *, FILE *); +static int findconstraintloop(struct nfa *, struct state *); +static void breakconstraintloop(struct nfa *, struct state *); +static void clonesuccessorstates(struct nfa *, struct state *, struct state *, + struct state *, struct arc *, char *, char *, int); static void cleanup(struct nfa *); static void markreachable(struct nfa *, struct state *, struct state *, struct state *); static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); static long analyze(struct nfa *); static void compact(struct nfa *, struct cnfa *); -static void carcsort(struct carc *, struct carc *); +static void carcsort(struct carc *, size_t); +static int carc_cmp(const void *, const void *); static void freecnfa(struct cnfa *); static void dumpnfa(struct nfa *, FILE *); #ifdef REG_DEBUG static void dumpstate(struct state *, FILE *); static void dumparcs(struct state *, FILE *); -static int dumprarcs(struct arc *, struct state *, FILE *, int); static void dumparc(struct arc *, struct state *, FILE *); #endif static void dumpcnfa(struct cnfa *, FILE *); #ifdef REG_DEBUG -static void dumpcstate(int, struct carc *, struct cnfa *, FILE *); +static void dumpcstate(int, struct cnfa *, FILE *); #endif /* === regc_cvec.c === */ static struct cvec *clearcvec(struct cvec *); @@ -210,11 +219,12 @@ struct vars { struct subre *tree; /* subexpression tree */ struct subre *treechain; /* all tree nodes allocated */ struct subre *treefree; /* any free tree nodes */ - int ntree; /* number of tree nodes */ + int ntree; /* number of tree nodes, plus one */ struct cvec *cv; /* interface cvec */ struct cvec *cv2; /* utility cvec */ struct subre *lacons; /* lookahead-constraint vector */ int nlacons; /* size of lacons */ + size_t spaceused; /* approx. space used for compilation */ }; /* parsing macros; most know that `v' is the struct vars pointer */ @@ -223,13 +233,13 @@ struct vars { #define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */ #define VISERR(vv) ((vv)->err != 0)/* have we seen an error yet? */ #define ISERR() VISERR(v) -#define VERR(vv,e) \ - ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err : ((vv)->err = (e))) +#define VERR(vv,e) ((vv)->nexttype = EOS, \ + (vv)->err = ((vv)->err ? (vv)->err : (e))) #define ERR(e) VERR(v, e) /* record an error */ #define NOERR() {if (ISERR()) return;} /* if error seen, return */ #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */ #define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */ -#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */ +#define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */ #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */ #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y) @@ -258,12 +268,14 @@ struct vars { ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND) /* static function list */ -static struct fns functions = { +static const struct fns functions = { rfree, /* regfree insides */ }; /* - compile - compile regular expression + * Note: on failure, no resources remain allocated, so regfree() + * need not be applied to re. ^ int compile(regex_t *, const chr *, size_t, int); */ int @@ -324,6 +336,7 @@ compile( v->cv2 = NULL; v->lacons = NULL; v->nlacons = 0; + v->spaceused = 0; re->re_magic = REMAGIC; re->re_info = 0; /* bits get set during parse */ re->re_csize = sizeof(chr); @@ -593,13 +606,15 @@ makesearch( break; } } - if (b != NULL && s->tmp == NULL) { - /* - * Must be split if not already in the list (fixes bugs 505048, - * 230589, 840258, 504785). - */ - s->tmp = slist; + /* + * We want to mark states as being in the list already by having non + * NULL tmp fields, but we can't just store the old slist value in tmp + * because that doesn't work for the first such state. Instead, the + * first list entry gets its own address in tmp. + */ + if (b != NULL && s->tmp == NULL) { + s->tmp = (slist != NULL) ? slist : s; slist = s; } } @@ -610,8 +625,9 @@ makesearch( for (s=slist ; s!=NULL ; s=s2) { s2 = newstate(nfa); - - copyouts(nfa, s, s2, 1); + NOERR(); + copyouts(nfa, s, s2); + NOERR(); for (a=s->ins ; a!=NULL ; a=b) { b = a->inchain; @@ -620,7 +636,7 @@ makesearch( freearc(nfa, a); } } - s2 = s->tmp; + s2 = (s->tmp != s) ? s->tmp : NULL; s->tmp = NULL; /* clean up while we're at it */ } } @@ -982,6 +998,7 @@ parseqatom( NOERR(); assert(v->nextvalue > 0); atom = subre(v, 'b', BACKR, lp, rp); + NOERR(); subno = v->nextvalue; atom->subno = subno; EMPTYARC(lp, rp); /* temporarily, so there's something */ @@ -996,13 +1013,13 @@ parseqatom( switch (v->nexttype) { case '*': m = 0; - n = INFINITY; + n = DUPINF; qprefer = (v->nextvalue) ? LONGER : SHORTER; NEXT(); break; case '+': m = 1; - n = INFINITY; + n = DUPINF; qprefer = (v->nextvalue) ? LONGER : SHORTER; NEXT(); break; @@ -1019,7 +1036,7 @@ parseqatom( if (SEE(DIGIT)) { n = scannum(v); } else { - n = INFINITY; + n = DUPINF; } if (m > n) { ERR(REG_BADBR); @@ -1102,11 +1119,17 @@ parseqatom( /* * Prepare a general-purpose state skeleton. * - * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] - * / / - * [lp] ----> [s2] ----bypass--------------------- + * In the no-backrefs case, we want this: + * + * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp] * - * where bypass is an empty, and prefix is some repetitions of atom + * where prefix is some repetitions of atom. In the general case we need + * + * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp] + * + * where the iterator wraps around [begin] ---atom---> [end] + * + * We make the s state here for both cases; s2 is made below if needed */ s = newstate(v->nfa); /* first, new endpoints for the atom */ @@ -1117,11 +1140,9 @@ parseqatom( NOERR(); atom->begin = s; atom->end = s2; - s = newstate(v->nfa); /* and spots for prefix and bypass */ - s2 = newstate(v->nfa); + s = newstate(v->nfa); /* set up starting state */ NOERR(); EMPTYARC(lp, s); - EMPTYARC(lp, s2); NOERR(); /* @@ -1129,6 +1150,7 @@ parseqatom( */ t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp); + NOERR(); t->left = atom; atomp = &t->left; @@ -1142,6 +1164,7 @@ parseqatom( assert(top->op == '=' && top->left == NULL && top->right == NULL); top->left = subre(v, '=', top->flags, top->begin, lp); + NOERR(); top->op = '.'; top->right = t; @@ -1166,27 +1189,8 @@ parseqatom( } /* - * It's quantifier time; first, turn x{0,...} into x{1,...}|empty - */ - - if (m == 0) { - EMPTYARC(s2, atom->end);/* the bypass */ - assert(PREF(qprefer) != 0); - f = COMBINE(qprefer, atom->flags); - t = subre(v, '|', f, lp, atom->end); - NOERR(); - t->left = atom; - t->right = subre(v, '|', PREF(f), s2, atom->end); - NOERR(); - t->right->left = subre(v, '=', 0, s2, atom->end); - NOERR(); - *atomp = t; - atomp = &t->left; - m = 1; - } - - /* - * Deal with the rest of the quantifier. + * It's quantifier time. If the atom is just a backref, we'll let it deal + * with quantifiers internally. */ if (atomtype == BACKREF) { @@ -1204,21 +1208,29 @@ parseqatom( atom->min = (short) m; atom->max = (short) n; atom->flags |= COMBINE(qprefer, atom->flags); + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; } else if (m == 1 && n == 1) { /* * No/vacuous quantifier: done. */ EMPTYARC(s, atom->begin); /* empty prefix */ - } else { + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; + } else if (m > 0 && !(atom->flags & BACKR)) { /* - * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only second - * x + * If there's no backrefs involved, we can turn x{m,n} into + * x{m-1,n-1}x, with capturing parens in only the second x. This + * is valid because we only care about capturing matches from the + * final iteration of the quantifier. It's a win because we can + * implement the backref-free left side as a plain DFA node, since + * we don't really care where its submatches are. */ dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin); - assert(m >= 1 && m != INFINITY && n >= 1); - repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1); + assert(m >= 1 && m != DUPINF && n >= 1); + repeat(v, s, atom->begin, m-1, (n == DUPINF) ? n : n-1); f = COMBINE(qprefer, atom->flags); t = subre(v, '.', f, s, atom->end); /* prefix and atom */ NOERR(); @@ -1226,6 +1238,24 @@ parseqatom( NOERR(); t->right = atom; *atomp = t; + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; + } else { + /* general case: need an iteration node */ + s2 = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, atom->end, s2); + NOERR(); + dupnfa(v->nfa, atom->begin, atom->end, s, s2); + repeat(v, s, s2, m, n); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '*', f, s, s2); + NOERR(); + t->min = (short) m; + t->max = (short) n; + t->left = atom; + *atomp = t; + /* rest of branch is to be strung from iteration's end state */ } /* @@ -1234,10 +1264,10 @@ parseqatom( t = top->right; if (!(SEE('|') || SEE(stopper) || SEE(EOS))) { - t->right = parsebranch(v, stopper, type, atom->end, rp, 1); + t->right = parsebranch(v, stopper, type, s2, rp, 1); } else { - EMPTYARC(atom->end, rp); - t->right = subre(v, '=', 0, atom->end, rp); + EMPTYARC(s2, rp); + t->right = subre(v, '=', 0, s2, rp); } NOERR(); assert(SEE('|') || SEE(stopper) || SEE(EOS)); @@ -1304,6 +1334,8 @@ scannum( /* - repeat - replicate subNFA for quantifiers + * The sub-NFA strung from lp to rp is modified to represent m to n + * repetitions of its initial contents. * The duplication sequences used here are chosen carefully so that any * pointers starting out pointing into the subexpression end up pointing into * the last occurrence. (Note that it may not be strung between the same left @@ -1323,7 +1355,7 @@ repeat( #define SOME 2 #define INF 3 #define PAIR(x, y) ((x)*4 + (y)) -#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) ) +#define REDUCE(x) ( ((x) == DUPINF) ? INF : (((x) > 1) ? SOME : (x)) ) const int rm = REDUCE(m); const int rn = REDUCE(n); struct state *s, *s2; @@ -1713,11 +1745,11 @@ subre( v->treechain = ret; } - assert(strchr("|.b(=", op) != NULL); + assert(strchr("=b|.*(", op) != NULL); ret->op = op; ret->flags = flags; - ret->retry = 0; + ret->id = 0; /* will be assigned later */ ret->subno = 0; ret->min = ret->max = 1; ret->left = NULL; @@ -1770,7 +1802,8 @@ freesrnode( } sr->flags = 0; - if (v != NULL) { + if (v != NULL && v->treechain != NULL) { + /* we're still parsing, maybe we can reuse the subre */ sr->left = v->treefree; v->treefree = sr; } else { @@ -1798,7 +1831,7 @@ optst( } /* - - numst - number tree nodes (assigning retry indexes) + - numst - number tree nodes (assigning "id" indexes) ^ static int numst(struct subre *, int); */ static int /* next number */ @@ -1811,7 +1844,7 @@ numst( assert(t != NULL); i = start; - t->retry = (short) i++; + t->id = (short) i++; if (t->left != NULL) { i = numst(t->left, i); } @@ -1823,6 +1856,19 @@ numst( /* - markst - mark tree nodes as INUSE + * Note: this is a great deal more subtle than it looks. During initial + * parsing of a regex, all subres are linked into the treechain list; + * discarded ones are also linked into the treefree list for possible reuse. + * After we are done creating all subres required for a regex, we run markst() + * then cleanst(), which results in discarding all subres not reachable from + * v->tree. We then clear v->treechain, indicating that subres must be found + * by descending from v->tree. This changes the behavior of freesubre(): it + * will henceforth FREE() unwanted subres rather than sticking them into the + * treefree list. (Doing that any earlier would result in dangling links in + * the treechain list.) This all means that freev() will clean up correctly + * if invoked before or after markst()+cleanst(); but it would not work if + * called partway through this state conversion, so we mustn't error out + * in or between these two functions. ^ static void markst(struct subre *); */ static void @@ -1929,24 +1975,26 @@ newlacon( struct state *end, int pos) { - struct subre *sub; int n; + struct subre *newlacons; + struct subre *sub; if (v->nlacons == 0) { - v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre)); n = 1; /* skip 0th */ - v->nlacons = 2; + newlacons = (struct subre *) MALLOC(2 * sizeof(struct subre)); } else { - v->lacons = (struct subre *) REALLOC(v->lacons, - (v->nlacons+1)*sizeof(struct subre)); - n = v->nlacons++; + n = v->nlacons; + newlacons = (struct subre *) REALLOC(v->lacons, + (n + 1) * sizeof(struct subre)); } - if (v->lacons == NULL) { + if (newlacons == NULL) { ERR(REG_ESPACE); return 0; } + v->lacons = newlacons; + v->nlacons = n + 1; sub = &v->lacons[n]; sub->begin = begin; sub->end = end; @@ -1994,18 +2042,20 @@ rfree( g = (struct guts *) re->re_guts; re->re_guts = NULL; re->re_fns = NULL; - g->magic = 0; - freecm(&g->cmap); - if (g->tree != NULL) { - freesubre(NULL, g->tree); - } - if (g->lacons != NULL) { - freelacons(g->lacons, g->nlacons); - } - if (!NULLCNFA(g->search)) { - freecnfa(&g->search); + if (g != NULL) { + g->magic = 0; + freecm(&g->cmap); + if (g->tree != NULL) { + freesubre(NULL, g->tree); + } + if (g->lacons != NULL) { + freelacons(g->lacons, g->nlacons); + } + if (!NULLCNFA(g->search)) { + freecnfa(&g->search); + } + FREE(g); } - FREE(g); } /* @@ -2037,11 +2087,11 @@ dump( fprintf(f, "\n\n\n========= DUMP ==========\n"); fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n", - re->re_nsub, re->re_info, re->re_csize, g->ntree); + (int) re->re_nsub, re->re_info, re->re_csize, g->ntree); dumpcolors(&g->cmap, f); if (!NULLCNFA(g->search)) { - printf("\nsearch:\n"); + fprintf(f, "\nsearch:\n"); dumpcnfa(&g->search, f); } for (i = 1; i < g->nlacons; i++) { @@ -2108,7 +2158,7 @@ stdump( } if (t->min != 1 || t->max != 1) { fprintf(f, " {%d,", t->min); - if (t->max != INFINITY) { + if (t->max != DUPINF) { fprintf(f, "%d", t->max); } fprintf(f, "}"); @@ -2146,14 +2196,14 @@ stid( size_t bufsize) { /* - * Big enough for hex int or decimal t->retry? + * Big enough for hex int or decimal t->id? */ - if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1) { + if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->id)*3 + 1) { return "unable"; } - if (t->retry != 0) { - sprintf(buf, "%d", t->retry); + if (t->id != 0) { + sprintf(buf, "%d", t->id); } else { sprintf(buf, "%p", t); } |