From 144c26987f8a6f6285efd2f4e878e844b9c13d33 Mon Sep 17 00:00:00 2001 From: stanton Date: Thu, 5 Aug 1999 01:16:57 +0000 Subject: * generic/regc_nfa.c: * generic/regcomp.c: * generic/rege_dfa.c: * generic/regexec.c: * generic/regguts.h: Applied patches supplied by Henry Spencer to greatly enhance the performance of certain classes of regular expressions. [Bug: 2440, 2447] --- generic/regc_nfa.c | 8 ++-- generic/regcomp.c | 128 ++++++++++++++++++++++++++++++++--------------------- generic/rege_dfa.c | 2 +- generic/regexec.c | 42 ++++++++++++------ generic/regguts.h | 3 +- 5 files changed, 111 insertions(+), 72 deletions(-) diff --git a/generic/regc_nfa.c b/generic/regc_nfa.c index 897b3eb..9881cd4 100644 --- a/generic/regc_nfa.c +++ b/generic/regc_nfa.c @@ -703,9 +703,9 @@ struct nfa *nfa; /* - optimize - optimize an NFA - ^ static int optimize(struct nfa *, FILE *); + ^ static long optimize(struct nfa *, FILE *); */ -static int /* re_info bits */ +static long /* re_info bits */ optimize(nfa, f) struct nfa *nfa; FILE *f; /* for debug output; NULL none */ @@ -1187,9 +1187,9 @@ struct state *mark; /* the value to mark with */ /* - analyze - ascertain potentially-useful facts about an optimized NFA - ^ static int analyze(struct nfa *); + ^ static long analyze(struct nfa *); */ -static int /* re_info bits to be ORed in */ +static long /* re_info bits to be ORed in */ analyze(nfa) struct nfa *nfa; { diff --git a/generic/regcomp.c b/generic/regcomp.c index b292012..85639a1 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -41,7 +41,7 @@ int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int)); static VOID moresubs _ANSI_ARGS_((struct vars *, int)); static int freev _ANSI_ARGS_((struct vars *, int)); -static VOID makescan _ANSI_ARGS_((struct vars *, struct nfa *)); +static VOID makesearch _ANSI_ARGS_((struct vars *, struct nfa *)); static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *)); static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int)); static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *)); @@ -65,14 +65,15 @@ static VOID optst _ANSI_ARGS_((struct vars *, struct subre *)); static int numst _ANSI_ARGS_((struct subre *, int)); static VOID markst _ANSI_ARGS_((struct subre *)); static VOID cleanst _ANSI_ARGS_((struct vars *)); -static int nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *)); -static int nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *)); +static long nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *)); +static long nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *)); static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int)); static VOID freelacons _ANSI_ARGS_((struct subre *, int)); static VOID rfree _ANSI_ARGS_((regex_t *)); static VOID dump _ANSI_ARGS_((regex_t *, FILE *)); static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int)); -static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int, int)); +static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int)); +static char *stid _ANSI_ARGS_((struct subre *, char *, size_t)); /* === regc_lex.c === */ static VOID lexstart _ANSI_ARGS_((struct vars *)); static VOID prefixes _ANSI_ARGS_((struct vars *)); @@ -138,7 +139,7 @@ static VOID dupnfa _ANSI_ARGS_((struct nfa *, struct state *, struct state *, st static VOID duptraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *)); static VOID cleartraverse _ANSI_ARGS_((struct nfa *, struct state *)); static VOID specialcolors _ANSI_ARGS_((struct nfa *)); -static int optimize _ANSI_ARGS_((struct nfa *, FILE *)); +static long optimize _ANSI_ARGS_((struct nfa *, FILE *)); static VOID pullback _ANSI_ARGS_((struct nfa *, FILE *)); static int pull _ANSI_ARGS_((struct nfa *, struct arc *)); static VOID pushfwd _ANSI_ARGS_((struct nfa *, FILE *)); @@ -152,7 +153,7 @@ static int unempty _ANSI_ARGS_((struct nfa *, struct arc *)); static VOID cleanup _ANSI_ARGS_((struct nfa *)); static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *)); static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *)); -static int analyze _ANSI_ARGS_((struct nfa *)); +static long analyze _ANSI_ARGS_((struct nfa *)); static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *)); static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *)); static VOID freecnfa _ANSI_ARGS_((struct cnfa *)); @@ -229,7 +230,6 @@ struct vars { struct subre *lacons; /* lookahead-constraint vector */ int nlacons; /* size of lacons */ int usedshorter; /* used short-preferring quantifiers */ - int unmatchable; /* can never match */ }; /* parsing macros; most know that `v' is the struct vars pointer */ @@ -382,36 +382,37 @@ int flags; specialcolors(v->nfa); CNOERR(); if (debug != NULL) { + fprintf(debug, "\n\n\n========= RAW ==========\n"); dumpnfa(v->nfa, debug); dumpst(v->tree, debug, 1); } v->usedshorter = 0; - v->unmatchable = 0; optst(v, v->tree); v->ntree = numst(v->tree, 1); markst(v->tree); cleanst(v); if (debug != NULL) { + fprintf(debug, "\n\n\n========= TREE FIXED ==========\n"); fprintf(debug, "-->\n"); dumpst(v->tree, debug, 1); } /* build compacted NFAs for tree, lacons, fast search */ re->re_info |= nfatree(v, v->tree, debug); - if (debug != NULL) { - fprintf(debug, "---->\n"); - dumpst(v->tree, debug, 1); - } CNOERR(); - if (re->re_info®_UIMPOSSIBLE) - v->unmatchable = 1; assert(v->nlacons == 0 || v->lacons != NULL); - for (i = 1; i < v->nlacons; i++) + for (i = 1; i < v->nlacons; i++) { + if (debug != NULL) + fprintf(debug, "\n\n\n========= LA%d ==========\n", i); nfanode(v, &v->lacons[i], debug); + } CNOERR(); + if (debug != NULL) + fprintf(debug, "\n\n\n========= SEARCH ==========\n"); + /* can sacrifice main NFA now, so use it as work area */ (DISCARD)optimize(v->nfa, debug); CNOERR(); - makescan(v, v->nfa); + makesearch(v, v->nfa); CNOERR(); compact(v->nfa, &g->search); CNOERR(); @@ -431,7 +432,6 @@ int flags; v->lacons = NULL; g->nlacons = v->nlacons; g->usedshorter = v->usedshorter; - g->unmatchable = v->unmatchable; if (flags®_DUMP) dump(re, stdout); @@ -507,12 +507,12 @@ int err; } /* - - makescan - turn an NFA into a fast-scan NFA (implicit prepend of .*?) + - makesearch - turn an NFA into a fast-scan NFA (implicit prepend of .*?) * NFA must have been optimize()d already. - ^ static VOID makescan(struct vars *, struct nfa *); + ^ static VOID makesearch(struct vars *, struct nfa *); */ static VOID -makescan(v, nfa) +makesearch(v, nfa) struct vars *v; struct nfa *nfa; { @@ -1907,9 +1907,9 @@ struct vars *v; /* - nfatree - turn a subRE subtree into a tree of compacted NFAs - ^ static int nfatree(struct vars *, struct subre *, FILE *); + ^ static long nfatree(struct vars *, struct subre *, FILE *); */ -static int /* optimize results from top node */ +static long /* optimize results from top node */ nfatree(v, t, f) struct vars *v; struct subre *t; @@ -1927,19 +1927,23 @@ FILE *f; /* for debug output */ /* - nfanode - do one NFA for nfatree - ^ static int nfanode(struct vars *, struct subre *, FILE *); + ^ static long nfanode(struct vars *, struct subre *, FILE *); */ -static int /* optimize results */ +static long /* optimize results */ nfanode(v, t, f) struct vars *v; struct subre *t; FILE *f; /* for debug output */ { struct nfa *nfa; - int ret = 0; + long ret = 0; + char idbuf[50]; assert(t->begin != NULL); + if (f != NULL) + fprintf(f, "\n\n\n========= TREE NODE %s ==========\n", + stid(t, idbuf, sizeof(idbuf))); nfa = newnfa(v, v->cm, v->nfa); NOERRZ(); dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final); @@ -2061,20 +2065,22 @@ FILE *f; fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic, GUTSMAGIC); - fprintf(f, "nsub %d, info 0%o, csize %d, ntree %d, usedshort %d\n", + fprintf(f, "\n\n\n========= DUMP ==========\n"); + fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d, usedshort %d\n", re->re_nsub, re->re_info, re->re_csize, g->ntree, g->usedshorter); dumpcolors(&g->cmap, f); if (!NULLCNFA(g->search)) { - printf("search:\n"); + printf("\nsearch:\n"); dumpcnfa(&g->search, f); } for (i = 1; i < g->nlacons; i++) { - fprintf(f, "la%d (%s):\n", i, + fprintf(f, "\nla%d (%s):\n", i, (g->lacons[i].subno) ? "positive" : "negative"); dumpcnfa(&g->lacons[i].cnfa, f); } + fprintf(f, "\n"); dumpst(g->tree, f, 0); #endif } @@ -2092,59 +2098,79 @@ int nfapresent; /* is the original NFA still around? */ if (t == NULL) fprintf(f, "null tree\n"); else - stdump(t, f, nfapresent, 0); + stdump(t, f, nfapresent); fflush(f); } /* - stdump - recursive guts of dumpst - ^ static VOID stdump(struct subre *, FILE *, int, int); + ^ static VOID stdump(struct subre *, FILE *, int); */ static VOID -stdump(t, f, nfapresent, level) +stdump(t, f, nfapresent) struct subre *t; FILE *f; int nfapresent; /* is the original NFA still around? */ -int level; { int i; -# define RTSEP " " + char idbuf[50]; - for (i = 0; i < level; i++) - fprintf(f, RTSEP); - fprintf(f, "%c (", t->op); + fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op); if (t->flags&LONGER) - fprintf(f, "L"); + fprintf(f, " longest"); if (t->flags&SHORTER) - fprintf(f, "S"); + fprintf(f, " shortest"); if (t->flags&MIXED) - fprintf(f, "M"); + fprintf(f, " hasmixed"); if (t->flags&CAP) - fprintf(f, "c"); + fprintf(f, " hascapture"); if (t->flags&BACKR) - fprintf(f, "b"); + fprintf(f, " hasbackref"); if (!(t->flags&INUSE)) - fprintf(f, "!u"); - fprintf(f, ") r%d", t->retry); + fprintf(f, " UNUSED"); if (t->subno != 0) - fprintf(f, " #%d", t->subno); + fprintf(f, " (#%d)", t->subno); if (t->min != 1 || t->max != 1) { - fprintf(f, "{%d,", t->min); + fprintf(f, " {%d,", t->min); if (t->max != INFINITY) fprintf(f, "%d", t->max); fprintf(f, "}"); } if (nfapresent) fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no); - if (!NULLCNFA(t->cnfa)) - fprintf(f, ":"); - fprintf(f, "\n"); if (t->left != NULL) - stdump(t->left, f, nfapresent, level+1); - if (!NULLCNFA(t->cnfa)) + fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf))); + if (t->right != NULL) + fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf))); + if (!NULLCNFA(t->cnfa)) { + fprintf(f, "\n"); dumpcnfa(&t->cnfa, f); + fprintf(f, "\n"); + } + if (t->left != NULL) + stdump(t->left, f, nfapresent); if (t->right != NULL) - stdump(t->right, f, nfapresent, level+1); + stdump(t->right, f, nfapresent); +} + +/* + - stid - identify a subtree node for dumping + ^ static char *stid(struct subre *, char *, size_t); + */ +static char * /* points to buf or constant string */ +stid(t, buf, bufsize) +struct subre *t; +char *buf; +size_t bufsize; +{ + /* big enough for hex int or decimal t->retry? */ + if (bufsize < sizeof(int)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1) + return "unable"; + if (t->retry != 0) + sprintf(buf, "%d", t->retry); + else + sprintf(buf, "0x%x", (int)t); /* may lose bits, that's okay */ + return buf; } #include "regc_lex.c" diff --git a/generic/rege_dfa.c b/generic/rege_dfa.c index 1628150..313892c 100644 --- a/generic/rege_dfa.c +++ b/generic/rege_dfa.c @@ -503,7 +503,7 @@ chr *start; /* where the attempt got started */ } if (!sawlacons) { /* lookahead conds. always cache miss */ - FDEBUG(("c%d[%d]->%d\n", css - d->ssets, co, p - d->ssets)); + FDEBUG(("c%d[%d]->c%d\n", css - d->ssets, co, p - d->ssets)); css->outs[co] = p; css->inchain[co] = p->ins; p->ins.ss = css; diff --git a/generic/regexec.c b/generic/regexec.c index ab9dc21..3d216db 100644 --- a/generic/regexec.c +++ b/generic/regexec.c @@ -172,7 +172,7 @@ int flags; register struct vars *v = &var; int st; size_t n; - int complications; + int backref; # define LOCALMAT 20 regmatch_t mat[LOCALMAT]; # define LOCALMEM 40 @@ -189,16 +189,14 @@ int flags; v->g = (struct guts *)re->re_guts; if ((v->g->cflags®_EXPECT) && details == NULL) return REG_INVARG; - if (v->g->unmatchable) + if (v->g->info®_UIMPOSSIBLE) return REG_NOMATCH; - complications = (v->g->info®_UBACKREF) ? 1 : 0; - if (v->g->usedshorter) - complications = 1; + backref = (v->g->info®_UBACKREF) ? 1 : 0; v->eflags = flags; if (v->g->cflags®_NOSUB) nmatch = 0; /* override client */ v->nmatch = nmatch; - if (complications && v->nmatch < v->g->nsub + 1) { + if (backref && v->nmatch < v->g->nsub + 1) { /* need work area bigger than what user gave us */ if (v->g->nsub + 1 <= LOCALMAT) v->pmatch = mat; @@ -214,7 +212,8 @@ int flags; v->start = (chr *)string; v->stop = (chr *)string + len; v->err = 0; - if (complications) { + if (backref) { + /* need retry memory */ assert(v->g->ntree >= 0); n = (size_t)v->g->ntree; if (n <= LOCALMEM) @@ -231,7 +230,7 @@ int flags; /* do it */ assert(v->g->tree != NULL); - if (complications) + if (backref) st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); else st = find(v, &v->g->tree->cnfa, &v->g->cmap); @@ -266,11 +265,12 @@ struct colormap *cm; struct smalldfa sa; struct dfa *s = newdfa(v, &v->g->search, cm, &sa); chr *begin; - chr *end = NULL; + chr *end; chr *cold; chr *open; /* open and close of range of possible starts */ chr *close; int hitend; + int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0; if (ISERR()) { /* should be newdfa() failure */ if (d != NULL) @@ -311,7 +311,11 @@ struct colormap *cm; MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); for (begin = open; begin <= close; begin++) { MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); - end = longest(v, d, begin, v->stop, &hitend); + if (shorter) + end = shortest(v, d, begin, begin, v->stop, + (chr **)NULL, &hitend); + else + end = longest(v, d, begin, v->stop, &hitend); if (hitend && cold == NULL) cold = begin; if (end != NULL) @@ -328,7 +332,7 @@ struct colormap *cm; if (cold != NULL) v->details->rm_extend.rm_so = OFF(cold); else - v->details->rm_extend.rm_eo = OFF(v->stop); + v->details->rm_extend.rm_so = OFF(v->stop); v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ } if (v->nmatch > 1) { @@ -591,6 +595,8 @@ chr *end; /* end of same */ struct dfa *d2; chr *mid; int i; + int shorter = (t->left->flags&SHORTER) ? 1 : 0; + chr *stop = (shorter) ? end : begin; assert(t->op == '.'); assert(t->left != NULL && t->left->cnfa.nstates > 0); @@ -606,7 +612,11 @@ chr *end; /* end of same */ } /* pick a tentative midpoint */ - mid = longest(v, d, begin, end, (int *)NULL); + if (shorter) + mid = shortest(v, d, begin, begin, end, (chr **)NULL, + (int *)NULL); + else + mid = longest(v, d, begin, end, (int *)NULL); if (mid == NULL) { freedfa(d); freedfa(d2); @@ -617,14 +627,18 @@ chr *end; /* end of same */ /* iterate until satisfaction or failure */ while (longest(v, d2, mid, end, (int *)NULL) != end) { /* that midpoint didn't work, find a new one */ - if (mid == begin) { + if (mid == stop) { /* all possibilities exhausted! */ MDEBUG(("no midpoint!\n")); freedfa(d); freedfa(d2); return REG_ASSERT; } - mid = longest(v, d, begin, mid-1, (int *)NULL); + if (shorter) + mid = shortest(v, d, begin, mid+1, end, (chr **)NULL, + (int *)NULL); + else + mid = longest(v, d, begin, mid-1, (int *)NULL); if (mid == NULL) { /* failed to find a new one! */ MDEBUG(("failed midpoint!\n")); diff --git a/generic/regguts.h b/generic/regguts.h index 29b1721..45c47f6 100644 --- a/generic/regguts.h +++ b/generic/regguts.h @@ -401,7 +401,7 @@ struct guts { int magic; # define GUTSMAGIC 0xfed9 int cflags; /* copy of compile flags */ - int info; /* copy of re_info */ + long info; /* copy of re_info */ size_t nsub; /* copy of re_nsub */ struct subre *tree; struct cnfa search; /* for fast preliminary search */ @@ -411,5 +411,4 @@ struct guts { struct subre *lacons; /* lookahead-constraint vector */ int nlacons; /* size of lacons */ int usedshorter; /* used non-greedy quantifiers? */ - int unmatchable; /* cannot match anything? */ }; -- cgit v0.12