From 066e9abdc308995186dbc4d9bbc2fedc07674296 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 21 Sep 2015 19:25:20 +0000 Subject: [187d7f499b] Sync the regexp engine to the Postgres version. --- generic/regc_color.c | 13 +++--- generic/regc_cvec.c | 1 + generic/regc_lex.c | 18 -------- generic/regc_nfa.c | 36 ++++++++-------- generic/regcomp.c | 75 ++++++++++++++++++++------------- generic/rege_dfa.c | 29 ++++++------- generic/regerror.c | 4 +- generic/regexec.c | 115 ++++++++++++++++++++++++++++----------------------- generic/regguts.h | 38 +++++++++++++---- 9 files changed, 182 insertions(+), 147 deletions(-) diff --git a/generic/regc_color.c b/generic/regc_color.c index 2e167fe..92e0aad 100644 --- a/generic/regc_color.c +++ b/generic/regc_color.c @@ -777,18 +777,19 @@ dumpcolors( } /* - * It's hard to do this more efficiently. + * Unfortunately, it's hard to do this next bit more efficiently. + * + * Spencer's original coding has the loop iterating from CHR_MIN + * to CHR_MAX, but that's utterly unusable for 32-bit chr, or + * even 16-bit. For debugging purposes it seems fine to print + * only chr codes up to 1000 or so. */ - for (c=CHR_MIN ; cnchrs < cv->chrspace); cv->chrs[cv->nchrs++] = (chr)c; } diff --git a/generic/regc_lex.c b/generic/regc_lex.c index 132e757..16e3ae9 100644 --- a/generic/regc_lex.c +++ b/generic/regc_lex.c @@ -1139,24 +1139,6 @@ newline(void) } /* - - ch - return the chr sequence for regc_locale.c's fake collating element ch - * This helps confine use of CHR to this source file. Beware that the caller - * knows how long the sequence is. - ^ #ifdef REG_DEBUG - ^ static const chr *ch(NOPARMS); - ^ #endif - */ -#ifdef REG_DEBUG -static const chr * -ch(void) -{ - static const chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') }; - - return chstr; -} -#endif - -/* - chrnamed - return the chr known by a given (chr string) name * The code is a bit clumsy, but this routine gets only such specialized * use that it hardly matters. diff --git a/generic/regc_nfa.c b/generic/regc_nfa.c index 9361d34..1fad85f 100644 --- a/generic/regc_nfa.c +++ b/generic/regc_nfa.c @@ -1652,13 +1652,16 @@ compact( narcs = 0; for (s = nfa->states; s != NULL; s = s->next) { nstates++; - narcs += 1 + s->nouts + 1; - /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */ + narcs += s->nouts + 1; /* need one extra for endmarker */ } + cnfa->stflags = (char *) MALLOC(nstates * sizeof(char)); cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *)); cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc)); - if (cnfa->states == NULL || cnfa->arcs == NULL) { + if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL) { + if (cnfa->stflags != NULL) { + FREE(cnfa->stflags); + } if (cnfa->states != NULL) { FREE(cnfa->states); } @@ -1681,9 +1684,8 @@ compact( ca = cnfa->arcs; for (s = nfa->states; s != NULL; s = s->next) { assert((size_t) s->no < nstates); + cnfa->stflags[s->no] = 0; cnfa->states[s->no] = ca; - ca->co = 0; /* clear and skip flags "arc" */ - ca++; first = ca; for (a = s->outs; a != NULL; a = a->outchain) { switch (a->type) { @@ -1717,9 +1719,9 @@ compact( */ for (a = nfa->pre->outs; a != NULL; a = a->outchain) { - cnfa->states[a->to->no]->co = 1; + cnfa->stflags[a->to->no] = CNFA_NOPROGRESS; } - cnfa->states[nfa->pre->no]->co = 1; + cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS; } /* @@ -1763,6 +1765,7 @@ freecnfa( { assert(cnfa->nstates != 0); /* not empty already */ cnfa->nstates = 0; + FREE(cnfa->stflags); FREE(cnfa->states); FREE(cnfa->arcs); } @@ -1985,7 +1988,7 @@ dumpcnfa( } fprintf(f, "\n"); for (st = 0; st < cnfa->nstates; st++) { - dumpcstate(st, cnfa->states[st], cnfa, f); + dumpcstate(st, cnfa, f); } fflush(f); #endif @@ -1998,25 +2001,24 @@ dumpcnfa( /* - dumpcstate - dump a compacted-NFA state in human-readable form - ^ static void dumpcstate(int, struct carc *, struct cnfa *, FILE *); + ^ static void dumpcstate(int, struct cnfa *, FILE *); */ static void dumpcstate( int st, - struct carc *ca, struct cnfa *cnfa, FILE *f) { - int i; + struct carc *ca; int pos; - fprintf(f, "%d%s", st, (ca[0].co) ? ":" : "."); + fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : "."); pos = 1; - for (i = 1; ca[i].co != COLORLESS; i++) { - if (ca[i].co < cnfa->ncolors) { - fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to); + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) { + if (ca->co < cnfa->ncolors) { + fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to); } else { - fprintf(f, "\t:%ld:->%d", (long) ca[i].co-cnfa->ncolors,ca[i].to); + fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to); } if (pos == 5) { fprintf(f, "\n"); @@ -2025,7 +2027,7 @@ dumpcstate( pos++; } } - if (i == 1 || pos != 1) { + if (ca == cnfa->states[st] || pos != 1) { fprintf(f, "\n"); } fflush(f); diff --git a/generic/regcomp.c b/generic/regcomp.c index b8a5a87..11a389a 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -83,9 +83,6 @@ static int lexdigits(struct vars *, int, int, int); static int brenext(struct vars *, pchr); static void skip(struct vars *); static chr newline(NOPARMS); -#ifdef REG_DEBUG -static const chr *ch(NOPARMS); -#endif static chr chrnamed(struct vars *, const chr *, const chr *, pchr); /* === regc_color.c === */ static void initcm(struct vars *, struct colormap *); @@ -165,7 +162,7 @@ static void dumparc(struct arc *, struct state *, FILE *); #endif static void dumpcnfa(struct cnfa *, FILE *); #ifdef REG_DEBUG -static void dumpcstate(int, struct carc *, struct cnfa *, FILE *); +static void dumpcstate(int, struct cnfa *, FILE *); #endif /* === regc_cvec.c === */ static struct cvec *clearcvec(struct cvec *); @@ -210,7 +207,7 @@ struct vars { struct subre *tree; /* subexpression tree */ struct subre *treechain; /* all tree nodes allocated */ struct subre *treefree; /* any free tree nodes */ - int ntree; /* number of tree nodes */ + int ntree; /* number of tree nodes, plus one */ struct cvec *cv; /* interface cvec */ struct cvec *cv2; /* utility cvec */ struct subre *lacons; /* lookahead-constraint vector */ @@ -223,13 +220,13 @@ struct vars { #define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */ #define VISERR(vv) ((vv)->err != 0)/* have we seen an error yet? */ #define ISERR() VISERR(v) -#define VERR(vv,e) \ - ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err : ((vv)->err = (e))) +#define VERR(vv,e) ((vv)->nexttype = EOS, \ + (vv)->err = ((vv)->err ? (vv)->err : (e))) #define ERR(e) VERR(v, e) /* record an error */ #define NOERR() {if (ISERR()) return;} /* if error seen, return */ #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */ #define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */ -#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */ +#define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */ #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */ #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y) @@ -258,12 +255,14 @@ struct vars { ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND) /* static function list */ -static struct fns functions = { +static const struct fns functions = { rfree, /* regfree insides */ }; /* - compile - compile regular expression + * Note: on failure, no resources remain allocated, so regfree() + * need not be applied to re. ^ int compile(regex_t *, const chr *, size_t, int); */ int @@ -1788,7 +1787,8 @@ freesrnode( } sr->flags = 0; - if (v != NULL) { + if (v != NULL && v->treechain != NULL) { + /* we're still parsing, maybe we can reuse the subre */ sr->left = v->treefree; v->treefree = sr; } else { @@ -1841,6 +1841,19 @@ numst( /* - markst - mark tree nodes as INUSE + * Note: this is a great deal more subtle than it looks. During initial + * parsing of a regex, all subres are linked into the treechain list; + * discarded ones are also linked into the treefree list for possible reuse. + * After we are done creating all subres required for a regex, we run markst() + * then cleanst(), which results in discarding all subres not reachable from + * v->tree. We then clear v->treechain, indicating that subres must be found + * by descending from v->tree. This changes the behavior of freesubre(): it + * will henceforth FREE() unwanted subres rather than sticking them into the + * treefree list. (Doing that any earlier would result in dangling links in + * the treechain list.) This all means that freev() will clean up correctly + * if invoked before or after markst()+cleanst(); but it would not work if + * called partway through this state conversion, so we mustn't error out + * in or between these two functions. ^ static void markst(struct subre *); */ static void @@ -1947,24 +1960,26 @@ newlacon( struct state *end, int pos) { - struct subre *sub; int n; + struct subre *newlacons; + struct subre *sub; if (v->nlacons == 0) { - v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre)); n = 1; /* skip 0th */ - v->nlacons = 2; + newlacons = (struct subre *) MALLOC(2 * sizeof(struct subre)); } else { - v->lacons = (struct subre *) REALLOC(v->lacons, - (v->nlacons+1)*sizeof(struct subre)); - n = v->nlacons++; + n = v->nlacons; + newlacons = (struct subre *) REALLOC(v->lacons, + (n + 1) * sizeof(struct subre)); } - if (v->lacons == NULL) { + if (newlacons == NULL) { ERR(REG_ESPACE); return 0; } + v->lacons = newlacons; + v->nlacons = n + 1; sub = &v->lacons[n]; sub->begin = begin; sub->end = end; @@ -2012,18 +2027,20 @@ rfree( g = (struct guts *) re->re_guts; re->re_guts = NULL; re->re_fns = NULL; - g->magic = 0; - freecm(&g->cmap); - if (g->tree != NULL) { - freesubre(NULL, g->tree); - } - if (g->lacons != NULL) { - freelacons(g->lacons, g->nlacons); - } - if (!NULLCNFA(g->search)) { - freecnfa(&g->search); + if (g != NULL) { + g->magic = 0; + freecm(&g->cmap); + if (g->tree != NULL) { + freesubre(NULL, g->tree); + } + if (g->lacons != NULL) { + freelacons(g->lacons, g->nlacons); + } + if (!NULLCNFA(g->search)) { + freecnfa(&g->search); + } + FREE(g); } - FREE(g); } /* @@ -2055,7 +2072,7 @@ dump( fprintf(f, "\n\n\n========= DUMP ==========\n"); fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n", - re->re_nsub, re->re_info, re->re_csize, g->ntree); + (int) re->re_nsub, re->re_info, re->re_csize, g->ntree); dumpcolors(&g->cmap, f); if (!NULLCNFA(g->search)) { diff --git a/generic/rege_dfa.c b/generic/rege_dfa.c index 920ea6c..e5f22c4 100644 --- a/generic/rege_dfa.c +++ b/generic/rege_dfa.c @@ -84,7 +84,7 @@ longest( if (v->eflags®_FTRACE) { while (cp < realstop) { - FDEBUG(("+++ at c%d +++\n", css - d->ssets)); + FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets))); co = GETCOLOR(cm, *cp); FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co)); ss = css->outs[co]; @@ -118,7 +118,7 @@ longest( * Shutdown. */ - FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets)); + FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets))); if (cp == v->stop && stop == v->stop) { if (hitstopp != NULL) { *hitstopp = 1; @@ -213,7 +213,7 @@ shortest( if (v->eflags®_FTRACE) { while (cp < realmax) { - FDEBUG(("--- at c%d ---\n", css - d->ssets)); + FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets))); co = GETCOLOR(cm, *cp); FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co)); ss = css->outs[co]; @@ -516,14 +516,14 @@ miss( gotState = 0; for (i = 0; i < d->nstates; i++) { if (ISBSET(css->states, i)) { - for (ca = cnfa->states[i]+1; ca->co != COLORLESS; ca++) { + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) { if (ca->co == co) { BSET(d->work, ca->to); gotState = 1; if (ca->to == cnfa->post) { isPost = 1; } - if (!cnfa->states[ca->to]->co) { + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) { noProgress = 0; } FDEBUG(("%d -> %d\n", i, ca->to)); @@ -537,8 +537,8 @@ miss( doLAConstraints = 0; for (i = 0; i < d->nstates; i++) { if (ISBSET(d->work, i)) { - for (ca = cnfa->states[i]+1; ca->co != COLORLESS; ca++) { - if (ca->co <= cnfa->ncolors) { + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) { + if (ca->co < cnfa->ncolors) { continue; /* NOTE CONTINUE */ } sawLAConstraints = 1; @@ -553,7 +553,7 @@ miss( if (ca->to == cnfa->post) { isPost = 1; } - if (!cnfa->states[ca->to]->co) { + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) { noProgress = 0; } FDEBUG(("%d :> %d\n", i, ca->to)); @@ -572,7 +572,7 @@ miss( for (p = d->ssets, i = d->nssused; i > 0; p++, i--) { if (HIT(h, d->work, p, d->wordsper)) { - FDEBUG(("cached c%d\n", p - d->ssets)); + FDEBUG(("cached c%d\n", (int) (p - d->ssets))); break; /* NOTE BREAK OUT */ } } @@ -594,7 +594,8 @@ miss( } if (!sawLAConstraints) { /* lookahead conds. always cache miss */ - FDEBUG(("c%d[%d]->c%d\n", css - d->ssets, co, p - d->ssets)); + FDEBUG(("c%d[%d]->c%d\n", + (int) (css - d->ssets), co, (int) (p - d->ssets))); css->outs[co] = p; css->inchain[co] = p->ins; p->ins.ss = css; @@ -663,7 +664,7 @@ getVacantSS( ap = ss->ins; while ((p = ap.ss) != NULL) { co = ap.co; - FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long)co)); + FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long)co)); p->outs[co] = NULL; ap = p->inchain[co]; p->inchain[co].ss = NULL; /* paranoia */ @@ -680,7 +681,7 @@ getVacantSS( if (p == NULL) { continue; /* NOTE CONTINUE */ } - FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets)); + FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets))); if (p->ins.ss == ss && p->ins.co == i) { p->ins = ss->inchain[i]; } else { @@ -772,7 +773,7 @@ pickNextSS( if ((ss->lastseen == NULL || ss->lastseen < ancient) && !(ss->flags&LOCKED)) { d->search = ss + 1; - FDEBUG(("replacing c%d\n", ss - d->ssets)); + FDEBUG(("replacing c%d\n", (int) (ss - d->ssets))); return ss; } } @@ -780,7 +781,7 @@ pickNextSS( if ((ss->lastseen == NULL || ss->lastseen < ancient) && !(ss->flags&LOCKED)) { d->search = ss + 1; - FDEBUG(("replacing c%d\n", ss - d->ssets)); + FDEBUG(("replacing c%d\n", (int) (ss - d->ssets))); return ss; } } diff --git a/generic/regerror.c b/generic/regerror.c index a1a0163..49d93ed 100644 --- a/generic/regerror.c +++ b/generic/regerror.c @@ -41,7 +41,7 @@ static const char unk[] = "*** unknown regex error code 0x%x ***"; * Struct to map among codes, code names, and explanations. */ -static struct rerr { +static const struct rerr { int code; const char *name; const char *explain; @@ -62,7 +62,7 @@ regerror( char *errbuf, /* Result buffer (unless errbuf_size==0) */ size_t errbuf_size) /* Available space in errbuf, can be 0 */ { - struct rerr *r; + const struct rerr *r; const char *msg; char convbuf[sizeof(unk)+50]; /* 50 = plenty for int */ size_t len; diff --git a/generic/regexec.c b/generic/regexec.c index e502ca5..3030913 100644 --- a/generic/regexec.c +++ b/generic/regexec.c @@ -107,12 +107,13 @@ struct vars { chr *start; /* start of string */ chr *stop; /* just past end of string */ int err; /* error code if any (0 none) */ + struct dfa **subdfas; /* per-subre DFAs */ struct smalldfa dfa1; struct smalldfa dfa2; }; #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ #define ISERR() VISERR(v) -#define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e))) +#define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e))) #define ERR(e) VERR(v, e) /* record an error */ #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */ #define OFF(p) ((p) - v->start) @@ -125,6 +126,7 @@ struct vars { /* automatically gathered by fwd; do not hand-edit */ /* === regexec.c === */ int exec(regex_t *, const chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int); +static struct dfa *getsubdfa(struct vars *, struct subre *); static int simpleFind(struct vars *const, struct cnfa *const, struct colormap *const); static int complicatedFind(struct vars *const, struct cnfa *const, struct colormap *const); static int complicatedFindLoop(struct vars *const, struct cnfa *const, struct colormap *const, struct dfa *const, struct dfa *const, chr **const); @@ -171,8 +173,11 @@ exec( AllocVars(v); int st, backref; size_t n; + size_t i; #define LOCALMAT 20 regmatch_t mat[LOCALMAT]; +#define LOCALDFAS 40 + struct dfa *subdfas[LOCALDFAS]; /* * Sanity checks. @@ -230,6 +235,20 @@ exec( v->start = (chr *)string; v->stop = (chr *)string + len; v->err = 0; + assert(v->g->ntree >= 0); + n = (size_t) v->g->ntree; + if (n <= LOCALDFAS) + v->subdfas = subdfas; + else + v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *)); + if (v->subdfas == NULL) { + if (v->pmatch != pmatch && v->pmatch != mat) + FREE(v->pmatch); + FreeVars(v); + return REG_ESPACE; + } + for (i = 0; i < n; i++) + v->subdfas[i] = NULL; /* * Do it. @@ -259,11 +278,35 @@ exec( if (v->pmatch != pmatch && v->pmatch != mat) { FREE(v->pmatch); } + n = (size_t) v->g->ntree; + for (i = 0; i < n; i++) { + if (v->subdfas[i] != NULL) + freeDFA(v->subdfas[i]); + } + if (v->subdfas != subdfas) + FREE(v->subdfas); FreeVars(v); return st; } /* + - getsubdfa - create or re-fetch the DFA for a subre node + * We only need to create the DFA once per overall regex execution. + * The DFA will be freed by the cleanup step in exec(). + */ +static struct dfa * +getsubdfa(struct vars * v, + struct subre * t) +{ + if (v->subdfas[t->id] == NULL) { + v->subdfas[t->id] = newDFA(v, &t->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + return NULL; + } + return v->subdfas[t->id]; +} + +/* - simpleFind - find a match for the main NFA (no-complications case) ^ static int simpleFind(struct vars *, struct cnfa *, struct colormap *); */ @@ -327,7 +370,10 @@ simpleFind( } else { end = longest(v, d, begin, v->stop, &hitend); } - NOERR(); + if (ISERR()) { + freeDFA(d); + return v->err; + } if (hitend && cold == NULL) { cold = begin; } @@ -470,6 +516,7 @@ complicatedFindLoop( } if (er != REG_NOMATCH) { ERR(er); + *coldp = cold; return er; } if ((shorter) ? end == estop : end == begin) { @@ -636,7 +683,7 @@ cdissect( } /* - - ccondissect - concatenation subexpression matches (with complications) + - ccondissect - dissect match for concatenation node ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *); */ static int /* regexec return code */ @@ -654,15 +701,11 @@ ccondissect( assert(t->right != NULL && t->right->cnfa.nstates > 0); assert(!(t->left->flags & SHORTER)); - d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); - if (ISERR()) { - return v->err; - } - d2 = newDFA(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); - if (ISERR()) { - freeDFA(d); - return v->err; - } + d = getsubdfa(v, t->left); + NOERR(); + d2 = getsubdfa(v, t->right); + NOERR(); + MDEBUG(("cConcat %d\n", t->id)); /* @@ -670,8 +713,6 @@ ccondissect( */ mid = longest(v, d, begin, end, (int *) NULL); if (mid == NULL) { - freeDFA(d); - freeDFA(d2); return REG_NOMATCH; } MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); @@ -696,14 +737,10 @@ ccondissect( */ MDEBUG(("successful\n")); - freeDFA(d); - freeDFA(d2); return REG_OKAY; } } if (er != REG_NOMATCH) { - freeDFA(d); - freeDFA(d2); return er; } } @@ -718,8 +755,6 @@ ccondissect( */ MDEBUG(("%d no midpoint\n", t->id)); - freeDFA(d); - freeDFA(d2); return REG_NOMATCH; } mid = longest(v, d, begin, mid-1, NULL); @@ -729,8 +764,6 @@ ccondissect( */ MDEBUG(("%d failed midpoint\n", t->id)); - freeDFA(d); - freeDFA(d2); return REG_NOMATCH; } MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); @@ -758,15 +791,11 @@ crevcondissect( assert(t->right != NULL && t->right->cnfa.nstates > 0); assert(t->left->flags&SHORTER); - d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); - if (ISERR()) { - return v->err; - } - d2 = newDFA(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); - if (ISERR()) { - freeDFA(d); - return v->err; - } + d = getsubdfa(v, t->left); + NOERR(); + d2 = getsubdfa(v, t->right); + NOERR(); + MDEBUG(("crevcon %d\n", t->id)); /* @@ -775,8 +804,6 @@ crevcondissect( mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL); if (mid == NULL) { - freeDFA(d); - freeDFA(d2); return REG_NOMATCH; } MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); @@ -801,14 +828,10 @@ crevcondissect( */ MDEBUG(("successful\n")); - freeDFA(d); - freeDFA(d2); return REG_OKAY; } } if (er != REG_NOMATCH) { - freeDFA(d); - freeDFA(d2); return er; } } @@ -823,8 +846,6 @@ crevcondissect( */ MDEBUG(("%d no midpoint\n", t->id)); - freeDFA(d); - freeDFA(d2); return REG_NOMATCH; } mid = shortest(v, d, begin, mid+1, end, NULL, NULL); @@ -834,8 +855,6 @@ crevcondissect( */ MDEBUG(("%d failed midpoint\n", t->id)); - freeDFA(d); - freeDFA(d2); return REG_NOMATCH; } MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); @@ -943,17 +962,15 @@ caltdissect( MDEBUG(("calt n%d\n", t->id)); - d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + d = getsubdfa(v, t->left); NOERR(); if (longest(v, d, begin, end, (int *) NULL) == end) { - freeDFA(d); MDEBUG(("calt matched\n")); er = cdissect(v, t->left, begin, end); if (er != REG_NOMATCH) { return er; } } - freeDFA(d); t = t->right; } @@ -1017,7 +1034,7 @@ citerdissect(struct vars * v, return REG_ESPACE; endpts[0] = begin; - d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + d = getsubdfa(v, t->left); if (ISERR()) { FREE(endpts); return v->err; @@ -1094,7 +1111,6 @@ citerdissect(struct vars * v, if (er == REG_NOMATCH) break; /* oops, something failed */ - freeDFA(d); FREE(endpts); return er; } @@ -1102,7 +1118,6 @@ citerdissect(struct vars * v, if (i > k) { /* satisfaction */ MDEBUG(("%d successful\n", t->id)); - freeDFA(d); FREE(endpts); return REG_OKAY; } @@ -1132,7 +1147,6 @@ citerdissect(struct vars * v, /* all possibilities exhausted */ MDEBUG(("%d failed\n", t->id)); - freeDFA(d); FREE(endpts); return REG_NOMATCH; } @@ -1193,7 +1207,7 @@ creviterdissect(struct vars * v, return REG_ESPACE; endpts[0] = begin; - d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + d = getsubdfa(v, t->left); if (ISERR()) { FREE(endpts); return v->err; @@ -1276,7 +1290,6 @@ creviterdissect(struct vars * v, if (er == REG_NOMATCH) break; /* oops, something failed */ - freeDFA(d); FREE(endpts); return er; } @@ -1284,7 +1297,6 @@ creviterdissect(struct vars * v, if (i > k) { /* satisfaction */ MDEBUG(("%d successful\n", t->id)); - freeDFA(d); FREE(endpts); return REG_OKAY; } @@ -1308,7 +1320,6 @@ creviterdissect(struct vars * v, /* all possibilities exhausted */ MDEBUG(("%d failed\n", t->id)); - freeDFA(d); FREE(endpts); return REG_NOMATCH; } diff --git a/generic/regguts.h b/generic/regguts.h index e1c60ce..de760b0 100644 --- a/generic/regguts.h +++ b/generic/regguts.h @@ -186,7 +186,14 @@ struct colordesc { union tree *block; /* block of solid color, if any */ }; -/* the color map itself */ +/* + * The color map itself + * + * Much of the data in the colormap struct is only used at compile time. + * However, the bulk of the space usage is in the "tree" structure, so it's + * not clear that there's much point in converting the rest to a more compact + * form when compilation is finished. + */ struct colormap { int magic; #define CMMAGIC 0x876 @@ -242,15 +249,14 @@ struct cvec { struct state; struct arc { - int type; -#define ARCFREE '\0' + int type; /* 0 if free, else an NFA arc type code */ color co; struct state *from; /* where it's from (and contained within) */ struct state *to; /* where it's to */ - struct arc *outchain; /* *from's outs chain or free chain */ + struct arc *outchain; /* link in *from's outs chain or free chain */ #define freechain outchain - struct arc *inchain; /* *to's ins chain */ - struct arc *colorchain; /* color's arc chain */ + struct arc *inchain; /* link in *to's ins chain */ + struct arc *colorchain; /* link in color's arc chain */ struct arc *colorchainRev; /* back-link in color's arc chain */ }; @@ -297,11 +303,22 @@ struct nfa { /* * definitions for compacted NFA + * + * The main space savings in a compacted NFA is from making the arcs as small + * as possible. We store only the transition color and next-state number for + * each arc. The list of out arcs for each state is an array beginning at + * cnfa.states[statenumber], and terminated by a dummy carc struct with + * co == COLORLESS. + * + * The non-dummy carc structs are of two types: plain arcs and LACON arcs. + * Plain arcs just store the transition color number as "co". LACON arcs + * store the lookahead constraint number plus cnfa.ncolors as "co". LACON + * arcs can be distinguished from plain by testing for co >= cnfa.ncolors. */ struct carc { color co; /* COLORLESS is list terminator */ - int to; /* state number */ + int to; /* next-state number */ }; struct cnfa { @@ -313,7 +330,10 @@ struct cnfa { int post; /* teardown state number */ color bos[2]; /* colors, if any, assigned to BOS and BOL */ color eos[2]; /* colors, if any, assigned to EOS and EOL */ + char *stflags; /* vector of per-state flags bytes */ +#define CNFA_NOPROGRESS 01 /* flag bit for a no-progress state */ struct carc **states; /* vector of pointers to outarc lists */ + /* states[n] are pointers into a single malloc'd array of arcs */ struct carc *arcs; /* the area for the lists */ }; #define ZAPCNFA(cnfa) ((cnfa).nstates = 0) @@ -366,7 +386,7 @@ struct subre { #define PREF(f) ((f)&NOPROP) #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) - short id; /* ID of subre (1..ntree) */ + short id; /* ID of subre (1..ntree-1) */ int subno; /* subexpression number (for 'b' and '(') */ short min; /* min repetitions for iteration or backref */ short max; /* max repetitions for iteration or backref */ @@ -399,7 +419,7 @@ struct guts { size_t nsub; /* copy of re_nsub */ struct subre *tree; struct cnfa search; /* for fast preliminary search */ - int ntree; + int ntree; /* number of subre's, plus one */ struct colormap cmap; int FUNCPTR(compare, (const chr *, const chr *, size_t)); struct subre *lacons; /* lookahead-constraint vector */ -- cgit v0.12