From 96c82d7028795dedc3360764dc6bfd03782a626f Mon Sep 17 00:00:00 2001 From: stanton Date: Tue, 17 Nov 1998 21:38:37 +0000 Subject: integrated latest changes from Henry Spencer, mostly performance improvements renamed re_u* interfaces to TclRe* to avoid conflicts with other software that uses the regular expression library --- generic/regc_nfa.c | 21 +- generic/regcomp.c | 80 ++++- generic/regcustom.h | 13 +- generic/rege_dfa.c | 180 +++++++---- generic/regerrs.h | 2 +- generic/regex.h | 12 +- generic/regexec.c | 229 +++++++++----- generic/regguts.h | 1 - generic/tclRegexp.c | 14 +- tests/reg.test | 881 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 1243 insertions(+), 190 deletions(-) create mode 100644 tests/reg.test diff --git a/generic/regc_nfa.c b/generic/regc_nfa.c index 8143181..80e9ac7 100644 --- a/generic/regc_nfa.c +++ b/generic/regc_nfa.c @@ -1192,7 +1192,8 @@ struct cnfa *cnfa; narcs = 0; for (s = nfa->states; s != NULL; s = s->next) { nstates++; - narcs += s->nouts + 1; + narcs += 1 + s->nouts + 1; + /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */ } cnfa->states = (struct carc **)MALLOC(nstates * sizeof(struct carc *)); @@ -1213,12 +1214,14 @@ struct cnfa *cnfa; cnfa->eos[0] = nfa->eos[0]; cnfa->eos[1] = nfa->eos[1]; cnfa->ncolors = maxcolor(nfa->cm) + 1; - cnfa->flags = LEFTANCH; /* tentatively */ + cnfa->flags = 0; ca = cnfa->arcs; for (s = nfa->states; s != NULL; s = s->next) { assert((size_t)s->no < nstates); cnfa->states[s->no] = ca; + ca->co = 0; /* clear and skip flags "arc" */ + ca++; first = ca; for (a = s->outs; a != NULL; a = a->outchain) switch (a->type) { @@ -1246,10 +1249,10 @@ struct cnfa *cnfa; assert(ca == &cnfa->arcs[narcs]); assert(cnfa->nstates != 0); + /* mark no-progress states */ for (a = nfa->pre->outs; a != NULL; a = a->outchain) - if (a->type == PLAIN && a->co != nfa->bos[0] && - a->co != nfa->bos[1]) - cnfa->flags &= ~LEFTANCH; + cnfa->states[a->to->no]->co = 1; + cnfa->states[nfa->pre->no]->co = 1; } /* @@ -1480,8 +1483,6 @@ FILE *f; fprintf(f, ", eol [%ld]", (long)cnfa->eos[1]); if (cnfa->flags&HASLACONS) fprintf(f, ", haslacons"); - if (cnfa->flags&LEFTANCH) - fprintf(f, ", leftanch"); fprintf(f, "\n"); for (st = 0; st < cnfa->nstates; st++) dumpcstate(st, cnfa->states[st], cnfa, f); @@ -1505,9 +1506,9 @@ FILE *f; int i; int pos; - fprintf(f, "%d.", st); + fprintf(f, "%d%s", st, (ca[0].co) ? ":" : "."); pos = 1; - for (i = 0; ca[i].co != COLORLESS; i++) { + for (i = 1; ca[i].co != COLORLESS; i++) { if (ca[i].co < cnfa->ncolors) fprintf(f, "\t[%ld]->%d", (long)ca[i].co, ca[i].to); else @@ -1519,7 +1520,7 @@ FILE *f; } else pos++; } - if (i == 0 || pos != 1) + if (i == 1 || pos != 1) fprintf(f, "\n"); fflush(f); } diff --git a/generic/regcomp.c b/generic/regcomp.c index 620dc1c..7a4ae34 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -14,6 +14,7 @@ int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int)); static VOID moresubs _ANSI_ARGS_((struct vars *, int)); static int freev _ANSI_ARGS_((struct vars *, int)); +static VOID makescan _ANSI_ARGS_((struct vars *, struct nfa *)); static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *)); static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int)); static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *)); @@ -369,13 +370,10 @@ int flags; for (i = 1; i < v->nlacons; i++) nfanode(v, &v->lacons[i], debug); CNOERR(); - rainbow(v->nfa, v->cm, PLAIN, COLORLESS, v->nfa->pre, v->nfa->pre); - newarc(v->nfa, PLAIN, v->nfa->bos[0], v->nfa->pre, v->nfa->pre); - newarc(v->nfa, PLAIN, v->nfa->bos[1], v->nfa->pre, v->nfa->pre); - newarc(v->nfa, PLAIN, v->nfa->eos[0], v->nfa->pre, v->nfa->pre); - newarc(v->nfa, PLAIN, v->nfa->eos[1], v->nfa->pre, v->nfa->pre); (DISCARD)optimize(v->nfa, debug); CNOERR(); + makescan(v, v->nfa); + CNOERR(); compact(v->nfa, &g->search); CNOERR(); @@ -470,6 +468,78 @@ int err; } /* + - makescan - turn an NFA into a fast-scan NFA (implicit prepend of .*?) + * NFA must have been optimize()d already. + ^ static VOID makescan(struct vars *, struct nfa *); + */ +static VOID +makescan(v, nfa) +struct vars *v; +struct nfa *nfa; +{ + struct arc *a; + struct arc *b; + struct state *pre = nfa->pre; + struct state *s; + struct state *s2; + struct state *slist; + + /* no changes are needed if it's anchored */ + for (a = pre->outs; a != NULL; a = a->outchain) { + assert(a->type == PLAIN); + if (a->co != nfa->bos[0] && a->co != nfa->bos[1]) + break; + } + if (a == NULL) + return; + + /* add implicit .* in front */ + rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre); + + /* and ^* and \Z* too -- not always necessary, but harmless */ + newarc(nfa, PLAIN, nfa->bos[0], pre, pre); + newarc(nfa, PLAIN, nfa->bos[1], pre, pre); + + /* + * Now here's the subtle part. Because many REs have no lookback + * constraints, often knowing when you were in the pre state tells + * you little; it's the next state(s) that are informative. But + * some of them may have other inarcs, i.e. it may be possible to + * make actual progress and then return to one of them. We must + * de-optimize such cases, splitting each such state into progress + * and no-progress states. + */ + + /* first, make a list of the states */ + slist = NULL; + for (a = pre->outs; a != NULL; a = a->outchain) { + s = a->to; + for (b = s->ins; b != NULL; b = b->inchain) + if (b->from != pre) + break; + if (b != NULL) { /* must be split */ + s->tmp = slist; + slist = s; + } + } + + /* do the splits */ + for (s = slist; s != NULL; s = s2) { + s2 = newstate(nfa); + copyouts(nfa, s, s2); + for (a = s->ins; a != NULL; a = b) { + b = a->inchain; + if (a->from != pre) { + cparc(nfa, a, a->from, s2); + freearc(nfa, a); + } + } + s2 = s->tmp; + s->tmp = NULL; /* clean up while we're at it */ + } +} + +/* - parse - parse an RE * This is actually just the top level, which parses a bunch of branches * tied together with '|'. They appear in the tree as the left children diff --git a/generic/regcustom.h b/generic/regcustom.h index eeaafa0..9ba4a2a 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -1,6 +1,5 @@ /* headers (which also pick up the standard ones, or equivalents) */ #include "tclInt.h" -#include "tclPort.h" /* overrides for regguts.h definitions */ /* function-pointer declarations */ @@ -41,16 +40,16 @@ #define __REG_VOID_T VOID #define __REG_CONST CONST /* names and declarations */ -#define __REG_WIDE_COMPILE re_ucomp -#define __REG_WIDE_EXEC re_uexec +#define __REG_WIDE_COMPILE TclReComp +#define __REG_WIDE_EXEC TclReExec #ifndef __REG_NOFRONT #define __REG_NOFRONT /* don't want regcomp() and regexec() */ #endif #ifndef __REG_NOCHAR #define __REG_NOCHAR /* or the char versions */ #endif -#define regfree re_ufree -#define regerror re_uerror +#define regfree TclReFree +#define regerror TclReError /* --- end --- */ @@ -74,8 +73,8 @@ typedef int celt; /* type to hold chr, MCCE number, or NOCELT */ #define iscspace(x) TclUniCharIsSpace(x) /* name the external functions */ -#define compile re_ucomp -#define exec re_uexec +#define compile TclReComp +#define exec TclReExec /* enable/disable debugging code (by whether REG_DEBUG is defined or not) */ #ifdef notdef diff --git a/generic/rege_dfa.c b/generic/rege_dfa.c index 6e4b941..eb31ffc6 100644 --- a/generic/rege_dfa.c +++ b/generic/rege_dfa.c @@ -115,13 +115,13 @@ chr **coldp; /* store coldstart pointer here, if nonNULL */ chr *realmax = (max == v->stop) ? max : max + 1; color co; struct sset *css; - struct sset *firstss; struct sset *ss; struct colormap *cm = d->cm; + chr *nopr; + int i; /* initialize */ css = initialize(v, d, start); - firstss = css; cp = start; /* startup */ @@ -175,72 +175,93 @@ chr **coldp; /* store coldstart pointer here, if nonNULL */ if (ss == NULL) return NULL; - if (ss->flags&POSTSTATE) { - assert(firstss->flags&STARTER); - assert(firstss->lastseen != NULL); - if (coldp != NULL) - *coldp = firstss->lastseen; + else if (ss->flags&POSTSTATE) { assert(cp >= realmin); - return cp - 1; - } - - /* shutdown */ - FDEBUG(("--- shutdown at c%d ---\n", css - d->ssets)); - if (cp == v->stop && max == v->stop) { + cp--; + } else if (cp == v->stop && max == v->stop) { co = d->cnfa->eos[(v->eflags®_NOTEOL) ? 0 : 1]; FDEBUG(("color %ld\n", (long)co)); ss = miss(v, d, css, co, cp, start); - /* special case: match ended at eol? */ - if (ss != NULL && (ss->flags&POSTSTATE)) { - assert(firstss->flags&STARTER); - assert(firstss->lastseen != NULL); - if (coldp != NULL) - *coldp = firstss->lastseen; - return cp; - } + /* match might have ended at eol */ } - return NULL; + if (ss == NULL || !(ss->flags&POSTSTATE)) + return NULL; + + /* find last no-progress state set, if any */ + nopr = d->lastnopr; + for (ss = d->ssets, i = 0; i < d->nssused; ss++, i++) + if ((ss->flags&NOPROGRESS) && nopr != ss->lastseen && + (nopr == NULL || nopr < ss->lastseen)) + nopr = ss->lastseen; + assert(nopr != NULL); + if (coldp != NULL) + *coldp = (nopr == v->start) ? nopr : nopr-1; + return cp; } /* - newdfa - set up a fresh DFA ^ static struct dfa *newdfa(struct vars *, struct cnfa *, - ^ struct colormap *); + ^ struct colormap *, struct smalldfa *); */ static struct dfa * -newdfa(v, cnfa, cm) +newdfa(v, cnfa, cm, small) struct vars *v; struct cnfa *cnfa; struct colormap *cm; +struct smalldfa *small; /* preallocated space, may be NULL */ { - struct dfa *d = (struct dfa *)MALLOC(sizeof(struct dfa)); + struct dfa *d; + size_t nss = cnfa->nstates * 2; int wordsper = (cnfa->nstates + UBITS - 1) / UBITS; - struct sset *ss; - int i; + struct smalldfa *smallwas = small; assert(cnfa != NULL && cnfa->nstates != 0); - if (d == NULL) { - ERR(REG_ESPACE); - return NULL; - } - d->ssets = (struct sset *)MALLOC(CACHE * sizeof(struct sset)); - d->statesarea = (unsigned *)MALLOC((CACHE+WORK) * wordsper * + if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS) { + assert(wordsper == 1); + if (small == NULL) { + small = (struct smalldfa *)MALLOC( + sizeof(struct smalldfa)); + if (small == NULL) { + ERR(REG_ESPACE); + return NULL; + } + } + d = &small->dfa; + d->ssets = small->ssets; + d->statesarea = small->statesarea; + d->work = &d->statesarea[nss]; + d->outsarea = small->outsarea; + d->incarea = small->incarea; + d->cptsmalloced = 0; + d->mallocarea = (smallwas == NULL) ? (char *)small : NULL; + } else { + d = (struct dfa *)MALLOC(sizeof(struct dfa)); + if (d == NULL) { + ERR(REG_ESPACE); + return NULL; + } + d->ssets = (struct sset *)MALLOC(nss * sizeof(struct sset)); + d->statesarea = (unsigned *)MALLOC((nss+WORK) * wordsper * sizeof(unsigned)); - d->work = &d->statesarea[CACHE * wordsper]; - d->outsarea = (struct sset **)MALLOC(CACHE * cnfa->ncolors * + d->work = &d->statesarea[nss * wordsper]; + d->outsarea = (struct sset **)MALLOC(nss * cnfa->ncolors * sizeof(struct sset *)); - d->incarea = (struct arcp *)MALLOC(CACHE * cnfa->ncolors * + d->incarea = (struct arcp *)MALLOC(nss * cnfa->ncolors * sizeof(struct arcp)); - if (d->ssets == NULL || d->statesarea == NULL || d->outsarea == NULL || - d->incarea == NULL) { - freedfa(d); - ERR(REG_ESPACE); - return NULL; + d->cptsmalloced = 1; + d->mallocarea = (char *)d; + if (d->ssets == NULL || d->statesarea == NULL || + d->outsarea == NULL || d->incarea == NULL) { + freedfa(d); + ERR(REG_ESPACE); + return NULL; + } } - d->nssets = (v->eflags®_SMALL) ? 7 : CACHE; + d->nssets = (v->eflags®_SMALL) ? 7 : nss; d->nssused = 0; d->nstates = cnfa->nstates; d->ncolors = cnfa->ncolors; @@ -248,14 +269,10 @@ struct colormap *cm; d->cnfa = cnfa; d->cm = cm; d->lastpost = NULL; + d->lastnopr = NULL; d->search = d->ssets; - for (ss = d->ssets, i = 0; i < d->nssets; ss++, i++) { - /* initialization of most fields is done as needed */ - ss->states = &d->statesarea[i * d->wordsper]; - ss->outs = &d->outsarea[i * d->ncolors]; - ss->inchain = &d->incarea[i * d->ncolors]; - } + /* initialization of sset fields is done as needed */ return d; } @@ -268,15 +285,19 @@ static VOID freedfa(d) struct dfa *d; { - if (d->ssets != NULL) - FREE(d->ssets); - if (d->statesarea != NULL) - FREE(d->statesarea); - if (d->outsarea != NULL) - FREE(d->outsarea); - if (d->incarea != NULL) - FREE(d->incarea); - FREE(d); + if (d->cptsmalloced) { + if (d->ssets != NULL) + FREE(d->ssets); + if (d->statesarea != NULL) + FREE(d->statesarea); + if (d->outsarea != NULL) + FREE(d->outsarea); + if (d->incarea != NULL) + FREE(d->incarea); + } + + if (d->mallocarea != NULL) + FREE(d->mallocarea); } /* @@ -319,9 +340,9 @@ chr *start; for (i = 0; i < d->wordsper; i++) ss->states[i] = 0; BSET(ss->states, d->cnfa->pre); - ss->hash = hash(ss->states, d->wordsper); + ss->hash = HASH(ss->states, d->wordsper); assert(d->cnfa->pre != d->cnfa->post); - ss->flags = STARTER|LOCKED; + ss->flags = STARTER|LOCKED|NOPROGRESS; /* lastseen dealt with below */ } @@ -329,6 +350,7 @@ chr *start; d->ssets[i].lastseen = NULL; ss->lastseen = start; /* maybe untrue, but harmless */ d->lastpost = NULL; + d->lastnopr = NULL; return ss; } @@ -352,6 +374,7 @@ chr *start; /* where the attempt got started */ struct carc *ca; struct sset *p; int ispost; + int noprogress; int gotstate; int dolacons; int didlacons; @@ -367,15 +390,18 @@ chr *start; /* where the attempt got started */ for (i = 0; i < d->wordsper; i++) d->work[i] = 0; ispost = 0; + noprogress = 1; gotstate = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(css->states, i)) - for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) + for (ca = cnfa->states[i]+1; ca->co != COLORLESS; ca++) if (ca->co == co) { BSET(d->work, ca->to); gotstate = 1; if (ca->to == cnfa->post) ispost = 1; + if (!cnfa->states[ca->to]->co) + noprogress = 0; FDEBUG(("%d -> %d\n", i, ca->to)); } dolacons = (gotstate) ? (cnfa->flags&HASLACONS) : 0; @@ -384,7 +410,7 @@ chr *start; /* where the attempt got started */ dolacons = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(d->work, i)) - for (ca = cnfa->states[i]; ca->co != COLORLESS; + for (ca = cnfa->states[i]+1; ca->co != COLORLESS; ca++) if (ca->co > cnfa->ncolors && !ISBSET(d->work, ca->to) && @@ -395,17 +421,23 @@ chr *start; /* where the attempt got started */ didlacons = 1; if (ca->to == cnfa->post) ispost = 1; + if (!cnfa->states[ca->to]->co) + noprogress = 0; FDEBUG(("%d :> %d\n",i,ca->to)); } } if (!gotstate) return NULL; - h = hash(d->work, d->wordsper); + h = HASH(d->work, d->wordsper); /* next, is that in the cache? */ for (p = d->ssets, i = d->nssused; i > 0; p++, i--) - if (p->hash == h && memcmp(VS(d->work), VS(p->states), + if (HIT(h, d->work, p, d->wordsper)) { +#ifndef xxx +p->hash == h && +memcmp(VS(d->work), VS(p->states), d->wordsper*sizeof(unsigned)) == 0) { +#endif FDEBUG(("cached c%d\n", p - d->ssets)); break; /* NOTE BREAK OUT */ } @@ -416,6 +448,8 @@ chr *start; /* where the attempt got started */ p->states[i] = d->work[i]; p->hash = h; p->flags = (ispost) ? POSTSTATE : 0; + if (noprogress) + p->flags |= NOPROGRESS; /* lastseen to be dealt with by caller */ } @@ -442,13 +476,14 @@ pcolor co; /* "color" of the lookahead constraint */ int n; struct subre *sub; struct dfa *d; + struct smalldfa sd; chr *end; n = co - pcnfa->ncolors; assert(n < v->g->nlacons && v->g->lacons != NULL); FDEBUG(("=== testing lacon %d\n", n)); sub = &v->g->lacons[n]; - d = newdfa(v, &sub->cnfa, &v->g->cmap); + d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd); if (d == NULL) { ERR(REG_ESPACE); return 0; @@ -520,6 +555,11 @@ chr *start; (d->lastpost == NULL || d->lastpost < ss->lastseen)) d->lastpost = ss->lastseen; + /* likewise for a no-progress state */ + if ((ss->flags&NOPROGRESS) && ss->lastseen != d->lastnopr && + (d->lastnopr == NULL || d->lastnopr < ss->lastseen)) + d->lastnopr = ss->lastseen; + return ss; } @@ -541,17 +581,21 @@ chr *start; /* shortcut for cases where cache isn't full */ if (d->nssused < d->nssets) { - ss = &d->ssets[d->nssused]; + i = d->nssused; d->nssused++; - FDEBUG(("new c%d\n", ss - d->ssets)); - /* must make innards consistent */ + ss = &d->ssets[i]; + FDEBUG(("new c%d\n", i)); + /* set up innards */ + ss->states = &d->statesarea[i * d->wordsper]; + ss->flags = 0; ss->ins.ss = NULL; ss->ins.co = WHITE; /* give it some value */ + ss->outs = &d->outsarea[i * d->ncolors]; + ss->inchain = &d->incarea[i * d->ncolors]; for (i = 0; i < d->ncolors; i++) { ss->outs[i] = NULL; ss->inchain[i].ss = NULL; } - ss->flags = 0; return ss; } diff --git a/generic/regerrs.h b/generic/regerrs.h index af2805c..1b6552c 100644 --- a/generic/regerrs.h +++ b/generic/regerrs.h @@ -1,6 +1,6 @@ { REG_OKAY, "REG_OKAY", "no errors detected" }, { REG_NOMATCH, "REG_NOMATCH", "failed to match" }, -{ REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.1)" }, +{ REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.2)" }, { REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" }, { REG_ECTYPE, "REG_ECTYPE", "invalid character class" }, { REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence" }, diff --git a/generic/regex.h b/generic/regex.h index 7ca6ad0..2f3ebfa 100644 --- a/generic/regex.h +++ b/generic/regex.h @@ -79,16 +79,16 @@ extern "C" { #define __REG_VOID_T VOID #define __REG_CONST CONST /* names and declarations */ -#define __REG_WIDE_COMPILE re_ucomp -#define __REG_WIDE_EXEC re_uexec +#define __REG_WIDE_COMPILE TclReComp +#define __REG_WIDE_EXEC TclReExec #ifndef __REG_NOFRONT #define __REG_NOFRONT /* don't want regcomp() and regexec() */ #endif #ifndef __REG_NOCHAR #define __REG_NOCHAR /* or the char versions */ #endif -#define regfree re_ufree -#define regerror re_uerror +#define regfree TclReFree +#define regerror TclReError /* --- end --- */ @@ -289,8 +289,8 @@ int regexec _ANSI_ARGS_((regex_t *, __REG_CONST char *, size_t, regmatch_t [], i #ifdef __REG_WIDE_T int __REG_WIDE_EXEC _ANSI_ARGS_((regex_t *, __REG_CONST __REG_WIDE_T *, size_t, rm_detail_t *, size_t, regmatch_t [], int)); #endif -re_void re_ufree _ANSI_ARGS_((regex_t *)); -extern size_t re_uerror _ANSI_ARGS_((int, __REG_CONST regex_t *, char *, size_t)); +re_void regfree _ANSI_ARGS_((regex_t *)); +extern size_t regerror _ANSI_ARGS_((int, __REG_CONST regex_t *, char *, size_t)); /* automatically gathered by fwd; do not hand-edit */ /* =====^!^===== end forwards =====^!^===== */ diff --git a/generic/regexec.c b/generic/regexec.c index 1671866..50fcc70 100644 --- a/generic/regexec.c +++ b/generic/regexec.c @@ -24,6 +24,7 @@ struct vars { #define ERR(e) VERR(v, e) /* record an error */ #define NOERR() {if (ISERR()) return;} /* if error seen, return */ #define OFF(p) ((p) - v->start) +#define LOFF(p) ((long)OFF(p)) @@ -35,11 +36,15 @@ struct arcp { /* "pointer" to an outarc */ struct sset { /* state set */ unsigned *states; /* pointer to bitvector */ - unsigned hash; /* xor of bitvector */ + unsigned hash; /* hash of bitvector */ +# define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw)) +# define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \ + memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0)) int flags; # define STARTER 01 /* the initial state set */ # define POSTSTATE 02 /* includes the goal state */ # define LOCKED 04 /* locked in cache */ +# define NOPROGRESS 010 /* zero-progress state set */ struct arcp ins; /* chain of inarcs pointing here */ chr *lastseen; /* last entered on arrival here */ struct sset **outs; /* outarc vector indexed by color */ @@ -60,12 +65,26 @@ struct dfa { struct cnfa *cnfa; struct colormap *cm; chr *lastpost; /* location of last cache-flushed success */ + chr *lastnopr; /* location of last cache-flushed NOPROGRESS */ struct sset *search; /* replacement-search-pointer memory */ + int cptsmalloced; /* were the areas individually malloced? */ + char *mallocarea; /* self, or master malloced area, or NULL */ }; -#define CACHE 200 #define WORK 1 /* number of work bitvectors needed */ +/* setup for non-malloc allocation for small cases */ +#define FEWSTATES 20 /* must be less than UBITS */ +#define FEWCOLORS 15 +struct smalldfa { + struct dfa dfa; + struct sset ssets[FEWSTATES*2]; + unsigned statesarea[FEWSTATES*2 + WORK]; + struct sset *outsarea[FEWSTATES*2 * FEWCOLORS]; + struct arcp incarea[FEWSTATES*2 * FEWCOLORS]; +}; + + /* @@ -91,7 +110,7 @@ static int caltdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *) /* === rege_dfa.c === */ static chr *longest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *)); static chr *shortest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, chr *, chr **)); -static struct dfa *newdfa _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *)); +static struct dfa *newdfa _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *, struct smalldfa *)); static VOID freedfa _ANSI_ARGS_((struct dfa *)); static unsigned hash _ANSI_ARGS_((unsigned *, int)); static struct sset *initialize _ANSI_ARGS_((struct vars *, struct dfa *, chr *)); @@ -124,6 +143,10 @@ int flags; int st; size_t n; int complications; +# define LOCALMAT 20 + regmatch_t mat[LOCALMAT]; +# define LOCALMEM 40 + regoff_t mem[LOCALMEM]; /* sanity checks */ if (re == NULL || string == NULL || re->re_magic != REMAGIC) @@ -145,7 +168,10 @@ int flags; v->nmatch = nmatch; if (complications && v->nmatch < v->g->nsub + 1) { /* need work area bigger than what user gave us */ - v->pmatch = (regmatch_t *)MALLOC((v->g->nsub + 1) * + if (v->g->nsub + 1 <= LOCALMAT) + v->pmatch = mat; + else + v->pmatch = (regmatch_t *)MALLOC((v->g->nsub + 1) * sizeof(regmatch_t)); if (v->pmatch == NULL) return REG_ESPACE; @@ -156,9 +182,14 @@ int flags; v->stop = (chr *)string + len; v->err = 0; if (complications) { - v->mem = (regoff_t *)MALLOC(v->g->ntree*sizeof(regoff_t)); + assert(v->g->ntree >= 0); + n = (size_t)v->g->ntree; + if (n <= LOCALMEM) + v->mem = mem; + else + v->mem = (regoff_t *)MALLOC(n*sizeof(regoff_t)); if (v->mem == NULL) { - if (v->pmatch != pmatch) + if (v->pmatch != pmatch && v->pmatch != mat) FREE(v->pmatch); return REG_ESPACE; } @@ -171,14 +202,18 @@ int flags; st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); else st = find(v, &v->g->tree->cnfa, &v->g->cmap); + + /* copy (portion of) match vector over if necessary */ if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) { zapsubs(pmatch, nmatch); n = (nmatch < v->nmatch) ? nmatch : v->nmatch; memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t)); } - if (v->pmatch != pmatch) + + /* clean up */ + if (v->pmatch != pmatch && v->pmatch != mat) FREE(v->pmatch); - if (v->mem != NULL) + if (v->mem != NULL && v->mem != mem) FREE(v->mem); return st; } @@ -193,15 +228,14 @@ struct vars *v; struct cnfa *cnfa; struct colormap *cm; { - struct dfa *d = newdfa(v, cnfa, cm); - struct dfa *s = newdfa(v, &v->g->search, cm); + struct smalldfa da; + struct dfa *d = newdfa(v, cnfa, cm, &da); + struct smalldfa sa; + struct dfa *s = newdfa(v, &v->g->search, cm, &sa); chr *begin; chr *end; -#ifdef notdef chr *open; /* open and close of range of possible starts */ chr *close; -#endif - chr *stop = (cnfa->flags&LEFTANCH) ? v->start : v->stop; if (d == NULL) return v->err; @@ -210,17 +244,15 @@ struct colormap *cm; return v->err; } -#ifdef notdef close = v->start; - while (close < stop) { - MDEBUG(("\nsearch at %ld\n", (long)OFF(close))); + do { + MDEBUG(("\nsearch at %ld\n", LOFF(close))); close = shortest(v, s, close, close, v->stop, &open); if (close == NULL) break; /* NOTE BREAK */ - MDEBUG(("between %ld and %ld\n", (long)OFF(open), - (long)OFF(close))); -#endif - for (begin = v->start; begin <= stop; begin++) { + MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); + for (begin = open; begin <= close; begin++) { + MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); end = longest(v, d, begin, v->stop); if (end != NULL) { if (v->nmatch > 0) { @@ -239,9 +271,7 @@ struct colormap *cm; return REG_OKAY; } } -#ifdef notdef - } -#endif + } while (close < v->stop); freedfa(d); freedfa(s); @@ -260,10 +290,14 @@ struct vars *v; struct cnfa *cnfa; struct colormap *cm; { - struct dfa *d = newdfa(v, cnfa, cm); + struct smalldfa da; + struct dfa *d = newdfa(v, cnfa, cm, &da); + struct smalldfa sa; + struct dfa *s = newdfa(v, &v->g->search, cm, &sa); chr *begin; chr *end; - chr *stop = (cnfa->flags&LEFTANCH) ? v->start : v->stop; + chr *open; /* open and close of range of possible starts */ + chr *close; chr *estart; chr *estop; int er; @@ -271,55 +305,72 @@ struct colormap *cm; if (d == NULL) return v->err; + if (s == NULL) { + freedfa(d); + return v->err; + } - for (begin = v->start; begin <= stop; begin++) { - MDEBUG(("\ncfind trying at %ld\n", (long)OFF(begin))); - estart = begin; - estop = v->stop; - for (;;) { - if (shorter) - end = shortest(v, d, begin, estart, estop, - (chr **)NULL); - else - end = longest(v, d, begin, estop); - if (end == NULL) - break; /* NOTE BREAK OUT */ - MDEBUG(("tentative end %ld\n", (long)OFF(end))); - zapsubs(v->pmatch, v->nmatch); - zapmem(v, v->g->tree); - er = cdissect(v, v->g->tree, begin, end); - switch (er) { - case REG_OKAY: - if (v->nmatch > 0) { - v->pmatch[0].rm_so = OFF(begin); - v->pmatch[0].rm_eo = OFF(end); - } - freedfa(d); - if (ISERR()) - return v->err; - return REG_OKAY; - break; - case REG_NOMATCH: - /* go around and try again */ - if ((shorter) ? end == estop : end == begin) { - /* no point in trying again */ - freedfa(d); - return REG_NOMATCH; - } + close = v->start; + do { + MDEBUG(("\ncsearch at %ld\n", LOFF(close))); + close = shortest(v, s, close, close, v->stop, &open); + if (close == NULL) + break; /* NOTE BREAK */ + MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close))); + for (begin = open; begin <= close; begin++) { + MDEBUG(("\ncfind trying at %ld\n", LOFF(begin))); + estart = begin; + estop = v->stop; + for (;;) { if (shorter) - estart = end + 1; + end = shortest(v, d, begin, estart, + estop, (chr **)NULL); else - estop = end - 1; - break; - default: - freedfa(d); - return er; - break; + end = longest(v, d, begin, estop); + if (end == NULL) + break; /* NOTE BREAK OUT */ + MDEBUG(("tentative end %ld\n", LOFF(end))); + zapsubs(v->pmatch, v->nmatch); + zapmem(v, v->g->tree); + er = cdissect(v, v->g->tree, begin, end); + switch (er) { + case REG_OKAY: + if (v->nmatch > 0) { + v->pmatch[0].rm_so = OFF(begin); + v->pmatch[0].rm_eo = OFF(end); + } + freedfa(d); + freedfa(s); + if (ISERR()) + return v->err; + return REG_OKAY; + break; + case REG_NOMATCH: + /* go around and try again */ + if ((shorter) ? end == estop : + end == begin) { + /* no point in trying again */ + freedfa(s); + freedfa(d); + return REG_NOMATCH; + } + if (shorter) + estart = end + 1; + else + estop = end - 1; + break; + default: + freedfa(d); + freedfa(s); + return er; + break; + } } } - } + } while (close < v->stop); freedfa(d); + freedfa(s); if (ISERR()) return v->err; return REG_NOMATCH; @@ -402,7 +453,7 @@ chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { assert(t != NULL); - MDEBUG(("dissect %ld-%ld\n", (long)OFF(begin), (long)OFF(end))); + MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end))); switch (t->op) { case '=': /* terminal node */ @@ -443,7 +494,9 @@ struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { + struct smalldfa da; struct dfa *d; + struct smalldfa d2a; struct dfa *d2; chr *mid; int i; @@ -452,10 +505,10 @@ chr *end; /* end of same */ assert(t->left != NULL && t->left->cnfa.nstates > 0); assert(t->right != NULL && t->right->cnfa.nstates > 0); - d = newdfa(v, &t->left->cnfa, &v->g->cmap); + d = newdfa(v, &t->left->cnfa, &v->g->cmap, &da); if (ISERR()) return v->err; - d2 = newdfa(v, &t->right->cnfa, &v->g->cmap); + d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &d2a); if (ISERR()) { freedfa(d); return v->err; @@ -468,7 +521,7 @@ chr *end; /* end of same */ freedfa(d2); return REG_ASSERT; } - MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid))); + MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); /* iterate until satisfaction or failure */ while (longest(v, d2, mid, end) != end) { @@ -488,7 +541,7 @@ chr *end; /* end of same */ freedfa(d2); return REG_ASSERT; } - MDEBUG(("new midpoint %ld\n", (long)OFF(mid))); + MDEBUG(("new midpoint %ld\n", LOFF(mid))); } /* satisfaction */ @@ -512,6 +565,7 @@ struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { + struct smalldfa da; struct dfa *d; int i; @@ -521,7 +575,7 @@ chr *end; /* end of same */ for (i = 0; t != NULL; t = t->right, i++) { MDEBUG(("trying %dth\n", i)); assert(t->left != NULL && t->left->cnfa.nstates > 0); - d = newdfa(v, &t->left->cnfa, &v->g->cmap); + d = newdfa(v, &t->left->cnfa, &v->g->cmap, &da); if (ISERR()) return v->err; if (longest(v, d, begin, end) == end) { @@ -550,7 +604,7 @@ chr *end; /* end of same */ int er; assert(t != NULL); - MDEBUG(("cdissect %ld-%ld\n", (long)OFF(begin), (long)OFF(end))); + MDEBUG(("cdissect %ld-%ld\n", LOFF(begin), LOFF(end))); switch (t->op) { case '=': /* terminal node */ @@ -596,7 +650,9 @@ struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { + struct smalldfa da; struct dfa *d; + struct smalldfa d2a; struct dfa *d2; chr *mid; int er; @@ -608,10 +664,10 @@ chr *end; /* end of same */ if (t->left->flags&SHORTER) /* reverse scan */ return crevdissect(v, t, begin, end); - d = newdfa(v, &t->left->cnfa, &v->g->cmap); + d = newdfa(v, &t->left->cnfa, &v->g->cmap, &da); if (ISERR()) return v->err; - d2 = newdfa(v, &t->right->cnfa, &v->g->cmap); + d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &d2a); if (ISERR()) { freedfa(d); return v->err; @@ -626,11 +682,11 @@ chr *end; /* end of same */ freedfa(d2); return REG_NOMATCH; } - MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid))); + MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); v->mem[t->retry] = (mid - begin) + 1; } else { mid = begin + (v->mem[t->retry] - 1); - MDEBUG(("working midpoint %ld\n", (long)OFF(mid))); + MDEBUG(("working midpoint %ld\n", LOFF(mid))); } /* iterate until satisfaction or failure */ @@ -663,7 +719,7 @@ chr *end; /* end of same */ freedfa(d2); return REG_NOMATCH; } - MDEBUG(("%d: new midpoint %ld\n", t->retry, (long)OFF(mid))); + MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); v->mem[t->retry] = (mid - begin) + 1; zapmem(v, t->left); zapmem(v, t->right); @@ -689,7 +745,9 @@ struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { + struct smalldfa da; struct dfa *d; + struct smalldfa d2a; struct dfa *d2; chr *mid; int er; @@ -700,10 +758,10 @@ chr *end; /* end of same */ assert(t->left->flags&SHORTER); /* concatenation -- need to split the substring between parts */ - d = newdfa(v, &t->left->cnfa, &v->g->cmap); + d = newdfa(v, &t->left->cnfa, &v->g->cmap, &da); if (ISERR()) return v->err; - d2 = newdfa(v, &t->right->cnfa, &v->g->cmap); + d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &d2a); if (ISERR()) { freedfa(d); return v->err; @@ -718,11 +776,11 @@ chr *end; /* end of same */ freedfa(d2); return REG_NOMATCH; } - MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid))); + MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); v->mem[t->retry] = (mid - begin) + 1; } else { mid = begin + (v->mem[t->retry] - 1); - MDEBUG(("working midpoint %ld\n", (long)OFF(mid))); + MDEBUG(("working midpoint %ld\n", LOFF(mid))); } /* iterate until satisfaction or failure */ @@ -755,7 +813,7 @@ chr *end; /* end of same */ freedfa(d2); return REG_NOMATCH; } - MDEBUG(("%d: new midpoint %ld\n", t->retry, (long)OFF(mid))); + MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); v->mem[t->retry] = (mid - begin) + 1; zapmem(v, t->left); zapmem(v, t->right); @@ -846,6 +904,7 @@ struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { + struct smalldfa da; struct dfa *d; int er; # define UNTRIED 0 /* not yet tried at all */ @@ -862,7 +921,7 @@ chr *end; /* end of same */ assert(t->left != NULL); if (v->mem[t->retry] == UNTRIED) { - d = newdfa(v, &t->left->cnfa, &v->g->cmap); + d = newdfa(v, &t->left->cnfa, &v->g->cmap, &da); if (ISERR()) return v->err; if (longest(v, d, begin, end) != end) { diff --git a/generic/regguts.h b/generic/regguts.h index 5f9a0ab..73aed2f 100644 --- a/generic/regguts.h +++ b/generic/regguts.h @@ -308,7 +308,6 @@ struct cnfa { int ncolors; /* number of colors */ int flags; # define HASLACONS 01 /* uses lookahead constraints */ -# define LEFTANCH 02 /* anchored on left */ int pre; /* setup state number */ int post; /* teardown state number */ color bos[2]; /* colors, if any, assigned to BOS and BOL */ diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index a619d5b..c605591 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -10,7 +10,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclRegexp.c,v 1.1.2.5 1998/11/11 04:54:19 stanton Exp $ + * RCS: @(#) $Id: tclRegexp.c,v 1.1.2.6 1998/11/17 21:38:39 stanton Exp $ */ #include "tclInt.h" @@ -163,7 +163,7 @@ Tcl_RegExpCompile(interp, string) if (iPtr->patterns[NUM_REGEXPS-1] != NULL) { ckfree(iPtr->patterns[NUM_REGEXPS-1]); - re_ufree(&(iPtr->regexps[NUM_REGEXPS-1]->re)); + TclReFree(&(iPtr->regexps[NUM_REGEXPS-1]->re)); ckfree((char *) iPtr->regexps[NUM_REGEXPS-1]); } for (i = NUM_REGEXPS - 2; i >= 0; i--) { @@ -326,7 +326,7 @@ TclRegExpExecUniChar(interp, re, wString, numChars, nmatches, flags) if (nmatches >= 0 && (size_t) nmatches < nm) nm = (size_t) nmatches; - status = re_uexec(®expPtr->re, wString, (size_t) numChars, + status = TclReExec(®expPtr->re, wString, (size_t) numChars, (rm_detail_t *)NULL, nm, regexpPtr->matches, flags); /* @@ -621,12 +621,12 @@ TclRegError(interp, msg, status) char *p; Tcl_ResetResult(interp); - n = re_uerror(status, (regex_t *)NULL, buf, sizeof(buf)); + n = TclReError(status, (regex_t *)NULL, buf, sizeof(buf)); p = (n > sizeof(buf)) ? "..." : ""; Tcl_AppendResult(interp, msg, buf, p, NULL); sprintf(cbuf, "%d", status); - (VOID) re_uerror(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf)); + (VOID) TclReError(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf)); Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL); } @@ -654,7 +654,7 @@ FreeRegexpInternalRep(objPtr) { TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; - re_ufree(®expRepPtr->re); + TclReFree(®expRepPtr->re); if (regexpRepPtr->matches) { ckfree((char *) regexpRepPtr->matches); } @@ -765,7 +765,7 @@ CompileRegexp(interp, string, length, flags) */ regexpPtr->flags = flags; - status = re_ucomp(®expPtr->re, uniString, (size_t) numChars, flags); + status = TclReComp(®expPtr->re, uniString, (size_t) numChars, flags); Tcl_DStringFree(&stringBuf); if (status != REG_OKAY) { diff --git a/tests/reg.test b/tests/reg.test new file mode 100644 index 0000000..26a981e --- /dev/null +++ b/tests/reg.test @@ -0,0 +1,881 @@ +# regexp tests + +if {[string compare test [info procs test]] == 1} then {source defs} + +# This file uses some custom procedures, defined below, for regexp regression +# testing. The name of the procedure indicates the general nature of the +# test: e for compile error expected, f for match failure expected, m +# for a successful match, and i for a successful match with -indices (used +# in checking things like nonparticipating subexpressions). There is also +# a "doing" procedure which sets up title and major test number for each +# block of tests, and an "xx" procedure which ignores its arguments and +# arranges for the next invocation of "doing" to announce that some tests +# were bypassed (which is better than just commenting them out). + +# The first 3 arguments are constant: a minor number (which often gets +# a letter or two suffixed to it internally), some flags, and the RE itself. +# For e, the remaining argument is the name of the compile error expected, +# less the leading "REG_". For the rest, the next argument is the string +# to try the match against. Remaining arguments are the substring expected +# to be matched, and any substrings expected to be matched by subexpressions. +# (For f, these arguments are optional, and if present are ignored except +# that they indicate how many subexpressions should be presents in the RE.) +# It is an error for the number of subexpression arguments to be wrong. +# Cases involving nonparticipating subexpressions, checking where empty +# substrings are located, etc. should be done using i. + +# The flag characters are complex and a bit eclectic. Generally speaking, +# lowercase letters are compile options, uppercase are expected re_info +# bits, and nonalphabetics are match options, controls for how the test is +# run, or testing options. The one small surprise is that AREs are the +# default, and you must explicitly request lesser flavors of RE. The flags +# are as follows. It is admitted that some are not very mnemonic. +# There are some others which are purely debugging tools and are not +# useful in this file. +# +# - no-op (placeholder) +# + provide fake xy equivalence class +# % force small state-set cache in matcher (to test cache replace) +# ^ beginning of string is not beginning of line +# $ end of string is not end of line +# +# & test as both ARE and BRE +# b BRE +# e ERE +# a turn advanced-features bit on (error unless ERE already) +# q literal string, no metacharacters at all +# +# i case-independent matching +# o ("opaque") no subexpression capture +# p newlines are half-magic, excluded from . and [^ only +# w newlines are half-magic, significant to ^ and $ only +# n newlines are fully magic, both effects +# x expanded RE syntax +# +# A backslash-_a_lphanumeric seen +# B ERE/ARE literal-_b_race heuristic used +# E backslash (_e_scape) seen within [] +# H looka_h_ead constraint seen +# I _i_mpossible to match +# L _l_ocale-specific construct seen +# M unportable (_m_achine-specific) construct seen +# N RE can match empty (_n_ull) string +# P non-_P_OSIX construct seen +# Q {} _q_uantifier seen +# R back _r_eference seen +# S POSIX-un_s_pecified syntax seen +# U saw original-POSIX botch: unmatched right paren in ERE (_u_gh) + +# The one area we can't easily test is memory-allocation failures (which +# are hard to provoke on command). Embedded NULs also are not tested at +# the moment, but this is a historical accident which should be fixed. + + + +# test procedures and related + +set ask "about" +set xflags "xflags" +set testbypassed 0 + +# re_info abbreviation mapping table +set infonames(A) "REG_UBSALNUM" +set infonames(B) "REG_UBRACES" +set infonames(E) "REG_UBBS" +set infonames(H) "REG_ULOOKAHEAD" +set infonames(I) "REG_UIMPOSSIBLE" +set infonames(L) "REG_ULOCALE" +set infonames(M) "REG_UUNPORT" +set infonames(N) "REG_UEMPTYMATCH" +set infonames(P) "REG_UNONPOSIX" +set infonames(Q) "REG_UBOUNDS" +set infonames(R) "REG_UBACKREF" +set infonames(S) "REG_UUNSPEC" +set infonames(U) "REG_UPBOTCH" +set infonameorder "RHQBAUEPSMLNI" ;# must match bit order, lsb first + +# set major test number and description +proc doing {major desc} { + global prefix description testbypassed + + if {$testbypassed != 0} { + puts stdout "!!! bypassed $testbypassed tests in\ + reg-$major, `$description'" + } + + set prefix reg-$major + set description "reg $desc" + set testbypassed 0 +} + +# build test number (internal) +proc tno {testid} { + return [lindex $testid 0] +} + +# build description, with possible modifiers (internal) +proc desc {testid} { + global description + + set d $description + if {[llength $testid] > 1} { + set d "([lreplace $testid 0 0]) $d" + } + return $d +} + +# build trailing options and flags argument from a flags string (internal) +proc flags {fl} { + global xflags + + set args [list] + set flags "" + foreach f [split $fl ""] { + switch -exact -- $f { + "i" { lappend args "-nocase" } + "x" { lappend args "-expanded" } + "n" { lappend args "-line" } + "p" { lappend args "-linestop" } + "w" { lappend args "-lineanchor" } + "-" { } + default { append flags $f } + } + } + if {[string compare $flags ""] != 0} { + lappend args -$xflags $flags + } + return $args +} + +# build info-flags list from a flags string (internal) +proc infoflags {fl} { + global infonames infonameorder + + set ret [list] + foreach f [split $infonameorder ""] { + if {[string first $f $fl] >= 0} { + lappend ret $infonames($f) + } + } + return $ret +} + +# compilation error expected +proc e {testid flags re err} { + global prefix ask errorCode + + # if &, test as both ARE and BRE + set amp [string first "&" $flags] + if {$amp >= 0} { + set f [string range $flags 0 [expr $amp - 1]] + append f [string range $flags [expr $amp + 1] end] + e [linsert $testid end ARE] ${f} $re $err + e [linsert $testid end BRE] ${f}b $re $err + return + } + + set cmd [concat [list testregexp -$ask] [flags $flags] [list $re]] + set run "list \[catch \{$cmd\}\] \[lindex \$errorCode 1\]" + test $prefix.[tno $testid] [desc $testid] $run [list 1 REG_$err] +} + +# match failure expected +proc f {testid flags re target args} { + global prefix description ask + + # if &, test as both ARE and BRE + set amp [string first "&" $flags] + if {$amp >= 0} { + set f [string range $flags 0 [expr $amp - 1]] + append f [string range $flags [expr $amp + 1] end] + eval [linsert $args 0 f [linsert $testid end ARE] ${f} $re \ + $target] + eval [linsert $args 0 f [linsert $testid end BRE] ${f}b $re \ + $target] + return + } + + set f [flags $flags] + set infoflags [infoflags $flags] + set ccmd [concat [list testregexp -$ask] $f [list $re]] + set nsub [expr [llength $args] - 1] + if {$nsub == -1} { + # didn't tell us number of subexps + set ccmd "lreplace \[$ccmd\] 0 0" + set info [list $infoflags] + } else { + set info [list $nsub $infoflags] + } + lappend testid "compile" + test $prefix.[tno $testid] [desc $testid] $ccmd $info + + set testid [lreplace $testid end end "execute"] + set ecmd [concat [list testregexp] $f [list $re $target]] + test $prefix.[tno $testid] [desc $testid] $ecmd 0 +} + +# match expected, internal routine that does the work +# parameters like the "real" routines except they don't have "opts", +# which is a possibly-empty list of switches for the regexp match attempt +proc matchexpected {opts testid flags re target args} { + global prefix description ask + + # if &, test as both BRE and ARE + set amp [string first "&" $flags] + if {$amp >= 0} { + set f [string range $flags 0 [expr $amp - 1]] + append f [string range $flags [expr $amp + 1] end] + eval [concat [list matchexpected $opts \ + [linsert $testid end ARE] ${f} $re $target] $args] + eval [concat [list matchexpected $opts \ + [linsert $testid end BRE] ${f}b $re $target] $args] + return + } + + set f [flags $flags] + set infoflags [infoflags $flags] + set ccmd [concat [list testregexp -$ask] $f [list $re]] + set ecmd [concat [list testregexp] $opts $f [list $re $target]] + + set nsub [expr [llength $args] - 1] + set names [list] + set refs "" + for {set i 0} {$i <= $nsub} {incr i} { + if {$i == 0} { + set name match + } else { + set name sub$i + } + lappend names $name + append refs " \$$name" + set $name "" + } + if {[string first "o" $flags] >= 0} { ;# REG_NOSUB + set nsub 0 ;# unsigned value cannot be -1 + } + set ecmd [concat $ecmd $names] + set erun "list \[$ecmd\] $refs" + set result [concat [list 1] $args] + + set info [list $nsub $infoflags] + lappend testid "compile" + test $prefix.[tno $testid] [desc $testid] $ccmd $info + set testid [lreplace $testid end end "execute"] + test $prefix.[tno $testid] [desc $testid] $erun $result +} + +# match expected (no missing, empty, or ambiguous submatches) +# m testno flags re target mat submat ... +proc m {args} { + eval matchexpected [linsert $args 0 [list]] +} + +# match expected (full fanciness) +# i testno flags re target mat submat ... +proc i {args} { + eval matchexpected [linsert $args 0 [list "-indices"]] +} + +# test temporarily unimplemented +proc xx {args} { + global testbypassed + + incr testbypassed +} + + + +# the tests themselves + + + +# support functions and preliminary misc. +# This is sensitive to changes in message wording, but we really have to +# test the code->message expansion at least once. +test regexp-0.1 "regexp error reporting" { + list [catch {regexp (*) ign} msg] $msg +} {1 {couldn't compile regular expression pattern: quantifier operand invalid}} + + + +doing 1 "basic sanity checks" +m 1 & abc abc abc +f 2 & abc def +m 3 & abc xyabxabce abc + + + +doing 2 "invalid option combinations" +e 1 qe a INVARG +e 2 qa a INVARG +e 3 qx a INVARG +e 4 qn a INVARG +e 5 ba a INVARG + + + +doing 3 "basic syntax" +i 1 &NS "" a {0 -1} +m 2 NS a| a a +m 3 - a|b a a +m 4 - a|b b b +m 5 NS a||b b b +m 6 & ab ab ab + + + +doing 4 "parentheses" +m 1 - (a)e ae ae a +m 2 o (a)e ae +m 3 b {\(a\)b} ab ab a +m 4 - a((b)c) abc abc bc b +m 5 - a(b)(c) abc abc b c +e 6 - a(b EPAREN +e 7 b {a\(b} EPAREN +# sigh, we blew it on the specs here... someday this will be fixed in POSIX, +# but meanwhile, it's fixed in AREs +m 8 eU a)b a)b a)b +e 9 - a)b EPAREN +e 10 b {a\)b} EPAREN +m 11 P a(?:b)c abc abc +e 12 e a(?:b)c BADRPT +i 13 S a()b ab {0 1} {1 0} +m 14 SP a(?:)b ab ab +i 15 S a(|b)c ac {0 1} {1 0} +m 16 S a(b|)c abc abc b + + + +doing 5 "simple one-char matching" +# general case of brackets done later +m 1 & a.b axb axb +f 2 &n "a.b" "a\nb" +m 3 & {a[bc]d} abd abd +m 4 & {a[bc]d} acd acd +f 5 & {a[bc]d} aed +f 6 & {a[^bc]d} abd +m 7 & {a[^bc]d} aed aed +f 8 &p "a\[^bc]d" "a\nd" + + + +doing 6 "context-dependent syntax" +# plus odds and ends +e 1 - * BADRPT +m 2 b * * * +m 3 b {\(*\)} * * * +e 4 - (*) BADRPT +m 5 b ^* * * +e 6 - ^* BADRPT +f 7 & ^b ^b +m 8 b x^ x^ x^ +f 9 I x^ x +m 10 n "\n^" "x\nb" "\n" +f 11 bS {\(^b\)} ^b +m 12 - (^b) b b b +m 13 & {x$} x x +m 14 bS {\(x$\)} x x x +m 15 - {(x$)} x x x +m 16 b {x$y} "x\$y" "x\$y" +f 17 I {x$y} xy +m 18 n "x\$\n" "x\n" "x\n" +e 19 - + BADRPT +e 20 - ? BADRPT + + + +doing 7 "simple quantifiers" +m 1 &N a* aa aa +i 2 &N a* b {0 -1} +m 3 - a+ aa aa +m 4 - a?b ab ab +m 5 - a?b b b +e 6 - ** BADRPT +m 7 bN ** *** *** +e 8 & a** BADRPT +e 9 & a**b BADRPT +e 10 & *** BADRPT +e 11 * a++ BADRPT +e 12 * a?+ BADRPT +e 13 * a?* BADRPT +e 14 * a+* BADRPT +e 15 * a*+ BADRPT + + + +doing 8 "braces" +m 1 NQ "a{0,1}" "" "" +m 2 NQ "a{0,1}" ac a +e 3 - "a{1,0}" BADBR +e 4 - "a{1,2,3}" BADBR +e 5 - "a{257}" BADBR +e 6 - "a{1000}" BADBR +e 7 - "a{1" EBRACE +e 8 - "a{1n}" BADBR +m 9 BS "a{b" "a\{b" "a\{b" +m 10 BS "a{" "a\{" "a\{" +m 11 bQ {a\{0,1\}b} cb b +e 12 b {a\{0,1} EBRACE +e 13 - "a{0,1\\" BADBR +m 14 Q "a{0}b" ab b +m 15 Q "a{0,0}b" ab b +m 16 Q "a{0,1}b" ab ab +m 17 Q "a{0,2}b" b b +m 18 Q "a{0,2}b" aab aab +m 19 Q "a{0,}b" aab aab +m 20 Q "a{1,1}b" aab ab +m 21 Q "a{1,3}b" aaaab aaab +f 22 Q "a{1,3}b" b +m 23 Q "a{1,}b" aab aab +f 24 Q "a{2,3}b" ab +m 25 Q "a{2,3}b" aaaab aaab +f 26 Q "a{2,}b" ab +m 27 Q "a{2,}b" aaaab aaaab + + + +doing 9 "brackets" +m 1 & {a[bc]} ac ac +m 2 & {a[-]} a- a- +m 3 & {a[[.-.]]} a- a- +m 4 &L {a[[.zero.]]} a0 a0 +m 5 &LM {a[[.zero.]-9]} a2 a2 +m 6 &M {a[0-[.9.]]} a2 a2 +m 7 &+L {a[[=x=]]} ax ax +m 8 &+L {a[[=x=]]} ay ay +f 9 &+L {a[[=x=]]} az +e 10 & {a[0-[=x=]]} ERANGE +m 11 &L {a[[:digit:]]} a0 a0 +e 12 & {a[[:woopsie:]]} ECTYPE +f 13 &L {a[[:digit:]]} ab +e 14 & {a[0-[:digit:]]} ERANGE +m 15 &LP {[[:<:]]a} a a +m 16 &LP {a[[:>:]]} a a +e 17 & {a[[..]]b} ECOLLATE +e 18 & {a[[==]]b} ECOLLATE +e 19 & {a[[::]]b} ECTYPE +e 20 & {a[[.a} EBRACK +e 21 & {a[[=a} EBRACK +e 22 & {a[[:a} EBRACK +e 23 & {a[} EBRACK +e 24 & {a[b} EBRACK +e 25 & {a[b-} EBRACK +e 26 & {a[b-c} EBRACK +m 27 &M {a[b-c]} ab ab +m 28 & {a[b-b]} ab ab +m 29 &M {a[1-2]} a2 a2 +e 30 & {a[c-b]} ERANGE +e 31 & {a[a-b-c]} ERANGE +m 32 &M {a[--?]b} a?b a?b +m 33 & {a[---]b} a-b a-b +m 34 & {a[]b]c} a]c a]c +m 35 EP {a[\]]b} a]b a]b +f 36 bE {a[\]]b} a]b +m 37 bE {a[\]]b} "a\\]b" "a\\]b" +m 38 eE {a[\]]b} "a\\]b" "a\\]b" +m 39 EP {a[\\]b} "a\\b" "a\\b" +m 40 eE {a[\\]b} "a\\b" "a\\b" +m 41 bE {a[\\]b} "a\\b" "a\\b" +e 42 - {a[\Z]b} EESCAPE +m 43 & {a[[b]c} "a\[c" "a\[c" +m 44 EMP {a[\u00fe-\u0507][\u00ff-\u0300]b} \ + "a\u0102\u02ffb" "a\u0102\u02ffb" + + + +doing 10 "anchors and newlines" +m 1 & ^a a a +f 2 &^ ^a a +i 3 &N ^ a {0 -1} +i 4 & {a$} aba {2 2} +f 5 {&$} {a$} a +i 6 &N {$} ab {2 1} +m 7 &n ^a a a +m 8 &n "^a" "b\na" "a" +i 9 &w "^a" "a\na" {0 0} +i 10 &n^ "^a" "a\na" {2 2} +m 11 &n {a$} a a +m 12 &n "a\$" "a\nb" "a" +i 13 &n "a\$" "a\na" {0 0} +i 14 N ^^ a {0 -1} +m 15 b ^^ ^ ^ +i 16 N {$$} a {1 0} +m 17 b {$$} "\$" "\$" +m 18 &N {^$} "" "" +f 19 &N {^$} a +i 20 &nN "^\$" "a\n\nb" {2 1} +m 21 N {$^} "" "" +m 22 b {$^} "\$^" "\$^" +m 23 P {\Aa} a a +m 24 ^P {\Aa} a a +f 25 ^nP {\Aa} "b\na" +m 26 P {a\Z} a a +m 27 {$P} {a\Z} a a +f 28 {$nP} {a\Z} "a\nb" +e 29 - ^* BADRPT +e 30 - {$*} BADRPT +e 31 - {\A*} BADRPT +e 32 - {\Z*} BADRPT + + + +doing 11 "boundary constraints" +m 1 &LP {[[:<:]]a} a a +m 2 &LP {[[:<:]]a} -a a +f 3 &LP {[[:<:]]a} ba +m 4 &LP {a[[:>:]]} a a +m 5 &LP {a[[:>:]]} a- a +f 6 &LP {a[[:>:]]} ab +m 7 bLP {\} a a +f 10 bLP {a\>} ab +m 11 LP {\ya} a a +f 12 LP {\ya} ba +m 13 LP {a\y} a a +f 14 LP {a\y} ab +m 15 LP {a\Y} ab a +f 16 LP {a\Y} a- +f 17 LP {a\Y} a +f 18 LP {-\Y} -a +m 19 LP {-\Y} -% - +f 20 LP {\Y-} a- +e 21 - {[[:<:]]*} BADRPT +e 22 - {[[:>:]]*} BADRPT +e 23 b {\<*} BADRPT +e 24 b {\>*} BADRPT +e 25 - {\y*} BADRPT +e 26 - {\Y*} BADRPT +m 27 LP {\ma} a a +f 28 LP {\ma} ba +m 29 LP {a\M} a a +f 30 LP {a\M} ab +f 31 ILP {\Ma} a +f 32 ILP {a\m} a + + + +doing 12 "character classes" +m 1 LP {a\db} a0b a0b +f 2 LP {a\db} axb +f 3 LP {a\Db} a0b +m 4 LP {a\Db} axb axb +m 5 LP "a\\sb" "a b" "a b" +m 6 LP "a\\sb" "a\tb" "a\tb" +m 7 LP "a\\sb" "a\nb" "a\nb" +f 8 LP {a\sb} axb +m 9 LP {a\Sb} axb axb +f 10 LP "a\\Sb" "a b" +m 11 LP {a\wb} axb axb +f 12 LP {a\wb} a-b +f 13 LP {a\Wb} axb +m 14 LP {a\Wb} a-b a-b +m 15 LP {\y\w+z\y} adze-guz guz +m 16 LPE {a[\d]b} a1b a1b +m 17 LPE "a\[\\s]b" "a b" "a b" +m 18 LPE {a[\w]b} axb axb + + + +doing 13 "escapes" +e 1 & "a\\" EESCAPE +m 2 - {a\