diff options
Diffstat (limited to 'generic/regcomp.c')
| -rw-r--r-- | generic/regcomp.c | 398 |
1 files changed, 38 insertions, 360 deletions
diff --git a/generic/regcomp.c b/generic/regcomp.c index b9169f9..c93eb24 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -53,12 +53,8 @@ static void bracket(struct vars *, struct state *, struct state *); static void cbracket(struct vars *, struct state *, struct state *); static void brackpart(struct vars *, struct state *, struct state *); static const chr *scanplain(struct vars *); -static void leaders(struct vars *, struct cvec *); static void onechr(struct vars *, pchr, struct state *, struct state *); static void dovec(struct vars *, struct cvec *, struct state *, struct state *); -#ifdef REGEXP_MCCE_ENABLED -static celt nextleader(struct vars *, pchr, pchr); -#endif static void wordchrs(struct vars *); static struct subre *subre(struct vars *, int, int, struct state *, struct state *); static void freesubre(struct vars *, struct subre *); @@ -83,7 +79,7 @@ static void lexnest(struct vars *, const chr *, const chr *); static void lexword(struct vars *); static int next(struct vars *); static int lexescape(struct vars *); -static chr lexdigits(struct vars *, int, int, int); +static int lexdigits(struct vars *, int, int, int); static int brenext(struct vars *, pchr); static void skip(struct vars *); static chr newline(NOPARMS); @@ -107,9 +103,6 @@ static void subblock(struct vars *, pchr, struct state *, struct state *); static void okcolors(struct nfa *, struct colormap *); static void colorchain(struct colormap *, struct arc *); static void uncolorchain(struct colormap *, struct arc *); -#ifdef REGEXP_MCCE_ENABLED -static int singleton(struct colormap *, pchr c); -#endif static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *); static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *); #ifdef REG_DEBUG @@ -128,17 +121,20 @@ static void destroystate(struct nfa *, struct state *); static void newarc(struct nfa *, int, pcolor, struct state *, struct state *); static struct arc *allocarc(struct nfa *, struct state *); static void freearc(struct nfa *, struct arc *); +static int hasnonemptyout(struct state *); +static int nonemptyouts(struct state *); +static int nonemptyins(struct state *); static struct arc *findarc(struct state *, int, pcolor); static void cparc(struct nfa *, struct arc *, struct state *, struct state *); static void moveins(struct nfa *, struct state *, struct state *); -static void copyins(struct nfa *, struct state *, struct state *); +static void copyins(struct nfa *, struct state *, struct state *, int); static void moveouts(struct nfa *, struct state *, struct state *); -static void copyouts(struct nfa *, struct state *, struct state *); +static void copyouts(struct nfa *, struct state *, struct state *, int); static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int); static void delsub(struct nfa *, struct state *, struct state *); static void deltraverse(struct nfa *, struct state *, struct state *); static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *); -static void duptraverse(struct nfa *, struct state *, struct state *); +static void duptraverse(struct nfa *, struct state *, struct state *, int); static void cleartraverse(struct nfa *, struct state *); static void specialcolors(struct nfa *); static long optimize(struct nfa *, FILE *); @@ -151,7 +147,8 @@ static int push(struct nfa *, struct arc *); #define COMPATIBLE 3 /* compatible but not satisfied yet */ static int combine(struct arc *, struct arc *); static void fixempties(struct nfa *, FILE *); -static int unempty(struct nfa *, struct arc *); +static struct state *emptyreachable(struct state *, struct state *); +static void replaceempty(struct nfa *, struct state *, struct state *); static void cleanup(struct nfa *); static void markreachable(struct nfa *, struct state *, struct state *, struct state *); static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); @@ -171,20 +168,13 @@ static void dumpcnfa(struct cnfa *, FILE *); static void dumpcstate(int, struct carc *, struct cnfa *, FILE *); #endif /* === regc_cvec.c === */ -static struct cvec *newcvec(int, int, int); static struct cvec *clearcvec(struct cvec *); static void addchr(struct cvec *, pchr); static void addrange(struct cvec *, pchr, pchr); -#ifdef REGEXP_MCCE_ENABLED -static void addmcce(struct cvec *, const chr *, const chr *); -#endif -static int haschr(struct cvec *, pchr); -static struct cvec *getcvec(struct vars *, int, int, int); +static struct cvec *newcvec(int, int); +static struct cvec *getcvec(struct vars *, int, int); static void freecvec(struct cvec *); /* === regc_locale.c === */ -static int nmcces(struct vars *); -static int nleaders(struct vars *); -static struct cvec *allmcces(struct vars *, struct cvec *); static celt element(struct vars *, const chr *, const chr *); static struct cvec *range(struct vars *, celt, celt, int); static int before(celt, celt); @@ -223,10 +213,6 @@ struct vars { int ntree; /* number of tree nodes */ struct cvec *cv; /* interface cvec */ struct cvec *cv2; /* utility cvec */ - struct cvec *mcces; /* collating-element information */ -#define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c))) - struct state *mccepbegin; /* in nfa, start of MCCE prototypes */ - struct state *mccepend; /* in nfa, end of MCCE prototypes */ struct subre *lacons; /* lookahead-constraint vector */ int nlacons; /* size of lacons */ }; @@ -336,7 +322,6 @@ compile( v->treefree = NULL; v->cv = NULL; v->cv2 = NULL; - v->mcces = NULL; v->lacons = NULL; v->nlacons = 0; re->re_magic = REMAGIC; @@ -362,22 +347,10 @@ compile( ZAPCNFA(g->search); v->nfa = newnfa(v, v->cm, NULL); CNOERR(); - v->cv = newcvec(100, 20, 10); + v->cv = newcvec(100, 20); if (v->cv == NULL) { return freev(v, REG_ESPACE); } - i = nmcces(v); - if (i > 0) { - v->mcces = newcvec(nleaders(v), 0, i); - CNOERR(); - v->mcces = allmcces(v, v->mcces); - leaders(v, v->mcces); -#ifdef REGEXP_MCCE_ENABLED - /* Function does nothing with NULL pointers */ - addmcce(v->mcces, NULL, NULL); /* dummy */ -#endif - } - CNOERR(); /* * Parsing. @@ -550,9 +523,6 @@ freev( if (v->cv2 != NULL) { freecvec(v->cv2); } - if (v->mcces != NULL) { - freecvec(v->mcces); - } if (v->lacons != NULL) { freelacons(v->lacons, v->nlacons); } @@ -641,7 +611,7 @@ makesearch( for (s=slist ; s!=NULL ; s=s2) { s2 = newstate(nfa); - copyouts(nfa, s, s2); + copyouts(nfa, s, s2, 1); for (a=s->ins ; a!=NULL ; a=b) { b = a->inchain; @@ -772,6 +742,7 @@ parsebranch( /* NB, recursion in parseqatom() may swallow rest of branch */ parseqatom(v, stopper, type, lp, right, t); + NOERRN(); } if (!seencontent) { /* empty branch */ @@ -839,7 +810,6 @@ parseqatom( } NEXT(); return; - break; case '$': ARCV('$', 1); if (v->cflags®_NLANCH) { @@ -847,19 +817,16 @@ parseqatom( } NEXT(); return; - break; case SBEGIN: ARCV('^', 1); /* BOL */ ARCV('^', 0); /* or BOS */ NEXT(); return; - break; case SEND: ARCV('$', 1); /* EOL */ ARCV('$', 0); /* or EOS */ NEXT(); return; - break; case '<': wordchrs(v); /* does NEXT() */ s = newstate(v->nfa); @@ -867,7 +834,6 @@ parseqatom( nonword(v, BEHIND, lp, s); word(v, AHEAD, s, rp); return; - break; case '>': wordchrs(v); /* does NEXT() */ s = newstate(v->nfa); @@ -875,7 +841,6 @@ parseqatom( word(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); return; - break; case WBDRY: wordchrs(v); /* does NEXT() */ s = newstate(v->nfa); @@ -887,7 +852,6 @@ parseqatom( word(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); return; - break; case NWBDRY: wordchrs(v); /* does NEXT() */ s = newstate(v->nfa); @@ -899,7 +863,6 @@ parseqatom( nonword(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); return; - break; case LACON: /* lookahead constraint */ pos = v->nextvalue; NEXT(); @@ -914,7 +877,6 @@ parseqatom( NOERR(); ARCV(LACON, n); return; - break; /* * Then errors, to get them out of the way. @@ -926,11 +888,9 @@ parseqatom( case '{': ERR(REG_BADRPT); return; - break; default: ERR(REG_ASSERT); return; - break; /* * Then plain characters, and minor variants on that theme. @@ -1279,6 +1239,7 @@ parseqatom( EMPTYARC(atom->end, rp); t->right = subre(v, '=', 0, atom->end, rp); } + NOERR(); assert(SEE('|') || SEE(stopper) || SEE(EOS)); t->flags |= COMBINE(t->flags, t->right->flags); top->flags |= COMBINE(top->flags, t->flags); @@ -1467,13 +1428,6 @@ cbracket( { struct state *left = newstate(v->nfa); struct state *right = newstate(v->nfa); - struct state *s; - struct arc *a; /* arc from lp */ - struct arc *ba; /* arc from left, from bracket() */ - struct arc *pa; /* MCCE-prototype arc */ - color co; - const chr *p; - int i; NOERR(); bracket(v, left, right); @@ -1485,67 +1439,16 @@ cbracket( assert(lp->nouts == 0); /* all outarcs will be ours */ /* - * Easy part of complementing + * Easy part of complementing, and all there is to do since the MCCE code + * was removed. */ colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp); NOERR(); - if (v->mcces == NULL) { /* no MCCEs -- we're done */ - dropstate(v->nfa, left); - assert(right->nins == 0); - freestate(v->nfa, right); - return; - } - - /* - * But complementing gets messy in the presence of MCCEs... - */ - - NOTE(REG_ULOCALE); - for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--) { - co = GETCOLOR(v->cm, *p); - a = findarc(lp, PLAIN, co); - ba = findarc(left, PLAIN, co); - if (ba == NULL) { - assert(a != NULL); - freearc(v->nfa, a); - } else { - assert(a == NULL); - } - s = newstate(v->nfa); - NOERR(); - newarc(v->nfa, PLAIN, co, lp, s); - NOERR(); - pa = findarc(v->mccepbegin, PLAIN, co); - assert(pa != NULL); - if (ba == NULL) { /* easy case, need all of them */ - cloneouts(v->nfa, pa->to, s, rp, PLAIN); - newarc(v->nfa, '$', 1, s, rp); - newarc(v->nfa, '$', 0, s, rp); - colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp); - } else { /* must be selective */ - if (findarc(ba->to, '$', 1) == NULL) { - newarc(v->nfa, '$', 1, s, rp); - newarc(v->nfa, '$', 0, s, rp); - colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp); - } - for (pa = pa->to->outs; pa != NULL; pa = pa->outchain) { - if (findarc(ba->to, PLAIN, pa->co) == NULL) { - newarc(v->nfa, PLAIN, pa->co, s, rp); - } - } - if (s->nouts == 0) { /* limit of selectivity: none */ - dropstate(v->nfa, s); /* frees arc too */ - } - } - NOERR(); - } - - delsub(v->nfa, left, right); - assert(left->nouts == 0); - freestate(v->nfa, left); + dropstate(v->nfa, left); assert(right->nins == 0); freestate(v->nfa, right); + return; } /* @@ -1561,7 +1464,7 @@ brackpart( celt startc, endc; struct cvec *cv; const chr *startp, *endp; - chr c[1]; + chr c; /* * Parse something, get rid of special cases, take shortcuts. @@ -1573,18 +1476,18 @@ brackpart( return; break; case PLAIN: - c[0] = v->nextvalue; + c = v->nextvalue; NEXT(); /* - * Shortcut for ordinary chr (not range, not MCCE leader). + * Shortcut for ordinary chr (not range). */ - if (!SEE(RANGE) && !ISCELEADER(v, c[0])) { - onechr(v, c[0], lp, rp); + if (!SEE(RANGE)) { + onechr(v, c, lp, rp); return; } - startc = element(v, c, c+1); + startc = element(v, &c, &c+1); NOERR(); break; case COLLEL: @@ -1628,9 +1531,9 @@ brackpart( switch (v->nexttype) { case PLAIN: case RANGE: - c[0] = v->nextvalue; + c = v->nextvalue; NEXT(); - endc = element(v, c, c+1); + endc = element(v, &c, &c+1); NOERR(); break; case COLLEL: @@ -1691,48 +1594,6 @@ scanplain( } /* - - leaders - process a cvec of collating elements to also include leaders - * Also gives all characters involved their own colors, which is almost - * certainly necessary, and sets up little disconnected subNFA. - ^ static void leaders(struct vars *, struct cvec *); - */ -static void -leaders( - struct vars *v, - struct cvec *cv) -{ - int mcce; - const chr *p; - chr leader; - struct state *s; - struct arc *a; - - v->mccepbegin = newstate(v->nfa); - v->mccepend = newstate(v->nfa); - NOERR(); - - for (mcce = 0; mcce < cv->nmcces; mcce++) { - p = cv->mcces[mcce]; - leader = *p; - if (!haschr(cv, leader)) { - addchr(cv, leader); - s = newstate(v->nfa); - newarc(v->nfa, PLAIN, subcolor(v->cm, leader), v->mccepbegin, s); - okcolors(v->nfa, v->cm); - } else { - a = findarc(v->mccepbegin, PLAIN, GETCOLOR(v->cm, leader)); - assert(a != NULL); - s = a->to; - assert(s != v->mccepend); - } - p++; - assert(*p != 0 && *(p+1) == 0); /* only 2-char MCCEs for now */ - newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend); - okcolors(v->nfa, v->cm); - } -} - -/* - onechr - fill in arcs for a plain character, and possible case complements * This is mostly a shortcut for efficient handling of the common case. ^ static void onechr(struct vars *, pchr, struct state *, struct state *); @@ -1749,17 +1610,18 @@ onechr( return; } - /* rats, need general case anyway... */ + /* + * Rats, need general case anyway... + */ + dovec(v, allcases(v, c), lp, rp); } /* - dovec - fill in arcs for each element of a cvec - * This one has to handle the messy cases, like MCCEs and MCCE leaders. ^ static void dovec(struct vars *, struct cvec *, struct state *, ^ struct state *); */ -#ifndef REGEXP_MCCE_ENABLED static void dovec( struct vars *v, @@ -1785,184 +1647,6 @@ dovec( } } -#else /* REGEXP_MCCE_ENABLED */ -static void -dovec( - struct vars *v, - struct cvec *cv, - struct state *lp, - struct state *rp) -{ - chr ch, from, to; - celt ce; - const chr *p; - int i; - struct cvec *leads; - color co; - struct arc *a; - struct arc *pa; /* arc in prototype */ - struct state *s; - struct state *ps; /* state in prototype */ - - /* - * Need a place to store leaders, if any. - */ - - if (nmcces(v) > 0) { - assert(v->mcces != NULL); - if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs) { - if (v->cv2 != NULL) { - free(v->cv2); - } - v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces); - NOERR(); - leads = v->cv2; - } else { - leads = clearcvec(v->cv2); - } - } else { - leads = NULL; - } - - /* - * First, get the ordinary characters out of the way. - */ - - for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) { - ch = *p; - if (!ISCELEADER(v, ch)) { - newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp); - } else { - assert(singleton(v->cm, ch)); - assert(leads != NULL); - if (!haschr(leads, ch)) { - addchr(leads, ch); - } - } - } - - /* - * And the ranges. - */ - - for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) { - from = *p; - to = *(p+1); - while (from <= to && (ce = nextleader(v, from, to)) != NOCELT) { - if (from < ce) { - subrange(v, from, ce - 1, lp, rp); - } - assert(singleton(v->cm, ce)); - assert(leads != NULL); - if (!haschr(leads, ce)) { - addchr(leads, ce); - } - from = ce + 1; - } - if (from <= to) { - subrange(v, from, to, lp, rp); - } - } - - /* *** WARNING *** - * - * This was buggy, check before enabling: the original version would cause - * a segfault at the loopinit below if (leads==NULL && cv->nmcces!=0) - * Possibly just a problem with parens? The original condition was - * ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0) - */ - - if (leads == NULL || (leads->nchrs == 0 && cv->nmcces == 0)) { - return; - } - - /* - * Deal with the MCCE leaders. - */ - - NOTE(REG_ULOCALE); - for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--) { - co = GETCOLOR(v->cm, *p); - a = findarc(lp, PLAIN, co); - if (a != NULL) { - s = a->to; - } else { - s = newstate(v->nfa); - NOERR(); - newarc(v->nfa, PLAIN, co, lp, s); - NOERR(); - } - pa = findarc(v->mccepbegin, PLAIN, co); - assert(pa != NULL); - ps = pa->to; - newarc(v->nfa, '$', 1, s, rp); - newarc(v->nfa, '$', 0, s, rp); - colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp); - NOERR(); - } - - /* - * And the MCCEs. - */ - - for (i = 0; i < cv->nmcces; i++) { - p = cv->mcces[i]; - assert(singleton(v->cm, *p)); - if (!singleton(v->cm, *p)) { - ERR(REG_ASSERT); - return; - } - ch = *p++; - co = GETCOLOR(v->cm, ch); - a = findarc(lp, PLAIN, co); - if (a != NULL) { - s = a->to; - } else { - s = newstate(v->nfa); - NOERR(); - newarc(v->nfa, PLAIN, co, lp, s); - NOERR(); - } - assert(*p != 0); /* at least two chars */ - assert(singleton(v->cm, *p)); - ch = *p++; - co = GETCOLOR(v->cm, ch); - assert(*p == 0); /* and only two, for now */ - newarc(v->nfa, PLAIN, co, s, rp); - NOERR(); - } -} - -/* - - nextleader - find next MCCE leader within range - ^ static celt nextleader(struct vars *, pchr, pchr); - */ -static celt /* NOCELT means none */ -nextleader( - struct vars *v, - pchr from, - pchr to) -{ - int i; - const chr *p; - chr ch; - celt it = NOCELT; - - if (v->mcces == NULL) { - return it; - } - - for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++) { - ch = *p; - if (from <= ch && ch <= to) { - if (it == NOCELT || ch < it) { - it = ch; - } - } - } - return it; -} -#endif /* - wordchrs - set up word-chr list for word-boundary stuff, if needed @@ -2103,20 +1787,14 @@ optst( struct vars *v, struct subre *t) { - if (t == NULL) { - return; - } - /* - * Recurse through children. + * DGP (2007-11-13): I assume it was the programmer's intent to eventually + * come back and add code to optimize subRE trees, but the routine coded + * just spends effort traversing the tree and doing nothing. We can do + * nothing with less effort. */ - if (t->left != NULL) { - optst(v, t->left); - } - if (t->right != NULL) { - optst(v, t->right); - } + return; } /* @@ -2447,8 +2125,8 @@ stdump( if (!NULLCNFA(t->cnfa)) { fprintf(f, "\n"); dumpcnfa(&t->cnfa, f); - fprintf(f, "\n"); } + fprintf(f, "\n"); if (t->left != NULL) { stdump(t->left, f, nfapresent); } @@ -2459,7 +2137,7 @@ stdump( /* - stid - identify a subtree node for dumping - ^ static char *stid(struct subre *, char *, size_t); + ^ static const char *stid(struct subre *, char *, size_t); */ static const char * /* points to buf or constant string */ stid( |
