From 0a41c61107c36da0a8e4ca0fc259149e3bc1956d Mon Sep 17 00:00:00 2001 From: stanton Date: Wed, 11 Nov 1998 01:44:46 +0000 Subject: integrated latest regexp updates from Henry Spencer, includes new regexp switches "-line", "-lineanchor", and "-linestop" for controlling newline behavior --- changes | 10 +- generic/regc_color.c | 499 +++++++++------ generic/regc_lex.c | 25 +- generic/regc_locale.c | 16 +- generic/regc_nfa.c | 35 +- generic/regcomp.c | 1705 +++++++++++++++++++++++-------------------------- generic/regcustom.h | 20 +- generic/rege_dfa.c | 583 +++++++++++++++++ generic/regerror.c | 2 +- generic/regerrs.h | 37 +- generic/regex.h | 25 +- generic/regexec.c | 1349 ++++++++------------------------------ generic/regguts.h | 90 ++- generic/tclCmdMZ.c | 38 +- generic/tclRegexp.c | 206 ++---- generic/tclRegexp.h | 6 +- generic/tclTest.c | 321 +++++++++- tests/regexp.test | 4 +- tests/regtests.test | 37 +- unix/Makefile.in | 4 +- win/makefile.vc | 17 +- 21 files changed, 2525 insertions(+), 2504 deletions(-) create mode 100644 generic/rege_dfa.c diff --git a/changes b/changes index 7d7851b..5e8f335 100644 --- a/changes +++ b/changes @@ -1,6 +1,6 @@ Recent user-visible changes to Tcl: -RCS: @(#) $Id: changes,v 1.1.2.8 1998/11/04 04:47:50 stanton Exp $ +RCS: @(#) $Id: changes,v 1.1.2.9 1998/11/11 01:44:46 stanton Exp $ 1. No more [command1] [command2] construct for grouping multiple commands on a single command line. @@ -3933,3 +3933,11 @@ the encoding.n manual entry for more details. (stanton) 11/3/98 (bug fix) The regular expression character classification syntax now includes Unicode characters in the supported classes. (stanton) + +11/9/98 (bug fix) "format" now correctly handles multibyte characters +in %s format strings. (stanton) + +11/10/98 (new feature) "regexp" now accepts three new switches +("-line", "-lineanchor", and "-linestop") that control how regular +expressions treat line breaks. See the regexp manual entry for more +details. (stanton) diff --git a/generic/regc_color.c b/generic/regc_color.c index 4a8a87c..843c05e 100644 --- a/generic/regc_color.c +++ b/generic/regc_color.c @@ -8,90 +8,75 @@ -/* - * If this declaration draws a complaint about a negative array size, - * then CHRBITS is defined incorrectly for the chr type. - */ -static char isCHRBITSright[NEGIFNOT(sizeof(chr)*CHAR_BIT == CHRBITS)]; - - - #define CISERR() VISERR(cm->v) #define CERR(e) VERR(cm->v, (e)) /* - - newcm - get new colormap - ^ static struct colormap *newcm(struct vars *); + - initcm - set up new colormap + ^ static VOID initcm(struct vars *, struct colormap *); */ -static struct colormap * /* NULL for allocation failure */ -newcm(v) +static VOID +initcm(v, cm) struct vars *v; +struct colormap *cm; { - struct colormap *cm; int i; int j; union tree *t; union tree *nextt; struct colordesc *cd; - cm = (struct colormap *)MALLOC(sizeof(struct colormap)); - if (cm == NULL) { - ERR(REG_ESPACE); - return NULL; - } cm->magic = CMMAGIC; cm->v = v; - cm->rest = WHITE; - cm->filled = 0; cm->ncds = NINLINECDS; - cm->cd = cm->cds; - for (cd = cm->cd; cd < CDEND(cm); cd++) { - cd->nchrs = 0; - cd->sub = NOSUB; - cd->arcs = NULL; - cd->flags = 0; - } - cm->cd[WHITE].nchrs = CHR_MAX - CHR_MIN + 1; - - /* treetop starts as NULLs if there are lower levels */ - t = cm->tree; - if (NBYTS > 1) - for (i = BYTTAB-1; i >= 0; i--) - t->tptr[i] = NULL; - /* if no lower levels, treetop and last fill block are the same */ - - /* fill blocks point to next fill block... */ - for (t = &cm->tree[1], j = NBYTS-2; j > 0; t = nextt, j--) { + cm->cd = cm->cdspace; + cm->max = 0; + cm->free = 0; + + cd = cm->cd; /* cm->cd[WHITE] */ + cd->sub = NOSUB; + cd->arcs = NULL; + cd->flags = 0; + cd->nchrs = CHR_MAX - CHR_MIN + 1; + + /* upper levels of tree */ + for (t = &cm->tree[0], j = NBYTS-1; j > 0; t = nextt, j--) { nextt = t + 1; for (i = BYTTAB-1; i >= 0; i--) - t->tptr[i] = t + 1; + t->tptr[i] = nextt; } - /* ...except last which is solid white */ + /* bottom level is solid white */ t = &cm->tree[NBYTS-1]; for (i = BYTTAB-1; i >= 0; i--) t->tcolor[i] = WHITE; - - - return cm; + cd->block = t; } /* - - freecm - free a colormap + - freecm - free dynamically-allocated things in a colormap ^ static VOID freecm(struct colormap *); */ static VOID freecm(cm) struct colormap *cm; { + size_t i; + union tree *cb; + cm->magic = 0; if (NBYTS > 1) cmtreefree(cm, cm->tree, 0); - if (cm->cd != cm->cds) + for (i = 1; i < cm->max; i++) /* skip WHITE */ + if (!UNUSEDCOLOR(&cm->cd[i])) { + cb = cm->cd[i].block; + if (cb != NULL) + FREE(cb); + } + if (cm->cd != cm->cdspace) FREE(cm->cd); - FREE(cm); } /* @@ -107,92 +92,26 @@ int level; /* level number (top == 0) of this block */ int i; union tree *t; union tree *fillt = &cm->tree[level+1]; + union tree *cb; assert(level < NBYTS-1); /* this level has pointers */ for (i = BYTTAB-1; i >= 0; i--) { t = tree->tptr[i]; - if (t != NULL && t != fillt) { - if (level < NBYTS-2) /* more pointer blocks below */ + assert(t != NULL); + if (t != fillt) { + if (level < NBYTS-2) { /* more pointer blocks below */ cmtreefree(cm, t, level+1); - FREE(t); + FREE(t); + } else { /* color block below */ + cb = cm->cd[t->tcolor[0]].block; + if (t != cb) /* not a solid block */ + FREE(t); + } } } } /* - - fillcm - fill in a colormap, so no NULLs remain - * The point of this is that the tree traversal can then be a fixed set - * of table lookups with no conditional branching. It might be better - * to do reallocation for a more compacted structure, on the order of - * what's done for NFAs, but the colormap can be quite large and a total - * rebuild of it could be costly. - ^ static VOID fillcm(struct colormap *); - */ -static VOID -fillcm(cm) -struct colormap *cm; -{ - if (!cm->filled && NBYTS > 1) - cmtreefill(cm, cm->tree, 0); - cm->filled = 1; -} - -/* - - cmtreefill - fill a non-terminal part of a colormap tree - ^ static VOID cmtreefill(struct colormap *, union tree *, int); - */ -static VOID -cmtreefill(cm, tree, level) -struct colormap *cm; -union tree *tree; -int level; /* level number (top == 0) of this block */ -{ - int i; - union tree *t; - union tree *fillt = &cm->tree[level+1]; - - assert(level < NBYTS-1); /* this level has pointers */ - for (i = BYTTAB-1; i >= 0; i--) { - t = tree->tptr[i]; - if (t == fillt) /* oops */ - {} - else if (t == NULL) - tree->tptr[i] = fillt; - else if (level < NBYTS-2) /* more pointer blocks below */ - cmtreefill(cm, t, level+1); - } -} - -/* - - getcolor - get the color of a character from a colormap - ^ static color getcolor(struct colormap *, pchr); - */ -static color -getcolor(cm, c) -struct colormap *cm; -pchr c; -{ - uchr uc = c; - int shift; - int b; - union tree *t; - - assert(cm->magic == CMMAGIC); - - t = cm->tree; - for (shift = BYTBITS * (NBYTS - 1); t != NULL; shift -= BYTBITS) { - b = (uc >> shift) & BYTMASK; - if (shift == 0) /* reached the bottom */ - return t->tcolor[b]; - t = t->tptr[b]; - } - - /* we fell off an incomplete part of the tree */ - assert(!cm->filled); - return cm->rest; -} - -/* - setcolor - set the color of a character in a colormap ^ static color setcolor(struct colormap *, pchr, pcolor); */ @@ -204,11 +123,14 @@ pcolor co; { uchr uc = c; int shift; - int i; + int level; int b; int bottom; union tree *t; + union tree *newt; + union tree *fillt; union tree *lastt; + union tree *cb; color prev; assert(cm->magic == CMMAGIC); @@ -216,28 +138,32 @@ pcolor co; return COLORLESS; t = cm->tree; - for (shift = BYTBITS * (NBYTS - 1); shift > 0; shift -= BYTBITS) { + for (level = 0, shift = BYTBITS * (NBYTS - 1); shift > 0; + level++, shift -= BYTBITS) { b = (uc >> shift) & BYTMASK; lastt = t; - t = t->tptr[b]; - if (t == NULL) { /* fell off an incomplete part */ - bottom = (shift <= BYTBITS) ? 1 : 0; - t = (union tree *)MALLOC((bottom) ? + t = lastt->tptr[b]; + assert(t != NULL); + fillt = &cm->tree[level+1]; + bottom = (shift <= BYTBITS) ? 1 : 0; + cb = (bottom) ? cm->cd[t->tcolor[0]].block : fillt; + if (t == fillt || t == cb) { /* must allocate a new block */ + newt = (union tree *)MALLOC((bottom) ? sizeof(struct colors) : sizeof(struct ptrs)); - if (t == NULL) { + if (newt == NULL) { CERR(REG_ESPACE); return COLORLESS; } if (bottom) - for (i = BYTTAB-1; i >= 0; i--) - t->tcolor[i] = cm->rest; + memcpy(VS(newt->tcolor), VS(t->tcolor), + BYTTAB*sizeof(color)); else - for (i = BYTTAB-1; i >= 0; i--) - t->tptr[i] = NULL; + memcpy(VS(newt->tptr), VS(t->tptr), + BYTTAB*sizeof(union tree *)); + t = newt; lastt->tptr[b] = t; } } - assert(shift == 0 && t != NULL); /* we hit bottom; it's there */ b = uc & BYTMASK; prev = t->tcolor[b]; @@ -253,20 +179,10 @@ static color maxcolor(cm) struct colormap *cm; { - struct colordesc *cd; - struct colordesc *end; - struct colordesc *lastused; - if (CISERR()) return COLORLESS; - lastused = NULL; - end = CDEND(cm); - for (cd = cm->cd; cd < end; cd++) - if (!UNUSEDCOLOR(cd)) - lastused = cd; - assert(lastused != NULL); - return (color)(lastused - cm->cd); + return (color)cm->max; } /* @@ -279,47 +195,85 @@ newcolor(cm) struct colormap *cm; { struct colordesc *cd; - struct colordesc *end; - struct colordesc *firstnew; + struct colordesc *new; size_t n; if (CISERR()) return COLORLESS; - end = CDEND(cm); - for (cd = cm->cd; cd < end; cd++) - if (UNUSEDCOLOR(cd)) { - assert(cd->arcs == NULL); - return (color)(cd - cm->cd); - } - - /* oops, must allocate more */ - n = cm->ncds * 2; - if (cm->cd == cm->cds) { - cd = (struct colordesc *)MALLOC(sizeof(struct colordesc) * n); - if (cd != NULL) - memcpy(VS(cd), VS(cm->cds), cm->ncds * - sizeof(struct colordesc)); + if (cm->free != 0) { + assert(cm->free > 0); + assert((size_t)cm->free < cm->ncds); + cd = &cm->cd[cm->free]; + assert(UNUSEDCOLOR(cd)); + assert(cd->arcs == NULL); + cm->free = cd->sub; + } else if (cm->max < cm->ncds - 1) { + cm->max++; + cd = &cm->cd[cm->max]; } else { - cd = (struct colordesc *)REALLOC(cm->cd, + /* oops, must allocate more */ + n = cm->ncds * 2; + if (cm->cd == cm->cdspace) { + new = (struct colordesc *)MALLOC(n * + sizeof(struct colordesc)); + if (new != NULL) + memcpy(VS(new), VS(cm->cdspace), cm->ncds * + sizeof(struct colordesc)); + } else + new = (struct colordesc *)REALLOC(cm->cd, n * sizeof(struct colordesc)); + if (new == NULL) { + CERR(REG_ESPACE); + return COLORLESS; + } + cm->cd = new; + cm->ncds = n; + assert(cm->max < cm->ncds - 1); + cm->max++; + cd = &cm->cd[cm->max]; } - if (cd == NULL) { - CERR(REG_ESPACE); - return COLORLESS; + + cd->nchrs = 0; + cd->sub = NOSUB; + cd->arcs = NULL; + cd->flags = 0; + cd->block = NULL; + + return (color)(cd - cm->cd); +} + +/* + - freecolor - free a color (must have no arcs or subcolor) + ^ static VOID freecolor(struct colormap *, pcolor); + */ +static VOID +freecolor(cm, co) +struct colormap *cm; +pcolor co; +{ + struct colordesc *cd = &cm->cd[co]; + + assert(co >= 0); + if (co == WHITE) + return; + + assert(cd->arcs == NULL); + assert(cd->sub == NOSUB); + assert(cd->nchrs == 0); + cd->flags = FREECOL; + if (cd->block != NULL) { + FREE(cd->block); + cd->block = NULL; /* just paranoia */ } - cm->cd = cd; - firstnew = CDEND(cm); - cm->ncds = n; - end = CDEND(cm); - for (cd = firstnew; cd < end; cd++) { - cd->nchrs = 0; - cd->sub = NOSUB; - cd->arcs = NULL; - cd->flags = 0; + + if ((size_t)co == cm->max) + while (cm->max > WHITE && UNUSEDCOLOR(&cm->cd[cm->max])) + cm->max--; + else { + cd->sub = cm->free; + cm->free = (color)(cd - cm->cd); } - assert(firstnew < CDEND(cm) && UNUSEDCOLOR(firstnew)); - return (color)(firstnew - cm->cd); } /* @@ -352,27 +306,170 @@ pchr c; color co; /* current color of c */ color sco; /* new subcolor */ - co = getcolor(cm, c); + co = GETCOLOR(cm, c); + sco = newsub(cm, co); + + if (co == sco) /* already in an open subcolor */ + return co; /* rest is redundant */ + cm->cd[co].nchrs--; + cm->cd[sco].nchrs++; + setcolor(cm, c, sco); + return sco; +} + +/* + - newsub - allocate a new subcolor (if necessary) for a color + ^ static color newsub(struct colormap *, pcolor); + */ +static color +newsub(cm, co) +struct colormap *cm; +pcolor co; +{ + color sco; /* new subcolor */ + sco = cm->cd[co].sub; - if (sco == NOSUB) { /* must create subcolor */ - if (cm->cd[co].nchrs == 1) /* shortcut */ + if (sco == NOSUB) { /* color has no open subcolor */ + if (cm->cd[co].nchrs == 1) /* optimization */ return co; - sco = newcolor(cm); + sco = newcolor(cm); /* must create subcolor */ if (sco == COLORLESS) return COLORLESS; cm->cd[co].sub = sco; - cm->cd[sco].sub = sco; /* self-referential subcolor ptr */ + cm->cd[sco].sub = sco; /* open subcolor points to self */ } + assert(sco != NOSUB); - if (co == sco) /* repeated character */ - return co; /* no further action needed */ - cm->cd[co].nchrs--; - cm->cd[sco].nchrs++; - setcolor(cm, c, sco); return sco; } /* + - subrange - allocate new subcolors to this range of chrs, fill in arcs + ^ static VOID subrange(struct vars *, pchr, pchr, struct state *, + ^ struct state *); + */ +static VOID +subrange(v, from, to, lp, rp) +struct vars *v; +pchr from; +pchr to; +struct state *lp; +struct state *rp; +{ + uchr uf; + int i; + + assert(from <= to); + + /* first, align "from" on a tree-block boundary */ + uf = (uchr)from; + i = (int)( ((uf + BYTTAB-1) & (uchr)~BYTMASK) - uf ); + for (; from <= to && i > 0; i--, from++) + newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp); + if (from > to) /* didn't reach a boundary */ + return; + + /* deal with whole blocks */ + for (; to - from >= BYTTAB; from += BYTTAB) + subblock(v, from, lp, rp); + + /* clean up any remaining partial table */ + for (; from <= to; from++) + newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp); +} + +/* + - subblock - allocate new subcolors for one tree block of chrs, fill in arcs + ^ static VOID subblock(struct vars *, pchr, struct state *, struct state *); + */ +static VOID +subblock(v, start, lp, rp) +struct vars *v; +pchr start; /* first of BYTTAB chrs */ +struct state *lp; +struct state *rp; +{ + uchr uc = start; + struct colormap *cm = v->cm; + int shift; + int level; + int i; + int b; + union tree *t; + union tree *cb; + union tree *fillt; + union tree *lastt; + int previ; + int ndone; + color co; + color sco; + + assert((uc & BYTMASK) == 0); + + /* find its color block, making new pointer blocks as needed */ + t = cm->tree; + fillt = NULL; + for (level = 0, shift = BYTBITS * (NBYTS - 1); shift > 0; + level++, shift -= BYTBITS) { + b = (uc >> shift) & BYTMASK; + lastt = t; + t = lastt->tptr[b]; + assert(t != NULL); + fillt = &cm->tree[level+1]; + if (t == fillt && shift > BYTBITS) { /* need new ptr block */ + t = (union tree *)MALLOC(sizeof(struct ptrs)); + if (t == NULL) { + CERR(REG_ESPACE); + return; + } + memcpy(VS(t->tptr), VS(fillt->tptr), + BYTTAB*sizeof(union tree *)); + lastt->tptr[b] = t; + } + } + + /* special cases: fill block or solid block */ + co = t->tcolor[0]; + cb = cm->cd[co].block; + if (t == fillt || t == cb) { + /* either way, we want a subcolor solid block */ + sco = newsub(cm, co); + t = cm->cd[sco].block; + if (t == NULL) { /* must set it up */ + t = (union tree *)MALLOC(sizeof(struct colors)); + if (t == NULL) { + CERR(REG_ESPACE); + return; + } + for (i = 0; i < BYTTAB; i++) + t->tcolor[i] = sco; + cm->cd[sco].block = t; + } + /* find loop must have run at least once */ + lastt->tptr[b] = t; + newarc(v->nfa, PLAIN, sco, lp, rp); + cm->cd[co].nchrs -= BYTTAB; + cm->cd[sco].nchrs += BYTTAB; + return; + } + + /* general case, a mixed block to be altered */ + i = 0; + while (i < BYTTAB) { + co = t->tcolor[i]; + sco = newsub(cm, co); + newarc(v->nfa, PLAIN, sco, lp, rp); + previ = i; + do { + t->tcolor[i++] = sco; + } while (i < BYTTAB && t->tcolor[i] == co); + ndone = i - previ; + cm->cd[co].nchrs -= ndone; + cm->cd[sco].nchrs += ndone; + } +} + +/* - okcolors - promote subcolors to full colors ^ static VOID okcolors(struct nfa *, struct colormap *); */ @@ -390,7 +487,7 @@ struct colormap *cm; for (cd = cm->cd, co = 0; cd < end; cd++, co++) { sco = cd->sub; - if (sco == NOSUB) { + if (UNUSEDCOLOR(cd) || sco == NOSUB) { /* has no subcolor, no further action */ } else if (sco == co) { /* is subcolor, let parent deal with it */ @@ -410,6 +507,7 @@ struct colormap *cm; a->colorchain = scd->arcs; scd->arcs = a; } + freecolor(cm, co); } else { /* parent's arcs must gain parallel subcolor arcs */ cd->sub = NOSUB; @@ -475,7 +573,7 @@ pchr c; { color co; /* color of c */ - co = getcolor(cm, c); + co = GETCOLOR(cm, c); if (cm->cd[co].nchrs == 1 && cm->cd[co].sub == NOSUB) return 1; return 0; @@ -548,24 +646,27 @@ FILE *f; struct colordesc *end; color co; chr c; + char *has; - if (cm->filled) { - fprintf(f, "filled\n"); - if (NBYTS > 1) - fillcheck(cm, cm->tree, 0, f); - } + fprintf(f, "max %ld\n", (long)cm->max); + if (NBYTS > 1) + fillcheck(cm, cm->tree, 0, f); end = CDEND(cm); for (cd = cm->cd + 1, co = 1; cd < end; cd++, co++) /* skip 0 */ - if (cd->nchrs > 0) { + if (!UNUSEDCOLOR(cd)) { + assert(cd->nchrs > 0); + has = (cd->block != NULL) ? "#" : ""; if (cd->flags&PSEUDO) - fprintf(f, "#%2ld(ps): ", (long)co); + fprintf(f, "#%2ld%s(ps): ", (long)co, has); else - fprintf(f, "#%2ld(%2d): ", (long)co, cd->nchrs); + fprintf(f, "#%2ld%s(%2d): ", (long)co, + has, cd->nchrs); + /* it's hard to do this more efficiently */ for (c = CHR_MIN; c < CHR_MAX; c++) - if (getcolor(cm, c) == co) + if (GETCOLOR(cm, c) == co) dumpchr(c, f); assert(c == CHR_MAX); - if (getcolor(cm, c) == co) + if (GETCOLOR(cm, c) == co) dumpchr(c, f); fprintf(f, "\n"); } @@ -613,7 +714,7 @@ FILE *f; else if (c > ' ' && c <= '~') putc((char)c, f); else - fprintf(f, "\\0%lo", (long)c); + fprintf(f, "\\u%04lx", (long)c); } #endif /* ifdef REG_DEBUG */ diff --git a/generic/regc_lex.c b/generic/regc_lex.c index 820b404..0eb6c5d 100644 --- a/generic/regc_lex.c +++ b/generic/regc_lex.c @@ -364,7 +364,7 @@ struct vars *v; NOTE(REG_UNONPOSIX); if (ATEOS()) FAILW(REG_EESCAPE); - (DISCARD) lexescape(v); + (DISCARD)lexescape(v); switch (v->nexttype) { /* not all escapes okay here */ case PLAIN: return 1; @@ -459,10 +459,10 @@ struct vars *v; break; } - /* that got rid of everything except EREs */ + /* that got rid of everything except EREs and AREs */ assert(INCON(L_ERE)); - /* deal with EREs, except for backslashes */ + /* deal with EREs and AREs, except for backslashes */ switch (c) { case CHR('|'): RET('|'); @@ -529,12 +529,6 @@ struct vars *v; NOTE(REG_ULOOKAHEAD); RETV(LACON, 0); break; - case CHR('<'): /* prefer short */ - RETV(PREFER, 0); - break; - case CHR('>'): /* prefer long */ - RETV(PREFER, 1); - break; default: FAILW(REG_BADRPT); break; @@ -547,8 +541,9 @@ struct vars *v; RETV('(', 1); break; case CHR(')'): - if (LASTTYPE('(')) + if (LASTTYPE('(')) { NOTE(REG_UUNSPEC); + } RETV(')', c); break; case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ @@ -589,7 +584,7 @@ struct vars *v; break; } - /* ERE backslash handling; backslash already eaten */ + /* ERE/ARE backslash handling; backslash already eaten */ assert(!ATEOS()); if (!(v->cflags®_ADVF)) { /* only AREs have non-trivial escapes */ if (iscalnum(*v->now)) { @@ -598,7 +593,7 @@ struct vars *v; } RETV(PLAIN, *v->now++); } - (DISCARD) lexescape(v); + (DISCARD)lexescape(v); if (ISERR()) FAILW(REG_EESCAPE); if (v->nexttype == CCLASS) { /* fudge at lexical level */ @@ -682,6 +677,12 @@ struct vars *v; case CHR('f'): RETV(PLAIN, CHR('\f')); break; + case CHR('m'): + RET('<'); + break; + case CHR('M'): + RET('>'); + break; case CHR('n'): RETV(PLAIN, CHR('\n')); break; diff --git a/generic/regc_locale.c b/generic/regc_locale.c index 7d504c8..5930378 100644 --- a/generic/regc_locale.c +++ b/generic/regc_locale.c @@ -1,9 +1,15 @@ -/* - * locale-specific stuff, including MCCE handling - * This file is #included by regcomp.c. +/* + * regc_locale.c -- + * + * This file contains the Unicode locale specific regexp routines. + * This file is #included by regcomp.c. + * + * Copyright (c) 1998 by Scriptics Corporation. + * + * See the file "license.terms" for information on usage and redistribution + * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * No MCCEs for Tcl. The handling of character names and classes is - * still ASCII-centric, and needs to be extended to handle full Unicode. + * RCS: @(#) $Id: regc_locale.c,v 1.1.2.3 1998/11/11 01:44:49 stanton Exp $ */ /* ASCII character-name table */ diff --git a/generic/regc_nfa.c b/generic/regc_nfa.c index 14ee077..8143181 100644 --- a/generic/regc_nfa.c +++ b/generic/regc_nfa.c @@ -575,7 +575,7 @@ struct state *s; static VOID dupnfa(nfa, start, stop, from, to) struct nfa *nfa; -struct state *start; /* duplicate starting here */ +struct state *start; /* duplicate of subNFA starting here */ struct state *stop; /* and stopping here */ struct state *from; /* stringing duplicate from here */ struct state *to; /* to here */ @@ -1161,7 +1161,8 @@ struct nfa *nfa; struct arc *a; struct arc *aa; - /* can the NFA match the empty string? */ + if (nfa->pre->outs == NULL) + return REG_UIMPOSSIBLE; for (a = nfa->pre->outs; a != NULL; a = a->outchain) for (aa = a->to->outs; aa != NULL; aa = aa->outchain) if (aa->to == nfa->post) @@ -1170,29 +1171,6 @@ struct nfa *nfa; } /* - - isempty - is a sub-NFA composed only of EMPTY arcs? - * Somewhat limited implementation, makes assumptions. - ^ static int isempty(struct state *, struct state *); - */ -static int -isempty(begin, end) -struct state *begin; -struct state *end; -{ - struct state *s; - - for (s = begin; s != end; s = s->outs->to) { - if (s->nouts != 1) - return 0; - assert(s->outs != NULL); - if (s->outs->type != EMPTY) - return 0; - } - - return 1; -} - -/* - compact - compact an NFA ^ static VOID compact(struct nfa *, struct cnfa *); */ @@ -1305,19 +1283,16 @@ struct carc *last; /* - freecnfa - free a compacted NFA - ^ static VOID freecnfa(struct cnfa *, int); + ^ static VOID freecnfa(struct cnfa *); */ static VOID -freecnfa(cnfa, dynalloc) +freecnfa(cnfa) struct cnfa *cnfa; -int dynalloc; /* is the cnfa struct itself dynamic? */ { assert(cnfa->nstates != 0); /* not empty already */ cnfa->nstates = 0; FREE(cnfa->states); FREE(cnfa->arcs); - if (dynalloc) - FREE(cnfa); } /* diff --git a/generic/regcomp.c b/generic/regcomp.c index b8c2119..620dc1c 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -14,7 +14,11 @@ int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int)); static VOID moresubs _ANSI_ARGS_((struct vars *, int)); static int freev _ANSI_ARGS_((struct vars *, int)); -static struct rtree *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int)); +static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *)); +static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int)); +static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *)); +static VOID nonword _ANSI_ARGS_((struct vars *, int, struct state *, struct state *)); +static VOID word _ANSI_ARGS_((struct vars *, int, struct state *, struct state *)); static int scannum _ANSI_ARGS_((struct vars *)); static VOID repeat _ANSI_ARGS_((struct vars *, struct state *, struct state *, int, int)); static VOID bracket _ANSI_ARGS_((struct vars *, struct state *, struct state *)); @@ -24,24 +28,23 @@ static chr *scanplain _ANSI_ARGS_((struct vars *)); static VOID leaders _ANSI_ARGS_((struct vars *, struct cvec *)); static VOID onechr _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *)); static VOID dovec _ANSI_ARGS_((struct vars *, struct cvec *, struct state *, struct state *)); -static color nlcolor _ANSI_ARGS_((struct vars *)); +static celt nextleader _ANSI_ARGS_((struct vars *, pchr, pchr)); static VOID wordchrs _ANSI_ARGS_((struct vars *)); -static struct subre subre _ANSI_ARGS_((struct state *, struct state *, int, int, struct rtree *)); -static struct rtree *newrt _ANSI_ARGS_((struct vars *)); -static VOID freert _ANSI_ARGS_((struct vars *, struct rtree *)); -static VOID freertnode _ANSI_ARGS_((struct vars *, struct rtree *)); -static VOID optrt _ANSI_ARGS_((struct vars *, struct rtree *)); -static int numrt _ANSI_ARGS_((struct rtree *, int)); -static VOID markrt _ANSI_ARGS_((struct rtree *)); -static VOID cleanrt _ANSI_ARGS_((struct vars *)); -static VOID nfatree _ANSI_ARGS_((struct vars *, struct rtree *, FILE *)); -static VOID nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *)); +static struct subre *subre _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *)); +static VOID freesubre _ANSI_ARGS_((struct vars *, struct subre *)); +static VOID freesrnode _ANSI_ARGS_((struct vars *, struct subre *)); +static VOID optst _ANSI_ARGS_((struct vars *, struct subre *)); +static int numst _ANSI_ARGS_((struct subre *, int)); +static VOID markst _ANSI_ARGS_((struct subre *)); +static VOID cleanst _ANSI_ARGS_((struct vars *)); +static int nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *)); +static int nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *)); static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int)); static VOID freelacons _ANSI_ARGS_((struct subre *, int)); static VOID rfree _ANSI_ARGS_((regex_t *)); static VOID dump _ANSI_ARGS_((regex_t *, FILE *)); -static VOID dumprt _ANSI_ARGS_((struct rtree *, FILE *, int)); -static VOID rtdump _ANSI_ARGS_((struct rtree *, FILE *, int, int)); +static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int)); +static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int, int)); /* === regc_lex.c === */ static VOID lexstart _ANSI_ARGS_((struct vars *)); static VOID prefixes _ANSI_ARGS_((struct vars *)); @@ -56,17 +59,18 @@ static chr newline _ANSI_ARGS_((NOPARMS)); static chr *ch _ANSI_ARGS_((NOPARMS)); static chr chrnamed _ANSI_ARGS_((struct vars *, chr *, chr *, pchr)); /* === regc_color.c === */ -static struct colormap *newcm _ANSI_ARGS_((struct vars *)); +static VOID initcm _ANSI_ARGS_((struct vars *, struct colormap *)); static VOID freecm _ANSI_ARGS_((struct colormap *)); static VOID cmtreefree _ANSI_ARGS_((struct colormap *, union tree *, int)); -static VOID fillcm _ANSI_ARGS_((struct colormap *)); -static VOID cmtreefill _ANSI_ARGS_((struct colormap *, union tree *, int)); -static color getcolor _ANSI_ARGS_((struct colormap *, pchr)); static color setcolor _ANSI_ARGS_((struct colormap *, pchr, pcolor)); static color maxcolor _ANSI_ARGS_((struct colormap *)); static color newcolor _ANSI_ARGS_((struct colormap *)); +static VOID freecolor _ANSI_ARGS_((struct colormap *, pcolor)); static color pseudocolor _ANSI_ARGS_((struct colormap *)); static color subcolor _ANSI_ARGS_((struct colormap *, pchr c)); +static color newsub _ANSI_ARGS_((struct colormap *, pcolor)); +static VOID subrange _ANSI_ARGS_((struct vars *, pchr, pchr, struct state *, struct state *)); +static VOID subblock _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *)); static VOID okcolors _ANSI_ARGS_((struct nfa *, struct colormap *)); static VOID colorchain _ANSI_ARGS_((struct colormap *, struct arc *)); static VOID uncolorchain _ANSI_ARGS_((struct colormap *, struct arc *)); @@ -115,10 +119,9 @@ static VOID cleanup _ANSI_ARGS_((struct nfa *)); static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *)); static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *)); static int analyze _ANSI_ARGS_((struct nfa *)); -static int isempty _ANSI_ARGS_((struct state *, struct state *)); static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *)); static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *)); -static VOID freecnfa _ANSI_ARGS_((struct cnfa *, int)); +static VOID freecnfa _ANSI_ARGS_((struct cnfa *)); static VOID dumpnfa _ANSI_ARGS_((struct nfa *, FILE *)); static VOID dumpstate _ANSI_ARGS_((struct state *, FILE *)); static VOID dumparcs _ANSI_ARGS_((struct state *, FILE *)); @@ -173,9 +176,9 @@ struct vars { struct colormap *cm; /* character color map */ color nlcolor; /* color of newline */ struct state *wordchrs; /* state in nfa holding word-char outarcs */ - struct rtree *tree; /* subexpression tree */ - struct rtree *treechain; /* all tree nodes allocated */ - struct rtree *treefree; /* any free tree nodes */ + struct subre *tree; /* subexpression tree */ + struct subre *treechain; /* all tree nodes allocated */ + struct subre *treefree; /* any free tree nodes */ int ntree; /* number of tree nodes */ struct cvec *cv; /* interface cvec */ struct cvec *cv2; /* utility cvec */ @@ -186,6 +189,7 @@ struct vars { struct subre *lacons; /* lookahead-constraint vector */ int nlacons; /* size of lacons */ int usedshorter; /* used short-preferring quantifiers */ + int unmatchable; /* can never match */ }; /* parsing macros; most know that `v' is the struct vars pointer */ @@ -199,6 +203,7 @@ struct vars { #define ERR(e) VERR(v, e) /* record an error */ #define NOERR() {if (ISERR()) return;} /* if error seen, return */ #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */ +#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */ #define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */ #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */ #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y) @@ -259,7 +264,6 @@ int flags; if (re == NULL || string == NULL) return REG_INVARG; - assert(REG_ADVANCED == (REG_EXTENDED|REG_ADVF)); if ((flags®_QUOTE) && (flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))) return REG_INVARG; @@ -297,19 +301,18 @@ int flags; re->re_fns = VS(&functions); /* more complex setup, malloced things */ - v->cm = newcm(v); - CNOERR(); - v->nfa = newnfa(v, v->cm, (struct nfa *)NULL); - CNOERR(); re->re_guts = VS(MALLOC(sizeof(struct guts))); if (re->re_guts == NULL) return freev(v, REG_ESPACE); g = (struct guts *)re->re_guts; - ZAPCNFA(g->cnfa); g->tree = NULL; - g->cm = NULL; + initcm(v, &g->cmap); + v->cm = &g->cmap; g->lacons = NULL; g->nlacons = 0; + ZAPCNFA(g->search); + v->nfa = newnfa(v, v->cm, (struct nfa *)NULL); + CNOERR(); v->cv = newcvec(100, 20, 10); if (v->cv == NULL) return freev(v, REG_ESPACE); @@ -324,53 +327,56 @@ int flags; /* parsing */ lexstart(v); /* also handles prefixes */ - v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final, NONEYET); + if ((v->cflags®_NLSTOP) || (v->cflags®_NLANCH)) { + /* assign newline a unique color */ + v->nlcolor = subcolor(v->cm, newline()); + okcolors(v->nfa, v->cm); + } + CNOERR(); + v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final); assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */ CNOERR(); + assert(v->tree != NULL); /* finish setup of nfa and its subre tree */ specialcolors(v->nfa); CNOERR(); if (debug != NULL) { dumpnfa(v->nfa, debug); - dumprt(v->tree, debug, 1); + dumpst(v->tree, debug, 1); } v->usedshorter = 0; - optrt(v, v->tree); - if (v->tree != NULL) { - v->ntree = numrt(v->tree, 1); - markrt(v->tree); - } else - v->ntree = 0; - cleanrt(v); + v->unmatchable = 0; + optst(v, v->tree); + v->ntree = numst(v->tree, 1); + markst(v->tree); + cleanst(v); if (debug != NULL) { fprintf(debug, "-->\n"); - dumprt(v->tree, debug, 1); + dumpst(v->tree, debug, 1); } - /* build compacted NFAs for tree, lacons, main nfa */ - nfatree(v, v->tree, debug); + /* build compacted NFAs for tree, lacons, fast search */ + re->re_info |= nfatree(v, v->tree, debug); if (debug != NULL) { fprintf(debug, "---->\n"); - dumprt(v->tree, debug, 1); + dumpst(v->tree, debug, 1); } CNOERR(); + if (re->re_info®_UIMPOSSIBLE) + v->unmatchable = 1; assert(v->nlacons == 0 || v->lacons != NULL); for (i = 1; i < v->nlacons; i++) nfanode(v, &v->lacons[i], debug); CNOERR(); - re->re_info |= optimize(v->nfa, debug); + rainbow(v->nfa, v->cm, PLAIN, COLORLESS, v->nfa->pre, v->nfa->pre); + newarc(v->nfa, PLAIN, v->nfa->bos[0], v->nfa->pre, v->nfa->pre); + newarc(v->nfa, PLAIN, v->nfa->bos[1], v->nfa->pre, v->nfa->pre); + newarc(v->nfa, PLAIN, v->nfa->eos[0], v->nfa->pre, v->nfa->pre); + newarc(v->nfa, PLAIN, v->nfa->eos[1], v->nfa->pre, v->nfa->pre); + (DISCARD)optimize(v->nfa, debug); CNOERR(); - if (v->nfa->post->nins <= 0) - return freev(v, REG_IMPOSS); /* end unreachable! */ - assert(v->nfa->pre->nouts > 0); - compact(v->nfa, &g->cnfa); - CNOERR(); - freenfa(v->nfa); - v->nfa = NULL; - - /* fill color map */ - fillcm(v->cm); + compact(v->nfa, &g->search); CNOERR(); /* looks okay, package it up */ @@ -380,8 +386,6 @@ int flags; g->cflags = v->cflags; g->info = re->re_info; g->nsub = re->re_nsub; - g->cm = v->cm; - v->cm = NULL; g->tree = v->tree; v->tree = NULL; g->ntree = v->ntree; @@ -390,6 +394,7 @@ int flags; v->lacons = NULL; g->nlacons = v->nlacons; g->usedshorter = v->usedshorter; + g->unmatchable = v->unmatchable; if (flags®_DUMP) dump(re, stdout); @@ -447,12 +452,10 @@ int err; FREE(v->subs); if (v->nfa != NULL) freenfa(v->nfa); - if (v->cm != NULL) - freecm(v->cm); if (v->tree != NULL) - freert(v, v->tree); + freesubre(v, v->tree); if (v->treechain != NULL) - cleanrt(v); + cleanst(v); if (v->cv != NULL) freecvec(v->cv); if (v->cv2 != NULL) @@ -468,541 +471,562 @@ int err; /* - parse - parse an RE - * Arguably this is too big and too complex and ought to be divided up. - * However, the code is somewhat intertwined... - * - * Note that it is no longer necessary to be rigorous about freeing tree - * nodes on error exits, as the tree machinery keeps track of them. - ^ static struct rtree *parse(struct vars *, int, int, struct state *, - ^ struct state *, int); + * This is actually just the top level, which parses a bunch of branches + * tied together with '|'. They appear in the tree as the left children + * of a chain of '|' subres. + ^ static struct subre *parse(struct vars *, int, int, struct state *, + ^ struct state *); */ -static struct rtree * /* NULL if no interesting substructure */ -parse(v, stopper, type, init, final, pprefer) +static struct subre * +parse(v, stopper, type, init, final) struct vars *v; int stopper; /* EOS or ')' */ int type; /* LACON (lookahead subRE) or PLAIN */ struct state *init; /* initial state */ struct state *final; /* final state */ -int pprefer; /* parent's short/long preference */ { struct state *left; /* scaffolding for branch */ struct state *right; - struct state *lp; /* scaffolding for current construct */ - struct state *rp; - struct state *s; /* temporaries for new states */ - struct state *s2; -# define ARCV(t, val) newarc(v->nfa, t, val, lp, rp) - int m, n; - int emptybranch; /* is there anything in this branch yet? */ - struct rtree *branches; /* top level */ - struct rtree *branch; /* current branch */ - struct subre *now; /* current subtree's top */ - struct subre sub; /* communication variable */ - struct rtree *rt1; /* temporaries */ - struct rtree *rt2; - struct subre *t; /* work pointer, top of interesting subtree */ + struct subre *branches; /* top level */ + struct subre *branch; /* current branch */ + struct subre *t; /* temporary */ int firstbranch; /* is this the first branch? */ - int capture; /* any capturing parens within this? */ - int constraint; /* is the current atom a constraint? */ assert(stopper == ')' || stopper == EOS); - capture = 0; - branches = newrt(v); + branches = subre(v, '|', LONGER, init, final); + NOERRN(); branch = branches; - rt1 = NULL; /* shut up lint */ firstbranch = 1; - NOERRN(); - do { - /* a branch */ - emptybranch = 1; /* tentatively */ - left = newstate(v->nfa); - right = newstate(v->nfa); - NOERRN(); + do { /* a branch */ if (!firstbranch) { - rt1 = newrt(v); + /* need a place to hang it */ + branch->right = subre(v, '|', LONGER, init, final); NOERRN(); - branch->next = rt1; - branch = rt1; + branch = branch->right; } + firstbranch = 0; + left = newstate(v->nfa); + right = newstate(v->nfa); + NOERRN(); EMPTYARC(init, left); EMPTYARC(right, final); - lp = left; - rp = right; - branch->op = '|'; - now = &branch->left; - *now = subre(left, right, NONEYET, 0, (struct rtree *)NULL); - firstbranch = 0; NOERRN(); + branch->left = parsebranch(v, stopper, type, left, right, 0); + NOERRN(); + branch->flags |= UP(branch->flags | branch->left->flags); + if ((branch->flags &~ branches->flags) != 0) /* new flags */ + for (t = branches; t != branch; t = t->right) + t->flags |= branch->flags; + } while (EAT('|')); + assert(SEE(stopper) || SEE(EOS)); - while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) { - /* initial bookkeeping */ - sub.begin = NULL; /* no substructure seen yet */ - sub.subno = 0; - sub.prefer = NONEYET; - constraint = 0; - if (emptybranch) /* first of the branch */ - emptybranch = 0; - else { /* implicit concat operator */ - lp = newstate(v->nfa); - NOERRN(); - moveins(v->nfa, rp, lp); - } - assert(lp->nouts == 0); /* must string new code */ - assert(rp->nins == 0); /* between lp and rp */ - - /* an atom... */ - switch (v->nexttype) { - case '(': /* value flags as capturing or non */ - m = (type == LACON) ? 0 : v->nextvalue; - if (m) { - v->nsubexp++; - sub.subno = v->nsubexp; - if ((size_t)sub.subno >= v->nsubs) - moresubs(v, sub.subno); - assert((size_t)sub.subno < v->nsubs); - } else - sub.subno = 0; - NEXT(); - sub.begin = lp; /* NB, substructure seen */ - sub.end = rp; - /* use now->tree as temporary, so */ - /* things get freed on error returns */ - assert(now->tree == NULL); - now->tree = parse(v, ')', PLAIN, lp, rp, - now->prefer); - assert(SEE(')') || ISERR()); - NEXT(); - NOERRN(); - if (!m && now->tree == NULL) { - /* actually no relevant substructure */ - sub.begin = NULL; - } - if (now->tree != NULL) { - if (now->tree->op == '|') - sub.prefer = LONGER; - else - sub.prefer = - now->tree->left.prefer; - } - /* must postpone other processing until we */ - /* know about any {0,0} quantifier */ - break; - case BACKREF: /* the Feature From The Black Lagoon */ - INSIST(type != LACON, REG_ESUBREG); - INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); - INSIST(v->subs[v->nextvalue] != NULL, - REG_ESUBREG); - NOERRN(); - assert(v->nextvalue > 0); - sub.subno = -v->nextvalue; - sub.begin = lp; /* NB, substructure seen */ - sub.end = rp; - EMPTYARC(lp, rp); /* temporarily */ - assert(now->tree == NULL); - NEXT(); - break; - case LACON: /* lookahead constraint */ - m = v->nextvalue; /* is positive? */ - NEXT(); - s = newstate(v->nfa); - s2 = newstate(v->nfa); - NOERRN(); - rt1 = parse(v, ')', LACON, s, s2, NONEYET); - assert(SEE(')') || ISERR()); - NEXT(); - m = newlacon(v, s, s2, m); - freert(v, rt1); - NOERRN(); - ARCV(LACON, m); - constraint = 1; - break; - case PREFER: /* length preference */ - sub.prefer = (v->nextvalue) ? LONGER : SHORTER; - NEXT(); - sub.begin = lp; /* NB, substructure seen */ - sub.end = rp; - /* use now->tree as temporary, so */ - /* things get freed on error returns */ - assert(now->tree == NULL); - now->tree = parse(v, ')', PLAIN, lp, rp, - sub.prefer); - assert(SEE(')') || ISERR()); - NEXT(); - NOERRN(); - if (now->prefer == NONEYET) - now->prefer = sub.prefer; - if (sub.prefer == now->prefer && - now->tree == NULL) { - /* actually no relevant substructure */ - sub.begin = NULL; - } - break; - case '[': - if (v->nextvalue == 1) - bracket(v, lp, rp); - else - cbracket(v, lp, rp); - assert(SEE(']') || ISERR()); - NEXT(); - break; - case '.': - rainbow(v->nfa, v->cm, PLAIN, - (v->cflags®_NLSTOP) ? - nlcolor(v) : COLORLESS, - lp, rp); - NEXT(); - break; - case '^': - ARCV('^', 1); - if (v->cflags®_NLANCH) - ARCV(BEHIND, nlcolor(v)); - NEXT(); - constraint = 1; - break; - case '$': - ARCV('$', 1); - if (v->cflags®_NLANCH) - ARCV(AHEAD, nlcolor(v)); - NEXT(); - constraint = 1; - break; - case SBEGIN: - ARCV('^', 1); /* BOL */ - ARCV('^', 0); /* or BOS */ - NEXT(); - constraint = 1; - break; - case SEND: - ARCV('$', 1); /* EOL */ - ARCV('$', 0); /* or EOS */ - NEXT(); - constraint = 1; - break; - case '<': - wordchrs(v); /* does NEXT() */ - s = newstate(v->nfa); - NOERRN(); - /* needs BOL, BOS, or nonword to left... */ - newarc(v->nfa, '^', 1, lp, s); - newarc(v->nfa, '^', 0, lp, s); - colorcomplement(v->nfa, v->cm, BEHIND, - v->wordchrs, lp, s); - /* ... and word to right */ - cloneouts(v->nfa, v->wordchrs, s, rp, AHEAD); - /* (no need for special attention to \n) */ - constraint = 1; - break; - case '>': - wordchrs(v); /* does NEXT() */ - s = newstate(v->nfa); - NOERRN(); - /* needs word to left... */ - cloneouts(v->nfa, v->wordchrs, lp, s, BEHIND); - /* ... and EOL, EOS, or nonword to right */ - newarc(v->nfa, '$', 1, s, rp); - newarc(v->nfa, '$', 0, s, rp); - colorcomplement(v->nfa, v->cm, AHEAD, - v->wordchrs, s, rp); - /* (no need for special attention to \n) */ - constraint = 1; - break; - case WBDRY: - wordchrs(v); /* does NEXT() */ - s = newstate(v->nfa); - NOERRN(); - /* needs BOL, BOS, or nonword to left... */ - newarc(v->nfa, '^', 1, lp, s); - newarc(v->nfa, '^', 0, lp, s); - colorcomplement(v->nfa, v->cm, BEHIND, - v->wordchrs, lp, s); - /* ... and word to right... */ - cloneouts(v->nfa, v->wordchrs, s, rp, AHEAD); - /* ...or... */ - s = newstate(v->nfa); - NOERRN(); - /* ...needs word to left... */ - cloneouts(v->nfa, v->wordchrs, lp, s, BEHIND); - /* ... and EOL, EOS, or nonword to right */ - newarc(v->nfa, '$', 1, s, rp); - newarc(v->nfa, '$', 0, s, rp); - colorcomplement(v->nfa, v->cm, AHEAD, - v->wordchrs, s, rp); - /* (no need for special attention to \n) */ - constraint = 1; - break; - case NWBDRY: - wordchrs(v); /* does NEXT() */ - s = newstate(v->nfa); - NOERRN(); - /* needs word to both left and right... */ - cloneouts(v->nfa, v->wordchrs, lp, s, BEHIND); - cloneouts(v->nfa, v->wordchrs, s, rp, AHEAD); - /* ...or... */ - s = newstate(v->nfa); - NOERRN(); - /* ...BOL, BOS, or nonword to left... */ - newarc(v->nfa, '^', 1, lp, s); - newarc(v->nfa, '^', 0, lp, s); - colorcomplement(v->nfa, v->cm, BEHIND, - v->wordchrs, lp, s); - /* ... and EOL, EOS, or nonword to right */ - newarc(v->nfa, '$', 1, s, rp); - newarc(v->nfa, '$', 0, s, rp); - colorcomplement(v->nfa, v->cm, AHEAD, - v->wordchrs, s, rp); - /* (no need for special attention to \n) */ - constraint = 1; - break; - case ')': /* unbalanced paren */ -#ifdef POSIX_MISTAKE - if (!(v->cflags®_EXTENDED) || - (v->cflags®_ADVF)) { - ERR(REG_EPAREN); - return NULL; - } - NOTE(REG_UPBOTCH); - /* fallthrough into case PLAIN */ -#else - ERR(REG_EPAREN); - return NULL; - break; -#endif - case PLAIN: - onechr(v, v->nextvalue, lp, rp); - okcolors(v->nfa, v->cm); - NOERRN(); - NEXT(); - break; - case '*': - case '+': - case '?': - case '{': - ERR(REG_BADRPT); - return NULL; - break; - default: - ERR(REG_ASSERT); - return NULL; - break; - } - - /* ...possibly followed by a quantifier */ - switch (v->nexttype) { - case '*': - m = 0; - n = INFINITY; - sub.prefer = (v->nextvalue) ? LONGER : SHORTER; - NEXT(); - break; - case '+': - m = 1; - n = INFINITY; - sub.prefer = (v->nextvalue) ? LONGER : SHORTER; - NEXT(); - break; - case '?': - m = 0; - n = 1; - sub.prefer = (v->nextvalue) ? LONGER : SHORTER; - NEXT(); - break; - case '{': - NEXT(); - m = scannum(v); - if (EAT(',')) { - if (SEE(DIGIT)) - n = scannum(v); - else - n = INFINITY; - if (m > n) { - ERR(REG_BADBR); - return NULL; - } - } else - n = m; - if (!SEE('}')) { /* gets errors too */ - ERR(REG_BADBR); - return NULL; - } - if (m != n) - sub.prefer = (v->nextvalue) ? LONGER : - SHORTER; - NEXT(); - break; - default: /* no quantifier */ - m = n = 1; - constraint = 0; - break; - } + if (!SEE(stopper)) { + assert(stopper == ')' && SEE(EOS)); + ERR(REG_EPAREN); + } - /* constraints may not be quantified */ - if (constraint) { - ERR(REG_BADRPT); - return NULL; - } + /* optimize out simple cases */ + if (branch == branches) { /* only one branch */ + assert(branch->right == NULL); + t = branch->left; + branch->left = NULL; + freesubre(v, branches); + branches = t; + } else if (!MESSY(branches->flags)) { /* no interesting innards */ + freesubre(v, branches->left); + branches->left = NULL; + freesubre(v, branches->right); + branches->right = NULL; + branches->op = '='; + } - /* annoying special case: {0,0} cancels everything */ - if (m == 0 && n == 0 && sub.begin != NULL) { - freert(v, now->tree); - now->tree = NULL; - sub.begin = NULL; /* no substructure */ - sub.prefer = NONEYET; - /* the repeat() below will do the rest */ - } + return branches; +} - /* if no substructure, avoid hard part */ - if (now->prefer == NONEYET) - now->prefer = sub.prefer; - if (sub.begin == NULL && (sub.prefer == NONEYET || - sub.prefer == now->prefer)) { - assert(sub.subno >= 0 || (m == 0 && n == 0)); - if (!(m == 1 && n == 1)) - repeat(v, lp, rp, m, n); - continue; /* NOTE CONTINUE */ - } +/* + - parsebranch - parse one branch of an RE + * This mostly manages concatenation, working closely with parseqatom(). + * Concatenated things are bundled up as much as possible, with separate + * ',' nodes introduced only when necessary due to substructure. + ^ static struct subre *parsebranch(struct vars *, int, int, struct state *, + ^ struct state *, int); + */ +static struct subre * +parsebranch(v, stopper, type, left, right, partial) +struct vars *v; +int stopper; /* EOS or ')' */ +int type; /* LACON (lookahead subRE) or PLAIN */ +struct state *left; /* leftmost state */ +struct state *right; /* rightmost state */ +int partial; /* is this only part of a branch? */ +{ + struct state *lp; /* left end of current construct */ + int seencontent; /* is there anything in this branch yet? */ + struct subre *t; - /* hard part: something messy seen */ - /* break subRE into pre, x{...}, post-to-be */ - capture = 1; /* upper levels will care */ - rt1 = newrt(v); - rt2 = newrt(v); - s = newstate(v->nfa); /* between x and post-to-be */ + lp = left; + seencontent = 0; + t = subre(v, '=', 0, left, right); /* op '=' is tentative */ + NOERRN(); + while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) { + if (seencontent) { /* implicit concat operator */ + lp = newstate(v->nfa); NOERRN(); - moveins(v->nfa, rp, s); - EMPTYARC(s, rp); - rt1->op = ','; - rt1->left = subre(now->begin, lp, now->prefer, 0, - (struct rtree *)NULL); - assert(now->end == rp); - rt1->right = subre(lp, rp, sub.prefer, 0, rt2); - rt2->op = ','; - rt2->left = subre(lp, s, sub.prefer, 0, now->tree); - rt2->right = subre(s, rp, NONEYET, 0, - (struct rtree *)NULL); - now->tree = rt1; - now = &rt2->right; /* future elaborations here */ - t = &rt2->left; /* current activity here */ - - /* if it's a backref, time to replicate the subNFA */ - if (sub.subno < 0) { - assert(lp->nouts == 1); /* just the EMPTY */ - delsub(v->nfa, lp, s); - assert(v->subs[-sub.subno] != NULL); - dupnfa(v->nfa, v->subs[-sub.subno]->begin, - v->subs[-sub.subno]->end, lp, s); - NOERRN(); - } + moveins(v->nfa, right, lp); + } + seencontent = 1; - /* if no/vacuous quantifier and not backref, done */ - if (m == 1 && n == 1 && sub.subno >= 0) { - t->subno = sub.subno; - if (sub.subno > 0) - v->subs[sub.subno] = t; - continue; /* NOTE CONTINUE */ - } + /* NB, recursion in parseqatom() may swallow rest of branch */ + parseqatom(v, stopper, type, lp, right, t); + } - /* really sticky part, quantified capturer/backref */ - /* first, turn x{0,...} into x{1,...}| */ - if (m == 0) { - s = newstate(v->nfa); - s2 = newstate(v->nfa); - rt1 = newrt(v); - rt2 = newrt(v); - NOERRN(); - moveouts(v->nfa, t->begin, s); - EMPTYARC(t->begin, s); - EMPTYARC(t->begin, s2); - EMPTYARC(s2, t->end); - rt1->op = rt2->op = '|'; - rt1->left = subre(s, t->end, sub.prefer, 0, - t->tree); - rt1->next = rt2; - rt2->left = subre(s2, t->end, sub.prefer, 0, - (struct rtree *)NULL); - t->tree = rt1; - t = &rt1->left; - m = 1; - } + if (!seencontent) { /* empty branch */ + if (!partial) + NOTE(REG_UUNSPEC); + assert(lp == left); + EMPTYARC(left, right); + } - /* second, x{1,1} is just x */ - if (m == 1 && n == 1 && sub.subno >= 0) { - t->subno = sub.subno; - if (sub.subno > 0) - v->subs[sub.subno] = t; - continue; /* NOTE CONTINUE */ - } + return t; +} - /* backrefs get special treatment */ - if (sub.subno < 0) { - repeat(v, t->begin, t->end, m, n); - rt1 = newrt(v); - NOERRN(); - assert(t->tree == NULL); - t->tree = rt1; - rt1->op = 'b'; - rt1->left.subno = sub.subno; - rt1->left.min = (short)m; - rt1->left.max = (short)n; - rt1->left.prefer = sub.prefer; - continue; /* NOTE CONTINUE */ - } +/* + - parseqatom - parse one quantified atom or constraint of an RE + * The bookkeeping near the end cooperates very closely with parsebranch(); + * in particular, it contains a recursion that can involve parsing the rest + * of the branch, making this function's name somewhat inaccurate. + ^ static VOID parseqatom(struct vars *, int, int, struct state *, + ^ struct state *, struct subre *); + */ +static VOID +parseqatom(v, stopper, type, lp, rp, top) +struct vars *v; +int stopper; /* EOS or ')' */ +int type; /* LACON (lookahead subRE) or PLAIN */ +struct state *lp; /* left state to hang it on */ +struct state *rp; /* right state to hang it on */ +struct subre *top; /* subtree top */ +{ + struct state *s; /* temporaries for new states */ + struct state *s2; +# define ARCV(t, val) newarc(v->nfa, t, val, lp, rp) + int m, n; + struct subre *atom; /* atom's subtree */ + struct subre *t; + int cap; /* capturing parens? */ + int pos; /* positive lookahead? */ + int subno; /* capturing-parens or backref number */ + int atomtype; + int qprefer; /* quantifier short/long preference */ + int f; + struct subre **atomp; /* where the pointer to atom is */ + + /* initial bookkeeping */ + atom = NULL; + assert(lp->nouts == 0); /* must string new code */ + assert(rp->nins == 0); /* between lp and rp */ + subno = 0; /* just to shut lint up */ + + /* an atom or constraint... */ + atomtype = v->nexttype; + switch (atomtype) { + /* first, constraints, which end by returning */ + case '^': + ARCV('^', 1); + if (v->cflags®_NLANCH) + ARCV(BEHIND, v->nlcolor); + NEXT(); + return; + break; + case '$': + ARCV('$', 1); + if (v->cflags®_NLANCH) + ARCV(AHEAD, v->nlcolor); + NEXT(); + return; + break; + case SBEGIN: + ARCV('^', 1); /* BOL */ + ARCV('^', 0); /* or BOS */ + NEXT(); + return; + break; + case SEND: + ARCV('$', 1); /* EOL */ + ARCV('$', 0); /* or EOS */ + NEXT(); + return; + break; + case '<': + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + nonword(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + return; + break; + case '>': + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + word(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + return; + break; + case WBDRY: + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + nonword(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + s = newstate(v->nfa); + NOERR(); + word(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + return; + break; + case NWBDRY: + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + word(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + s = newstate(v->nfa); + NOERR(); + nonword(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + return; + break; + case LACON: /* lookahead constraint */ + pos = v->nextvalue; + NEXT(); + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERR(); + t = parse(v, ')', LACON, s, s2); + freesubre(v, t); /* internal structure irrelevant */ + assert(SEE(')') || ISERR()); + NEXT(); + n = newlacon(v, s, s2, pos); + NOERR(); + ARCV(LACON, n); + return; + break; + /* then errors, to get them out of the way */ + case '*': + case '+': + case '?': + case '{': + ERR(REG_BADRPT); + return; + break; + default: + ERR(REG_ASSERT); + return; + break; + /* then plain characters, and minor variants on that theme */ + case ')': /* unbalanced paren */ + if ((v->cflags®_ADVANCED) != REG_EXTENDED) { + ERR(REG_EPAREN); + return; + } + /* legal in EREs due to specification botch */ + NOTE(REG_UPBOTCH); + /* fallthrough into case PLAIN */ + case PLAIN: + onechr(v, v->nextvalue, lp, rp); + okcolors(v->nfa, v->cm); + NOERR(); + NEXT(); + break; + case '[': + if (v->nextvalue == 1) + bracket(v, lp, rp); + else + cbracket(v, lp, rp); + assert(SEE(']') || ISERR()); + NEXT(); + break; + case '.': + rainbow(v->nfa, v->cm, PLAIN, + (v->cflags®_NLSTOP) ? v->nlcolor : COLORLESS, + lp, rp); + NEXT(); + break; + /* and finally the ugly stuff */ + case '(': /* value flags as capturing or non */ + cap = (type == LACON) ? 0 : v->nextvalue; + if (cap) { + v->nsubexp++; + subno = v->nsubexp; + if ((size_t)subno >= v->nsubs) + moresubs(v, subno); + assert((size_t)subno < v->nsubs); + } else + atomtype = PLAIN; /* something that's not '(' */ + NEXT(); + /* need new endpoints because tree will contain pointers */ + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERR(); + EMPTYARC(lp, s); + EMPTYARC(s2, rp); + NOERR(); + atom = parse(v, ')', PLAIN, s, s2); + assert(SEE(')') || ISERR()); + NEXT(); + NOERR(); + if (cap) { + v->subs[subno] = atom; + t = subre(v, '(', atom->flags|CAP, lp, rp); + NOERR(); + t->subno = subno; + t->left = atom; + atom = t; + } + /* postpone everything else pending possible {0} */ + break; + case BACKREF: /* the Feature From The Black Lagoon */ + INSIST(type != LACON, REG_ESUBREG); + INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); + INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG); + NOERR(); + assert(v->nextvalue > 0); + atom = subre(v, 'b', BACKR, lp, rp); + subno = v->nextvalue; + atom->subno = subno; + EMPTYARC(lp, rp); /* temporarily, so there's something */ + NEXT(); + break; + } - /* turn x{m,n} into x{m-1,n-1}x, with capturing */ - /* parens in only second x */ - s = newstate(v->nfa); - NOERRN(); - moveouts(v->nfa, t->begin, s); - dupnfa(v->nfa, s, t->end, t->begin, s); - assert(m >= 1 && m != INFINITY && n >= 1); - repeat(v, t->begin, s, m-1, (n == INFINITY) ? n : n-1); - rt1 = newrt(v); - NOERRN(); - rt1->op = ','; - rt1->left = subre(t->begin, s, sub.prefer, 0, - (struct rtree *)NULL); - /* sub.prefer not really right, but doesn't matter */ - rt1->right = subre(s, t->end, sub.prefer, sub.subno, - t->tree); - if (sub.subno > 0) - v->subs[sub.subno] = &rt1->right; - t->tree = rt1; + /* ...and an atom may be followed by a quantifier */ + switch (v->nexttype) { + case '*': + m = 0; + n = INFINITY; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '+': + m = 1; + n = INFINITY; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '?': + m = 0; + n = 1; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '{': + NEXT(); + m = scannum(v); + if (EAT(',')) { + if (SEE(DIGIT)) + n = scannum(v); + else + n = INFINITY; + if (m > n) { + ERR(REG_BADBR); + return; + } + /* {m,n} exercises preference, even if it's {m,m} */ + qprefer = (v->nextvalue) ? LONGER : SHORTER; + } else { + n = m; + /* {m} passes operand's preference through */ + qprefer = 0; } - if (emptybranch) { - NOTE(REG_UUNSPEC); - EMPTYARC(lp, rp); + if (!SEE('}')) { /* catches errors too */ + ERR(REG_BADBR); + return; } - } while (EAT('|')); - assert(SEE(stopper) || SEE(EOS)); + NEXT(); + break; + default: /* no quantifier */ + m = n = 1; + qprefer = 0; + break; + } - if (!SEE(stopper)) { - assert(stopper == ')' && SEE(EOS)); - ERR(REG_EPAREN); + /* annoying special case: {0} or {0,0} cancels everything */ + if (m == 0 && n == 0) { + if (atom != NULL) + freesubre(v, atom); + if (atomtype == '(') + v->subs[subno] = NULL; + delsub(v->nfa, lp, rp); + EMPTYARC(lp, rp); + return; } - /* higher levels care about our preference in certain situations */ - if (branch != branches) { /* >1 branch */ - if (pprefer != LONGER) - capture = 1; - } else if (branches->left.prefer != pprefer) - capture = 1; - - /* optimize out vacuous alternation */ - if (branch == branches) { - assert(branch->next == NULL && branch->right.begin == NULL); - assert(branch->left.subno == 0); - if (capture && branch->left.tree == NULL) - branch->op = ','; - else { - branches = branch->left.tree; /* might be NULL */ - freertnode(v, branch); - } + /* if not a messy case, avoid hard part */ + assert(!MESSY(top->flags)); + f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0); + if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) { + if (!(m == 1 && n == 1)) + repeat(v, lp, rp, m, n); + if (atom != NULL) + freesubre(v, atom); + top->flags = f; + return; + } + + /* + * hard part: something messy + * That is, capturing parens, back reference, short/long clash, or + * an atom with substructure containing one of those. + */ + + /* now we'll need a subre for the contents even if they're boring */ + if (atom == NULL) { + atom = subre(v, '=', 0, lp, rp); + NOERR(); + } + + /* + * prepare a general-purpose state skeleton + * + * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] + * / / + * [lp] ----> [s2] ----bypass--------------------- + * + * where bypass is an empty, and prefix is some repetitions of atom + */ + s = newstate(v->nfa); /* first, new endpoints for the atom */ + s2 = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + moveins(v->nfa, rp, s2); + NOERR(); + atom->begin = s; + atom->end = s2; + s = newstate(v->nfa); /* and spots for prefix and bypass */ + s2 = newstate(v->nfa); + NOERR(); + EMPTYARC(lp, s); + EMPTYARC(lp, s2); + NOERR(); + + /* break remaining subRE into x{...} and what follows */ + t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp); + t->left = atom; + atomp = &t->left; + /* here we should recurse... but we must postpone that to the end */ + + /* split top into prefix and remaining */ + assert(top->op == '=' && top->left == NULL && top->right == NULL); + top->left = subre(v, '=', top->flags, top->begin, lp); + top->op = '.'; + top->right = t; + + /* if it's a backref, now is the time to replicate the subNFA */ + if (atomtype == BACKREF) { + assert(atom->begin->nouts == 1); /* just the EMPTY */ + delsub(v->nfa, atom->begin, atom->end); + assert(v->subs[subno] != NULL); + /* and here's why the recursion got postponed: it must */ + /* wait until the skeleton is filled in, because it may */ + /* hit a backref that wants to copy the filled-in skeleton */ + dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end, + atom->begin, atom->end); + NOERR(); + } + + /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */ + if (m == 0) { + EMPTYARC(s2, atom->end); /* the bypass */ + assert(PREF(qprefer) != 0); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '|', f, lp, atom->end); + NOERR(); + t->left = atom; + t->right = subre(v, '|', PREF(f), s2, atom->end); + NOERR(); + t->right->left = subre(v, '=', 0, s2, atom->end); + NOERR(); + *atomp = t; + atomp = &t->left; + m = 1; } - if (capture) /* actually a catchall flag */ - return branches; - freert(v, branches); - return NULL; + /* deal with the rest of the quantifier */ + if (atomtype == BACKREF) { + /* special case: backrefs have internal quantifiers */ + EMPTYARC(s, atom->begin); /* empty prefix */ + /* just stuff everything into atom */ + repeat(v, atom->begin, atom->end, m, n); + atom->min = (short)m; + atom->max = (short)n; + atom->flags |= COMBINE(qprefer, atom->flags); + } else if (m == 1 && n == 1) { + /* no/vacuous quantifier: done */ + EMPTYARC(s, atom->begin); /* empty prefix */ + } else { + /* turn x{m,n} into x{m-1,n-1}x, with capturing */ + /* parens in only second x */ + dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin); + assert(m >= 1 && m != INFINITY && n >= 1); + repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '.', f, s, atom->end); /* prefix and atom */ + NOERR(); + t->left = subre(v, '=', PREF(f), s, atom->begin); + NOERR(); + t->right = atom; + *atomp = t; + } + + /* and finally, look after that postponed recursion */ + t = top->right; + if (!(SEE('|') || SEE(stopper) || SEE(EOS))) + t->right = parsebranch(v, stopper, type, atom->end, rp, 1); + else { + EMPTYARC(atom->end, rp); + t->right = subre(v, '=', 0, atom->end, rp); + } + assert(SEE('|') || SEE(stopper) || SEE(EOS)); + t->flags |= COMBINE(t->flags, t->right->flags); + top->flags |= COMBINE(top->flags, t->flags); +} + +/* + - nonword - generate arcs for non-word-character ahead or behind + ^ static VOID nonword(struct vars *, int, struct state *, struct state *); + */ +static VOID +nonword(v, dir, lp, rp) +struct vars *v; +int dir; /* AHEAD or BEHIND */ +struct state *lp; +struct state *rp; +{ + int anchor = (dir == AHEAD) ? '$' : '^'; + + assert(dir == AHEAD || dir == BEHIND); + newarc(v->nfa, anchor, 1, lp, rp); + newarc(v->nfa, anchor, 0, lp, rp); + colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp); + /* (no need for special attention to \n) */ +} + +/* + - word - generate arcs for word character ahead or behind + ^ static VOID word(struct vars *, int, struct state *, struct state *); + */ +static VOID +word(v, dir, lp, rp) +struct vars *v; +int dir; /* AHEAD or BEHIND */ +struct state *lp; +struct state *rp; +{ + assert(dir == AHEAD || dir == BEHIND); + cloneouts(v->nfa, v->wordchrs, lp, rp, dir); + /* (no need for special attention to \n) */ } /* @@ -1163,7 +1187,7 @@ struct state *rp; NOERR(); bracket(v, left, right); if (v->cflags®_NLSTOP) - newarc(v->nfa, PLAIN, nlcolor(v), left, right); + newarc(v->nfa, PLAIN, v->nlcolor, left, right); NOERR(); assert(lp->nouts == 0); /* all outarcs will be ours */ @@ -1181,7 +1205,7 @@ struct state *rp; /* but complementing gets messy in the presence of MCCEs... */ NOTE(REG_ULOCALE); for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--) { - co = getcolor(v->cm, *p); + co = GETCOLOR(v->cm, *p); a = findarc(lp, PLAIN, co); ba = findarc(left, PLAIN, co); if (ba == NULL) { @@ -1391,7 +1415,7 @@ struct cvec *cv; okcolors(v->nfa, v->cm); } else { a = findarc(v->mccepbegin, PLAIN, - getcolor(v->cm, leader)); + GETCOLOR(v->cm, leader)); assert(a != NULL); s = a->to; assert(s != v->mccepend); @@ -1438,6 +1462,7 @@ struct state *lp; struct state *rp; { chr ch, from, to; + celt ce; chr *p; int i; color co; @@ -1478,16 +1503,17 @@ struct state *rp; for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) { from = *p; to = *(p+1); - for (ch = from; ch <= to; ch++) - if (!ISCELEADER(v, ch)) - newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, - rp); - else { - assert(singleton(v->cm, ch)); - assert(leads != NULL); - if (!haschr(leads, ch)) - addchr(leads, ch); - } + while (from <= to && (ce = nextleader(v, from, to)) != NOCELT) { + if (from < ce) + subrange(v, from, ce - 1, lp, rp); + assert(singleton(v->cm, ce)); + assert(leads != NULL); + if (!haschr(leads, ce)) + addchr(leads, ce); + from = ce + 1; + } + if (from <= to) + subrange(v, from, to, lp, rp); } if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0) @@ -1496,7 +1522,7 @@ struct state *rp; /* deal with the MCCE leaders */ NOTE(REG_ULOCALE); for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--) { - co = getcolor(v->cm, *p); + co = GETCOLOR(v->cm, *p); a = findarc(lp, PLAIN, co); if (a != NULL) s = a->to; @@ -1519,7 +1545,8 @@ struct state *rp; for (i = 0; i < cv->nmcces; i++) { p = cv->mcces[i]; assert(singleton(v->cm, *p)); - co = getcolor(v->cm, *p++); + ch = *p++; + co = GETCOLOR(v->cm, ch); a = findarc(lp, PLAIN, co); if (a != NULL) s = a->to; @@ -1531,7 +1558,8 @@ struct state *rp; } assert(*p != 0); /* at least two chars */ assert(singleton(v->cm, *p)); - co = getcolor(v->cm, *p++); + ch = *p++; + co = GETCOLOR(v->cm, ch); assert(*p == 0); /* and only two, for now */ newarc(v->nfa, PLAIN, co, s, rp); NOERR(); @@ -1539,20 +1567,30 @@ struct state *rp; } /* - - nlcolor - assign newline a unique color, if it doesn't have one already - * Restriction: can't be called when there are subcolors open. (Maybe - * this should be enforced...) - ^ static color nlcolor(struct vars *); + - nextleader - find next MCCE leader within range + ^ static celt nextleader(struct vars *, pchr, pchr); */ -static color -nlcolor(v) +static celt /* NOCELT means none */ +nextleader(v, from, to) struct vars *v; +pchr from; +pchr to; { - if (v->nlcolor == COLORLESS) { - v->nlcolor = subcolor(v->cm, newline()); - okcolors(v->nfa, v->cm); + int i; + chr *p; + chr ch; + celt it = NOCELT; + + if (v->mcces == NULL) + return it; + + for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++) { + ch = *p; + if (from <= ch && ch <= to) + if (it == NOCELT || ch < it) + it = ch; } - return v->nlcolor; + return it; } /* @@ -1579,6 +1617,7 @@ struct vars *v; left = newstate(v->nfa); right = newstate(v->nfa); NOERR(); + /* fine point: implemented with [::], and lexer will set REG_ULOCALE */ lexword(v); NEXT(); assert(v->savenow != NULL && SEE('[')); @@ -1590,273 +1629,169 @@ struct vars *v; } /* - - subre - construct a subre struct - ^ static struct subre subre(struct state *, struct state *, int, int, - ^ struct rtree *); + - subre - allocate a subre + ^ static struct subre *subre(struct vars *, int, int, struct state *, + ^ struct state *); */ -static struct subre -subre(begin, end, prefer, subno, tree) +static struct subre * +subre(v, op, flags, begin, end) +struct vars *v; +int op; +int flags; struct state *begin; struct state *end; -int prefer; -int subno; -struct rtree *tree; -{ - struct subre ret; - - ret.begin = begin; - ret.end = end; - ret.prefer = prefer; - ret.subno = subno; - ret.min = ret.max = 1; - ret.tree = tree; - ZAPCNFA(ret.cnfa); - return ret; -} - -/* - - newrt - allocate subRE-tree node - ^ static struct rtree *newrt(struct vars *); - */ -static struct rtree * -newrt(v) -struct vars *v; { - struct rtree *rt; + struct subre *ret; - rt = v->treefree; - if (rt != NULL) - v->treefree = rt->next; + ret = v->treefree; + if (ret != NULL) + v->treefree = ret->left; else { - rt = (struct rtree *)MALLOC(sizeof(struct rtree)); - if (rt == NULL) { + ret = (struct subre *)MALLOC(sizeof(struct subre)); + if (ret == NULL) { ERR(REG_ESPACE); return NULL; } - rt->chain = v->treechain; - v->treechain = rt; + ret->chain = v->treechain; + v->treechain = ret; } - rt->op = '?'; /* invalid */ - rt->flags = 0; - rt->no = 0; - rt->left.begin = NULL; - rt->left.end = NULL; - rt->left.prefer = NONEYET; - rt->left.subno = 0; - rt->left.min = rt->left.max = 1; - rt->left.tree = NULL; - ZAPCNFA(rt->left.cnfa); - rt->right.begin = NULL; - rt->right.end = NULL; - rt->right.prefer = NONEYET; - rt->right.subno = 0; - rt->right.min = rt->right.max = 1; - rt->right.tree = NULL; - ZAPCNFA(rt->right.cnfa); - rt->next = NULL; - - return rt; + assert(strchr("|.b(=", op) != NULL); + + ret->op = op; + ret->flags = flags; + ret->retry = 0; + ret->subno = 0; + ret->min = ret->max = 1; + ret->left = NULL; + ret->right = NULL; + ret->begin = begin; + ret->end = end; + ZAPCNFA(ret->cnfa); + + return ret; } /* - - freert - free a subRE subtree - ^ static VOID freert(struct vars *, struct rtree *); + - freesubre - free a subRE subtree + ^ static VOID freesubre(struct vars *, struct subre *); */ static VOID -freert(v, rt) +freesubre(v, sr) struct vars *v; /* might be NULL */ -struct rtree *rt; +struct subre *sr; { - if (rt == NULL) + if (sr == NULL) return; - if (rt->left.tree != NULL) - freert(v, rt->left.tree); - if (rt->right.tree != NULL) - freert(v, rt->right.tree); - if (rt->next != NULL) - freert(v, rt->next); + if (sr->left != NULL) + freesubre(v, sr->left); + if (sr->right != NULL) + freesubre(v, sr->right); - freertnode(v, rt); + freesrnode(v, sr); } /* - - freertnode - free one node in a subRE subtree - ^ static VOID freertnode(struct vars *, struct rtree *); + - freesrnode - free one node in a subRE subtree + ^ static VOID freesrnode(struct vars *, struct subre *); */ static VOID -freertnode(v, rt) +freesrnode(v, sr) struct vars *v; /* might be NULL */ -struct rtree *rt; +struct subre *sr; { - if (rt == NULL) + if (sr == NULL) return; - if (!NULLCNFA(rt->left.cnfa)) - freecnfa(&rt->left.cnfa, 0); - if (!NULLCNFA(rt->right.cnfa)) - freecnfa(&rt->right.cnfa, 0); - rt->flags = 0; + if (!NULLCNFA(sr->cnfa)) + freecnfa(&sr->cnfa); + sr->flags = 0; if (v != NULL) { - rt->next = v->treefree; - v->treefree = rt; + sr->left = v->treefree; + v->treefree = sr; } else - FREE(rt); + FREE(sr); } /* - - optrt - optimize a subRE subtree - ^ static VOID optrt(struct vars *, struct rtree *); + - optst - optimize a subRE subtree + ^ static VOID optst(struct vars *, struct subre *); */ static VOID -optrt(v, rt) +optst(v, t) struct vars *v; -struct rtree *rt; +struct subre *t; { - struct rtree *t; - int subno; - - if (rt == NULL) + if (t == NULL) return; - assert(rt->op != 'b'); - - /* pull up subtrees if possible */ - if (rt->left.begin != NULL && rt->left.tree != NULL && - rt->left.tree->op != 'b') { - t = rt->left.tree; - optrt(v, t); - if (t->right.begin == NULL && t->next == NULL && - (rt->left.prefer == NONEYET || - t->left.prefer == rt->left.prefer) && - (rt->left.subno == 0 || t->left.subno == 0)) { - subno = rt->left.subno; - rt->left = t->left; - assert(NULLCNFA(t->left.cnfa)); - freertnode(v, t); - if (subno != 0) { - assert(rt->left.subno == 0 && subno > 0); - rt->left.subno = subno; - } - } - } - if (rt->right.begin != NULL && rt->right.tree != NULL && - rt->right.tree->op != 'b') { - t = rt->right.tree; - optrt(v, t); - if (t->right.begin == NULL && t->next == NULL && - (rt->right.prefer == NONEYET || - t->right.prefer == rt->right.prefer) && - (rt->right.subno == 0 || t->right.subno == 0)) { - subno = rt->right.subno; - rt->right = t->left; - assert(NULLCNFA(t->right.cnfa)); - freertnode(v, t); - if (subno != 0) { - assert(rt->right.subno == 0 && subno > 0); - rt->right.subno = subno; - } - } - } - - /* simplify empties */ - if (rt->left.begin != NULL && isempty(rt->left.begin, rt->left.end)) - rt->left.end = rt->left.begin; - if (rt->right.begin != NULL && isempty(rt->right.begin, rt->right.end)) - rt->right.end = rt->right.begin; - - /* if left subtree vacuous and right non-empty, move right over */ - if (rt->left.begin != NULL && rt->left.begin == rt->left.end && - rt->left.subno == 0 && rt->left.tree == NULL && - rt->right.begin != NULL) { - rt->left = rt->right; - rt->right.begin = NULL; - rt->right.tree = NULL; - } - - /* if right subtree vacuous, clear it out */ - if (rt->right.begin != NULL && rt->right.begin == rt->right.end && - rt->right.subno == 0 && rt->right.tree == NULL) { - rt->right.begin = NULL; - rt->right.tree = NULL; - } /* preference cleanup and analysis */ - if (rt->left.prefer == NONEYET) - rt->left.prefer = LONGER; - if (rt->left.prefer == SHORTER) + if (t->flags&SHORTER) v->usedshorter = 1; - if (rt->right.begin != NULL) { - if (rt->right.prefer == NONEYET) - rt->right.prefer = LONGER; - if (rt->right.prefer == SHORTER) - v->usedshorter = 1; - } - /* recurse through alternatives */ - if (rt->next != NULL) - optrt(v, rt->next); + /* recurse through children */ + if (t->left != NULL) + optst(v, t->left); + if (t->right != NULL) + optst(v, t->right); } /* - - numrt - number tree nodes - ^ static int numrt(struct rtree *, int); + - numst - number tree nodes (assigning retry indexes) + ^ static int numst(struct subre *, int); */ static int /* next number */ -numrt(rt, start) -struct rtree *rt; +numst(t, start) +struct subre *t; int start; /* starting point for subtree numbers */ { int i; - assert(rt != NULL); + assert(t != NULL); i = start; - rt->no = (short)i++; - if (rt->left.tree != NULL) - i = numrt(rt->left.tree, i); - if (rt->right.tree != NULL) - i = numrt(rt->right.tree, i); - if (rt->next != NULL) - i = numrt(rt->next, i); + t->retry = (short)i++; + if (t->left != NULL) + i = numst(t->left, i); + if (t->right != NULL) + i = numst(t->right, i); return i; } /* - - markrt - mark tree nodes as INUSE - ^ static VOID markrt(struct rtree *); + - markst - mark tree nodes as INUSE + ^ static VOID markst(struct subre *); */ static VOID -markrt(rt) -struct rtree *rt; +markst(t) +struct subre *t; { - assert(rt != NULL); - - rt->flags |= INUSE; - if (rt->left.tree != NULL) - markrt(rt->left.tree); - if (rt->right.tree != NULL) - markrt(rt->right.tree); - if (rt->next != NULL) - markrt(rt->next); + assert(t != NULL); + + t->flags |= INUSE; + if (t->left != NULL) + markst(t->left); + if (t->right != NULL) + markst(t->right); } /* - - cleanrt - free any tree nodes not marked INUSE - ^ static VOID cleanrt(struct vars *); + - cleanst - free any tree nodes not marked INUSE + ^ static VOID cleanst(struct vars *); */ static VOID -cleanrt(v) +cleanst(v) struct vars *v; { - struct rtree *rt; - struct rtree *next; + struct subre *t; + struct subre *next; - for (rt = v->treechain; rt != NULL; rt = next) { - next = rt->next; - if (!(rt->flags&INUSE)) - FREE(rt); + for (t = v->treechain; t != NULL; t = next) { + next = t->chain; + if (!(t->flags&INUSE)) + FREE(t); } v->treechain = NULL; v->treefree = NULL; /* just on general principles */ @@ -1864,56 +1799,51 @@ struct vars *v; /* - nfatree - turn a subRE subtree into a tree of compacted NFAs - ^ static VOID nfatree(struct vars *, struct rtree *, FILE *); + ^ static int nfatree(struct vars *, struct subre *, FILE *); */ -static VOID -nfatree(v, rt, f) +static int /* optimize results from top node */ +nfatree(v, t, f) struct vars *v; -struct rtree *rt; +struct subre *t; FILE *f; /* for debug output */ { - if (rt == NULL) - return; + assert(t != NULL && t->begin != NULL); - if (rt->left.begin != NULL) - nfanode(v, &rt->left, f); - if (rt->left.tree != NULL) - nfatree(v, rt->left.tree, f); + if (t->left != NULL) + (DISCARD)nfatree(v, t->left, f); + if (t->right != NULL) + (DISCARD)nfatree(v, t->right, f); - if (rt->right.begin != NULL) - nfanode(v, &rt->right, f); - if (rt->right.tree != NULL) - nfatree(v, rt->right.tree, f); - - if (rt->next != NULL) - nfatree(v, rt->next, f); + return nfanode(v, t, f); } /* - nfanode - do one NFA for nfatree - ^ static VOID nfanode(struct vars *, struct subre *, FILE *); + ^ static int nfanode(struct vars *, struct subre *, FILE *); */ -static VOID -nfanode(v, sub, f) +static int /* optimize results */ +nfanode(v, t, f) struct vars *v; -struct subre *sub; +struct subre *t; FILE *f; /* for debug output */ { struct nfa *nfa; + int ret = 0; - if (sub->begin == NULL) - return; + assert(t->begin != NULL); nfa = newnfa(v, v->cm, v->nfa); - NOERR(); - dupnfa(nfa, sub->begin, sub->end, nfa->init, nfa->final); + NOERRZ(); + dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final); if (!ISERR()) { specialcolors(nfa); - (DISCARD) optimize(nfa, f); + ret = optimize(nfa, f); } if (!ISERR()) - compact(nfa, &sub->cnfa); + compact(nfa, &t->cnfa); + freenfa(nfa); + return ret; } /* @@ -1964,9 +1894,9 @@ int n; int i; assert(n > 0); - for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) + for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */ if (!NULLCNFA(sub->cnfa)) - freecnfa(&sub->cnfa, 0); + freecnfa(&sub->cnfa); FREE(subs); } @@ -1988,14 +1918,13 @@ regex_t *re; re->re_guts = NULL; re->re_fns = NULL; g->magic = 0; - if (!NULLCNFA(g->cnfa)) - freecnfa(&g->cnfa, 0); - if (g->cm != NULL) - freecm(g->cm); + freecm(&g->cmap); if (g->tree != NULL) - freert((struct vars *)NULL, g->tree); + freesubre((struct vars *)NULL, g->tree); if (g->lacons != NULL) freelacons(g->lacons, g->nlacons); + if (!NULLCNFA(g->search)) + freecnfa(&g->search); FREE(g); } @@ -2028,41 +1957,44 @@ FILE *f; re->re_nsub, re->re_info, re->re_csize, g->ntree, g->usedshorter); - dumpcolors(g->cm, f); - dumpcnfa(&g->cnfa, f); + dumpcolors(&g->cmap, f); + if (!NULLCNFA(g->search)) { + printf("search:\n"); + dumpcnfa(&g->search, f); + } for (i = 1; i < g->nlacons; i++) { fprintf(f, "la%d (%s):\n", i, (g->lacons[i].subno) ? "positive" : "negative"); dumpcnfa(&g->lacons[i].cnfa, f); } - dumprt(g->tree, f, 0); + dumpst(g->tree, f, 0); #endif } /* - - dumprt - dump a subRE tree - ^ static VOID dumprt(struct rtree *, FILE *, int); + - dumpst - dump a subRE tree + ^ static VOID dumpst(struct subre *, FILE *, int); */ static VOID -dumprt(rt, f, nfapresent) -struct rtree *rt; +dumpst(t, f, nfapresent) +struct subre *t; FILE *f; int nfapresent; /* is the original NFA still around? */ { - if (rt == NULL) + if (t == NULL) fprintf(f, "null tree\n"); else - rtdump(rt, f, nfapresent, 0); + stdump(t, f, nfapresent, 0); fflush(f); } /* - - rtdump - recursive guts of dumprt - ^ static VOID rtdump(struct rtree *, FILE *, int, int); + - stdump - recursive guts of dumpst + ^ static VOID stdump(struct subre *, FILE *, int, int); */ static VOID -rtdump(rt, f, nfapresent, level) -struct rtree *rt; +stdump(t, f, nfapresent, level) +struct subre *t; FILE *f; int nfapresent; /* is the original NFA still around? */ int level; @@ -2072,102 +2004,41 @@ int level; for (i = 0; i < level; i++) fprintf(f, RTSEP); - fprintf(f, "%c (n%d) {\n", rt->op, rt->no); - if (rt->left.begin != NULL) { - for (i = 0; i < level+1; i++) - fprintf(f, RTSEP); - fprintf(f, "L"); - fprintf(f, "%s", (rt->left.prefer == NONEYET) ? "-" : - ((rt->left.prefer == LONGER) ? ">" : "<")); - if (nfapresent) - fprintf(f, "%ld-%ld", (long)rt->left.begin->no, - (long)rt->left.end->no); - if (rt->left.subno > 0) - fprintf(f, " (%d)", rt->left.subno); - else if (rt->left.subno < 0) { - fprintf(f, " \\%d", -rt->left.subno); - if (rt->left.min != 1 || rt->left.max != 1) { - fprintf(f, "{%d-", (int)rt->left.min); - if (rt->left.max != INFINITY) - fprintf(f, "%d", (int)rt->left.max); - fprintf(f, "}"); - } - if (rt->left.tree != NULL) - fprintf(f, "(nonNULL tree!!)"); - } - if (rt->left.tree != NULL || !NULLCNFA(rt->left.cnfa)) - fprintf(f, ":"); - fprintf(f, "\n"); - if (!NULLCNFA(rt->left.cnfa)) - dumpcnfa(&rt->left.cnfa, f); - if (rt->left.tree != NULL) - rtdump(rt->left.tree, f, nfapresent, level+1); - } else if (rt->op == 'b') { - for (i = 0; i < level+1; i++) - fprintf(f, RTSEP); + fprintf(f, "%c (", t->op); + if (t->flags&LONGER) fprintf(f, "L"); - fprintf(f, "%s", (rt->left.prefer == NONEYET) ? "-" : - ((rt->left.prefer == LONGER) ? ">" : "<")); - assert(rt->left.subno < 0); - fprintf(f, " \\%d", -rt->left.subno); - if (rt->left.min != 1 || rt->left.max != 1) { - fprintf(f, "{%d-", (int)rt->left.min); - if (rt->left.max != INFINITY) - fprintf(f, "%d", (int)rt->left.max); - fprintf(f, "}"); - } - if (rt->left.tree != NULL) - fprintf(f, "(nonNULL tree!!)"); - fprintf(f, "\n"); - } - - if (rt->right.begin != NULL) { - if (rt->op != ',') - fprintf(f, "op %c has non-NULL right tree\n", rt->op); - for (i = 0; i < level+1; i++) - fprintf(f, RTSEP); - fprintf(f, "R"); - fprintf(f, "%s", (rt->right.prefer == NONEYET) ? "-" : - ((rt->right.prefer == LONGER) ? ">" : "<")); - if (nfapresent) - fprintf(f, "%ld-%ld", (long)rt->right.begin->no, - (long)rt->right.end->no); - if (rt->right.subno > 0) - fprintf(f, " (%d)", rt->right.subno); - else if (rt->right.subno < 0) { - fprintf(f, " \\%d", -rt->right.subno); - if (rt->right.min != 1 || rt->right.max != 1) { - fprintf(f, "{%d-", (int)rt->right.min); - if (rt->right.max != INFINITY) - fprintf(f, "%d", (int)rt->right.max); - fprintf(f, "}"); - } - if (rt->right.tree != NULL) - fprintf(f, "(nonNULL tree!!)"); - } - if (rt->right.tree != NULL || !NULLCNFA(rt->right.cnfa)) - fprintf(f, ":"); - fprintf(f, "\n"); - if (!NULLCNFA(rt->right.cnfa)) - dumpcnfa(&rt->right.cnfa, f); - if (rt->right.tree != NULL) - rtdump(rt->right.tree, f, nfapresent, level+1); - } - for (i = 0; i < level; i++) - fprintf(f, RTSEP); - fprintf(f, "}\n"); - - if (rt->next != NULL) { - if (rt->op != '|') - fprintf(f, "op %c has non-NULL next\n", rt->op); - if (rt->next->op != rt->op) - fprintf(f, "next op %c, expecting %c\n", rt->next->op, - rt->op); - rtdump(rt->next, f, nfapresent, level); + if (t->flags&SHORTER) + fprintf(f, "S"); + if (t->flags&MIXED) + fprintf(f, "M"); + if (t->flags&CAP) + fprintf(f, "c"); + if (t->flags&BACKR) + fprintf(f, "b"); + if (!(t->flags&INUSE)) + fprintf(f, "!u"); + fprintf(f, ") r%d", t->retry); + if (t->subno != 0) + fprintf(f, " #%d", t->subno); + if (t->min != 1 || t->max != 1) { + fprintf(f, "{%d,", t->min); + if (t->max != INFINITY) + fprintf(f, "%d", t->max); + fprintf(f, "}"); } + if (nfapresent) + fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no); + if (!NULLCNFA(t->cnfa)) + fprintf(f, ":"); + fprintf(f, "\n"); + if (t->left != NULL) + stdump(t->left, f, nfapresent, level+1); + if (!NULLCNFA(t->cnfa)) + dumpcnfa(&t->cnfa, f); + if (t->right != NULL) + stdump(t->right, f, nfapresent, level+1); } -#define COMPILE 1 #include "regc_lex.c" #include "regc_color.c" #include "regc_nfa.c" diff --git a/generic/regcustom.h b/generic/regcustom.h index 0fda25f..eeaafa0 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -37,17 +37,20 @@ #endif /* interface types */ #define __REG_WIDE_T Tcl_UniChar -#define __REG_WIDE_COMPILE re_ucomp -#define __REG_WIDE_EXEC re_uexec #define __REG_REGOFF_T long /* not really right, but good enough... */ #define __REG_VOID_T VOID #define __REG_CONST CONST +/* names and declarations */ +#define __REG_WIDE_COMPILE re_ucomp +#define __REG_WIDE_EXEC re_uexec #ifndef __REG_NOFRONT #define __REG_NOFRONT /* don't want regcomp() and regexec() */ #endif #ifndef __REG_NOCHAR #define __REG_NOCHAR /* or the char versions */ #endif +#define regfree re_ufree +#define regerror re_uerror /* --- end --- */ @@ -73,18 +76,11 @@ typedef int celt; /* type to hold chr, MCCE number, or NOCELT */ /* name the external functions */ #define compile re_ucomp #define exec re_uexec + +/* enable/disable debugging code (by whether REG_DEBUG is defined or not) */ #ifdef notdef -#define regfree re_ufree -#define regerror re_uerror +#define REG_DEBUG /* */ #endif -/* - * Implement a mistake in the original POSIX.2: in EREs, and only in EREs - * (AREs do not support this botch), an unbalanced right parenthesis is an - * ordinary character rather than an error. This was unintentional, and - * will be fixed someday. - */ -#define POSIX_MISTAKE /* sigh */ - /* and pick up the standard header */ #include "regex.h" diff --git a/generic/rege_dfa.c b/generic/rege_dfa.c new file mode 100644 index 0000000..6e4b941 --- /dev/null +++ b/generic/rege_dfa.c @@ -0,0 +1,583 @@ +/* + * DFA routines + * This file is #included by regexec.c. + */ + +/* + - longest - longest-preferred matching engine + ^ static chr *longest(struct vars *, struct dfa *, chr *, chr *); + */ +static chr * /* endpoint, or NULL */ +longest(v, d, start, stop) +struct vars *v; /* used only for debug and exec flags */ +struct dfa *d; +chr *start; /* where the match should start */ +chr *stop; /* match must end at or before here */ +{ + chr *cp; + chr *realstop = (stop == v->stop) ? stop : stop + 1; + color co; + struct sset *css; + struct sset *ss; + chr *post; + int i; + struct colormap *cm = d->cm; + + /* initialize */ + css = initialize(v, d, start); + cp = start; + + /* startup */ + FDEBUG(("+++ startup +++\n")); + if (cp == v->start) { + co = d->cnfa->bos[(v->eflags®_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long)co)); + } else { + co = GETCOLOR(cm, *(cp - 1)); + FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co)); + } + css = miss(v, d, css, co, cp, start); + if (css == NULL) + return NULL; + css->lastseen = cp; + + /* main loop */ + if (v->eflags®_FTRACE) + while (cp < realstop) { + FDEBUG(("+++ at c%d +++\n", css - d->ssets)); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co)); + ss = css->outs[co]; + if (ss == NULL) { + ss = miss(v, d, css, co, cp+1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + else + while (cp < realstop) { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) { + ss = miss(v, d, css, co, cp+1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + + /* shutdown */ + FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets)); + if (cp == v->stop && stop == v->stop) { + co = d->cnfa->eos[(v->eflags®_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long)co)); + ss = miss(v, d, css, co, cp, start); + /* special case: match ended at eol? */ + if (ss != NULL && (ss->flags&POSTSTATE)) + return cp; + else if (ss != NULL) + ss->lastseen = cp; /* to be tidy */ + } + + /* find last match, if any */ + post = d->lastpost; + for (ss = d->ssets, i = 0; i < d->nssused; ss++, i++) + if ((ss->flags&POSTSTATE) && post != ss->lastseen && + (post == NULL || post < ss->lastseen)) + post = ss->lastseen; + if (post != NULL) /* found one */ + return post - 1; + + return NULL; +} + +/* + - shortest - shortest-preferred matching engine + ^ static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, + ^ chr **); + */ +static chr * /* endpoint, or NULL */ +shortest(v, d, start, min, max, coldp) +struct vars *v; /* used only for debug and exec flags */ +struct dfa *d; +chr *start; /* where the match should start */ +chr *min; /* match must end at or after here */ +chr *max; /* match must end at or before here */ +chr **coldp; /* store coldstart pointer here, if nonNULL */ +{ + chr *cp; + chr *realmin = (min == v->stop) ? min : min + 1; + chr *realmax = (max == v->stop) ? max : max + 1; + color co; + struct sset *css; + struct sset *firstss; + struct sset *ss; + struct colormap *cm = d->cm; + + /* initialize */ + css = initialize(v, d, start); + firstss = css; + cp = start; + + /* startup */ + FDEBUG(("--- startup ---\n")); + if (cp == v->start) { + co = d->cnfa->bos[(v->eflags®_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long)co)); + } else { + co = GETCOLOR(cm, *(cp - 1)); + FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co)); + } + css = miss(v, d, css, co, cp, start); + if (css == NULL) + return NULL; + css->lastseen = cp; + ss = css; + + /* main loop */ + if (v->eflags®_FTRACE) + while (cp < realmax) { + FDEBUG(("--- at c%d ---\n", css - d->ssets)); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co)); + ss = css->outs[co]; + if (ss == NULL) { + ss = miss(v, d, css, co, cp+1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + if ((ss->flags&POSTSTATE) && cp >= realmin) + break; /* NOTE BREAK OUT */ + } + else + while (cp < realmax) { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) { + ss = miss(v, d, css, co, cp+1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + if ((ss->flags&POSTSTATE) && cp >= realmin) + break; /* NOTE BREAK OUT */ + } + + if (ss == NULL) + return NULL; + if (ss->flags&POSTSTATE) { + assert(firstss->flags&STARTER); + assert(firstss->lastseen != NULL); + if (coldp != NULL) + *coldp = firstss->lastseen; + assert(cp >= realmin); + return cp - 1; + } + + /* shutdown */ + FDEBUG(("--- shutdown at c%d ---\n", css - d->ssets)); + if (cp == v->stop && max == v->stop) { + co = d->cnfa->eos[(v->eflags®_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long)co)); + ss = miss(v, d, css, co, cp, start); + /* special case: match ended at eol? */ + if (ss != NULL && (ss->flags&POSTSTATE)) { + assert(firstss->flags&STARTER); + assert(firstss->lastseen != NULL); + if (coldp != NULL) + *coldp = firstss->lastseen; + return cp; + } + } + + return NULL; +} + +/* + - newdfa - set up a fresh DFA + ^ static struct dfa *newdfa(struct vars *, struct cnfa *, + ^ struct colormap *); + */ +static struct dfa * +newdfa(v, cnfa, cm) +struct vars *v; +struct cnfa *cnfa; +struct colormap *cm; +{ + struct dfa *d = (struct dfa *)MALLOC(sizeof(struct dfa)); + int wordsper = (cnfa->nstates + UBITS - 1) / UBITS; + struct sset *ss; + int i; + + assert(cnfa != NULL && cnfa->nstates != 0); + if (d == NULL) { + ERR(REG_ESPACE); + return NULL; + } + + d->ssets = (struct sset *)MALLOC(CACHE * sizeof(struct sset)); + d->statesarea = (unsigned *)MALLOC((CACHE+WORK) * wordsper * + sizeof(unsigned)); + d->work = &d->statesarea[CACHE * wordsper]; + d->outsarea = (struct sset **)MALLOC(CACHE * cnfa->ncolors * + sizeof(struct sset *)); + d->incarea = (struct arcp *)MALLOC(CACHE * cnfa->ncolors * + sizeof(struct arcp)); + if (d->ssets == NULL || d->statesarea == NULL || d->outsarea == NULL || + d->incarea == NULL) { + freedfa(d); + ERR(REG_ESPACE); + return NULL; + } + + d->nssets = (v->eflags®_SMALL) ? 7 : CACHE; + d->nssused = 0; + d->nstates = cnfa->nstates; + d->ncolors = cnfa->ncolors; + d->wordsper = wordsper; + d->cnfa = cnfa; + d->cm = cm; + d->lastpost = NULL; + d->search = d->ssets; + + for (ss = d->ssets, i = 0; i < d->nssets; ss++, i++) { + /* initialization of most fields is done as needed */ + ss->states = &d->statesarea[i * d->wordsper]; + ss->outs = &d->outsarea[i * d->ncolors]; + ss->inchain = &d->incarea[i * d->ncolors]; + } + + return d; +} + +/* + - freedfa - free a DFA + ^ static VOID freedfa(struct dfa *); + */ +static VOID +freedfa(d) +struct dfa *d; +{ + if (d->ssets != NULL) + FREE(d->ssets); + if (d->statesarea != NULL) + FREE(d->statesarea); + if (d->outsarea != NULL) + FREE(d->outsarea); + if (d->incarea != NULL) + FREE(d->incarea); + FREE(d); +} + +/* + - hash - construct a hash code for a bitvector + * There are probably better ways, but they're more expensive. + ^ static unsigned hash(unsigned *, int); + */ +static unsigned +hash(uv, n) +unsigned *uv; +int n; +{ + int i; + unsigned h; + + h = 0; + for (i = 0; i < n; i++) + h ^= uv[i]; + return h; +} + +/* + - initialize - hand-craft a cache entry for startup, otherwise get ready + ^ static struct sset *initialize(struct vars *, struct dfa *, chr *); + */ +static struct sset * +initialize(v, d, start) +struct vars *v; /* used only for debug flags */ +struct dfa *d; +chr *start; +{ + struct sset *ss; + int i; + + /* is previous one still there? */ + if (d->nssused > 0 && (d->ssets[0].flags&STARTER)) + ss = &d->ssets[0]; + else { /* no, must (re)build it */ + ss = getvacant(v, d, start, start); + for (i = 0; i < d->wordsper; i++) + ss->states[i] = 0; + BSET(ss->states, d->cnfa->pre); + ss->hash = hash(ss->states, d->wordsper); + assert(d->cnfa->pre != d->cnfa->post); + ss->flags = STARTER|LOCKED; + /* lastseen dealt with below */ + } + + for (i = 0; i < d->nssused; i++) + d->ssets[i].lastseen = NULL; + ss->lastseen = start; /* maybe untrue, but harmless */ + d->lastpost = NULL; + return ss; +} + +/* + - miss - handle a cache miss + ^ static struct sset *miss(struct vars *, struct dfa *, struct sset *, + ^ pcolor, chr *, chr *); + */ +static struct sset * /* NULL if goes to empty set */ +miss(v, d, css, co, cp, start) +struct vars *v; /* used only for debug flags */ +struct dfa *d; +struct sset *css; +pcolor co; +chr *cp; /* next chr */ +chr *start; /* where the attempt got started */ +{ + struct cnfa *cnfa = d->cnfa; + int i; + unsigned h; + struct carc *ca; + struct sset *p; + int ispost; + int gotstate; + int dolacons; + int didlacons; + + /* for convenience, we can be called even if it might not be a miss */ + if (css->outs[co] != NULL) { + FDEBUG(("hit\n")); + return css->outs[co]; + } + FDEBUG(("miss\n")); + + /* first, what set of states would we end up in? */ + for (i = 0; i < d->wordsper; i++) + d->work[i] = 0; + ispost = 0; + gotstate = 0; + for (i = 0; i < d->nstates; i++) + if (ISBSET(css->states, i)) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) + if (ca->co == co) { + BSET(d->work, ca->to); + gotstate = 1; + if (ca->to == cnfa->post) + ispost = 1; + FDEBUG(("%d -> %d\n", i, ca->to)); + } + dolacons = (gotstate) ? (cnfa->flags&HASLACONS) : 0; + didlacons = 0; + while (dolacons) { /* transitive closure */ + dolacons = 0; + for (i = 0; i < d->nstates; i++) + if (ISBSET(d->work, i)) + for (ca = cnfa->states[i]; ca->co != COLORLESS; + ca++) + if (ca->co > cnfa->ncolors && + !ISBSET(d->work, ca->to) && + lacon(v, cnfa, cp, + ca->co)) { + BSET(d->work, ca->to); + dolacons = 1; + didlacons = 1; + if (ca->to == cnfa->post) + ispost = 1; + FDEBUG(("%d :> %d\n",i,ca->to)); + } + } + if (!gotstate) + return NULL; + h = hash(d->work, d->wordsper); + + /* next, is that in the cache? */ + for (p = d->ssets, i = d->nssused; i > 0; p++, i--) + if (p->hash == h && memcmp(VS(d->work), VS(p->states), + d->wordsper*sizeof(unsigned)) == 0) { + FDEBUG(("cached c%d\n", p - d->ssets)); + break; /* NOTE BREAK OUT */ + } + if (i == 0) { /* nope, need a new cache entry */ + p = getvacant(v, d, cp, start); + assert(p != css); + for (i = 0; i < d->wordsper; i++) + p->states[i] = d->work[i]; + p->hash = h; + p->flags = (ispost) ? POSTSTATE : 0; + /* lastseen to be dealt with by caller */ + } + + if (!didlacons) { /* lookahead conds. always cache miss */ + css->outs[co] = p; + css->inchain[co] = p->ins; + p->ins.ss = css; + p->ins.co = (color)co; + } + return p; +} + +/* + - lacon - lookahead-constraint checker for miss() + ^ static int lacon(struct vars *, struct cnfa *, chr *, pcolor); + */ +static int /* predicate: constraint satisfied? */ +lacon(v, pcnfa, cp, co) +struct vars *v; +struct cnfa *pcnfa; /* parent cnfa */ +chr *cp; +pcolor co; /* "color" of the lookahead constraint */ +{ + int n; + struct subre *sub; + struct dfa *d; + chr *end; + + n = co - pcnfa->ncolors; + assert(n < v->g->nlacons && v->g->lacons != NULL); + FDEBUG(("=== testing lacon %d\n", n)); + sub = &v->g->lacons[n]; + d = newdfa(v, &sub->cnfa, &v->g->cmap); + if (d == NULL) { + ERR(REG_ESPACE); + return 0; + } + end = longest(v, d, cp, v->stop); + freedfa(d); + FDEBUG(("=== lacon %d match %d\n", n, (end != NULL))); + return (sub->subno) ? (end != NULL) : (end == NULL); +} + +/* + - getvacant - get a vacant state set + * This routine clears out the inarcs and outarcs, but does not otherwise + * clear the innards of the state set -- that's up to the caller. + ^ static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *); + */ +static struct sset * +getvacant(v, d, cp, start) +struct vars *v; /* used only for debug flags */ +struct dfa *d; +chr *cp; +chr *start; +{ + int i; + struct sset *ss; + struct sset *p; + struct arcp ap; + struct arcp lastap; + color co; + + ss = pickss(v, d, cp, start); + assert(!(ss->flags&LOCKED)); + + /* clear out its inarcs, including self-referential ones */ + ap = ss->ins; + while ((p = ap.ss) != NULL) { + co = ap.co; + FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long)co)); + p->outs[co] = NULL; + ap = p->inchain[co]; + p->inchain[co].ss = NULL; /* paranoia */ + } + ss->ins.ss = NULL; + + /* take it off the inarc chains of the ssets reached by its outarcs */ + for (i = 0; i < d->ncolors; i++) { + p = ss->outs[i]; + assert(p != ss); /* not self-referential */ + if (p == NULL) + continue; /* NOTE CONTINUE */ + FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets)); + if (p->ins.ss == ss && p->ins.co == i) + p->ins = ss->inchain[i]; + else { + assert(p->ins.ss != NULL); + for (ap = p->ins; ap.ss != NULL && + !(ap.ss == ss && ap.co == i); + ap = ap.ss->inchain[ap.co]) + lastap = ap; + assert(ap.ss != NULL); + lastap.ss->inchain[lastap.co] = ss->inchain[i]; + } + ss->outs[i] = NULL; + ss->inchain[i].ss = NULL; + } + + /* if ss was a success state, may need to remember location */ + if ((ss->flags&POSTSTATE) && ss->lastseen != d->lastpost && + (d->lastpost == NULL || d->lastpost < ss->lastseen)) + d->lastpost = ss->lastseen; + + return ss; +} + +/* + - pickss - pick the next stateset to be used + ^ static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *); + */ +static struct sset * +pickss(v, d, cp, start) +struct vars *v; /* used only for debug flags */ +struct dfa *d; +chr *cp; +chr *start; +{ + int i; + struct sset *ss; + struct sset *end; + chr *ancient; + + /* shortcut for cases where cache isn't full */ + if (d->nssused < d->nssets) { + ss = &d->ssets[d->nssused]; + d->nssused++; + FDEBUG(("new c%d\n", ss - d->ssets)); + /* must make innards consistent */ + ss->ins.ss = NULL; + ss->ins.co = WHITE; /* give it some value */ + for (i = 0; i < d->ncolors; i++) { + ss->outs[i] = NULL; + ss->inchain[i].ss = NULL; + } + ss->flags = 0; + return ss; + } + + /* look for oldest, or old enough anyway */ + if (cp - start > d->nssets*2/3) /* oldest 33% are expendable */ + ancient = cp - d->nssets*2/3; + else + ancient = start; + for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++) + if ((ss->lastseen == NULL || ss->lastseen < ancient) && + !(ss->flags&LOCKED)) { + d->search = ss + 1; + FDEBUG(("replacing c%d\n", ss - d->ssets)); + return ss; + } + for (ss = d->ssets, end = d->search; ss < end; ss++) + if ((ss->lastseen == NULL || ss->lastseen < ancient) && + !(ss->flags&LOCKED)) { + d->search = ss + 1; + FDEBUG(("replacing c%d\n", ss - d->ssets)); + return ss; + } + + /* nobody's old enough?!? -- something's really wrong */ + FDEBUG(("can't find victim to replace!\n")); + assert(NOTREACHED); + ERR(REG_ASSERT); + return d->ssets; +} diff --git a/generic/regerror.c b/generic/regerror.c index 5eb67a7..7fb2ef3 100644 --- a/generic/regerror.c +++ b/generic/regerror.c @@ -15,7 +15,7 @@ static struct rerr { } rerrs[] = { /* the actual table is built from regex.h */ # include "regerrs.h" - -1, "", "oops", /* explanation special-cased in code */ + { -1, "", "oops" }, /* explanation special-cased in code */ }; /* diff --git a/generic/regerrs.h b/generic/regerrs.h index 8298597..af2805c 100644 --- a/generic/regerrs.h +++ b/generic/regerrs.h @@ -1,19 +1,18 @@ -REG_OKAY, "REG_OKAY", "no errors detected", -REG_NOMATCH, "REG_NOMATCH", "failed to match", -REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.1)", -REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element", -REG_ECTYPE, "REG_ECTYPE", "invalid character class", -REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence", -REG_ESUBREG, "REG_ESUBREG", "invalid backreference number", -REG_EBRACK, "REG_EBRACK", "brackets [] not balanced", -REG_EPAREN, "REG_EPAREN", "parentheses () not balanced", -REG_EBRACE, "REG_EBRACE", "braces {} not balanced", -REG_BADBR, "REG_BADBR", "invalid repetition count(s)", -REG_ERANGE, "REG_ERANGE", "invalid character range", -REG_ESPACE, "REG_ESPACE", "out of memory", -REG_BADRPT, "REG_BADRPT", "quantifier operand invalid", -REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug", -REG_INVARG, "REG_INVARG", "invalid argument to regex function", -REG_MIXED, "REG_MIXED", "character widths of regex and string differ", -REG_BADOPT, "REG_BADOPT", "invalid embedded option", -REG_IMPOSS, "REG_IMPOSS", "can never match", +{ REG_OKAY, "REG_OKAY", "no errors detected" }, +{ REG_NOMATCH, "REG_NOMATCH", "failed to match" }, +{ REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.1)" }, +{ REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" }, +{ REG_ECTYPE, "REG_ECTYPE", "invalid character class" }, +{ REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence" }, +{ REG_ESUBREG, "REG_ESUBREG", "invalid backreference number" }, +{ REG_EBRACK, "REG_EBRACK", "brackets [] not balanced" }, +{ REG_EPAREN, "REG_EPAREN", "parentheses () not balanced" }, +{ REG_EBRACE, "REG_EBRACE", "braces {} not balanced" }, +{ REG_BADBR, "REG_BADBR", "invalid repetition count(s)" }, +{ REG_ERANGE, "REG_ERANGE", "invalid character range" }, +{ REG_ESPACE, "REG_ESPACE", "out of memory" }, +{ REG_BADRPT, "REG_BADRPT", "quantifier operand invalid" }, +{ REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug" }, +{ REG_INVARG, "REG_INVARG", "invalid argument to regex function" }, +{ REG_MIXED, "REG_MIXED", "character widths of regex and string differ" }, +{ REG_BADOPT, "REG_BADOPT", "invalid embedded option" }, diff --git a/generic/regex.h b/generic/regex.h index 6f61dd3..7ca6ad0 100644 --- a/generic/regex.h +++ b/generic/regex.h @@ -16,7 +16,7 @@ * suggestive of the wide type, e.g. re_ucomp and re_uexec for Unicode). * For cranky old compilers, it may be necessary to do something like: * #define __REG_WIDE_COMPILE(a,b,c,d) re_Xcomp(a,b,c,d) - * #define __REG_WIDE_EXEC(a,b,c,d,e,f) re_Xexec(a,b,c,d,e,f) + * #define __REG_WIDE_EXEC(a,b,c,d,e,f,g) re_Xexec(a,b,c,d,e,f,g) * rather than just #defining the names as parameterless macros. * * For some specialized purposes, it may be desirable to suppress the @@ -44,8 +44,14 @@ extern "C" { /* - * Add your own defines, if needed, here. The --- stuff is for automatic - * generation of this file from regproto.h and regcustom.h. + * Add your own defines, if needed, here. + */ + + + +/* + * Location where a chunk of regcustom.h is automatically spliced into + * this file (working from its prototype, regproto.h). */ /* --- begin --- */ /* ensure certain things don't sneak in from system headers */ @@ -69,17 +75,20 @@ extern "C" { #endif /* interface types */ #define __REG_WIDE_T Tcl_UniChar -#define __REG_WIDE_COMPILE re_ucomp -#define __REG_WIDE_EXEC re_uexec #define __REG_REGOFF_T long /* not really right, but good enough... */ #define __REG_VOID_T VOID #define __REG_CONST CONST +/* names and declarations */ +#define __REG_WIDE_COMPILE re_ucomp +#define __REG_WIDE_EXEC re_uexec #ifndef __REG_NOFRONT #define __REG_NOFRONT /* don't want regcomp() and regexec() */ #endif #ifndef __REG_NOCHAR #define __REG_NOCHAR /* or the char versions */ #endif +#define regfree re_ufree +#define regerror re_uerror /* --- end --- */ @@ -139,6 +148,7 @@ typedef struct { # define REG_UUNPORT 001000 # define REG_ULOCALE 002000 # define REG_UEMPTYMATCH 004000 +# define REG_UIMPOSSIBLE 010000 int re_csize; /* sizeof(character) */ char *re_endp; /* backward compatibility kludge */ /* the rest is opaque pointers to hidden innards */ @@ -249,7 +259,6 @@ typedef struct { #define REG_INVARG 16 /* invalid argument to regex function */ #define REG_MIXED 17 /* character widths of regex and string differ */ #define REG_BADOPT 18 /* invalid embedded option */ -#define REG_IMPOSS 19 /* can never match */ /* two specials for debugging and testing */ #define REG_ATOI 101 /* convert error-code name to number */ #define REG_ITOA 102 /* convert error-code number to name */ @@ -280,8 +289,8 @@ int regexec _ANSI_ARGS_((regex_t *, __REG_CONST char *, size_t, regmatch_t [], i #ifdef __REG_WIDE_T int __REG_WIDE_EXEC _ANSI_ARGS_((regex_t *, __REG_CONST __REG_WIDE_T *, size_t, rm_detail_t *, size_t, regmatch_t [], int)); #endif -re_void regfree _ANSI_ARGS_((regex_t *)); -extern size_t regerror _ANSI_ARGS_((int, __REG_CONST regex_t *, char *, size_t)); +re_void re_ufree _ANSI_ARGS_((regex_t *)); +extern size_t re_uerror _ANSI_ARGS_((int, __REG_CONST regex_t *, char *, size_t)); /* automatically gathered by fwd; do not hand-edit */ /* =====^!^===== end forwards =====^!^===== */ diff --git a/generic/regexec.c b/generic/regexec.c index 4220062..1671866 100644 --- a/generic/regexec.c +++ b/generic/regexec.c @@ -17,8 +17,6 @@ struct vars { chr *stop; /* just past end of string */ int err; /* error code if any (0 none) */ regoff_t *mem; /* memory vector for backtracking */ - regoff_t *mem1; /* localizer vector */ - regoff_t *mem2; /* dissector vector */ }; #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ #define ISERR() VISERR(v) @@ -79,21 +77,20 @@ struct dfa { int exec _ANSI_ARGS_((regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int)); static int find _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *)); static int cfind _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *)); -static VOID zapmatches _ANSI_ARGS_((regmatch_t *, size_t)); -static VOID zapmem _ANSI_ARGS_((struct vars *, struct rtree *)); +static VOID zapsubs _ANSI_ARGS_((regmatch_t *, size_t)); +static VOID zapmem _ANSI_ARGS_((struct vars *, struct subre *)); static VOID subset _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); -static int dissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *)); -static int altdissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *)); -static int cdissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *)); -static int crevdissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *)); -static int csindissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *)); -static int cbrdissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *)); -static int caltdissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *)); -static chr *dismatch _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *)); -static chr *dismrev _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *)); -static chr *dismsin _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *)); +static int dissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); +static int condissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); +static int altdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); +static int cdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); +static int ccondissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); +static int crevdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); +static int cbrdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); +static int caltdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); +/* === rege_dfa.c === */ static chr *longest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *)); -static chr *shortest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, chr *)); +static chr *shortest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, chr *, chr **)); static struct dfa *newdfa _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *)); static VOID freedfa _ANSI_ARGS_((struct dfa *)); static unsigned hash _ANSI_ARGS_((unsigned *, int)); @@ -137,6 +134,8 @@ int flags; /* setup */ v->re = re; v->g = (struct guts *)re->re_guts; + if (v->g->unmatchable) + return REG_NOMATCH; complications = (v->g->info®_UBACKREF) ? 1 : 0; if (v->g->usedshorter) complications = 1; @@ -157,30 +156,30 @@ int flags; v->stop = (chr *)string + len; v->err = 0; if (complications) { - v->mem1 = (regoff_t *)MALLOC(2*v->g->ntree*sizeof(regoff_t)); - if (v->mem1 == NULL) { + v->mem = (regoff_t *)MALLOC(v->g->ntree*sizeof(regoff_t)); + if (v->mem == NULL) { if (v->pmatch != pmatch) FREE(v->pmatch); return REG_ESPACE; } - v->mem2 = v->mem1 + v->g->ntree; } else - v->mem1 = NULL; + v->mem = NULL; /* do it */ + assert(v->g->tree != NULL); if (complications) - st = cfind(v, &v->g->cnfa, v->g->cm); + st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); else - st = find(v, &v->g->cnfa, v->g->cm); + st = find(v, &v->g->tree->cnfa, &v->g->cmap); if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) { - zapmatches(pmatch, nmatch); + zapsubs(pmatch, nmatch); n = (nmatch < v->nmatch) ? nmatch : v->nmatch; memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t)); } if (v->pmatch != pmatch) FREE(v->pmatch); - if (v->mem1 != NULL) - FREE(v->mem1); + if (v->mem != NULL) + FREE(v->mem); return st; } @@ -195,33 +194,57 @@ struct cnfa *cnfa; struct colormap *cm; { struct dfa *d = newdfa(v, cnfa, cm); + struct dfa *s = newdfa(v, &v->g->search, cm); chr *begin; chr *end; +#ifdef notdef + chr *open; /* open and close of range of possible starts */ + chr *close; +#endif chr *stop = (cnfa->flags&LEFTANCH) ? v->start : v->stop; if (d == NULL) return v->err; + if (s == NULL) { + freedfa(d); + return v->err; + } - for (begin = v->start; begin <= stop; begin++) { - MDEBUG(("\ntrying at %ld\n", (long)OFF(begin))); - end = longest(v, d, begin, v->stop); - if (end != NULL) { - if (v->nmatch > 0) { - v->pmatch[0].rm_so = OFF(begin); - v->pmatch[0].rm_eo = OFF(end); - } - freedfa(d); - if (v->nmatch > 1) { - zapmatches(v->pmatch, v->nmatch); - return dissect(v, v->g->tree, begin, end); +#ifdef notdef + close = v->start; + while (close < stop) { + MDEBUG(("\nsearch at %ld\n", (long)OFF(close))); + close = shortest(v, s, close, close, v->stop, &open); + if (close == NULL) + break; /* NOTE BREAK */ + MDEBUG(("between %ld and %ld\n", (long)OFF(open), + (long)OFF(close))); +#endif + for (begin = v->start; begin <= stop; begin++) { + end = longest(v, d, begin, v->stop); + if (end != NULL) { + if (v->nmatch > 0) { + v->pmatch[0].rm_so = OFF(begin); + v->pmatch[0].rm_eo = OFF(end); + } + freedfa(d); + freedfa(s); + if (v->nmatch > 1) { + zapsubs(v->pmatch, v->nmatch); + return dissect(v, v->g->tree, begin, + end); + } + if (ISERR()) + return v->err; + return REG_OKAY; } - if (ISERR()) - return v->err; - return REG_OKAY; } +#ifdef notdef } +#endif freedfa(d); + freedfa(s); if (ISERR()) return v->err; return REG_NOMATCH; @@ -241,33 +264,28 @@ struct colormap *cm; chr *begin; chr *end; chr *stop = (cnfa->flags&LEFTANCH) ? v->start : v->stop; + chr *estart; chr *estop; int er; - int usedis = (v->g->tree == NULL || v->g->tree->op == '|') ? 0 : 1; + int shorter = v->g->tree->flags&SHORTER; if (d == NULL) return v->err; - if (!v->g->usedshorter) - usedis = 0; for (begin = v->start; begin <= stop; begin++) { - MDEBUG(("\ntrying at %ld\n", (long)OFF(begin))); - if (usedis) { - v->mem = v->mem1; - zapmem(v, v->g->tree); - } + MDEBUG(("\ncfind trying at %ld\n", (long)OFF(begin))); + estart = begin; estop = v->stop; for (;;) { - if (usedis) { - v->mem = v->mem1; - end = dismatch(v, v->g->tree, begin, v->stop); - } else + if (shorter) + end = shortest(v, d, begin, estart, estop, + (chr **)NULL); + else end = longest(v, d, begin, estop); if (end == NULL) break; /* NOTE BREAK OUT */ MDEBUG(("tentative end %ld\n", (long)OFF(end))); - zapmatches(v->pmatch, v->nmatch); - v->mem = v->mem2; + zapsubs(v->pmatch, v->nmatch); zapmem(v, v->g->tree); er = cdissect(v, v->g->tree, begin, end); switch (er) { @@ -283,14 +301,15 @@ struct colormap *cm; break; case REG_NOMATCH: /* go around and try again */ - if (!usedis) { - if (end == begin) { - /* no point in trying again */ - freedfa(d); - return REG_NOMATCH; - } - estop = end - 1; + if ((shorter) ? end == estop : end == begin) { + /* no point in trying again */ + freedfa(d); + return REG_NOMATCH; } + if (shorter) + estart = end + 1; + else + estop = end - 1; break; default: freedfa(d); @@ -307,11 +326,11 @@ struct colormap *cm; } /* - - zapmatches - initialize the subexpression matches to "no match" - ^ static VOID zapmatches(regmatch_t *, size_t); + - zapsubs - initialize the subexpression matches to "no match" + ^ static VOID zapsubs(regmatch_t *, size_t); */ static VOID -zapmatches(p, n) +zapsubs(p, n) regmatch_t *p; size_t n; { @@ -325,33 +344,28 @@ size_t n; /* - zapmem - initialize the retry memory of a subtree to zeros - ^ static VOID zapmem(struct vars *, struct rtree *); + ^ static VOID zapmem(struct vars *, struct subre *); */ static VOID -zapmem(v, rt) +zapmem(v, t) struct vars *v; -struct rtree *rt; +struct subre *t; { - if (rt == NULL) + if (t == NULL) return; assert(v->mem != NULL); - v->mem[rt->no] = 0; - - if (rt->left.tree != NULL) - zapmem(v, rt->left.tree); - if (rt->left.subno > 0) { - v->pmatch[rt->left.subno].rm_so = -1; - v->pmatch[rt->left.subno].rm_eo = -1; + v->mem[t->retry] = 0; + if (t->op == '(') { + assert(t->subno > 0); + v->pmatch[t->subno].rm_so = -1; + v->pmatch[t->subno].rm_eo = -1; } - if (rt->right.tree != NULL) - zapmem(v, rt->right.tree); - if (rt->right.subno > 0) { - v->pmatch[rt->right.subno].rm_so = -1; - v->pmatch[rt->right.subno].rm_eo = -1; - } - if (rt->next != NULL) - zapmem(v, rt->next); + + if (t->left != NULL) + zapmem(v, t->left); + if (t->right != NULL) + zapmem(v, t->right); } /* @@ -367,8 +381,6 @@ chr *end; { int n = sub->subno; - if (n == 0) - return; assert(n > 0); if ((size_t)n >= v->nmatch) return; @@ -380,12 +392,54 @@ chr *end; /* - dissect - determine subexpression matches (uncomplicated case) - ^ static int dissect(struct vars *, struct rtree *, chr *, chr *); + ^ static int dissect(struct vars *, struct subre *, chr *, chr *); + */ +static int /* regexec return code */ +dissect(v, t, begin, end) +struct vars *v; +struct subre *t; +chr *begin; /* beginning of relevant substring */ +chr *end; /* end of same */ +{ + assert(t != NULL); + MDEBUG(("dissect %ld-%ld\n", (long)OFF(begin), (long)OFF(end))); + + switch (t->op) { + case '=': /* terminal node */ + assert(t->left == NULL && t->right == NULL); + return REG_OKAY; /* no action, parent did the work */ + break; + case '|': /* alternation */ + assert(t->left != NULL); + return altdissect(v, t, begin, end); + break; + case 'b': /* back ref -- shouldn't be calling us! */ + return REG_ASSERT; + break; + case '.': /* concatenation */ + assert(t->left != NULL && t->right != NULL); + return condissect(v, t, begin, end); + break; + case '(': /* capturing */ + assert(t->left != NULL && t->right == NULL); + assert(t->subno > 0); + subset(v, t, begin, end); + return dissect(v, t->left, begin, end); + break; + default: + return REG_ASSERT; + break; + } +} + +/* + - condissect - determine concatenation subexpression matches (uncomplicated) + ^ static int condissect(struct vars *, struct subre *, chr *, chr *); */ static int /* regexec return code */ -dissect(v, rt, begin, end) +condissect(v, t, begin, end) struct vars *v; -struct rtree *rt; +struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { @@ -394,37 +448,14 @@ chr *end; /* end of same */ chr *mid; int i; - if (rt == NULL) - return REG_OKAY; - MDEBUG(("substring %ld-%ld\n", (long)OFF(begin), (long)OFF(end))); - - /* alternatives -- punt to auxiliary */ - if (rt->op == '|') - return altdissect(v, rt, begin, end); + assert(t->op == '.'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->right != NULL && t->right->cnfa.nstates > 0); - /* concatenation -- need to split the substring between parts */ - assert(rt->op == ','); - assert(rt->left.cnfa.nstates > 0); - d = newdfa(v, &rt->left.cnfa, v->g->cm); + d = newdfa(v, &t->left->cnfa, &v->g->cmap); if (ISERR()) return v->err; - - /* in some cases, there may be no right side... */ - if (rt->right.cnfa.nstates == 0) { - MDEBUG(("singleton\n")); - if (longest(v, d, begin, end) != end) { - freedfa(d); - return REG_ASSERT; - } - freedfa(d); - assert(rt->left.subno >= 0); - subset(v, &rt->left, begin, end); - return dissect(v, rt->left.tree, begin, end); - } - - /* general case */ - assert(rt->right.cnfa.nstates > 0); - d2 = newdfa(v, &rt->right.cnfa, v->g->cm); + d2 = newdfa(v, &t->right->cnfa, &v->g->cmap); if (ISERR()) { freedfa(d); return v->err; @@ -464,45 +495,39 @@ chr *end; /* end of same */ MDEBUG(("successful\n")); freedfa(d); freedfa(d2); - assert(rt->left.subno >= 0); - subset(v, &rt->left, begin, mid); - assert(rt->right.subno >= 0); - subset(v, &rt->right, mid, end); - i = dissect(v, rt->left.tree, begin, mid); + i = dissect(v, t->left, begin, mid); if (i != REG_OKAY) return i; - return dissect(v, rt->right.tree, mid, end); + return dissect(v, t->right, mid, end); } /* - altdissect - determine alternative subexpression matches (uncomplicated) - ^ static int altdissect(struct vars *, struct rtree *, chr *, chr *); + ^ static int altdissect(struct vars *, struct subre *, chr *, chr *); */ static int /* regexec return code */ -altdissect(v, rt, begin, end) +altdissect(v, t, begin, end) struct vars *v; -struct rtree *rt; +struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { struct dfa *d; int i; - assert(rt != NULL); - assert(rt->op == '|'); + assert(t != NULL); + assert(t->op == '|'); - for (i = 0; rt != NULL; rt = rt->next, i++) { + for (i = 0; t != NULL; t = t->right, i++) { MDEBUG(("trying %dth\n", i)); - assert(rt->left.begin != NULL); - d = newdfa(v, &rt->left.cnfa, v->g->cm); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + d = newdfa(v, &t->left->cnfa, &v->g->cmap); if (ISERR()) return v->err; if (longest(v, d, begin, end) == end) { MDEBUG(("success\n")); freedfa(d); - assert(rt->left.subno >= 0); - subset(v, &rt->left, begin, end); - return dissect(v, rt->left.tree, begin, end); + return dissect(v, t->left, begin, end); } freedfa(d); } @@ -513,12 +538,61 @@ chr *end; /* end of same */ - cdissect - determine subexpression matches (with complications) * The retry memory stores the offset of the trial midpoint from begin, * plus 1 so that 0 uniquely means "clean slate". - ^ static int cdissect(struct vars *, struct rtree *, chr *, chr *); + ^ static int cdissect(struct vars *, struct subre *, chr *, chr *); */ static int /* regexec return code */ -cdissect(v, rt, begin, end) +cdissect(v, t, begin, end) struct vars *v; -struct rtree *rt; +struct subre *t; +chr *begin; /* beginning of relevant substring */ +chr *end; /* end of same */ +{ + int er; + + assert(t != NULL); + MDEBUG(("cdissect %ld-%ld\n", (long)OFF(begin), (long)OFF(end))); + + switch (t->op) { + case '=': /* terminal node */ + assert(t->left == NULL && t->right == NULL); + return REG_OKAY; /* no action, parent did the work */ + break; + case '|': /* alternation */ + assert(t->left != NULL); + return caltdissect(v, t, begin, end); + break; + case 'b': /* back ref -- shouldn't be calling us! */ + assert(t->left == NULL && t->right == NULL); + return cbrdissect(v, t, begin, end); + break; + case '.': /* concatenation */ + assert(t->left != NULL && t->right != NULL); + return ccondissect(v, t, begin, end); + break; + case '(': /* capturing */ + assert(t->left != NULL && t->right == NULL); + assert(t->subno > 0); + er = cdissect(v, t->left, begin, end); + if (er == REG_OKAY) + subset(v, t, begin, end); + return er; + break; + default: + return REG_ASSERT; + break; + } +} + +/* + - ccondissect - concatenation subexpression matches (with complications) + * The retry memory stores the offset of the trial midpoint from begin, + * plus 1 so that 0 uniquely means "clean slate". + ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *); + */ +static int /* regexec return code */ +ccondissect(v, t, begin, end) +struct vars *v; +struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { @@ -527,36 +601,25 @@ chr *end; /* end of same */ chr *mid; int er; - if (rt == NULL) - return REG_OKAY; - MDEBUG(("csubstr %ld-%ld\n", (long)OFF(begin), (long)OFF(end))); - - /* punt various cases to auxiliaries */ - if (rt->op == '|') /* alternatives */ - return caltdissect(v, rt, begin, end); - if (rt->op == 'b') /* backref */ - return cbrdissect(v, rt, begin, end); - if (rt->right.cnfa.nstates == 0) /* no RHS */ - return csindissect(v, rt, begin, end); - if (rt->left.prefer == SHORTER) /* reverse scan */ - return crevdissect(v, rt, begin, end); + assert(t->op == '.'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->right != NULL && t->right->cnfa.nstates > 0); - /* concatenation -- need to split the substring between parts */ - assert(rt->op == ','); - assert(rt->left.cnfa.nstates > 0); - assert(rt->right.cnfa.nstates > 0); - d = newdfa(v, &rt->left.cnfa, v->g->cm); + if (t->left->flags&SHORTER) /* reverse scan */ + return crevdissect(v, t, begin, end); + + d = newdfa(v, &t->left->cnfa, &v->g->cmap); if (ISERR()) return v->err; - d2 = newdfa(v, &rt->right.cnfa, v->g->cm); + d2 = newdfa(v, &t->right->cnfa, &v->g->cmap); if (ISERR()) { freedfa(d); return v->err; } - MDEBUG(("cconcat %d\n", rt->no)); + MDEBUG(("cconcat %d\n", t->retry)); /* pick a tentative midpoint */ - if (v->mem[rt->no] == 0) { + if (v->mem[t->retry] == 0) { mid = longest(v, d, begin, end); if (mid == NULL) { freedfa(d); @@ -564,19 +627,18 @@ chr *end; /* end of same */ return REG_NOMATCH; } MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid))); - subset(v, &rt->left, begin, mid); - v->mem[rt->no] = (mid - begin) + 1; + v->mem[t->retry] = (mid - begin) + 1; } else { - mid = begin + (v->mem[rt->no] - 1); + mid = begin + (v->mem[t->retry] - 1); MDEBUG(("working midpoint %ld\n", (long)OFF(mid))); } /* iterate until satisfaction or failure */ for (;;) { /* try this midpoint on for size */ - er = cdissect(v, rt->left.tree, begin, mid); + er = cdissect(v, t->left, begin, mid); if (er == REG_OKAY && longest(v, d2, mid, end) == end && - (er = cdissect(v, rt->right.tree, mid, end)) == + (er = cdissect(v, t->right, mid, end)) == REG_OKAY) break; /* NOTE BREAK OUT */ if (er != REG_OKAY && er != REG_NOMATCH) { @@ -588,7 +650,7 @@ chr *end; /* end of same */ /* that midpoint didn't work, find a new one */ if (mid == begin) { /* all possibilities exhausted */ - MDEBUG(("%d no midpoint\n", rt->no)); + MDEBUG(("%d no midpoint\n", t->retry)); freedfa(d); freedfa(d2); return REG_NOMATCH; @@ -596,23 +658,21 @@ chr *end; /* end of same */ mid = longest(v, d, begin, mid-1); if (mid == NULL) { /* failed to find a new one */ - MDEBUG(("%d failed midpoint\n", rt->no)); + MDEBUG(("%d failed midpoint\n", t->retry)); freedfa(d); freedfa(d2); return REG_NOMATCH; } - MDEBUG(("%d: new midpoint %ld\n", rt->no, (long)OFF(mid))); - subset(v, &rt->left, begin, mid); - v->mem[rt->no] = (mid - begin) + 1; - zapmem(v, rt->left.tree); - zapmem(v, rt->right.tree); + MDEBUG(("%d: new midpoint %ld\n", t->retry, (long)OFF(mid))); + v->mem[t->retry] = (mid - begin) + 1; + zapmem(v, t->left); + zapmem(v, t->right); } /* satisfaction */ MDEBUG(("successful\n")); freedfa(d); freedfa(d2); - subset(v, &rt->right, mid, end); return REG_OKAY; } @@ -620,12 +680,12 @@ chr *end; /* end of same */ - crevdissect - determine shortest-first subexpression matches * The retry memory stores the offset of the trial midpoint from begin, * plus 1 so that 0 uniquely means "clean slate". - ^ static int crevdissect(struct vars *, struct rtree *, chr *, chr *); + ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *); */ static int /* regexec return code */ -crevdissect(v, rt, begin, end) +crevdissect(v, t, begin, end) struct vars *v; -struct rtree *rt; +struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { @@ -634,45 +694,43 @@ chr *end; /* end of same */ chr *mid; int er; - if (rt == NULL) - return REG_OKAY; - assert(rt->op == ',' && rt->left.prefer == SHORTER); + assert(t->op == '.'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->right != NULL && t->right->cnfa.nstates > 0); + assert(t->left->flags&SHORTER); /* concatenation -- need to split the substring between parts */ - assert(rt->left.cnfa.nstates > 0); - assert(rt->right.cnfa.nstates > 0); - d = newdfa(v, &rt->left.cnfa, v->g->cm); + d = newdfa(v, &t->left->cnfa, &v->g->cmap); if (ISERR()) return v->err; - d2 = newdfa(v, &rt->right.cnfa, v->g->cm); + d2 = newdfa(v, &t->right->cnfa, &v->g->cmap); if (ISERR()) { freedfa(d); return v->err; } - MDEBUG(("crev %d\n", rt->no)); + MDEBUG(("crev %d\n", t->retry)); /* pick a tentative midpoint */ - if (v->mem[rt->no] == 0) { - mid = shortest(v, d, begin, begin, end); + if (v->mem[t->retry] == 0) { + mid = shortest(v, d, begin, begin, end, (chr **)NULL); if (mid == NULL) { freedfa(d); freedfa(d2); return REG_NOMATCH; } MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid))); - subset(v, &rt->left, begin, mid); - v->mem[rt->no] = (mid - begin) + 1; + v->mem[t->retry] = (mid - begin) + 1; } else { - mid = begin + (v->mem[rt->no] - 1); + mid = begin + (v->mem[t->retry] - 1); MDEBUG(("working midpoint %ld\n", (long)OFF(mid))); } /* iterate until satisfaction or failure */ for (;;) { /* try this midpoint on for size */ - er = cdissect(v, rt->left.tree, begin, mid); + er = cdissect(v, t->left, begin, mid); if (er == REG_OKAY && longest(v, d2, mid, end) == end && - (er = cdissect(v, rt->right.tree, mid, end)) == + (er = cdissect(v, t->right, mid, end)) == REG_OKAY) break; /* NOTE BREAK OUT */ if (er != REG_OKAY && er != REG_NOMATCH) { @@ -684,101 +742,58 @@ chr *end; /* end of same */ /* that midpoint didn't work, find a new one */ if (mid == end) { /* all possibilities exhausted */ - MDEBUG(("%d no midpoint\n", rt->no)); + MDEBUG(("%d no midpoint\n", t->retry)); freedfa(d); freedfa(d2); return REG_NOMATCH; } - mid = shortest(v, d, begin, mid+1, end); + mid = shortest(v, d, begin, mid+1, end, (chr **)NULL); if (mid == NULL) { /* failed to find a new one */ - MDEBUG(("%d failed midpoint\n", rt->no)); + MDEBUG(("%d failed midpoint\n", t->retry)); freedfa(d); freedfa(d2); return REG_NOMATCH; } - MDEBUG(("%d: new midpoint %ld\n", rt->no, (long)OFF(mid))); - subset(v, &rt->left, begin, mid); - v->mem[rt->no] = (mid - begin) + 1; - zapmem(v, rt->left.tree); - zapmem(v, rt->right.tree); + MDEBUG(("%d: new midpoint %ld\n", t->retry, (long)OFF(mid))); + v->mem[t->retry] = (mid - begin) + 1; + zapmem(v, t->left); + zapmem(v, t->right); } /* satisfaction */ MDEBUG(("successful\n")); freedfa(d); freedfa(d2); - subset(v, &rt->right, mid, end); - return REG_OKAY; -} - -/* - - csindissect - determine singleton subexpression matches (with complications) - ^ static int csindissect(struct vars *, struct rtree *, chr *, chr *); - */ -static int /* regexec return code */ -csindissect(v, rt, begin, end) -struct vars *v; -struct rtree *rt; -chr *begin; /* beginning of relevant substring */ -chr *end; /* end of same */ -{ - struct dfa *d; - int er; - - assert(rt != NULL); - assert(rt->op == ','); - assert(rt->right.cnfa.nstates == 0); - MDEBUG(("csingleton %d\n", rt->no)); - - assert(rt->left.cnfa.nstates > 0); - - /* exploit memory only to suppress repeated work in retries */ - if (!v->mem[rt->no]) { - d = newdfa(v, &rt->left.cnfa, v->g->cm); - if (longest(v, d, begin, end) != end) { - freedfa(d); - return REG_NOMATCH; - } - freedfa(d); - v->mem[rt->no] = 1; - MDEBUG(("csingleton matched\n")); - } - - er = cdissect(v, rt->left.tree, begin, end); - if (er != REG_OKAY) - return er; - subset(v, &rt->left, begin, end); return REG_OKAY; } /* - cbrdissect - determine backref subexpression matches - ^ static int cbrdissect(struct vars *, struct rtree *, chr *, chr *); + ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *); */ static int /* regexec return code */ -cbrdissect(v, rt, begin, end) +cbrdissect(v, t, begin, end) struct vars *v; -struct rtree *rt; +struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { int i; - int n = -rt->left.subno; + int n = t->subno; size_t len; chr *paren; chr *p; chr *stop; - int min = rt->left.min; - int max = rt->left.max; + int min = t->min; + int max = t->max; - assert(rt != NULL); - assert(rt->op == 'b'); - assert(rt->right.cnfa.nstates == 0); + assert(t != NULL); + assert(t->op == 'b'); assert(n >= 0); assert((size_t)n < v->nmatch); - MDEBUG(("cbackref n%d %d{%d-%d}\n", rt->no, n, min, max)); + MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max)); if (v->pmatch[n].rm_so == -1) return REG_NOMATCH; @@ -786,9 +801,9 @@ chr *end; /* end of same */ len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; /* no room to maneuver -- retries are pointless */ - if (v->mem[rt->no]) + if (v->mem[t->retry]) return REG_NOMATCH; - v->mem[rt->no] = 1; + v->mem[t->retry] = 1; /* special-case zero-length string */ if (len == 0) { @@ -822,12 +837,12 @@ chr *end; /* end of same */ /* - caltdissect - determine alternative subexpression matches (w. complications) - ^ static int caltdissect(struct vars *, struct rtree *, chr *, chr *); + ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *); */ static int /* regexec return code */ -caltdissect(v, rt, begin, end) +caltdissect(v, t, begin, end) struct vars *v; -struct rtree *rt; +struct subre *t; chr *begin; /* beginning of relevant substring */ chr *end; /* end of same */ { @@ -837,839 +852,37 @@ chr *end; /* end of same */ # define TRYING 1 /* top matched, trying submatches */ # define TRIED 2 /* top didn't match or submatches exhausted */ - if (rt == NULL) + if (t == NULL) return REG_NOMATCH; - assert(rt->op == '|'); - if (v->mem[rt->no] == TRIED) - return caltdissect(v, rt->next, begin, end); + assert(t->op == '|'); + if (v->mem[t->retry] == TRIED) + return caltdissect(v, t->right, begin, end); - MDEBUG(("calt n%d\n", rt->no)); - assert(rt->left.begin != NULL); + MDEBUG(("calt n%d\n", t->retry)); + assert(t->left != NULL); - if (v->mem[rt->no] == UNTRIED) { - d = newdfa(v, &rt->left.cnfa, v->g->cm); + if (v->mem[t->retry] == UNTRIED) { + d = newdfa(v, &t->left->cnfa, &v->g->cmap); if (ISERR()) return v->err; if (longest(v, d, begin, end) != end) { freedfa(d); - v->mem[rt->no] = TRIED; - return caltdissect(v, rt->next, begin, end); + v->mem[t->retry] = TRIED; + return caltdissect(v, t->right, begin, end); } freedfa(d); MDEBUG(("calt matched\n")); - v->mem[rt->no] = TRYING; + v->mem[t->retry] = TRYING; } - er = cdissect(v, rt->left.tree, begin, end); - if (er == REG_OKAY) { - subset(v, &rt->left, begin, end); - return REG_OKAY; - } + er = cdissect(v, t->left, begin, end); if (er != REG_NOMATCH) return er; - v->mem[rt->no] = TRIED; - return caltdissect(v, rt->next, begin, end); -} - -/* - - dismatch - determine overall match using top-level dissection - * The retry memory stores the offset of the trial midpoint from begin, - * plus 1 so that 0 uniquely means "clean slate". - ^ static chr *dismatch(struct vars *, struct rtree *, chr *, chr *); - */ -static chr * /* endpoint, or NULL */ -dismatch(v, rt, begin, end) -struct vars *v; -struct rtree *rt; -chr *begin; /* beginning of relevant substring */ -chr *end; /* end of same */ -{ - struct dfa *d; - struct dfa *d2; - chr *mid; - chr *ret; - - if (rt == NULL) - return begin; - MDEBUG(("dsubstr %ld-%ld\n", (long)OFF(begin), (long)OFF(end))); - - /* punt various cases to auxiliaries */ - if (rt->right.cnfa.nstates == 0) /* no RHS */ - return dismsin(v, rt, begin, end); - if (rt->left.prefer == SHORTER) /* reverse scan */ - return dismrev(v, rt, begin, end); - - /* concatenation -- need to split the substring between parts */ - assert(rt->op == ','); - assert(rt->left.cnfa.nstates > 0); - assert(rt->right.cnfa.nstates > 0); - d = newdfa(v, &rt->left.cnfa, v->g->cm); - if (ISERR()) - return NULL; - d2 = newdfa(v, &rt->right.cnfa, v->g->cm); - if (ISERR()) { - freedfa(d); - return NULL; - } - MDEBUG(("dconcat %d\n", rt->no)); - - /* pick a tentative midpoint */ - if (v->mem[rt->no] == 0) { - mid = longest(v, d, begin, end); - if (mid == NULL) { - freedfa(d); - freedfa(d2); - return NULL; - } - MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid))); - v->mem[rt->no] = (mid - begin) + 1; - } else { - mid = begin + (v->mem[rt->no] - 1); - MDEBUG(("working midpoint %ld\n", (long)OFF(mid))); - } - - /* iterate until satisfaction or failure */ - for (;;) { - /* try this midpoint on for size */ - if (rt->right.tree == NULL || rt->right.tree->op == 'b') { - if (rt->right.prefer == LONGER) - ret = longest(v, d2, mid, end); - else - ret = shortest(v, d2, mid, mid, end); - } else { - if (longest(v, d2, mid, end) != NULL) - ret = dismatch(v, rt->right.tree, mid, end); - else - ret = NULL; - } - if (ret != NULL) - break; /* NOTE BREAK OUT */ - - /* that midpoint didn't work, find a new one */ - if (mid == begin) { - /* all possibilities exhausted */ - MDEBUG(("%d no midpoint\n", rt->no)); - freedfa(d); - freedfa(d2); - return NULL; - } - mid = longest(v, d, begin, mid-1); - if (mid == NULL) { - /* failed to find a new one */ - MDEBUG(("%d failed midpoint\n", rt->no)); - freedfa(d); - freedfa(d2); - return NULL; - } - MDEBUG(("%d: new midpoint %ld\n", rt->no, (long)OFF(mid))); - v->mem[rt->no] = (mid - begin) + 1; - zapmem(v, rt->right.tree); - } - - /* satisfaction */ - MDEBUG(("successful\n")); - freedfa(d); - freedfa(d2); - return ret; -} - -/* - - dismrev - determine overall match using top-level dissection - * The retry memory stores the offset of the trial midpoint from begin, - * plus 1 so that 0 uniquely means "clean slate". - ^ static chr *dismrev(struct vars *, struct rtree *, chr *, chr *); - */ -static chr * /* endpoint, or NULL */ -dismrev(v, rt, begin, end) -struct vars *v; -struct rtree *rt; -chr *begin; /* beginning of relevant substring */ -chr *end; /* end of same */ -{ - struct dfa *d; - struct dfa *d2; - chr *mid; - chr *ret; - - if (rt == NULL) - return begin; - MDEBUG(("rsubstr %ld-%ld\n", (long)OFF(begin), (long)OFF(end))); - - /* concatenation -- need to split the substring between parts */ - assert(rt->op == ','); - assert(rt->left.cnfa.nstates > 0); - assert(rt->right.cnfa.nstates > 0); - d = newdfa(v, &rt->left.cnfa, v->g->cm); - if (ISERR()) - return NULL; - d2 = newdfa(v, &rt->right.cnfa, v->g->cm); - if (ISERR()) { - freedfa(d); - return NULL; - } - MDEBUG(("dconcat %d\n", rt->no)); - - /* pick a tentative midpoint */ - if (v->mem[rt->no] == 0) { - mid = shortest(v, d, begin, begin, end); - if (mid == NULL) { - freedfa(d); - freedfa(d2); - return NULL; - } - MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid))); - v->mem[rt->no] = (mid - begin) + 1; - } else { - mid = begin + (v->mem[rt->no] - 1); - MDEBUG(("working midpoint %ld\n", (long)OFF(mid))); - } - - /* iterate until satisfaction or failure */ - for (;;) { - /* try this midpoint on for size */ - if (rt->right.tree == NULL || rt->right.tree->op == 'b') { - if (rt->right.prefer == LONGER) - ret = longest(v, d2, mid, end); - else - ret = shortest(v, d2, mid, mid, end); - } else { - if (longest(v, d2, mid, end) != NULL) - ret = dismatch(v, rt->right.tree, mid, end); - else - ret = NULL; - } - if (ret != NULL) - break; /* NOTE BREAK OUT */ - - /* that midpoint didn't work, find a new one */ - if (mid == end) { - /* all possibilities exhausted */ - MDEBUG(("%d no midpoint\n", rt->no)); - freedfa(d); - freedfa(d2); - return NULL; - } - mid = shortest(v, d, begin, mid+1, end); - if (mid == NULL) { - /* failed to find a new one */ - MDEBUG(("%d failed midpoint\n", rt->no)); - freedfa(d); - freedfa(d2); - return NULL; - } - MDEBUG(("%d: new midpoint %ld\n", rt->no, (long)OFF(mid))); - v->mem[rt->no] = (mid - begin) + 1; - zapmem(v, rt->right.tree); - } - - /* satisfaction */ - MDEBUG(("successful\n")); - freedfa(d); - freedfa(d2); - return ret; -} - -/* - - dismsin - determine singleton subexpression matches (with complications) - ^ static chr *dismsin(struct vars *, struct rtree *, chr *, chr *); - */ -static chr * -dismsin(v, rt, begin, end) -struct vars *v; -struct rtree *rt; -chr *begin; /* beginning of relevant substring */ -chr *end; /* end of same */ -{ - struct dfa *d; - chr *ret; - - assert(rt != NULL); - assert(rt->op == ','); - assert(rt->right.cnfa.nstates == 0); - MDEBUG(("dsingleton %d\n", rt->no)); - - assert(rt->left.cnfa.nstates > 0); - - /* retries are pointless */ - if (v->mem[rt->no]) - return NULL; - v->mem[rt->no] = 1; - - d = newdfa(v, &rt->left.cnfa, v->g->cm); - if (d == NULL) - return NULL; - if (rt->left.prefer == LONGER) - ret = longest(v, d, begin, end); - else - ret = shortest(v, d, begin, begin, end); - freedfa(d); - if (ret != NULL) - MDEBUG(("dsingleton matched\n")); - return ret; -} - -/* - - longest - longest-preferred matching engine - ^ static chr *longest(struct vars *, struct dfa *, chr *, chr *); - */ -static chr * /* endpoint, or NULL */ -longest(v, d, start, stop) -struct vars *v; /* used only for debug and exec flags */ -struct dfa *d; -chr *start; /* where the match should start */ -chr *stop; /* match must end at or before here */ -{ - chr *cp; - chr *realstop = (stop == v->stop) ? stop : stop + 1; - color co; - struct sset *css; - struct sset *ss; - chr *post; - int i; - struct colormap *cm = d->cm; - - /* initialize */ - css = initialize(v, d, start); - cp = start; - - /* startup */ - FDEBUG(("+++ startup +++\n")); - if (cp == v->start) { - co = d->cnfa->bos[(v->eflags®_NOTBOL) ? 0 : 1]; - FDEBUG(("color %ld\n", (long)co)); - } else { - co = GETCOLOR(cm, *(cp - 1)); - FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co)); - } - css = miss(v, d, css, co, cp, start); - if (css == NULL) - return NULL; - css->lastseen = cp; - - /* main loop */ - if (v->eflags®_FTRACE) - while (cp < realstop) { - FDEBUG(("+++ at c%d +++\n", css - d->ssets)); - co = GETCOLOR(cm, *cp); - FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co)); - ss = css->outs[co]; - if (ss == NULL) { - ss = miss(v, d, css, co, cp+1, start); - if (ss == NULL) - break; /* NOTE BREAK OUT */ - } - cp++; - ss->lastseen = cp; - css = ss; - } - else - while (cp < realstop) { - co = GETCOLOR(cm, *cp); - ss = css->outs[co]; - if (ss == NULL) { - ss = miss(v, d, css, co, cp+1, start); - if (ss == NULL) - break; /* NOTE BREAK OUT */ - } - cp++; - ss->lastseen = cp; - css = ss; - } - - /* shutdown */ - FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets)); - if (cp == v->stop && stop == v->stop) { - co = d->cnfa->eos[(v->eflags®_NOTEOL) ? 0 : 1]; - FDEBUG(("color %ld\n", (long)co)); - ss = miss(v, d, css, co, cp, start); - /* special case: match ended at eol? */ - if (ss != NULL && (ss->flags&POSTSTATE)) - return cp; - else if (ss != NULL) - ss->lastseen = cp; /* to be tidy */ - } - - /* find last match, if any */ - post = d->lastpost; - for (ss = d->ssets, i = 0; i < d->nssused; ss++, i++) - if ((ss->flags&POSTSTATE) && post != ss->lastseen && - (post == NULL || post < ss->lastseen)) - post = ss->lastseen; - if (post != NULL) /* found one */ - return post - 1; - - return NULL; -} - -/* - - shortest - shortest-preferred matching engine - ^ static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *); - */ -static chr * /* endpoint, or NULL */ -shortest(v, d, start, min, max) -struct vars *v; /* used only for debug and exec flags */ -struct dfa *d; -chr *start; /* where the match should start */ -chr *min; /* match must end at or after here */ -chr *max; /* match must end at or before here */ -{ - chr *cp; - chr *realmin = (min == v->stop) ? min : min + 1; - chr *realmax = (max == v->stop) ? max : max + 1; - color co; - struct sset *css; - struct sset *ss; - struct colormap *cm = d->cm; - - /* initialize */ - css = initialize(v, d, start); - cp = start; - - /* startup */ - FDEBUG(("--- startup ---\n")); - if (cp == v->start) { - co = d->cnfa->bos[(v->eflags®_NOTBOL) ? 0 : 1]; - FDEBUG(("color %ld\n", (long)co)); - } else { - co = GETCOLOR(cm, *(cp - 1)); - FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co)); - } - css = miss(v, d, css, co, cp, start); - if (css == NULL) - return NULL; - css->lastseen = cp; - ss = css; - - /* main loop */ - if (v->eflags®_FTRACE) - while (cp < realmax) { - FDEBUG(("--- at c%d ---\n", css - d->ssets)); - co = GETCOLOR(cm, *cp); - FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co)); - ss = css->outs[co]; - if (ss == NULL) { - ss = miss(v, d, css, co, cp+1, start); - if (ss == NULL) - break; /* NOTE BREAK OUT */ - } - cp++; - ss->lastseen = cp; - css = ss; - if ((ss->flags&POSTSTATE) && cp >= realmin) - break; /* NOTE BREAK OUT */ - } - else - while (cp < realmax) { - co = GETCOLOR(cm, *cp); - ss = css->outs[co]; - if (ss == NULL) { - ss = miss(v, d, css, co, cp+1, start); - if (ss == NULL) - break; /* NOTE BREAK OUT */ - } - cp++; - ss->lastseen = cp; - css = ss; - if ((ss->flags&POSTSTATE) && cp >= realmin) - break; /* NOTE BREAK OUT */ - } - - if (ss == NULL) - return NULL; - if (ss->flags&POSTSTATE) { - assert(cp >= realmin); - return cp - 1; - } - - /* shutdown */ - FDEBUG(("--- shutdown at c%d ---\n", css - d->ssets)); - if (cp == v->stop && max == v->stop) { - co = d->cnfa->eos[(v->eflags®_NOTEOL) ? 0 : 1]; - FDEBUG(("color %ld\n", (long)co)); - ss = miss(v, d, css, co, cp, start); - /* special case: match ended at eol? */ - if (ss != NULL && (ss->flags&POSTSTATE)) - return cp; - } - - return NULL; -} - -/* - - newdfa - set up a fresh DFA - ^ static struct dfa *newdfa(struct vars *, struct cnfa *, - ^ struct colormap *); - */ -static struct dfa * -newdfa(v, cnfa, cm) -struct vars *v; -struct cnfa *cnfa; -struct colormap *cm; -{ - struct dfa *d = (struct dfa *)MALLOC(sizeof(struct dfa)); - int wordsper = (cnfa->nstates + UBITS - 1) / UBITS; - struct sset *ss; - int i; - - assert(cnfa != NULL && cnfa->nstates != 0); - if (d == NULL) { - ERR(REG_ESPACE); - return NULL; - } - - d->ssets = (struct sset *)MALLOC(CACHE * sizeof(struct sset)); - d->statesarea = (unsigned *)MALLOC((CACHE+WORK) * wordsper * - sizeof(unsigned)); - d->work = &d->statesarea[CACHE * wordsper]; - d->outsarea = (struct sset **)MALLOC(CACHE * cnfa->ncolors * - sizeof(struct sset *)); - d->incarea = (struct arcp *)MALLOC(CACHE * cnfa->ncolors * - sizeof(struct arcp)); - if (d->ssets == NULL || d->statesarea == NULL || d->outsarea == NULL || - d->incarea == NULL) { - freedfa(d); - ERR(REG_ESPACE); - return NULL; - } - - d->nssets = (v->eflags®_SMALL) ? 5 : CACHE; - d->nssused = 0; - d->nstates = cnfa->nstates; - d->ncolors = cnfa->ncolors; - d->wordsper = wordsper; - d->cnfa = cnfa; - d->cm = cm; - d->lastpost = NULL; - d->search = d->ssets; - - for (ss = d->ssets, i = 0; i < d->nssets; ss++, i++) { - /* initialization of most fields is done as needed */ - ss->states = &d->statesarea[i * d->wordsper]; - ss->outs = &d->outsarea[i * d->ncolors]; - ss->inchain = &d->incarea[i * d->ncolors]; - } - - return d; -} - -/* - - freedfa - free a DFA - ^ static VOID freedfa(struct dfa *); - */ -static VOID -freedfa(d) -struct dfa *d; -{ - if (d->ssets != NULL) - FREE(d->ssets); - if (d->statesarea != NULL) - FREE(d->statesarea); - if (d->outsarea != NULL) - FREE(d->outsarea); - if (d->incarea != NULL) - FREE(d->incarea); - FREE(d); -} - -/* - - hash - construct a hash code for a bitvector - * There are probably better ways, but they're more expensive. - ^ static unsigned hash(unsigned *, int); - */ -static unsigned -hash(uv, n) -unsigned *uv; -int n; -{ - int i; - unsigned h; - - h = 0; - for (i = 0; i < n; i++) - h ^= uv[i]; - return h; -} - -/* - - initialize - hand-craft a cache entry for startup, otherwise get ready - ^ static struct sset *initialize(struct vars *, struct dfa *, chr *); - */ -static struct sset * -initialize(v, d, start) -struct vars *v; /* used only for debug flags */ -struct dfa *d; -chr *start; -{ - struct sset *ss; - int i; - - /* is previous one still there? */ - if (d->nssused > 0 && (d->ssets[0].flags&STARTER)) - ss = &d->ssets[0]; - else { /* no, must (re)build it */ - ss = getvacant(v, d, start, start); - for (i = 0; i < d->wordsper; i++) - ss->states[i] = 0; - BSET(ss->states, d->cnfa->pre); - ss->hash = hash(ss->states, d->wordsper); - assert(d->cnfa->pre != d->cnfa->post); - ss->flags = STARTER; - /* lastseen dealt with below */ - } - - for (i = 0; i < d->nssused; i++) - d->ssets[i].lastseen = NULL; - ss->lastseen = start; /* maybe untrue, but harmless */ - d->lastpost = NULL; - return ss; -} - -/* - - miss - handle a cache miss - ^ static struct sset *miss(struct vars *, struct dfa *, struct sset *, - ^ pcolor, chr *, chr *); - */ -static struct sset * /* NULL if goes to empty set */ -miss(v, d, css, co, cp, start) -struct vars *v; /* used only for debug flags */ -struct dfa *d; -struct sset *css; -pcolor co; -chr *cp; /* next chr */ -chr *start; /* where the attempt got started */ -{ - struct cnfa *cnfa = d->cnfa; - int i; - unsigned h; - struct carc *ca; - struct sset *p; - int ispost; - int gotstate; - int dolacons; - int didlacons; - - /* for convenience, we can be called even if it might not be a miss */ - if (css->outs[co] != NULL) { - FDEBUG(("hit\n")); - return css->outs[co]; - } - FDEBUG(("miss\n")); - - /* first, what set of states would we end up in? */ - for (i = 0; i < d->wordsper; i++) - d->work[i] = 0; - ispost = 0; - gotstate = 0; - for (i = 0; i < d->nstates; i++) - if (ISBSET(css->states, i)) - for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) - if (ca->co == co) { - BSET(d->work, ca->to); - gotstate = 1; - if (ca->to == cnfa->post) - ispost = 1; - FDEBUG(("%d -> %d\n", i, ca->to)); - } - dolacons = (gotstate) ? (cnfa->flags&HASLACONS) : 0; - didlacons = 0; - while (dolacons) { /* transitive closure */ - dolacons = 0; - for (i = 0; i < d->nstates; i++) - if (ISBSET(d->work, i)) - for (ca = cnfa->states[i]; ca->co != COLORLESS; - ca++) - if (ca->co > cnfa->ncolors && - !ISBSET(d->work, ca->to) && - lacon(v, cnfa, cp, - ca->co)) { - BSET(d->work, ca->to); - dolacons = 1; - didlacons = 1; - if (ca->to == cnfa->post) - ispost = 1; - FDEBUG(("%d :> %d\n",i,ca->to)); - } - } - if (!gotstate) - return NULL; - h = hash(d->work, d->wordsper); - - /* next, is that in the cache? */ - for (p = d->ssets, i = d->nssused; i > 0; p++, i--) - if (p->hash == h && memcmp(VS(d->work), VS(p->states), - d->wordsper*sizeof(unsigned)) == 0) { - FDEBUG(("cached c%d\n", p - d->ssets)); - break; /* NOTE BREAK OUT */ - } - if (i == 0) { /* nope, need a new cache entry */ - p = getvacant(v, d, cp, start); - assert(p != css); - for (i = 0; i < d->wordsper; i++) - p->states[i] = d->work[i]; - p->hash = h; - p->flags = (ispost) ? POSTSTATE : 0; - /* lastseen to be dealt with by caller */ - } - - if (!didlacons) { /* lookahead conds. always cache miss */ - css->outs[co] = p; - css->inchain[co] = p->ins; - p->ins.ss = css; - p->ins.co = (color)co; - } - return p; + v->mem[t->retry] = TRIED; + return caltdissect(v, t->right, begin, end); } -/* - - lacon - lookahead-constraint checker for miss() - ^ static int lacon(struct vars *, struct cnfa *, chr *, pcolor); - */ -static int /* predicate: constraint satisfied? */ -lacon(v, pcnfa, cp, co) -struct vars *v; -struct cnfa *pcnfa; /* parent cnfa */ -chr *cp; -pcolor co; /* "color" of the lookahead constraint */ -{ - int n; - struct subre *sub; - struct dfa *d; - chr *end; - n = co - pcnfa->ncolors; - assert(n < v->g->nlacons && v->g->lacons != NULL); - FDEBUG(("=== testing lacon %d\n", n)); - sub = &v->g->lacons[n]; - d = newdfa(v, &sub->cnfa, v->g->cm); - if (d == NULL) { - ERR(REG_ESPACE); - return 0; - } - end = longest(v, d, cp, v->stop); - freedfa(d); - FDEBUG(("=== lacon %d match %d\n", n, (end != NULL))); - return (sub->subno) ? (end != NULL) : (end == NULL); -} -/* - - getvacant - get a vacant state set - * This routine clears out the inarcs and outarcs, but does not otherwise - * clear the innards of the state set -- that's up to the caller. - ^ static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *); - */ -static struct sset * -getvacant(v, d, cp, start) -struct vars *v; /* used only for debug flags */ -struct dfa *d; -chr *cp; -chr *start; -{ - int i; - struct sset *ss; - struct sset *p; - struct arcp ap; - struct arcp lastap; - color co; - - ss = pickss(v, d, cp, start); - assert(!(ss->flags&LOCKED)); - - /* clear out its inarcs, including self-referential ones */ - ap = ss->ins; - while ((p = ap.ss) != NULL) { - co = ap.co; - FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long)co)); - p->outs[co] = NULL; - ap = p->inchain[co]; - p->inchain[co].ss = NULL; /* paranoia */ - } - ss->ins.ss = NULL; - - /* take it off the inarc chains of the ssets reached by its outarcs */ - for (i = 0; i < d->ncolors; i++) { - p = ss->outs[i]; - assert(p != ss); /* not self-referential */ - if (p == NULL) - continue; /* NOTE CONTINUE */ - FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets)); - if (p->ins.ss == ss && p->ins.co == i) - p->ins = ss->inchain[i]; - else { - assert(p->ins.ss != NULL); - for (ap = p->ins; ap.ss != NULL && - !(ap.ss == ss && ap.co == i); - ap = ap.ss->inchain[ap.co]) - lastap = ap; - assert(ap.ss != NULL); - lastap.ss->inchain[lastap.co] = ss->inchain[i]; - } - ss->outs[i] = NULL; - ss->inchain[i].ss = NULL; - } - - /* if ss was a success state, may need to remember location */ - if ((ss->flags&POSTSTATE) && ss->lastseen != d->lastpost && - (d->lastpost == NULL || d->lastpost < ss->lastseen)) - d->lastpost = ss->lastseen; - - return ss; -} - -/* - - pickss - pick the next stateset to be used - ^ static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *); - */ -static struct sset * -pickss(v, d, cp, start) -struct vars *v; /* used only for debug flags */ -struct dfa *d; -chr *cp; -chr *start; -{ - int i; - struct sset *ss; - struct sset *end; - chr *ancient; - - /* shortcut for cases where cache isn't full */ - if (d->nssused < d->nssets) { - ss = &d->ssets[d->nssused]; - d->nssused++; - FDEBUG(("new c%d\n", ss - d->ssets)); - /* must make innards consistent */ - ss->ins.ss = NULL; - for (i = 0; i < d->ncolors; i++) { - ss->outs[i] = NULL; - ss->inchain[i].ss = NULL; - } - ss->flags = 0; - return ss; - } - - /* look for oldest, or old enough anyway */ - if (cp - start > d->nssets*3/4) /* oldest 25% are expendable */ - ancient = cp - d->nssets*3/4; - else - ancient = start; - for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++) - if ((ss->lastseen == NULL || ss->lastseen < ancient) && - !(ss->flags&LOCKED)) { - d->search = ss + 1; - FDEBUG(("replacing c%d\n", ss - d->ssets)); - return ss; - } - for (ss = d->ssets, end = d->search; ss < end; ss++) - if ((ss->lastseen == NULL || ss->lastseen < ancient) && - !(ss->flags&LOCKED)) { - d->search = ss + 1; - FDEBUG(("replacing c%d\n", ss - d->ssets)); - return ss; - } - - /* nobody's old enough?!? -- something's really wrong */ - FDEBUG(("can't find victim to replace!\n")); - assert(NOTREACHED); - ERR(REG_ASSERT); - return d->ssets; -} +#include "rege_dfa.c" diff --git a/generic/regguts.h b/generic/regguts.h index 93d0eec..5f9a0ab 100644 --- a/generic/regguts.h +++ b/generic/regguts.h @@ -96,7 +96,9 @@ * debugging facilities */ #ifdef REG_DEBUG +/* FDEBUG does finite-state tracing */ #define FDEBUG(arglist) { if (v->eflags®_FTRACE) printf arglist; } +/* MDEBUG does higher-level tracing */ #define MDEBUG(arglist) { if (v->eflags®_MTRACE) printf arglist; } #else #define FDEBUG(arglist) {} @@ -115,15 +117,6 @@ /* - * Map a truth value into -1 for false, 1 for true. This is so it is - * possible to write compile-time assertions by declaring a dummy array - * of this size. (Why not #if? Because sizeof is not available there.) - */ -#define NEGIFNOT(x) (2*!!(x) - 1) /* !! ensures 0 or 1 */ - - - -/* * We dissect a chr into byts for colormap table indexing. Here we define * a byt, which will be the same as a byte on most machines... The exact * size of a byt is not critical, but about 8 bits is good, and extraction @@ -154,10 +147,10 @@ typedef int pcolor; /* what color promotes to */ * A colormap is a tree -- more precisely, a DAG -- indexed at each level * by a byt of the chr, to map the chr to a color efficiently. Because * lower sections of the tree can be shared, it can exploit the usual - * sparseness of such a mapping table. The final tree is always NBYTS - * levels deep (at present it may be shallower during construction, but - * it is always "filled" to full depth at the end of that, using pointers - * to "fill blocks" which are entirely WHITE in color). + * sparseness of such a mapping table. The tree is always NBYTS levels + * deep (in the past it was shallower during construction but was "filled" + * to full depth at the end of that); areas that are unaltered as yet point + * to "fill blocks" which are entirely WHITE in color. */ /* the tree itself */ @@ -177,12 +170,14 @@ union tree { /* internal per-color structure for the color machinery */ struct colordesc { uchr nchrs; /* number of chars of this color */ - color sub; /* open subcolor of this one, or NOSUB */ + color sub; /* open subcolor (if any); free chain ptr */ # define NOSUB COLORLESS struct arc *arcs; /* color chain */ -# define UNUSEDCOLOR(cd) ((cd)->nchrs == 0 && (cd)->sub == NOSUB) int flags; -# define PSEUDO 1 /* pseudocolor, no real chars */ +# define FREECOL 01 /* currently free */ +# define PSEUDO 02 /* pseudocolor, no real chars */ +# define UNUSEDCOLOR(cd) ((cd)->flags&FREECOL) + union tree *block; /* block of solid color, if any */ }; /* the color map itself */ @@ -190,13 +185,13 @@ struct colormap { int magic; # define CMMAGIC 0x876 struct vars *v; /* for compile error reporting */ - color rest; - int filled; /* has it been filled? */ size_t ncds; /* number of colordescs */ + size_t max; /* highest in use */ + color free; /* beginning of free chain (if non-0) */ struct colordesc *cd; -# define CDEND(cm) (&(cm)->cd[(cm)->ncds]) +# define CDEND(cm) (&(cm)->cd[(cm)->max + 1]) # define NINLINECDS ((size_t)10) - struct colordesc cds[NINLINECDS]; + struct colordesc cdspace[NINLINECDS]; union tree tree[NBYTS]; /* tree top, plus fill blocks */ }; @@ -208,6 +203,7 @@ struct colormap { #if NBYTS == 1 #define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)]) #endif +/* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */ #if NBYTS == 2 #define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)]) #endif @@ -326,34 +322,35 @@ struct cnfa { /* - * definitions for subexpression tree - * The intrepid code-reader is hereby warned that the subexpression tree - * is kludge piled upon kludge, and is badly in need of rethinking. Do - * not expect it to look clean and sensible. + * subexpression tree */ struct subre { - struct state *begin; /* outarcs from here... */ - struct state *end; /* ...ending in inarcs here */ - int prefer; /* match preference */ -# define NONEYET 00 -# define LONGER 01 -# define SHORTER 02 - int subno; /* subexpression number (0 none, <0 backref) */ + char op; /* '|', '.' (concat), 'b' (backref), '(', '=' */ + char flags; +# define LONGER 01 /* prefers longer match */ +# define SHORTER 02 /* prefers shorter match */ +# define MIXED 04 /* mixed preference below */ +# define CAP 010 /* capturing parens below */ +# define BACKR 020 /* back reference below */ +# define INUSE 0100 /* in use in final tree */ +# define LOCAL 03 /* bits which may not propagate up */ +# define LMIX(f) ((f)<<2) /* LONGER -> MIXED */ +# define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */ +# define UP(f) (((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED)) +# define MESSY(f) ((f)&(MIXED|CAP|BACKR)) +# define PREF(f) ((f)&LOCAL) +# define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) +# define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) + short retry; /* index into retry memory */ + int subno; /* subexpression number (for 'b' and '(') */ short min; /* min repetitions, for backref only */ short max; /* max repetitions, for backref only */ - struct rtree *tree; /* substructure, if any */ + struct subre *left; /* left child, if any (also freelist chain) */ + struct subre *right; /* right child, if any */ + struct state *begin; /* outarcs from here... */ + struct state *end; /* ...ending in inarcs here */ struct cnfa cnfa; /* compacted NFA, if any */ -}; - -struct rtree { - char op; /* operator: '|', ',' */ - char flags; -# define INUSE 01 /* in use in the tree */ - short no; /* index into retry memory */ - struct subre left; - struct rtree *next; /* for '|' */ - struct subre right; /* for ',' */ - struct rtree *chain; /* for bookkeeping and error cleanup */ + struct subre *chain; /* for bookkeeping and error cleanup */ }; @@ -377,12 +374,13 @@ struct guts { int cflags; /* copy of compile flags */ int info; /* copy of re_info */ size_t nsub; /* copy of re_nsub */ - struct cnfa cnfa; - struct rtree *tree; + struct subre *tree; + struct cnfa search; /* for fast preliminary search */ int ntree; - struct colormap *cm; + struct colormap cmap; int FUNCPTR(compare, (CONST chr *, CONST chr *, size_t)); struct subre *lacons; /* lookahead-constraint vector */ int nlacons; /* size of lacons */ int usedshorter; /* used non-greedy quantifiers? */ + int unmatchable; /* cannot match anything? */ }; diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 9f46efc..1a1cf2a 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -12,7 +12,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclCmdMZ.c,v 1.1.2.4 1998/10/21 20:40:05 stanton Exp $ + * RCS: @(#) $Id: tclCmdMZ.c,v 1.1.2.5 1998/11/11 01:44:52 stanton Exp $ */ #include "tclInt.h" @@ -90,7 +90,9 @@ Tcl_PwdObjCmd(dummy, interp, objc, objv) * Tcl_RegexpObjCmd -- * * This procedure is invoked to process the "regexp" Tcl command. - * See the user documentation for details on what it does. + * See the user documentation for details on what it does. The + * REGEXP_TEST stuff is to minimize code differences between this + * and the "testregexp" command. * * Results: * A standard Tcl result. @@ -117,12 +119,24 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) Tcl_UniChar *wStart; static char *options[] = { "-indices", "-nocase", "-about", "-expanded", - "-unsupported0", "--", (char *) NULL + "-line", "-linestop", "-lineanchor", +#ifdef REGEXP_TEST + "-xflags", +#endif + "--", (char *) NULL }; enum options { REGEXP_INDICES, REGEXP_NOCASE, REGEXP_ABOUT, REGEXP_EXPANDED, - REGEXP_XFLAGS, REGEXP_LAST + REGEXP_MULTI, REGEXP_NOCROSS, REGEXP_NEWL, +#ifdef REGEXP_TEST + REGEXP_XFLAGS, +#endif + REGEXP_LAST }; +#ifndef REGEXP_TEST +# define REGEXP_XFLAGS -1 /* impossible value */ +# define TclRegXflags(a,b,c,d) /* do nothing */ +#endif indices = 0; about = 0; @@ -159,6 +173,18 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) cflags |= REG_EXPANDED; break; } + case REGEXP_MULTI: { + cflags |= REG_NEWLINE; + break; + } + case REGEXP_NOCROSS: { + cflags |= REG_NLSTOP; + break; + } + case REGEXP_NEWL: { + cflags |= REG_NLANCH; + break; + } case REGEXP_XFLAGS: { hasxflags = 1; break; @@ -207,7 +233,7 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) wStart = TclUtfToUniCharDString(string, stringLength, &stringBuffer); wLen = Tcl_DStringLength(&stringBuffer) / sizeof(Tcl_UniChar); - match = TclRegExpExecUniChar(interp, regExpr, wStart, wLen, eflags); + match = TclRegExpExecUniChar(interp, regExpr, wStart, wLen, objc-2, eflags); if (match < 0) { result = TCL_ERROR; goto done; @@ -384,7 +410,7 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) * so that "^" won't match. */ - match = TclRegExpExecUniChar(interp, regExpr, w, wEnd - w, + match = TclRegExpExecUniChar(interp, regExpr, w, wEnd - w, 10, ((w > wStart) ? REG_NOTBOL : 0)); if (match < 0) { result = TCL_ERROR; diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index d65b19a..a4c9c37 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -10,7 +10,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclRegexp.c,v 1.1.2.3 1998/10/21 20:40:06 stanton Exp $ + * RCS: @(#) $Id: tclRegexp.c,v 1.1.2.4 1998/11/11 01:44:53 stanton Exp $ */ #include "tclInt.h" @@ -22,60 +22,42 @@ * The routines in this file use Henry Spencer's regular expression * package contained in the following additional source files: * - * chr.h tclRegexp.h lex.c - * guts.h color.c locale.c - * wchar.h compile.c nfa.c - * wctype.h exec.c - * - * Copyright (c) 1986 by University of Toronto. - * Written by Henry Spencer. Not derived from licensed software. - * - * Permission is granted to anyone to use this software for any - * purpose on any computer system, and to redistribute it freely, - * subject to the following restrictions: - * - * 1. The author is not responsible for the consequences of use of - * this software, no matter how awful, even if they arise - * from defects in it. - * - * 2. The origin of this software must not be misrepresented, either - * by explicit claim or by omission. - * - * 3. Altered versions must be plainly marked as such, and must not - * be misrepresented as being the original software. - * - * Beware that some of this code is subtly aware of the way operator - * precedence is structured in regular expressions. Serious changes in - * regular-expression syntax might require a total rethink. + * regc_color.c regc_cvec.c regc_lex.c + * regc_nfa.c regcomp.c regcustom.h + * rege_dfa.c regerror.c regerrs.h + * regex.h regexec.c regfree.c + * regfronts.c regguts.h + * + * Copyright (c) 1998 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * *** NOTE: this code has been altered slightly for use in Tcl: *** - * *** 1. Use ckalloc, ckfree, and ckrealloc instead of malloc, *** - * *** free, and realloc. *** - * *** 2. Add extra argument to regexp to specify the real *** - * *** start of the string separately from the start of the *** - * *** current search. This is needed to search for multiple *** - * *** matches within a string. *** - * *** 3. Names have been changed, e.g. from re_comp to *** + * *** 1. Names have been changed, e.g. from re_comp to *** * *** TclRegComp, to avoid clashes with other *** * *** regexp implementations used by applications. *** - * *** 4. Various lint-like things, such as casting arguments *** - * *** in procedure calls, removing debugging code and *** - * *** unused vars and procs. *** - * *** 5. Removed "backward-compatibility kludges" such as *** - * *** REG_PEND and REG_STARTEND flags, the re_endp field in *** - * *** the regex_t struct, and the fronts.c layer. *** - * *** 6. Changed char to Tcl_UniChar. *** - * *** 7. Removed -DPOSIX_MISTAKE compile-time flag. *** - * *** This is now the default. *** - * *** 8. For performance considerations, created new Unicode *** - * *** interfaces to avoid having to convert between UTF and *** - * *** Unicode when we already had the Unicode string. For *** - * *** example, in a "regsub -all" we were converting the *** - * *** match string to Unicode N times, where N is the number *** - * *** of bytes in the source string. *** - * *** 9. Changed/widened some of the interfaces to allow *** - * *** explicit passing of flags so that tclTest.c could *** - * *** test the full range of acceptable flags. *** */ /* @@ -181,7 +163,7 @@ Tcl_RegExpCompile(interp, string) if (iPtr->patterns[NUM_REGEXPS-1] != NULL) { ckfree(iPtr->patterns[NUM_REGEXPS-1]); - regfree(&(iPtr->regexps[NUM_REGEXPS-1]->re)); + re_ufree(&(iPtr->regexps[NUM_REGEXPS-1]->re)); ckfree((char *) iPtr->regexps[NUM_REGEXPS-1]); } for (i = NUM_REGEXPS - 2; i >= 0; i--) { @@ -249,7 +231,7 @@ Tcl_RegExpExec(interp, re, string, start) * Perform the regexp match. */ - result = TclRegExpExecUniChar(interp, re, uniString, numChars, + result = TclRegExpExecUniChar(interp, re, uniString, numChars, -1, ((string > start) ? REG_NOTBOL : 0)); Tcl_DStringFree(&stringBuffer); @@ -324,7 +306,7 @@ Tcl_RegExpRange(re, index, startPtr, endPtr) */ int -TclRegExpExecUniChar(interp, re, wString, numChars, flags) +TclRegExpExecUniChar(interp, re, wString, numChars, nmatches, flags) Tcl_Interp *interp; /* Interpreter to use for error reporting. */ Tcl_RegExp re; /* Compiled regular expression; returned by * a previous call to Tcl_RegExpCompile() or @@ -332,14 +314,20 @@ TclRegExpExecUniChar(interp, re, wString, numChars, flags) CONST Tcl_UniChar *wString; /* String against which to match re. */ int numChars; /* Length of string in Tcl_UniChars (must * be >= 0). */ + int nmatches; /* How many subexpression matches (counting + * the whole match as subexpression 0) are + * of interest. -1 means "don't know". */ int flags; /* Regular expression flags. */ { int status; TclRegexp *regexpPtr = (TclRegexp *) re; + size_t nm = regexpPtr->re.re_nsub + 1; + + if (nmatches >= 0 && (size_t) nmatches < nm) + nm = (size_t) nmatches; status = re_uexec(®expPtr->re, wString, (size_t) numChars, - (rm_detail_t *)NULL, - regexpPtr->re.re_nsub + 1, regexpPtr->matches, flags); + (rm_detail_t *)NULL, nm, regexpPtr->matches, flags); /* * Check for errors. @@ -569,6 +557,7 @@ TclRegAbout(interp, re) REG_UUNPORT, "REG_UUNPORT", REG_ULOCALE, "REG_ULOCALE", REG_UEMPTYMATCH, "REG_UEMPTYMATCH", + REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE", 0, "", }; struct infoname *inf; @@ -632,12 +621,12 @@ TclRegError(interp, msg, status) char *p; Tcl_ResetResult(interp); - n = regerror(status, (regex_t *)NULL, buf, sizeof(buf)); + n = re_uerror(status, (regex_t *)NULL, buf, sizeof(buf)); p = (n > sizeof(buf)) ? "..." : ""; Tcl_AppendResult(interp, msg, buf, p, NULL); sprintf(cbuf, "%d", status); - (VOID) regerror(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf)); + (VOID) re_uerror(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf)); Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL); } @@ -665,7 +654,7 @@ FreeRegexpInternalRep(objPtr) { TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; - regfree(®expRepPtr->re); + re_ufree(®expRepPtr->re); if (regexpRepPtr->matches) { ckfree((char *) regexpRepPtr->matches); } @@ -677,7 +666,7 @@ FreeRegexpInternalRep(objPtr) * * DupRegexpInternalRep -- * - * It is way to hairy to copy a regular expression, so we punt + * It is way too hairy to copy a regular expression, so we punt * and revert the object back to a vanilla string. * * Results: @@ -802,100 +791,3 @@ CompileRegexp(interp, string, length, flags) return regexpPtr; } - -/* - *--------------------------------------------------------------------------- - * - * TclRegXflags -- - * - * Parse a string of extended regexp flag letters, for testing. - * - * Results: - * No return value (you're on your own for errors here). - * - * Side effects: - * Modifies *cflagsPtr, a regcomp flags word, and *eflagsPtr, a - * regexec flags word, as appropriate. - * - *---------------------------------------------------------------------- - */ - -VOID -TclRegXflags(string, length, cflagsPtr, eflagsPtr) - char *string; /* The string of flags. */ - int length; /* The length of the string in bytes. */ - int *cflagsPtr; /* compile flags word */ - int *eflagsPtr; /* exec flags word */ -{ - int i; - int cflags; - int eflags; - - cflags = *cflagsPtr; - eflags = *eflagsPtr; - for (i = 0; i < length; i++) { - switch (string[i]) { - case 'a': { - cflags |= REG_ADVF; - break; - } - case 'b': { - cflags &= ~REG_ADVANCED; - break; - } - case 'e': { - cflags &= ~REG_ADVANCED; - cflags |= REG_EXTENDED; - break; - } - case 'q': { - cflags &= ~REG_ADVANCED; - cflags |= REG_QUOTE; - break; - } - case 'i': { - cflags |= REG_ICASE; - break; - } - case 'o': { /* o for opaque */ - cflags |= REG_NOSUB; - break; - } - case 'x': { - cflags |= REG_EXPANDED; - break; - } - case 'p': { - cflags |= REG_NLSTOP; - break; - } - case 'w': { - cflags |= REG_NLANCH; - break; - } - case 'n': { - cflags |= REG_NEWLINE; - break; - } - case '+': { - cflags |= REG_FAKEEC; - break; - } - case '^': { - eflags |= REG_NOTBOL; - break; - } - case '$': { - eflags |= REG_NOTEOL; - break; - } - case '%': { - eflags |= REG_SMALL; - break; - } - } - } - - *cflagsPtr = cflags; - *eflagsPtr = eflags; -} diff --git a/generic/tclRegexp.h b/generic/tclRegexp.h index 9e56730..cd275f2 100644 --- a/generic/tclRegexp.h +++ b/generic/tclRegexp.h @@ -33,13 +33,13 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclRegexp.h,v 1.1.2.3 1998/10/21 20:40:06 stanton Exp $ + * RCS: @(#) $Id: tclRegexp.h,v 1.1.2.4 1998/11/11 01:44:54 stanton Exp $ */ #ifndef _TCLREGEXP #define _TCLREGEXP -#include "regcustom.h" +#include "regex.h" #ifdef BUILD_tcl # undef TCL_STORAGE_CLASS @@ -78,7 +78,7 @@ EXTERN VOID TclRegXflags _ANSI_ARGS_((char *string, int length, int *cflagsPtr, int *eflagsPtr)); EXTERN int TclRegExpExecUniChar _ANSI_ARGS_((Tcl_Interp *interp, Tcl_RegExp re, CONST Tcl_UniChar *uniString, - int numChars, int flags)); + int numChars, int nmatches, int flags)); EXTERN int TclRegExpMatchObj _ANSI_ARGS_((Tcl_Interp *interp, char *string, Tcl_Obj *patObj)); EXTERN void TclRegExpRangeUniChar _ANSI_ARGS_((Tcl_RegExp re, diff --git a/generic/tclTest.c b/generic/tclTest.c index 2136b7c..b408fee 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -12,13 +12,14 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclTest.c,v 1.1.2.3 1998/10/21 20:40:07 stanton Exp $ + * RCS: @(#) $Id: tclTest.c,v 1.1.2.4 1998/11/11 01:44:54 stanton Exp $ */ #define TCL_TEST #include "tclInt.h" #include "tclPort.h" +#include "tclRegexp.h" #include /* @@ -244,6 +245,11 @@ static int TestparsevarObjCmd _ANSI_ARGS_((ClientData dummy, static int TestparsevarnameObjCmd _ANSI_ARGS_((ClientData dummy, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[])); +static int TestregexpObjCmd _ANSI_ARGS_((ClientData dummy, + Tcl_Interp *interp, int objc, + Tcl_Obj *CONST objv[])); +static void TestregexpXflags _ANSI_ARGS_((char *string, + int length, int *cflagsPtr, int *eflagsPtr)); static int TestsaveresultCmd _ANSI_ARGS_((ClientData dummy, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[])); @@ -394,6 +400,8 @@ Tcltest_Init(interp) (ClientData) 0, (Tcl_CmdDeleteProc *) NULL); Tcl_CreateObjCommand(interp, "testparsevarname", TestparsevarnameObjCmd, (ClientData) 0, (Tcl_CmdDeleteProc *) NULL); + Tcl_CreateObjCommand(interp, "testregexp", TestregexpObjCmd, + (ClientData) 0, (Tcl_CmdDeleteProc *) NULL); Tcl_CreateObjCommand(interp, "testsaveresult", TestsaveresultCmd, (ClientData) 0, (Tcl_CmdDeleteProc *) NULL); Tcl_CreateCommand(interp, "testsetassocdata", TestsetassocdataCmd, @@ -2501,6 +2509,317 @@ TestparsevarnameObjCmd(clientData, interp, objc, objv) /* *---------------------------------------------------------------------- * + * TestregexpObjCmd -- + * + * This procedure implements the "testregexp" command. It is + * used to give a direct interface for regexp flags. It's identical + * to Tcl_RegexpObjCmd except for the REGEXP_TEST define, which + * enables the -xflags option. + * + * Results: + * A standard Tcl result. + * + * Side effects: + * See the user documentation. + * + *---------------------------------------------------------------------- + */ + + /* ARGSUSED */ +int +TestregexpObjCmd(dummy, interp, objc, objv) + ClientData dummy; /* Not used. */ + Tcl_Interp *interp; /* Current interpreter. */ + int objc; /* Number of arguments. */ + Tcl_Obj *CONST objv[]; /* Argument objects. */ +{ + int i, result, indices, stringLength, wLen, match, about; + int hasxflags, cflags, eflags; + Tcl_RegExp regExpr; + char *string; + Tcl_DString stringBuffer, valueBuffer; + Tcl_UniChar *wStart; +# define REGEXP_TEST /* yes */ + static char *options[] = { + "-indices", "-nocase", "-about", "-expanded", + "-line", "-linestop", "-lineanchor", +#ifdef REGEXP_TEST + "-xflags", +#endif + "--", (char *) NULL + }; + enum options { + REGEXP_INDICES, REGEXP_NOCASE, REGEXP_ABOUT, REGEXP_EXPANDED, + REGEXP_MULTI, REGEXP_NOCROSS, REGEXP_NEWL, +#ifdef REGEXP_TEST + REGEXP_XFLAGS, +#endif + REGEXP_LAST + }; +#ifndef REGEXP_TEST +# define REGEXP_XFLAGS -1 /* impossible value */ +# define TestregexpXflags(a,b,c,d) /* do nothing */ +#endif + + indices = 0; + about = 0; + cflags = REG_ADVANCED; + eflags = 0; + hasxflags = 0; + + for (i = 1; i < objc; i++) { + char *name; + int index; + + name = Tcl_GetString(objv[i]); + if (name[0] != '-') { + break; + } + if (Tcl_GetIndexFromObj(interp, objv[i], options, "switch", TCL_EXACT, + &index) != TCL_OK) { + return TCL_ERROR; + } + switch ((enum options) index) { + case REGEXP_INDICES: { + indices = 1; + break; + } + case REGEXP_NOCASE: { + cflags |= REG_ICASE; + break; + } + case REGEXP_ABOUT: { + about = 1; + break; + } + case REGEXP_EXPANDED: { + cflags |= REG_EXPANDED; + break; + } + case REGEXP_MULTI: { + cflags |= REG_NEWLINE; + break; + } + case REGEXP_NOCROSS: { + cflags |= REG_NLSTOP; + break; + } + case REGEXP_NEWL: { + cflags |= REG_NLANCH; + break; + } + case REGEXP_XFLAGS: { + hasxflags = 1; + break; + } + case REGEXP_LAST: { + i++; + goto endOfForLoop; + } + } + } + + endOfForLoop: + if (objc - i < hasxflags + 2 - about) { + Tcl_WrongNumArgs(interp, 1, objv, + "?switches? exp string ?matchVar? ?subMatchVar subMatchVar ...?"); + return TCL_ERROR; + } + objc -= i; + objv += i; + + if (hasxflags) { + string = Tcl_GetStringFromObj(objv[0], &stringLength); + TestregexpXflags(string, stringLength, &cflags, &eflags); + objc--; + objv++; + } + + regExpr = TclRegCompObj(interp, objv[0], cflags); + if (regExpr == NULL) { + return TCL_ERROR; + } + + if (about) { + if (TclRegAbout(interp, regExpr) < 0) { + return TCL_ERROR; + } + return TCL_OK; + } + + result = TCL_OK; + string = Tcl_GetStringFromObj(objv[1], &stringLength); + + Tcl_DStringInit(&valueBuffer); + + Tcl_DStringInit(&stringBuffer); + wStart = TclUtfToUniCharDString(string, stringLength, &stringBuffer); + wLen = Tcl_DStringLength(&stringBuffer) / sizeof(Tcl_UniChar); + + match = TclRegExpExecUniChar(interp, regExpr, wStart, wLen, objc-2, eflags); + if (match < 0) { + result = TCL_ERROR; + goto done; + } + if (match == 0) { + /* + * Set the interpreter's object result to an integer object w/ value 0. + */ + + Tcl_SetIntObj(Tcl_GetObjResult(interp), 0); + goto done; + } + + /* + * If additional variable names have been specified, return + * index information in those variables. + */ + + objc -= 2; + objv += 2; + + for (i = 0; i < objc; i++) { + char *varName, *value; + int start, end; + + varName = Tcl_GetString(objv[i]); + + TclRegExpRangeUniChar(regExpr, i, &start, &end); + if (start < 0) { + if (indices) { + value = Tcl_SetVar(interp, varName, "-1 -1", 0); + } else { + value = Tcl_SetVar(interp, varName, "", 0); + } + } else { + if (indices) { + char info[TCL_INTEGER_SPACE * 2]; + + sprintf(info, "%d %d", start, end - 1); + value = Tcl_SetVar(interp, varName, info, 0); + } else { + value = TclUniCharToUtfDString(wStart + start, end - start, + &valueBuffer); + value = Tcl_SetVar(interp, varName, value, 0); + Tcl_DStringSetLength(&valueBuffer, 0); + } + } + if (value == NULL) { + Tcl_AppendResult(interp, "couldn't set variable \"", + varName, "\"", (char *) NULL); + result = TCL_ERROR; + goto done; + } + } + + /* + * Set the interpreter's object result to an integer object w/ value 1. + */ + + Tcl_SetIntObj(Tcl_GetObjResult(interp), 1); + + done: + Tcl_DStringFree(&stringBuffer); + Tcl_DStringFree(&valueBuffer); + return result; +} + +/* + *--------------------------------------------------------------------------- + * + * TestregexpXflags -- + * + * Parse a string of extended regexp flag letters, for testing. + * + * Results: + * No return value (you're on your own for errors here). + * + * Side effects: + * Modifies *cflagsPtr, a regcomp flags word, and *eflagsPtr, a + * regexec flags word, as appropriate. + * + *---------------------------------------------------------------------- + */ + +VOID +TestregexpXflags(string, length, cflagsPtr, eflagsPtr) + char *string; /* The string of flags. */ + int length; /* The length of the string in bytes. */ + int *cflagsPtr; /* compile flags word */ + int *eflagsPtr; /* exec flags word */ +{ + int i; + int cflags; + int eflags; + + cflags = *cflagsPtr; + eflags = *eflagsPtr; + for (i = 0; i < length; i++) { + switch (string[i]) { + case 'a': { + cflags |= REG_ADVF; + break; + } + case 'b': { + cflags &= ~REG_ADVANCED; + break; + } + case 'e': { + cflags &= ~REG_ADVANCED; + cflags |= REG_EXTENDED; + break; + } + case 'q': { + cflags &= ~REG_ADVANCED; + cflags |= REG_QUOTE; + break; + } + case 'o': { /* o for opaque */ + cflags |= REG_NOSUB; + break; + } + case '+': { + cflags |= REG_FAKEEC; + break; + } + case ',': { + cflags |= REG_PROGRESS; + break; + } + case '.': { + cflags |= REG_DUMP; + break; + } + case ':': { + eflags |= REG_MTRACE; + break; + } + case ';': { + eflags |= REG_FTRACE; + break; + } + case '^': { + eflags |= REG_NOTBOL; + break; + } + case '$': { + eflags |= REG_NOTEOL; + break; + } + case '%': { + eflags |= REG_SMALL; + break; + } + } + } + + *cflagsPtr = cflags; + *eflagsPtr = eflags; +} + +/* + *---------------------------------------------------------------------- + * * TestsetassocdataCmd -- * * This procedure implements the "testsetassocdata" command. It is used diff --git a/tests/regexp.test b/tests/regexp.test index a97a258..8e52abf 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -10,7 +10,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: regexp.test,v 1.1.2.3 1998/10/21 20:40:08 stanton Exp $ +# RCS: @(#) $Id: regexp.test,v 1.1.2.4 1998/11/11 01:44:56 stanton Exp $ if {[string compare test [info procs test]] == 1} then {source defs} @@ -183,7 +183,7 @@ test regexp-6.2 {regexp errors} { } {1 {wrong # args: should be "regexp ?switches? exp string ?matchVar? ?subMatchVar subMatchVar ...?"}} test regexp-6.3 {regexp errors} { list [catch {regexp -gorp a} msg] $msg -} {1 {bad switch "-gorp": must be -indices, -nocase, -about, -expanded, or --}} +} {1 {bad switch "-gorp": must be -indices, -nocase, -about, -expanded, -line, -linestop, -lineanchor, or --}} test regexp-6.4 {regexp errors} { list [catch {regexp a( b} msg] $msg } {1 {couldn't compile regular expression pattern: parentheses () not balanced}} diff --git a/tests/regtests.test b/tests/regtests.test index 02efa79..d4b93e2 100644 --- a/tests/regtests.test +++ b/tests/regtests.test @@ -56,6 +56,7 @@ if {[string compare test [info procs test]] == 1} then {source defs} # B ERE/ARE literal-_b_race heuristic used # E backslash (_e_scape) seen within [] # H looka_h_ead constraint seen +# I _i_mpossible to match # L _l_ocale-specific construct seen # M unportable (_m_achine-specific) construct seen # N RE can match empty (_n_ull) string @@ -74,7 +75,7 @@ if {[string compare test [info procs test]] == 1} then {source defs} # test procedures and related set ask "about" -set xflags "unsupported0" +set xflags "xflags" set testbypassed 0 # re_info abbreviation mapping table @@ -82,6 +83,7 @@ set infonames(A) "REG_UBSALNUM" set infonames(B) "REG_UBRACES" set infonames(E) "REG_UBBS" set infonames(H) "REG_ULOOKAHEAD" +set infonames(I) "REG_UIMPOSSIBLE" set infonames(L) "REG_ULOCALE" set infonames(M) "REG_UUNPORT" set infonames(N) "REG_UEMPTYMATCH" @@ -90,7 +92,7 @@ set infonames(Q) "REG_UBOUNDS" set infonames(R) "REG_UBACKREF" set infonames(S) "REG_UUNSPEC" set infonames(U) "REG_UPBOTCH" -set infonameorder "RHQBAUEPSMLN" ;# must match bit order, lsb first +set infonameorder "RHQBAUEPSMLNI" ;# must match bit order, lsb first # set major test number and description proc doing {major desc} { @@ -132,6 +134,9 @@ proc flags {fl} { switch -exact -- $f { "i" { lappend args "-nocase" } "x" { lappend args "-expanded" } + "n" { lappend args "-line" } + "p" { lappend args "-linestop" } + "w" { lappend args "-lineanchor" } "-" { } default { append flags $f } } @@ -169,7 +174,7 @@ proc e {testid flags re err} { return } - set cmd [concat [list regexp -$ask] [flags $flags] [list $re]] + set cmd [concat [list testregexp -$ask] [flags $flags] [list $re]] set run "list \[catch \{$cmd\}\] \[lindex \$errorCode 1\]" test $prefix.[tno $testid] [desc $testid] $run [list 1 REG_$err] } @@ -192,7 +197,7 @@ proc f {testid flags re target args} { set f [flags $flags] set infoflags [infoflags $flags] - set ccmd [concat [list regexp -$ask] $f [list $re]] + set ccmd [concat [list testregexp -$ask] $f [list $re]] set nsub [expr [llength $args] - 1] if {$nsub == -1} { # didn't tell us number of subexps @@ -205,7 +210,7 @@ proc f {testid flags re target args} { test $prefix.[tno $testid] [desc $testid] $ccmd $info set testid [lreplace $testid end end "execute"] - set ecmd [concat [list regexp] $f [list $re $target]] + set ecmd [concat [list testregexp] $f [list $re $target]] test $prefix.[tno $testid] [desc $testid] $ecmd 0 } @@ -229,8 +234,8 @@ proc matchexpected {opts testid flags re target args} { set f [flags $flags] set infoflags [infoflags $flags] - set ccmd [concat [list regexp -$ask] $f [list $re]] - set ecmd [concat [list regexp] $opts $f [list $re $target]] + set ccmd [concat [list testregexp -$ask] $f [list $re]] + set ecmd [concat [list testregexp] $opts $f [list $re $target]] set nsub [expr [llength $args] - 1] set names [list] @@ -364,7 +369,7 @@ m 5 b ^* * * e 6 - ^* BADRPT f 7 & ^b ^b m 8 b x^ x^ x^ -e 9 - x^ IMPOSS +f 9 I x^ x m 10 n "\n^" "x\nb" "\n" f 11 bS {\(^b\)} ^b m 12 - (^b) b b b @@ -372,7 +377,7 @@ m 13 & {x$} x x m 14 bS {\(x$\)} x x x m 15 - {(x$)} x x x m 16 b {x$y} "x\$y" "x\$y" -e 17 - {x$y} IMPOSS +f 17 I {x$y} xy m 18 n "x\$\n" "x\n" "x\n" e 19 - + BADRPT e 20 - ? BADRPT @@ -473,6 +478,8 @@ m 40 eE {a[\\]b} "a\\b" "a\\b" m 41 bE {a[\\]b} "a\\b" "a\\b" e 42 - {a[\Z]b} EESCAPE m 43 & {a[[b]c} "a\[c" "a\[c" +m 44 EMP {a[\u00fe-\u0507][\u00ff-\u0300]b} \ + "a\u0102\u02ffb" "a\u0102\u02ffb" @@ -539,6 +546,12 @@ e 23 b {\<*} BADRPT e 24 b {\>*} BADRPT e 25 - {\y*} BADRPT e 26 - {\Y*} BADRPT +m 27 LP {\ma} a a +f 28 LP {\ma} ba +m 29 LP {a\M} a a +f 30 LP {a\M} ab +f 31 ILP {\Ma} a +f 32 ILP {a\m} a @@ -587,7 +600,7 @@ m 19 P "a\\U00000008x" "a\bx" "a\bx" e 20 - {a\U0000008x} EESCAPE m 21 P "a\\vb" "a\vb" "a\vb" m 22 MP "a\\x08x" "a\bx" "a\bx" -e 23 - {a\xx} EESCAPE +e 23 - {a\xq} EESCAPE m 24 MP "a\\x0008x" "a\bx" "a\bx" e 25 - {a\z} EESCAPE m 26 MP "a\\010b" "a\bb" "a\bb" @@ -663,7 +676,7 @@ m 1 P a(?#comment)b ab ab doing 18 "unmatchable REs" -e 1 - a^b IMPOSS +f 1 I a^b ab @@ -757,7 +770,7 @@ doing 22 "multicharacter collating elements" # again ugh # currently disabled because the fake MCCE we use for testing is unavailable xx m 1 &+L {a[c]e} ace ace -xx e 2 &+ {a[c]h} IMPOSS +xx f 2 &+I {a[c]h} ach xx m 3 &+L {a[[.ch.]]} ach ach xx f 4 &+L {a[[.ch.]]} ace xx m 5 &+L {a[c[.ch.]]} ac ac diff --git a/unix/Makefile.in b/unix/Makefile.in index fd2bdbc..cbcd510 100644 --- a/unix/Makefile.in +++ b/unix/Makefile.in @@ -5,7 +5,7 @@ # "autoconf" program (constructs like "@foo@" will get replaced in the # actual Makefile. # -# RCS: @(#) $Id: Makefile.in,v 1.1.2.4 1998/10/21 20:40:12 stanton Exp $ +# RCS: @(#) $Id: Makefile.in,v 1.1.2.5 1998/11/11 01:44:57 stanton Exp $ # Current Tcl version; used in various names. @@ -618,7 +618,7 @@ regcomp.o: $(REGHDRS) $(GENERIC_DIR)/regcomp.c $(GENERIC_DIR)/regc_lex.c \ $(GENERIC_DIR)/regc_nfa.c $(GENERIC_DIR)/regc_cvec.c $(CC) -c $(CC_SWITCHES) $(GENERIC_DIR)/regcomp.c -regexec.o: $(REGHDRS) $(GENERIC_DIR)/regexec.c +regexec.o: $(REGHDRS) $(GENERIC_DIR)/regexec.c $(GENERIC_DIR)/rege_dfa.c $(CC) -c $(CC_SWITCHES) $(GENERIC_DIR)/regexec.c regfree.o: $(REGHDRS) $(GENERIC_DIR)/regfree.c diff --git a/win/makefile.vc b/win/makefile.vc index ac73808..de9f892 100644 --- a/win/makefile.vc +++ b/win/makefile.vc @@ -4,7 +4,7 @@ # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # # Copyright (c) 1995-1996 Sun Microsystems, Inc. -# RCS: @(#) $Id: makefile.vc,v 1.1.2.7 1998/11/04 04:41:26 stanton Exp $ +# RCS: @(#) $Id: makefile.vc,v 1.1.2.8 1998/11/11 01:44:57 stanton Exp $ # Does not depend on the presence of any environment variables in # order to compile tcl; all needed information is derived from @@ -466,12 +466,23 @@ $(TMPDIR)\tclAppInit.obj : $(WINDIR)\tclAppInit.c # Dedependency rules $(GENERICDIR)\regcomp.c: \ - $(GENERICDIR)\regc_locale.c \ $(GENERICDIR)\regguts.h \ $(GENERICDIR)\regc_lex.c \ $(GENERICDIR)\regc_color.c \ $(GENERICDIR)\regc_nfa.c \ - $(GENERICDIR)\regc_cvec.c + $(GENERICDIR)\regc_cvec.c \ + $(GENERICDIR)\regc_locale.c +$(GENERICDIR)\regcustom.h: \ + $(GENERICDIR)\tclInt.h \ + $(GENERICDIR)\tclPort.h \ + $(GENERICDIR)\regex.h +$(GENERICDIR)\regexec.c: \ + $(GENERICDIR)\rege_dfa.c \ + $(GENERICDIR)\regguts.h +$(GENERICDIR)\regerror.c: $(GENERICDIR)\regguts.h +$(GENERICDIR)\regfree.c: $(GENERICDIR)\regguts.h +$(GENERICDIR)\regfronts.c: $(GENERICDIR)\regguts.h +$(GENERICDIR)\regguts.h: $(GENERICDIR)\regcustom.h # # Implicit rules -- cgit v0.12