summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorstanton <stanton>1998-11-11 01:44:46 (GMT)
committerstanton <stanton>1998-11-11 01:44:46 (GMT)
commit0a41c61107c36da0a8e4ca0fc259149e3bc1956d (patch)
tree37f7fe5f0b8a64e08aae1446bb8cdd4516256a01
parent3774776e7bc507091c0793c14cfd8fb45484e054 (diff)
downloadtcl-0a41c61107c36da0a8e4ca0fc259149e3bc1956d.zip
tcl-0a41c61107c36da0a8e4ca0fc259149e3bc1956d.tar.gz
tcl-0a41c61107c36da0a8e4ca0fc259149e3bc1956d.tar.bz2
integrated latest regexp updates from Henry Spencer, includes new
regexp switches "-line", "-lineanchor", and "-linestop" for controlling newline behavior
-rw-r--r--changes10
-rw-r--r--generic/regc_color.c499
-rw-r--r--generic/regc_lex.c25
-rw-r--r--generic/regc_locale.c16
-rw-r--r--generic/regc_nfa.c35
-rw-r--r--generic/regcomp.c1705
-rw-r--r--generic/regcustom.h20
-rw-r--r--generic/rege_dfa.c583
-rw-r--r--generic/regerror.c2
-rw-r--r--generic/regerrs.h37
-rw-r--r--generic/regex.h25
-rw-r--r--generic/regexec.c1349
-rw-r--r--generic/regguts.h90
-rw-r--r--generic/tclCmdMZ.c38
-rw-r--r--generic/tclRegexp.c206
-rw-r--r--generic/tclRegexp.h6
-rw-r--r--generic/tclTest.c321
-rw-r--r--tests/regexp.test4
-rw-r--r--tests/regtests.test37
-rw-r--r--unix/Makefile.in4
-rw-r--r--win/makefile.vc17
21 files changed, 2525 insertions, 2504 deletions
diff --git a/changes b/changes
index 7d7851b..5e8f335 100644
--- a/changes
+++ b/changes
@@ -1,6 +1,6 @@
Recent user-visible changes to Tcl:
-RCS: @(#) $Id: changes,v 1.1.2.8 1998/11/04 04:47:50 stanton Exp $
+RCS: @(#) $Id: changes,v 1.1.2.9 1998/11/11 01:44:46 stanton Exp $
1. No more [command1] [command2] construct for grouping multiple
commands on a single command line.
@@ -3933,3 +3933,11 @@ the encoding.n manual entry for more details. (stanton)
11/3/98 (bug fix) The regular expression character classification
syntax now includes Unicode characters in the supported
classes. (stanton)
+
+11/9/98 (bug fix) "format" now correctly handles multibyte characters
+in %s format strings. (stanton)
+
+11/10/98 (new feature) "regexp" now accepts three new switches
+("-line", "-lineanchor", and "-linestop") that control how regular
+expressions treat line breaks. See the regexp manual entry for more
+details. (stanton)
diff --git a/generic/regc_color.c b/generic/regc_color.c
index 4a8a87c..843c05e 100644
--- a/generic/regc_color.c
+++ b/generic/regc_color.c
@@ -8,90 +8,75 @@
-/*
- * If this declaration draws a complaint about a negative array size,
- * then CHRBITS is defined incorrectly for the chr type.
- */
-static char isCHRBITSright[NEGIFNOT(sizeof(chr)*CHAR_BIT == CHRBITS)];
-
-
-
#define CISERR() VISERR(cm->v)
#define CERR(e) VERR(cm->v, (e))
/*
- - newcm - get new colormap
- ^ static struct colormap *newcm(struct vars *);
+ - initcm - set up new colormap
+ ^ static VOID initcm(struct vars *, struct colormap *);
*/
-static struct colormap * /* NULL for allocation failure */
-newcm(v)
+static VOID
+initcm(v, cm)
struct vars *v;
+struct colormap *cm;
{
- struct colormap *cm;
int i;
int j;
union tree *t;
union tree *nextt;
struct colordesc *cd;
- cm = (struct colormap *)MALLOC(sizeof(struct colormap));
- if (cm == NULL) {
- ERR(REG_ESPACE);
- return NULL;
- }
cm->magic = CMMAGIC;
cm->v = v;
- cm->rest = WHITE;
- cm->filled = 0;
cm->ncds = NINLINECDS;
- cm->cd = cm->cds;
- for (cd = cm->cd; cd < CDEND(cm); cd++) {
- cd->nchrs = 0;
- cd->sub = NOSUB;
- cd->arcs = NULL;
- cd->flags = 0;
- }
- cm->cd[WHITE].nchrs = CHR_MAX - CHR_MIN + 1;
-
- /* treetop starts as NULLs if there are lower levels */
- t = cm->tree;
- if (NBYTS > 1)
- for (i = BYTTAB-1; i >= 0; i--)
- t->tptr[i] = NULL;
- /* if no lower levels, treetop and last fill block are the same */
-
- /* fill blocks point to next fill block... */
- for (t = &cm->tree[1], j = NBYTS-2; j > 0; t = nextt, j--) {
+ cm->cd = cm->cdspace;
+ cm->max = 0;
+ cm->free = 0;
+
+ cd = cm->cd; /* cm->cd[WHITE] */
+ cd->sub = NOSUB;
+ cd->arcs = NULL;
+ cd->flags = 0;
+ cd->nchrs = CHR_MAX - CHR_MIN + 1;
+
+ /* upper levels of tree */
+ for (t = &cm->tree[0], j = NBYTS-1; j > 0; t = nextt, j--) {
nextt = t + 1;
for (i = BYTTAB-1; i >= 0; i--)
- t->tptr[i] = t + 1;
+ t->tptr[i] = nextt;
}
- /* ...except last which is solid white */
+ /* bottom level is solid white */
t = &cm->tree[NBYTS-1];
for (i = BYTTAB-1; i >= 0; i--)
t->tcolor[i] = WHITE;
-
-
- return cm;
+ cd->block = t;
}
/*
- - freecm - free a colormap
+ - freecm - free dynamically-allocated things in a colormap
^ static VOID freecm(struct colormap *);
*/
static VOID
freecm(cm)
struct colormap *cm;
{
+ size_t i;
+ union tree *cb;
+
cm->magic = 0;
if (NBYTS > 1)
cmtreefree(cm, cm->tree, 0);
- if (cm->cd != cm->cds)
+ for (i = 1; i < cm->max; i++) /* skip WHITE */
+ if (!UNUSEDCOLOR(&cm->cd[i])) {
+ cb = cm->cd[i].block;
+ if (cb != NULL)
+ FREE(cb);
+ }
+ if (cm->cd != cm->cdspace)
FREE(cm->cd);
- FREE(cm);
}
/*
@@ -107,92 +92,26 @@ int level; /* level number (top == 0) of this block */
int i;
union tree *t;
union tree *fillt = &cm->tree[level+1];
+ union tree *cb;
assert(level < NBYTS-1); /* this level has pointers */
for (i = BYTTAB-1; i >= 0; i--) {
t = tree->tptr[i];
- if (t != NULL && t != fillt) {
- if (level < NBYTS-2) /* more pointer blocks below */
+ assert(t != NULL);
+ if (t != fillt) {
+ if (level < NBYTS-2) { /* more pointer blocks below */
cmtreefree(cm, t, level+1);
- FREE(t);
+ FREE(t);
+ } else { /* color block below */
+ cb = cm->cd[t->tcolor[0]].block;
+ if (t != cb) /* not a solid block */
+ FREE(t);
+ }
}
}
}
/*
- - fillcm - fill in a colormap, so no NULLs remain
- * The point of this is that the tree traversal can then be a fixed set
- * of table lookups with no conditional branching. It might be better
- * to do reallocation for a more compacted structure, on the order of
- * what's done for NFAs, but the colormap can be quite large and a total
- * rebuild of it could be costly.
- ^ static VOID fillcm(struct colormap *);
- */
-static VOID
-fillcm(cm)
-struct colormap *cm;
-{
- if (!cm->filled && NBYTS > 1)
- cmtreefill(cm, cm->tree, 0);
- cm->filled = 1;
-}
-
-/*
- - cmtreefill - fill a non-terminal part of a colormap tree
- ^ static VOID cmtreefill(struct colormap *, union tree *, int);
- */
-static VOID
-cmtreefill(cm, tree, level)
-struct colormap *cm;
-union tree *tree;
-int level; /* level number (top == 0) of this block */
-{
- int i;
- union tree *t;
- union tree *fillt = &cm->tree[level+1];
-
- assert(level < NBYTS-1); /* this level has pointers */
- for (i = BYTTAB-1; i >= 0; i--) {
- t = tree->tptr[i];
- if (t == fillt) /* oops */
- {}
- else if (t == NULL)
- tree->tptr[i] = fillt;
- else if (level < NBYTS-2) /* more pointer blocks below */
- cmtreefill(cm, t, level+1);
- }
-}
-
-/*
- - getcolor - get the color of a character from a colormap
- ^ static color getcolor(struct colormap *, pchr);
- */
-static color
-getcolor(cm, c)
-struct colormap *cm;
-pchr c;
-{
- uchr uc = c;
- int shift;
- int b;
- union tree *t;
-
- assert(cm->magic == CMMAGIC);
-
- t = cm->tree;
- for (shift = BYTBITS * (NBYTS - 1); t != NULL; shift -= BYTBITS) {
- b = (uc >> shift) & BYTMASK;
- if (shift == 0) /* reached the bottom */
- return t->tcolor[b];
- t = t->tptr[b];
- }
-
- /* we fell off an incomplete part of the tree */
- assert(!cm->filled);
- return cm->rest;
-}
-
-/*
- setcolor - set the color of a character in a colormap
^ static color setcolor(struct colormap *, pchr, pcolor);
*/
@@ -204,11 +123,14 @@ pcolor co;
{
uchr uc = c;
int shift;
- int i;
+ int level;
int b;
int bottom;
union tree *t;
+ union tree *newt;
+ union tree *fillt;
union tree *lastt;
+ union tree *cb;
color prev;
assert(cm->magic == CMMAGIC);
@@ -216,28 +138,32 @@ pcolor co;
return COLORLESS;
t = cm->tree;
- for (shift = BYTBITS * (NBYTS - 1); shift > 0; shift -= BYTBITS) {
+ for (level = 0, shift = BYTBITS * (NBYTS - 1); shift > 0;
+ level++, shift -= BYTBITS) {
b = (uc >> shift) & BYTMASK;
lastt = t;
- t = t->tptr[b];
- if (t == NULL) { /* fell off an incomplete part */
- bottom = (shift <= BYTBITS) ? 1 : 0;
- t = (union tree *)MALLOC((bottom) ?
+ t = lastt->tptr[b];
+ assert(t != NULL);
+ fillt = &cm->tree[level+1];
+ bottom = (shift <= BYTBITS) ? 1 : 0;
+ cb = (bottom) ? cm->cd[t->tcolor[0]].block : fillt;
+ if (t == fillt || t == cb) { /* must allocate a new block */
+ newt = (union tree *)MALLOC((bottom) ?
sizeof(struct colors) : sizeof(struct ptrs));
- if (t == NULL) {
+ if (newt == NULL) {
CERR(REG_ESPACE);
return COLORLESS;
}
if (bottom)
- for (i = BYTTAB-1; i >= 0; i--)
- t->tcolor[i] = cm->rest;
+ memcpy(VS(newt->tcolor), VS(t->tcolor),
+ BYTTAB*sizeof(color));
else
- for (i = BYTTAB-1; i >= 0; i--)
- t->tptr[i] = NULL;
+ memcpy(VS(newt->tptr), VS(t->tptr),
+ BYTTAB*sizeof(union tree *));
+ t = newt;
lastt->tptr[b] = t;
}
}
- assert(shift == 0 && t != NULL); /* we hit bottom; it's there */
b = uc & BYTMASK;
prev = t->tcolor[b];
@@ -253,20 +179,10 @@ static color
maxcolor(cm)
struct colormap *cm;
{
- struct colordesc *cd;
- struct colordesc *end;
- struct colordesc *lastused;
-
if (CISERR())
return COLORLESS;
- lastused = NULL;
- end = CDEND(cm);
- for (cd = cm->cd; cd < end; cd++)
- if (!UNUSEDCOLOR(cd))
- lastused = cd;
- assert(lastused != NULL);
- return (color)(lastused - cm->cd);
+ return (color)cm->max;
}
/*
@@ -279,47 +195,85 @@ newcolor(cm)
struct colormap *cm;
{
struct colordesc *cd;
- struct colordesc *end;
- struct colordesc *firstnew;
+ struct colordesc *new;
size_t n;
if (CISERR())
return COLORLESS;
- end = CDEND(cm);
- for (cd = cm->cd; cd < end; cd++)
- if (UNUSEDCOLOR(cd)) {
- assert(cd->arcs == NULL);
- return (color)(cd - cm->cd);
- }
-
- /* oops, must allocate more */
- n = cm->ncds * 2;
- if (cm->cd == cm->cds) {
- cd = (struct colordesc *)MALLOC(sizeof(struct colordesc) * n);
- if (cd != NULL)
- memcpy(VS(cd), VS(cm->cds), cm->ncds *
- sizeof(struct colordesc));
+ if (cm->free != 0) {
+ assert(cm->free > 0);
+ assert((size_t)cm->free < cm->ncds);
+ cd = &cm->cd[cm->free];
+ assert(UNUSEDCOLOR(cd));
+ assert(cd->arcs == NULL);
+ cm->free = cd->sub;
+ } else if (cm->max < cm->ncds - 1) {
+ cm->max++;
+ cd = &cm->cd[cm->max];
} else {
- cd = (struct colordesc *)REALLOC(cm->cd,
+ /* oops, must allocate more */
+ n = cm->ncds * 2;
+ if (cm->cd == cm->cdspace) {
+ new = (struct colordesc *)MALLOC(n *
+ sizeof(struct colordesc));
+ if (new != NULL)
+ memcpy(VS(new), VS(cm->cdspace), cm->ncds *
+ sizeof(struct colordesc));
+ } else
+ new = (struct colordesc *)REALLOC(cm->cd,
n * sizeof(struct colordesc));
+ if (new == NULL) {
+ CERR(REG_ESPACE);
+ return COLORLESS;
+ }
+ cm->cd = new;
+ cm->ncds = n;
+ assert(cm->max < cm->ncds - 1);
+ cm->max++;
+ cd = &cm->cd[cm->max];
}
- if (cd == NULL) {
- CERR(REG_ESPACE);
- return COLORLESS;
+
+ cd->nchrs = 0;
+ cd->sub = NOSUB;
+ cd->arcs = NULL;
+ cd->flags = 0;
+ cd->block = NULL;
+
+ return (color)(cd - cm->cd);
+}
+
+/*
+ - freecolor - free a color (must have no arcs or subcolor)
+ ^ static VOID freecolor(struct colormap *, pcolor);
+ */
+static VOID
+freecolor(cm, co)
+struct colormap *cm;
+pcolor co;
+{
+ struct colordesc *cd = &cm->cd[co];
+
+ assert(co >= 0);
+ if (co == WHITE)
+ return;
+
+ assert(cd->arcs == NULL);
+ assert(cd->sub == NOSUB);
+ assert(cd->nchrs == 0);
+ cd->flags = FREECOL;
+ if (cd->block != NULL) {
+ FREE(cd->block);
+ cd->block = NULL; /* just paranoia */
}
- cm->cd = cd;
- firstnew = CDEND(cm);
- cm->ncds = n;
- end = CDEND(cm);
- for (cd = firstnew; cd < end; cd++) {
- cd->nchrs = 0;
- cd->sub = NOSUB;
- cd->arcs = NULL;
- cd->flags = 0;
+
+ if ((size_t)co == cm->max)
+ while (cm->max > WHITE && UNUSEDCOLOR(&cm->cd[cm->max]))
+ cm->max--;
+ else {
+ cd->sub = cm->free;
+ cm->free = (color)(cd - cm->cd);
}
- assert(firstnew < CDEND(cm) && UNUSEDCOLOR(firstnew));
- return (color)(firstnew - cm->cd);
}
/*
@@ -352,27 +306,170 @@ pchr c;
color co; /* current color of c */
color sco; /* new subcolor */
- co = getcolor(cm, c);
+ co = GETCOLOR(cm, c);
+ sco = newsub(cm, co);
+
+ if (co == sco) /* already in an open subcolor */
+ return co; /* rest is redundant */
+ cm->cd[co].nchrs--;
+ cm->cd[sco].nchrs++;
+ setcolor(cm, c, sco);
+ return sco;
+}
+
+/*
+ - newsub - allocate a new subcolor (if necessary) for a color
+ ^ static color newsub(struct colormap *, pcolor);
+ */
+static color
+newsub(cm, co)
+struct colormap *cm;
+pcolor co;
+{
+ color sco; /* new subcolor */
+
sco = cm->cd[co].sub;
- if (sco == NOSUB) { /* must create subcolor */
- if (cm->cd[co].nchrs == 1) /* shortcut */
+ if (sco == NOSUB) { /* color has no open subcolor */
+ if (cm->cd[co].nchrs == 1) /* optimization */
return co;
- sco = newcolor(cm);
+ sco = newcolor(cm); /* must create subcolor */
if (sco == COLORLESS)
return COLORLESS;
cm->cd[co].sub = sco;
- cm->cd[sco].sub = sco; /* self-referential subcolor ptr */
+ cm->cd[sco].sub = sco; /* open subcolor points to self */
}
+ assert(sco != NOSUB);
- if (co == sco) /* repeated character */
- return co; /* no further action needed */
- cm->cd[co].nchrs--;
- cm->cd[sco].nchrs++;
- setcolor(cm, c, sco);
return sco;
}
/*
+ - subrange - allocate new subcolors to this range of chrs, fill in arcs
+ ^ static VOID subrange(struct vars *, pchr, pchr, struct state *,
+ ^ struct state *);
+ */
+static VOID
+subrange(v, from, to, lp, rp)
+struct vars *v;
+pchr from;
+pchr to;
+struct state *lp;
+struct state *rp;
+{
+ uchr uf;
+ int i;
+
+ assert(from <= to);
+
+ /* first, align "from" on a tree-block boundary */
+ uf = (uchr)from;
+ i = (int)( ((uf + BYTTAB-1) & (uchr)~BYTMASK) - uf );
+ for (; from <= to && i > 0; i--, from++)
+ newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp);
+ if (from > to) /* didn't reach a boundary */
+ return;
+
+ /* deal with whole blocks */
+ for (; to - from >= BYTTAB; from += BYTTAB)
+ subblock(v, from, lp, rp);
+
+ /* clean up any remaining partial table */
+ for (; from <= to; from++)
+ newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp);
+}
+
+/*
+ - subblock - allocate new subcolors for one tree block of chrs, fill in arcs
+ ^ static VOID subblock(struct vars *, pchr, struct state *, struct state *);
+ */
+static VOID
+subblock(v, start, lp, rp)
+struct vars *v;
+pchr start; /* first of BYTTAB chrs */
+struct state *lp;
+struct state *rp;
+{
+ uchr uc = start;
+ struct colormap *cm = v->cm;
+ int shift;
+ int level;
+ int i;
+ int b;
+ union tree *t;
+ union tree *cb;
+ union tree *fillt;
+ union tree *lastt;
+ int previ;
+ int ndone;
+ color co;
+ color sco;
+
+ assert((uc & BYTMASK) == 0);
+
+ /* find its color block, making new pointer blocks as needed */
+ t = cm->tree;
+ fillt = NULL;
+ for (level = 0, shift = BYTBITS * (NBYTS - 1); shift > 0;
+ level++, shift -= BYTBITS) {
+ b = (uc >> shift) & BYTMASK;
+ lastt = t;
+ t = lastt->tptr[b];
+ assert(t != NULL);
+ fillt = &cm->tree[level+1];
+ if (t == fillt && shift > BYTBITS) { /* need new ptr block */
+ t = (union tree *)MALLOC(sizeof(struct ptrs));
+ if (t == NULL) {
+ CERR(REG_ESPACE);
+ return;
+ }
+ memcpy(VS(t->tptr), VS(fillt->tptr),
+ BYTTAB*sizeof(union tree *));
+ lastt->tptr[b] = t;
+ }
+ }
+
+ /* special cases: fill block or solid block */
+ co = t->tcolor[0];
+ cb = cm->cd[co].block;
+ if (t == fillt || t == cb) {
+ /* either way, we want a subcolor solid block */
+ sco = newsub(cm, co);
+ t = cm->cd[sco].block;
+ if (t == NULL) { /* must set it up */
+ t = (union tree *)MALLOC(sizeof(struct colors));
+ if (t == NULL) {
+ CERR(REG_ESPACE);
+ return;
+ }
+ for (i = 0; i < BYTTAB; i++)
+ t->tcolor[i] = sco;
+ cm->cd[sco].block = t;
+ }
+ /* find loop must have run at least once */
+ lastt->tptr[b] = t;
+ newarc(v->nfa, PLAIN, sco, lp, rp);
+ cm->cd[co].nchrs -= BYTTAB;
+ cm->cd[sco].nchrs += BYTTAB;
+ return;
+ }
+
+ /* general case, a mixed block to be altered */
+ i = 0;
+ while (i < BYTTAB) {
+ co = t->tcolor[i];
+ sco = newsub(cm, co);
+ newarc(v->nfa, PLAIN, sco, lp, rp);
+ previ = i;
+ do {
+ t->tcolor[i++] = sco;
+ } while (i < BYTTAB && t->tcolor[i] == co);
+ ndone = i - previ;
+ cm->cd[co].nchrs -= ndone;
+ cm->cd[sco].nchrs += ndone;
+ }
+}
+
+/*
- okcolors - promote subcolors to full colors
^ static VOID okcolors(struct nfa *, struct colormap *);
*/
@@ -390,7 +487,7 @@ struct colormap *cm;
for (cd = cm->cd, co = 0; cd < end; cd++, co++) {
sco = cd->sub;
- if (sco == NOSUB) {
+ if (UNUSEDCOLOR(cd) || sco == NOSUB) {
/* has no subcolor, no further action */
} else if (sco == co) {
/* is subcolor, let parent deal with it */
@@ -410,6 +507,7 @@ struct colormap *cm;
a->colorchain = scd->arcs;
scd->arcs = a;
}
+ freecolor(cm, co);
} else {
/* parent's arcs must gain parallel subcolor arcs */
cd->sub = NOSUB;
@@ -475,7 +573,7 @@ pchr c;
{
color co; /* color of c */
- co = getcolor(cm, c);
+ co = GETCOLOR(cm, c);
if (cm->cd[co].nchrs == 1 && cm->cd[co].sub == NOSUB)
return 1;
return 0;
@@ -548,24 +646,27 @@ FILE *f;
struct colordesc *end;
color co;
chr c;
+ char *has;
- if (cm->filled) {
- fprintf(f, "filled\n");
- if (NBYTS > 1)
- fillcheck(cm, cm->tree, 0, f);
- }
+ fprintf(f, "max %ld\n", (long)cm->max);
+ if (NBYTS > 1)
+ fillcheck(cm, cm->tree, 0, f);
end = CDEND(cm);
for (cd = cm->cd + 1, co = 1; cd < end; cd++, co++) /* skip 0 */
- if (cd->nchrs > 0) {
+ if (!UNUSEDCOLOR(cd)) {
+ assert(cd->nchrs > 0);
+ has = (cd->block != NULL) ? "#" : "";
if (cd->flags&PSEUDO)
- fprintf(f, "#%2ld(ps): ", (long)co);
+ fprintf(f, "#%2ld%s(ps): ", (long)co, has);
else
- fprintf(f, "#%2ld(%2d): ", (long)co, cd->nchrs);
+ fprintf(f, "#%2ld%s(%2d): ", (long)co,
+ has, cd->nchrs);
+ /* it's hard to do this more efficiently */
for (c = CHR_MIN; c < CHR_MAX; c++)
- if (getcolor(cm, c) == co)
+ if (GETCOLOR(cm, c) == co)
dumpchr(c, f);
assert(c == CHR_MAX);
- if (getcolor(cm, c) == co)
+ if (GETCOLOR(cm, c) == co)
dumpchr(c, f);
fprintf(f, "\n");
}
@@ -613,7 +714,7 @@ FILE *f;
else if (c > ' ' && c <= '~')
putc((char)c, f);
else
- fprintf(f, "\\0%lo", (long)c);
+ fprintf(f, "\\u%04lx", (long)c);
}
#endif /* ifdef REG_DEBUG */
diff --git a/generic/regc_lex.c b/generic/regc_lex.c
index 820b404..0eb6c5d 100644
--- a/generic/regc_lex.c
+++ b/generic/regc_lex.c
@@ -364,7 +364,7 @@ struct vars *v;
NOTE(REG_UNONPOSIX);
if (ATEOS())
FAILW(REG_EESCAPE);
- (DISCARD) lexescape(v);
+ (DISCARD)lexescape(v);
switch (v->nexttype) { /* not all escapes okay here */
case PLAIN:
return 1;
@@ -459,10 +459,10 @@ struct vars *v;
break;
}
- /* that got rid of everything except EREs */
+ /* that got rid of everything except EREs and AREs */
assert(INCON(L_ERE));
- /* deal with EREs, except for backslashes */
+ /* deal with EREs and AREs, except for backslashes */
switch (c) {
case CHR('|'):
RET('|');
@@ -529,12 +529,6 @@ struct vars *v;
NOTE(REG_ULOOKAHEAD);
RETV(LACON, 0);
break;
- case CHR('<'): /* prefer short */
- RETV(PREFER, 0);
- break;
- case CHR('>'): /* prefer long */
- RETV(PREFER, 1);
- break;
default:
FAILW(REG_BADRPT);
break;
@@ -547,8 +541,9 @@ struct vars *v;
RETV('(', 1);
break;
case CHR(')'):
- if (LASTTYPE('('))
+ if (LASTTYPE('(')) {
NOTE(REG_UUNSPEC);
+ }
RETV(')', c);
break;
case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
@@ -589,7 +584,7 @@ struct vars *v;
break;
}
- /* ERE backslash handling; backslash already eaten */
+ /* ERE/ARE backslash handling; backslash already eaten */
assert(!ATEOS());
if (!(v->cflags&REG_ADVF)) { /* only AREs have non-trivial escapes */
if (iscalnum(*v->now)) {
@@ -598,7 +593,7 @@ struct vars *v;
}
RETV(PLAIN, *v->now++);
}
- (DISCARD) lexescape(v);
+ (DISCARD)lexescape(v);
if (ISERR())
FAILW(REG_EESCAPE);
if (v->nexttype == CCLASS) { /* fudge at lexical level */
@@ -682,6 +677,12 @@ struct vars *v;
case CHR('f'):
RETV(PLAIN, CHR('\f'));
break;
+ case CHR('m'):
+ RET('<');
+ break;
+ case CHR('M'):
+ RET('>');
+ break;
case CHR('n'):
RETV(PLAIN, CHR('\n'));
break;
diff --git a/generic/regc_locale.c b/generic/regc_locale.c
index 7d504c8..5930378 100644
--- a/generic/regc_locale.c
+++ b/generic/regc_locale.c
@@ -1,9 +1,15 @@
-/*
- * locale-specific stuff, including MCCE handling
- * This file is #included by regcomp.c.
+/*
+ * regc_locale.c --
+ *
+ * This file contains the Unicode locale specific regexp routines.
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998 by Scriptics Corporation.
+ *
+ * See the file "license.terms" for information on usage and redistribution
+ * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * No MCCEs for Tcl. The handling of character names and classes is
- * still ASCII-centric, and needs to be extended to handle full Unicode.
+ * RCS: @(#) $Id: regc_locale.c,v 1.1.2.3 1998/11/11 01:44:49 stanton Exp $
*/
/* ASCII character-name table */
diff --git a/generic/regc_nfa.c b/generic/regc_nfa.c
index 14ee077..8143181 100644
--- a/generic/regc_nfa.c
+++ b/generic/regc_nfa.c
@@ -575,7 +575,7 @@ struct state *s;
static VOID
dupnfa(nfa, start, stop, from, to)
struct nfa *nfa;
-struct state *start; /* duplicate starting here */
+struct state *start; /* duplicate of subNFA starting here */
struct state *stop; /* and stopping here */
struct state *from; /* stringing duplicate from here */
struct state *to; /* to here */
@@ -1161,7 +1161,8 @@ struct nfa *nfa;
struct arc *a;
struct arc *aa;
- /* can the NFA match the empty string? */
+ if (nfa->pre->outs == NULL)
+ return REG_UIMPOSSIBLE;
for (a = nfa->pre->outs; a != NULL; a = a->outchain)
for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
if (aa->to == nfa->post)
@@ -1170,29 +1171,6 @@ struct nfa *nfa;
}
/*
- - isempty - is a sub-NFA composed only of EMPTY arcs?
- * Somewhat limited implementation, makes assumptions.
- ^ static int isempty(struct state *, struct state *);
- */
-static int
-isempty(begin, end)
-struct state *begin;
-struct state *end;
-{
- struct state *s;
-
- for (s = begin; s != end; s = s->outs->to) {
- if (s->nouts != 1)
- return 0;
- assert(s->outs != NULL);
- if (s->outs->type != EMPTY)
- return 0;
- }
-
- return 1;
-}
-
-/*
- compact - compact an NFA
^ static VOID compact(struct nfa *, struct cnfa *);
*/
@@ -1305,19 +1283,16 @@ struct carc *last;
/*
- freecnfa - free a compacted NFA
- ^ static VOID freecnfa(struct cnfa *, int);
+ ^ static VOID freecnfa(struct cnfa *);
*/
static VOID
-freecnfa(cnfa, dynalloc)
+freecnfa(cnfa)
struct cnfa *cnfa;
-int dynalloc; /* is the cnfa struct itself dynamic? */
{
assert(cnfa->nstates != 0); /* not empty already */
cnfa->nstates = 0;
FREE(cnfa->states);
FREE(cnfa->arcs);
- if (dynalloc)
- FREE(cnfa);
}
/*
diff --git a/generic/regcomp.c b/generic/regcomp.c
index b8c2119..620dc1c 100644
--- a/generic/regcomp.c
+++ b/generic/regcomp.c
@@ -14,7 +14,11 @@
int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int));
static VOID moresubs _ANSI_ARGS_((struct vars *, int));
static int freev _ANSI_ARGS_((struct vars *, int));
-static struct rtree *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int));
+static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
+static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int));
+static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *));
+static VOID nonword _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
+static VOID word _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
static int scannum _ANSI_ARGS_((struct vars *));
static VOID repeat _ANSI_ARGS_((struct vars *, struct state *, struct state *, int, int));
static VOID bracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
@@ -24,24 +28,23 @@ static chr *scanplain _ANSI_ARGS_((struct vars *));
static VOID leaders _ANSI_ARGS_((struct vars *, struct cvec *));
static VOID onechr _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
static VOID dovec _ANSI_ARGS_((struct vars *, struct cvec *, struct state *, struct state *));
-static color nlcolor _ANSI_ARGS_((struct vars *));
+static celt nextleader _ANSI_ARGS_((struct vars *, pchr, pchr));
static VOID wordchrs _ANSI_ARGS_((struct vars *));
-static struct subre subre _ANSI_ARGS_((struct state *, struct state *, int, int, struct rtree *));
-static struct rtree *newrt _ANSI_ARGS_((struct vars *));
-static VOID freert _ANSI_ARGS_((struct vars *, struct rtree *));
-static VOID freertnode _ANSI_ARGS_((struct vars *, struct rtree *));
-static VOID optrt _ANSI_ARGS_((struct vars *, struct rtree *));
-static int numrt _ANSI_ARGS_((struct rtree *, int));
-static VOID markrt _ANSI_ARGS_((struct rtree *));
-static VOID cleanrt _ANSI_ARGS_((struct vars *));
-static VOID nfatree _ANSI_ARGS_((struct vars *, struct rtree *, FILE *));
-static VOID nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
+static struct subre *subre _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
+static VOID freesubre _ANSI_ARGS_((struct vars *, struct subre *));
+static VOID freesrnode _ANSI_ARGS_((struct vars *, struct subre *));
+static VOID optst _ANSI_ARGS_((struct vars *, struct subre *));
+static int numst _ANSI_ARGS_((struct subre *, int));
+static VOID markst _ANSI_ARGS_((struct subre *));
+static VOID cleanst _ANSI_ARGS_((struct vars *));
+static int nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
+static int nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int));
static VOID freelacons _ANSI_ARGS_((struct subre *, int));
static VOID rfree _ANSI_ARGS_((regex_t *));
static VOID dump _ANSI_ARGS_((regex_t *, FILE *));
-static VOID dumprt _ANSI_ARGS_((struct rtree *, FILE *, int));
-static VOID rtdump _ANSI_ARGS_((struct rtree *, FILE *, int, int));
+static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int));
+static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int, int));
/* === regc_lex.c === */
static VOID lexstart _ANSI_ARGS_((struct vars *));
static VOID prefixes _ANSI_ARGS_((struct vars *));
@@ -56,17 +59,18 @@ static chr newline _ANSI_ARGS_((NOPARMS));
static chr *ch _ANSI_ARGS_((NOPARMS));
static chr chrnamed _ANSI_ARGS_((struct vars *, chr *, chr *, pchr));
/* === regc_color.c === */
-static struct colormap *newcm _ANSI_ARGS_((struct vars *));
+static VOID initcm _ANSI_ARGS_((struct vars *, struct colormap *));
static VOID freecm _ANSI_ARGS_((struct colormap *));
static VOID cmtreefree _ANSI_ARGS_((struct colormap *, union tree *, int));
-static VOID fillcm _ANSI_ARGS_((struct colormap *));
-static VOID cmtreefill _ANSI_ARGS_((struct colormap *, union tree *, int));
-static color getcolor _ANSI_ARGS_((struct colormap *, pchr));
static color setcolor _ANSI_ARGS_((struct colormap *, pchr, pcolor));
static color maxcolor _ANSI_ARGS_((struct colormap *));
static color newcolor _ANSI_ARGS_((struct colormap *));
+static VOID freecolor _ANSI_ARGS_((struct colormap *, pcolor));
static color pseudocolor _ANSI_ARGS_((struct colormap *));
static color subcolor _ANSI_ARGS_((struct colormap *, pchr c));
+static color newsub _ANSI_ARGS_((struct colormap *, pcolor));
+static VOID subrange _ANSI_ARGS_((struct vars *, pchr, pchr, struct state *, struct state *));
+static VOID subblock _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
static VOID okcolors _ANSI_ARGS_((struct nfa *, struct colormap *));
static VOID colorchain _ANSI_ARGS_((struct colormap *, struct arc *));
static VOID uncolorchain _ANSI_ARGS_((struct colormap *, struct arc *));
@@ -115,10 +119,9 @@ static VOID cleanup _ANSI_ARGS_((struct nfa *));
static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
static int analyze _ANSI_ARGS_((struct nfa *));
-static int isempty _ANSI_ARGS_((struct state *, struct state *));
static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *));
static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *));
-static VOID freecnfa _ANSI_ARGS_((struct cnfa *, int));
+static VOID freecnfa _ANSI_ARGS_((struct cnfa *));
static VOID dumpnfa _ANSI_ARGS_((struct nfa *, FILE *));
static VOID dumpstate _ANSI_ARGS_((struct state *, FILE *));
static VOID dumparcs _ANSI_ARGS_((struct state *, FILE *));
@@ -173,9 +176,9 @@ struct vars {
struct colormap *cm; /* character color map */
color nlcolor; /* color of newline */
struct state *wordchrs; /* state in nfa holding word-char outarcs */
- struct rtree *tree; /* subexpression tree */
- struct rtree *treechain; /* all tree nodes allocated */
- struct rtree *treefree; /* any free tree nodes */
+ struct subre *tree; /* subexpression tree */
+ struct subre *treechain; /* all tree nodes allocated */
+ struct subre *treefree; /* any free tree nodes */
int ntree; /* number of tree nodes */
struct cvec *cv; /* interface cvec */
struct cvec *cv2; /* utility cvec */
@@ -186,6 +189,7 @@ struct vars {
struct subre *lacons; /* lookahead-constraint vector */
int nlacons; /* size of lacons */
int usedshorter; /* used short-preferring quantifiers */
+ int unmatchable; /* can never match */
};
/* parsing macros; most know that `v' is the struct vars pointer */
@@ -199,6 +203,7 @@ struct vars {
#define ERR(e) VERR(v, e) /* record an error */
#define NOERR() {if (ISERR()) return;} /* if error seen, return */
#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
+#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */
#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
@@ -259,7 +264,6 @@ int flags;
if (re == NULL || string == NULL)
return REG_INVARG;
- assert(REG_ADVANCED == (REG_EXTENDED|REG_ADVF));
if ((flags&REG_QUOTE) &&
(flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)))
return REG_INVARG;
@@ -297,19 +301,18 @@ int flags;
re->re_fns = VS(&functions);
/* more complex setup, malloced things */
- v->cm = newcm(v);
- CNOERR();
- v->nfa = newnfa(v, v->cm, (struct nfa *)NULL);
- CNOERR();
re->re_guts = VS(MALLOC(sizeof(struct guts)));
if (re->re_guts == NULL)
return freev(v, REG_ESPACE);
g = (struct guts *)re->re_guts;
- ZAPCNFA(g->cnfa);
g->tree = NULL;
- g->cm = NULL;
+ initcm(v, &g->cmap);
+ v->cm = &g->cmap;
g->lacons = NULL;
g->nlacons = 0;
+ ZAPCNFA(g->search);
+ v->nfa = newnfa(v, v->cm, (struct nfa *)NULL);
+ CNOERR();
v->cv = newcvec(100, 20, 10);
if (v->cv == NULL)
return freev(v, REG_ESPACE);
@@ -324,53 +327,56 @@ int flags;
/* parsing */
lexstart(v); /* also handles prefixes */
- v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final, NONEYET);
+ if ((v->cflags&REG_NLSTOP) || (v->cflags&REG_NLANCH)) {
+ /* assign newline a unique color */
+ v->nlcolor = subcolor(v->cm, newline());
+ okcolors(v->nfa, v->cm);
+ }
+ CNOERR();
+ v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
CNOERR();
+ assert(v->tree != NULL);
/* finish setup of nfa and its subre tree */
specialcolors(v->nfa);
CNOERR();
if (debug != NULL) {
dumpnfa(v->nfa, debug);
- dumprt(v->tree, debug, 1);
+ dumpst(v->tree, debug, 1);
}
v->usedshorter = 0;
- optrt(v, v->tree);
- if (v->tree != NULL) {
- v->ntree = numrt(v->tree, 1);
- markrt(v->tree);
- } else
- v->ntree = 0;
- cleanrt(v);
+ v->unmatchable = 0;
+ optst(v, v->tree);
+ v->ntree = numst(v->tree, 1);
+ markst(v->tree);
+ cleanst(v);
if (debug != NULL) {
fprintf(debug, "-->\n");
- dumprt(v->tree, debug, 1);
+ dumpst(v->tree, debug, 1);
}
- /* build compacted NFAs for tree, lacons, main nfa */
- nfatree(v, v->tree, debug);
+ /* build compacted NFAs for tree, lacons, fast search */
+ re->re_info |= nfatree(v, v->tree, debug);
if (debug != NULL) {
fprintf(debug, "---->\n");
- dumprt(v->tree, debug, 1);
+ dumpst(v->tree, debug, 1);
}
CNOERR();
+ if (re->re_info&REG_UIMPOSSIBLE)
+ v->unmatchable = 1;
assert(v->nlacons == 0 || v->lacons != NULL);
for (i = 1; i < v->nlacons; i++)
nfanode(v, &v->lacons[i], debug);
CNOERR();
- re->re_info |= optimize(v->nfa, debug);
+ rainbow(v->nfa, v->cm, PLAIN, COLORLESS, v->nfa->pre, v->nfa->pre);
+ newarc(v->nfa, PLAIN, v->nfa->bos[0], v->nfa->pre, v->nfa->pre);
+ newarc(v->nfa, PLAIN, v->nfa->bos[1], v->nfa->pre, v->nfa->pre);
+ newarc(v->nfa, PLAIN, v->nfa->eos[0], v->nfa->pre, v->nfa->pre);
+ newarc(v->nfa, PLAIN, v->nfa->eos[1], v->nfa->pre, v->nfa->pre);
+ (DISCARD)optimize(v->nfa, debug);
CNOERR();
- if (v->nfa->post->nins <= 0)
- return freev(v, REG_IMPOSS); /* end unreachable! */
- assert(v->nfa->pre->nouts > 0);
- compact(v->nfa, &g->cnfa);
- CNOERR();
- freenfa(v->nfa);
- v->nfa = NULL;
-
- /* fill color map */
- fillcm(v->cm);
+ compact(v->nfa, &g->search);
CNOERR();
/* looks okay, package it up */
@@ -380,8 +386,6 @@ int flags;
g->cflags = v->cflags;
g->info = re->re_info;
g->nsub = re->re_nsub;
- g->cm = v->cm;
- v->cm = NULL;
g->tree = v->tree;
v->tree = NULL;
g->ntree = v->ntree;
@@ -390,6 +394,7 @@ int flags;
v->lacons = NULL;
g->nlacons = v->nlacons;
g->usedshorter = v->usedshorter;
+ g->unmatchable = v->unmatchable;
if (flags&REG_DUMP)
dump(re, stdout);
@@ -447,12 +452,10 @@ int err;
FREE(v->subs);
if (v->nfa != NULL)
freenfa(v->nfa);
- if (v->cm != NULL)
- freecm(v->cm);
if (v->tree != NULL)
- freert(v, v->tree);
+ freesubre(v, v->tree);
if (v->treechain != NULL)
- cleanrt(v);
+ cleanst(v);
if (v->cv != NULL)
freecvec(v->cv);
if (v->cv2 != NULL)
@@ -468,541 +471,562 @@ int err;
/*
- parse - parse an RE
- * Arguably this is too big and too complex and ought to be divided up.
- * However, the code is somewhat intertwined...
- *
- * Note that it is no longer necessary to be rigorous about freeing tree
- * nodes on error exits, as the tree machinery keeps track of them.
- ^ static struct rtree *parse(struct vars *, int, int, struct state *,
- ^ struct state *, int);
+ * This is actually just the top level, which parses a bunch of branches
+ * tied together with '|'. They appear in the tree as the left children
+ * of a chain of '|' subres.
+ ^ static struct subre *parse(struct vars *, int, int, struct state *,
+ ^ struct state *);
*/
-static struct rtree * /* NULL if no interesting substructure */
-parse(v, stopper, type, init, final, pprefer)
+static struct subre *
+parse(v, stopper, type, init, final)
struct vars *v;
int stopper; /* EOS or ')' */
int type; /* LACON (lookahead subRE) or PLAIN */
struct state *init; /* initial state */
struct state *final; /* final state */
-int pprefer; /* parent's short/long preference */
{
struct state *left; /* scaffolding for branch */
struct state *right;
- struct state *lp; /* scaffolding for current construct */
- struct state *rp;
- struct state *s; /* temporaries for new states */
- struct state *s2;
-# define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
- int m, n;
- int emptybranch; /* is there anything in this branch yet? */
- struct rtree *branches; /* top level */
- struct rtree *branch; /* current branch */
- struct subre *now; /* current subtree's top */
- struct subre sub; /* communication variable */
- struct rtree *rt1; /* temporaries */
- struct rtree *rt2;
- struct subre *t; /* work pointer, top of interesting subtree */
+ struct subre *branches; /* top level */
+ struct subre *branch; /* current branch */
+ struct subre *t; /* temporary */
int firstbranch; /* is this the first branch? */
- int capture; /* any capturing parens within this? */
- int constraint; /* is the current atom a constraint? */
assert(stopper == ')' || stopper == EOS);
- capture = 0;
- branches = newrt(v);
+ branches = subre(v, '|', LONGER, init, final);
+ NOERRN();
branch = branches;
- rt1 = NULL; /* shut up lint */
firstbranch = 1;
- NOERRN();
- do {
- /* a branch */
- emptybranch = 1; /* tentatively */
- left = newstate(v->nfa);
- right = newstate(v->nfa);
- NOERRN();
+ do { /* a branch */
if (!firstbranch) {
- rt1 = newrt(v);
+ /* need a place to hang it */
+ branch->right = subre(v, '|', LONGER, init, final);
NOERRN();
- branch->next = rt1;
- branch = rt1;
+ branch = branch->right;
}
+ firstbranch = 0;
+ left = newstate(v->nfa);
+ right = newstate(v->nfa);
+ NOERRN();
EMPTYARC(init, left);
EMPTYARC(right, final);
- lp = left;
- rp = right;
- branch->op = '|';
- now = &branch->left;
- *now = subre(left, right, NONEYET, 0, (struct rtree *)NULL);
- firstbranch = 0;
NOERRN();
+ branch->left = parsebranch(v, stopper, type, left, right, 0);
+ NOERRN();
+ branch->flags |= UP(branch->flags | branch->left->flags);
+ if ((branch->flags &~ branches->flags) != 0) /* new flags */
+ for (t = branches; t != branch; t = t->right)
+ t->flags |= branch->flags;
+ } while (EAT('|'));
+ assert(SEE(stopper) || SEE(EOS));
- while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {
- /* initial bookkeeping */
- sub.begin = NULL; /* no substructure seen yet */
- sub.subno = 0;
- sub.prefer = NONEYET;
- constraint = 0;
- if (emptybranch) /* first of the branch */
- emptybranch = 0;
- else { /* implicit concat operator */
- lp = newstate(v->nfa);
- NOERRN();
- moveins(v->nfa, rp, lp);
- }
- assert(lp->nouts == 0); /* must string new code */
- assert(rp->nins == 0); /* between lp and rp */
-
- /* an atom... */
- switch (v->nexttype) {
- case '(': /* value flags as capturing or non */
- m = (type == LACON) ? 0 : v->nextvalue;
- if (m) {
- v->nsubexp++;
- sub.subno = v->nsubexp;
- if ((size_t)sub.subno >= v->nsubs)
- moresubs(v, sub.subno);
- assert((size_t)sub.subno < v->nsubs);
- } else
- sub.subno = 0;
- NEXT();
- sub.begin = lp; /* NB, substructure seen */
- sub.end = rp;
- /* use now->tree as temporary, so */
- /* things get freed on error returns */
- assert(now->tree == NULL);
- now->tree = parse(v, ')', PLAIN, lp, rp,
- now->prefer);
- assert(SEE(')') || ISERR());
- NEXT();
- NOERRN();
- if (!m && now->tree == NULL) {
- /* actually no relevant substructure */
- sub.begin = NULL;
- }
- if (now->tree != NULL) {
- if (now->tree->op == '|')
- sub.prefer = LONGER;
- else
- sub.prefer =
- now->tree->left.prefer;
- }
- /* must postpone other processing until we */
- /* know about any {0,0} quantifier */
- break;
- case BACKREF: /* the Feature From The Black Lagoon */
- INSIST(type != LACON, REG_ESUBREG);
- INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
- INSIST(v->subs[v->nextvalue] != NULL,
- REG_ESUBREG);
- NOERRN();
- assert(v->nextvalue > 0);
- sub.subno = -v->nextvalue;
- sub.begin = lp; /* NB, substructure seen */
- sub.end = rp;
- EMPTYARC(lp, rp); /* temporarily */
- assert(now->tree == NULL);
- NEXT();
- break;
- case LACON: /* lookahead constraint */
- m = v->nextvalue; /* is positive? */
- NEXT();
- s = newstate(v->nfa);
- s2 = newstate(v->nfa);
- NOERRN();
- rt1 = parse(v, ')', LACON, s, s2, NONEYET);
- assert(SEE(')') || ISERR());
- NEXT();
- m = newlacon(v, s, s2, m);
- freert(v, rt1);
- NOERRN();
- ARCV(LACON, m);
- constraint = 1;
- break;
- case PREFER: /* length preference */
- sub.prefer = (v->nextvalue) ? LONGER : SHORTER;
- NEXT();
- sub.begin = lp; /* NB, substructure seen */
- sub.end = rp;
- /* use now->tree as temporary, so */
- /* things get freed on error returns */
- assert(now->tree == NULL);
- now->tree = parse(v, ')', PLAIN, lp, rp,
- sub.prefer);
- assert(SEE(')') || ISERR());
- NEXT();
- NOERRN();
- if (now->prefer == NONEYET)
- now->prefer = sub.prefer;
- if (sub.prefer == now->prefer &&
- now->tree == NULL) {
- /* actually no relevant substructure */
- sub.begin = NULL;
- }
- break;
- case '[':
- if (v->nextvalue == 1)
- bracket(v, lp, rp);
- else
- cbracket(v, lp, rp);
- assert(SEE(']') || ISERR());
- NEXT();
- break;
- case '.':
- rainbow(v->nfa, v->cm, PLAIN,
- (v->cflags&REG_NLSTOP) ?
- nlcolor(v) : COLORLESS,
- lp, rp);
- NEXT();
- break;
- case '^':
- ARCV('^', 1);
- if (v->cflags&REG_NLANCH)
- ARCV(BEHIND, nlcolor(v));
- NEXT();
- constraint = 1;
- break;
- case '$':
- ARCV('$', 1);
- if (v->cflags&REG_NLANCH)
- ARCV(AHEAD, nlcolor(v));
- NEXT();
- constraint = 1;
- break;
- case SBEGIN:
- ARCV('^', 1); /* BOL */
- ARCV('^', 0); /* or BOS */
- NEXT();
- constraint = 1;
- break;
- case SEND:
- ARCV('$', 1); /* EOL */
- ARCV('$', 0); /* or EOS */
- NEXT();
- constraint = 1;
- break;
- case '<':
- wordchrs(v); /* does NEXT() */
- s = newstate(v->nfa);
- NOERRN();
- /* needs BOL, BOS, or nonword to left... */
- newarc(v->nfa, '^', 1, lp, s);
- newarc(v->nfa, '^', 0, lp, s);
- colorcomplement(v->nfa, v->cm, BEHIND,
- v->wordchrs, lp, s);
- /* ... and word to right */
- cloneouts(v->nfa, v->wordchrs, s, rp, AHEAD);
- /* (no need for special attention to \n) */
- constraint = 1;
- break;
- case '>':
- wordchrs(v); /* does NEXT() */
- s = newstate(v->nfa);
- NOERRN();
- /* needs word to left... */
- cloneouts(v->nfa, v->wordchrs, lp, s, BEHIND);
- /* ... and EOL, EOS, or nonword to right */
- newarc(v->nfa, '$', 1, s, rp);
- newarc(v->nfa, '$', 0, s, rp);
- colorcomplement(v->nfa, v->cm, AHEAD,
- v->wordchrs, s, rp);
- /* (no need for special attention to \n) */
- constraint = 1;
- break;
- case WBDRY:
- wordchrs(v); /* does NEXT() */
- s = newstate(v->nfa);
- NOERRN();
- /* needs BOL, BOS, or nonword to left... */
- newarc(v->nfa, '^', 1, lp, s);
- newarc(v->nfa, '^', 0, lp, s);
- colorcomplement(v->nfa, v->cm, BEHIND,
- v->wordchrs, lp, s);
- /* ... and word to right... */
- cloneouts(v->nfa, v->wordchrs, s, rp, AHEAD);
- /* ...or... */
- s = newstate(v->nfa);
- NOERRN();
- /* ...needs word to left... */
- cloneouts(v->nfa, v->wordchrs, lp, s, BEHIND);
- /* ... and EOL, EOS, or nonword to right */
- newarc(v->nfa, '$', 1, s, rp);
- newarc(v->nfa, '$', 0, s, rp);
- colorcomplement(v->nfa, v->cm, AHEAD,
- v->wordchrs, s, rp);
- /* (no need for special attention to \n) */
- constraint = 1;
- break;
- case NWBDRY:
- wordchrs(v); /* does NEXT() */
- s = newstate(v->nfa);
- NOERRN();
- /* needs word to both left and right... */
- cloneouts(v->nfa, v->wordchrs, lp, s, BEHIND);
- cloneouts(v->nfa, v->wordchrs, s, rp, AHEAD);
- /* ...or... */
- s = newstate(v->nfa);
- NOERRN();
- /* ...BOL, BOS, or nonword to left... */
- newarc(v->nfa, '^', 1, lp, s);
- newarc(v->nfa, '^', 0, lp, s);
- colorcomplement(v->nfa, v->cm, BEHIND,
- v->wordchrs, lp, s);
- /* ... and EOL, EOS, or nonword to right */
- newarc(v->nfa, '$', 1, s, rp);
- newarc(v->nfa, '$', 0, s, rp);
- colorcomplement(v->nfa, v->cm, AHEAD,
- v->wordchrs, s, rp);
- /* (no need for special attention to \n) */
- constraint = 1;
- break;
- case ')': /* unbalanced paren */
-#ifdef POSIX_MISTAKE
- if (!(v->cflags&REG_EXTENDED) ||
- (v->cflags&REG_ADVF)) {
- ERR(REG_EPAREN);
- return NULL;
- }
- NOTE(REG_UPBOTCH);
- /* fallthrough into case PLAIN */
-#else
- ERR(REG_EPAREN);
- return NULL;
- break;
-#endif
- case PLAIN:
- onechr(v, v->nextvalue, lp, rp);
- okcolors(v->nfa, v->cm);
- NOERRN();
- NEXT();
- break;
- case '*':
- case '+':
- case '?':
- case '{':
- ERR(REG_BADRPT);
- return NULL;
- break;
- default:
- ERR(REG_ASSERT);
- return NULL;
- break;
- }
-
- /* ...possibly followed by a quantifier */
- switch (v->nexttype) {
- case '*':
- m = 0;
- n = INFINITY;
- sub.prefer = (v->nextvalue) ? LONGER : SHORTER;
- NEXT();
- break;
- case '+':
- m = 1;
- n = INFINITY;
- sub.prefer = (v->nextvalue) ? LONGER : SHORTER;
- NEXT();
- break;
- case '?':
- m = 0;
- n = 1;
- sub.prefer = (v->nextvalue) ? LONGER : SHORTER;
- NEXT();
- break;
- case '{':
- NEXT();
- m = scannum(v);
- if (EAT(',')) {
- if (SEE(DIGIT))
- n = scannum(v);
- else
- n = INFINITY;
- if (m > n) {
- ERR(REG_BADBR);
- return NULL;
- }
- } else
- n = m;
- if (!SEE('}')) { /* gets errors too */
- ERR(REG_BADBR);
- return NULL;
- }
- if (m != n)
- sub.prefer = (v->nextvalue) ? LONGER :
- SHORTER;
- NEXT();
- break;
- default: /* no quantifier */
- m = n = 1;
- constraint = 0;
- break;
- }
+ if (!SEE(stopper)) {
+ assert(stopper == ')' && SEE(EOS));
+ ERR(REG_EPAREN);
+ }
- /* constraints may not be quantified */
- if (constraint) {
- ERR(REG_BADRPT);
- return NULL;
- }
+ /* optimize out simple cases */
+ if (branch == branches) { /* only one branch */
+ assert(branch->right == NULL);
+ t = branch->left;
+ branch->left = NULL;
+ freesubre(v, branches);
+ branches = t;
+ } else if (!MESSY(branches->flags)) { /* no interesting innards */
+ freesubre(v, branches->left);
+ branches->left = NULL;
+ freesubre(v, branches->right);
+ branches->right = NULL;
+ branches->op = '=';
+ }
- /* annoying special case: {0,0} cancels everything */
- if (m == 0 && n == 0 && sub.begin != NULL) {
- freert(v, now->tree);
- now->tree = NULL;
- sub.begin = NULL; /* no substructure */
- sub.prefer = NONEYET;
- /* the repeat() below will do the rest */
- }
+ return branches;
+}
- /* if no substructure, avoid hard part */
- if (now->prefer == NONEYET)
- now->prefer = sub.prefer;
- if (sub.begin == NULL && (sub.prefer == NONEYET ||
- sub.prefer == now->prefer)) {
- assert(sub.subno >= 0 || (m == 0 && n == 0));
- if (!(m == 1 && n == 1))
- repeat(v, lp, rp, m, n);
- continue; /* NOTE CONTINUE */
- }
+/*
+ - parsebranch - parse one branch of an RE
+ * This mostly manages concatenation, working closely with parseqatom().
+ * Concatenated things are bundled up as much as possible, with separate
+ * ',' nodes introduced only when necessary due to substructure.
+ ^ static struct subre *parsebranch(struct vars *, int, int, struct state *,
+ ^ struct state *, int);
+ */
+static struct subre *
+parsebranch(v, stopper, type, left, right, partial)
+struct vars *v;
+int stopper; /* EOS or ')' */
+int type; /* LACON (lookahead subRE) or PLAIN */
+struct state *left; /* leftmost state */
+struct state *right; /* rightmost state */
+int partial; /* is this only part of a branch? */
+{
+ struct state *lp; /* left end of current construct */
+ int seencontent; /* is there anything in this branch yet? */
+ struct subre *t;
- /* hard part: something messy seen */
- /* break subRE into pre, x{...}, post-to-be */
- capture = 1; /* upper levels will care */
- rt1 = newrt(v);
- rt2 = newrt(v);
- s = newstate(v->nfa); /* between x and post-to-be */
+ lp = left;
+ seencontent = 0;
+ t = subre(v, '=', 0, left, right); /* op '=' is tentative */
+ NOERRN();
+ while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {
+ if (seencontent) { /* implicit concat operator */
+ lp = newstate(v->nfa);
NOERRN();
- moveins(v->nfa, rp, s);
- EMPTYARC(s, rp);
- rt1->op = ',';
- rt1->left = subre(now->begin, lp, now->prefer, 0,
- (struct rtree *)NULL);
- assert(now->end == rp);
- rt1->right = subre(lp, rp, sub.prefer, 0, rt2);
- rt2->op = ',';
- rt2->left = subre(lp, s, sub.prefer, 0, now->tree);
- rt2->right = subre(s, rp, NONEYET, 0,
- (struct rtree *)NULL);
- now->tree = rt1;
- now = &rt2->right; /* future elaborations here */
- t = &rt2->left; /* current activity here */
-
- /* if it's a backref, time to replicate the subNFA */
- if (sub.subno < 0) {
- assert(lp->nouts == 1); /* just the EMPTY */
- delsub(v->nfa, lp, s);
- assert(v->subs[-sub.subno] != NULL);
- dupnfa(v->nfa, v->subs[-sub.subno]->begin,
- v->subs[-sub.subno]->end, lp, s);
- NOERRN();
- }
+ moveins(v->nfa, right, lp);
+ }
+ seencontent = 1;
- /* if no/vacuous quantifier and not backref, done */
- if (m == 1 && n == 1 && sub.subno >= 0) {
- t->subno = sub.subno;
- if (sub.subno > 0)
- v->subs[sub.subno] = t;
- continue; /* NOTE CONTINUE */
- }
+ /* NB, recursion in parseqatom() may swallow rest of branch */
+ parseqatom(v, stopper, type, lp, right, t);
+ }
- /* really sticky part, quantified capturer/backref */
- /* first, turn x{0,...} into x{1,...}| */
- if (m == 0) {
- s = newstate(v->nfa);
- s2 = newstate(v->nfa);
- rt1 = newrt(v);
- rt2 = newrt(v);
- NOERRN();
- moveouts(v->nfa, t->begin, s);
- EMPTYARC(t->begin, s);
- EMPTYARC(t->begin, s2);
- EMPTYARC(s2, t->end);
- rt1->op = rt2->op = '|';
- rt1->left = subre(s, t->end, sub.prefer, 0,
- t->tree);
- rt1->next = rt2;
- rt2->left = subre(s2, t->end, sub.prefer, 0,
- (struct rtree *)NULL);
- t->tree = rt1;
- t = &rt1->left;
- m = 1;
- }
+ if (!seencontent) { /* empty branch */
+ if (!partial)
+ NOTE(REG_UUNSPEC);
+ assert(lp == left);
+ EMPTYARC(left, right);
+ }
- /* second, x{1,1} is just x */
- if (m == 1 && n == 1 && sub.subno >= 0) {
- t->subno = sub.subno;
- if (sub.subno > 0)
- v->subs[sub.subno] = t;
- continue; /* NOTE CONTINUE */
- }
+ return t;
+}
- /* backrefs get special treatment */
- if (sub.subno < 0) {
- repeat(v, t->begin, t->end, m, n);
- rt1 = newrt(v);
- NOERRN();
- assert(t->tree == NULL);
- t->tree = rt1;
- rt1->op = 'b';
- rt1->left.subno = sub.subno;
- rt1->left.min = (short)m;
- rt1->left.max = (short)n;
- rt1->left.prefer = sub.prefer;
- continue; /* NOTE CONTINUE */
- }
+/*
+ - parseqatom - parse one quantified atom or constraint of an RE
+ * The bookkeeping near the end cooperates very closely with parsebranch();
+ * in particular, it contains a recursion that can involve parsing the rest
+ * of the branch, making this function's name somewhat inaccurate.
+ ^ static VOID parseqatom(struct vars *, int, int, struct state *,
+ ^ struct state *, struct subre *);
+ */
+static VOID
+parseqatom(v, stopper, type, lp, rp, top)
+struct vars *v;
+int stopper; /* EOS or ')' */
+int type; /* LACON (lookahead subRE) or PLAIN */
+struct state *lp; /* left state to hang it on */
+struct state *rp; /* right state to hang it on */
+struct subre *top; /* subtree top */
+{
+ struct state *s; /* temporaries for new states */
+ struct state *s2;
+# define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
+ int m, n;
+ struct subre *atom; /* atom's subtree */
+ struct subre *t;
+ int cap; /* capturing parens? */
+ int pos; /* positive lookahead? */
+ int subno; /* capturing-parens or backref number */
+ int atomtype;
+ int qprefer; /* quantifier short/long preference */
+ int f;
+ struct subre **atomp; /* where the pointer to atom is */
+
+ /* initial bookkeeping */
+ atom = NULL;
+ assert(lp->nouts == 0); /* must string new code */
+ assert(rp->nins == 0); /* between lp and rp */
+ subno = 0; /* just to shut lint up */
+
+ /* an atom or constraint... */
+ atomtype = v->nexttype;
+ switch (atomtype) {
+ /* first, constraints, which end by returning */
+ case '^':
+ ARCV('^', 1);
+ if (v->cflags&REG_NLANCH)
+ ARCV(BEHIND, v->nlcolor);
+ NEXT();
+ return;
+ break;
+ case '$':
+ ARCV('$', 1);
+ if (v->cflags&REG_NLANCH)
+ ARCV(AHEAD, v->nlcolor);
+ NEXT();
+ return;
+ break;
+ case SBEGIN:
+ ARCV('^', 1); /* BOL */
+ ARCV('^', 0); /* or BOS */
+ NEXT();
+ return;
+ break;
+ case SEND:
+ ARCV('$', 1); /* EOL */
+ ARCV('$', 0); /* or EOS */
+ NEXT();
+ return;
+ break;
+ case '<':
+ wordchrs(v); /* does NEXT() */
+ s = newstate(v->nfa);
+ NOERR();
+ nonword(v, BEHIND, lp, s);
+ word(v, AHEAD, s, rp);
+ return;
+ break;
+ case '>':
+ wordchrs(v); /* does NEXT() */
+ s = newstate(v->nfa);
+ NOERR();
+ word(v, BEHIND, lp, s);
+ nonword(v, AHEAD, s, rp);
+ return;
+ break;
+ case WBDRY:
+ wordchrs(v); /* does NEXT() */
+ s = newstate(v->nfa);
+ NOERR();
+ nonword(v, BEHIND, lp, s);
+ word(v, AHEAD, s, rp);
+ s = newstate(v->nfa);
+ NOERR();
+ word(v, BEHIND, lp, s);
+ nonword(v, AHEAD, s, rp);
+ return;
+ break;
+ case NWBDRY:
+ wordchrs(v); /* does NEXT() */
+ s = newstate(v->nfa);
+ NOERR();
+ word(v, BEHIND, lp, s);
+ word(v, AHEAD, s, rp);
+ s = newstate(v->nfa);
+ NOERR();
+ nonword(v, BEHIND, lp, s);
+ nonword(v, AHEAD, s, rp);
+ return;
+ break;
+ case LACON: /* lookahead constraint */
+ pos = v->nextvalue;
+ NEXT();
+ s = newstate(v->nfa);
+ s2 = newstate(v->nfa);
+ NOERR();
+ t = parse(v, ')', LACON, s, s2);
+ freesubre(v, t); /* internal structure irrelevant */
+ assert(SEE(')') || ISERR());
+ NEXT();
+ n = newlacon(v, s, s2, pos);
+ NOERR();
+ ARCV(LACON, n);
+ return;
+ break;
+ /* then errors, to get them out of the way */
+ case '*':
+ case '+':
+ case '?':
+ case '{':
+ ERR(REG_BADRPT);
+ return;
+ break;
+ default:
+ ERR(REG_ASSERT);
+ return;
+ break;
+ /* then plain characters, and minor variants on that theme */
+ case ')': /* unbalanced paren */
+ if ((v->cflags&REG_ADVANCED) != REG_EXTENDED) {
+ ERR(REG_EPAREN);
+ return;
+ }
+ /* legal in EREs due to specification botch */
+ NOTE(REG_UPBOTCH);
+ /* fallthrough into case PLAIN */
+ case PLAIN:
+ onechr(v, v->nextvalue, lp, rp);
+ okcolors(v->nfa, v->cm);
+ NOERR();
+ NEXT();
+ break;
+ case '[':
+ if (v->nextvalue == 1)
+ bracket(v, lp, rp);
+ else
+ cbracket(v, lp, rp);
+ assert(SEE(']') || ISERR());
+ NEXT();
+ break;
+ case '.':
+ rainbow(v->nfa, v->cm, PLAIN,
+ (v->cflags&REG_NLSTOP) ? v->nlcolor : COLORLESS,
+ lp, rp);
+ NEXT();
+ break;
+ /* and finally the ugly stuff */
+ case '(': /* value flags as capturing or non */
+ cap = (type == LACON) ? 0 : v->nextvalue;
+ if (cap) {
+ v->nsubexp++;
+ subno = v->nsubexp;
+ if ((size_t)subno >= v->nsubs)
+ moresubs(v, subno);
+ assert((size_t)subno < v->nsubs);
+ } else
+ atomtype = PLAIN; /* something that's not '(' */
+ NEXT();
+ /* need new endpoints because tree will contain pointers */
+ s = newstate(v->nfa);
+ s2 = newstate(v->nfa);
+ NOERR();
+ EMPTYARC(lp, s);
+ EMPTYARC(s2, rp);
+ NOERR();
+ atom = parse(v, ')', PLAIN, s, s2);
+ assert(SEE(')') || ISERR());
+ NEXT();
+ NOERR();
+ if (cap) {
+ v->subs[subno] = atom;
+ t = subre(v, '(', atom->flags|CAP, lp, rp);
+ NOERR();
+ t->subno = subno;
+ t->left = atom;
+ atom = t;
+ }
+ /* postpone everything else pending possible {0} */
+ break;
+ case BACKREF: /* the Feature From The Black Lagoon */
+ INSIST(type != LACON, REG_ESUBREG);
+ INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
+ INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
+ NOERR();
+ assert(v->nextvalue > 0);
+ atom = subre(v, 'b', BACKR, lp, rp);
+ subno = v->nextvalue;
+ atom->subno = subno;
+ EMPTYARC(lp, rp); /* temporarily, so there's something */
+ NEXT();
+ break;
+ }
- /* turn x{m,n} into x{m-1,n-1}x, with capturing */
- /* parens in only second x */
- s = newstate(v->nfa);
- NOERRN();
- moveouts(v->nfa, t->begin, s);
- dupnfa(v->nfa, s, t->end, t->begin, s);
- assert(m >= 1 && m != INFINITY && n >= 1);
- repeat(v, t->begin, s, m-1, (n == INFINITY) ? n : n-1);
- rt1 = newrt(v);
- NOERRN();
- rt1->op = ',';
- rt1->left = subre(t->begin, s, sub.prefer, 0,
- (struct rtree *)NULL);
- /* sub.prefer not really right, but doesn't matter */
- rt1->right = subre(s, t->end, sub.prefer, sub.subno,
- t->tree);
- if (sub.subno > 0)
- v->subs[sub.subno] = &rt1->right;
- t->tree = rt1;
+ /* ...and an atom may be followed by a quantifier */
+ switch (v->nexttype) {
+ case '*':
+ m = 0;
+ n = INFINITY;
+ qprefer = (v->nextvalue) ? LONGER : SHORTER;
+ NEXT();
+ break;
+ case '+':
+ m = 1;
+ n = INFINITY;
+ qprefer = (v->nextvalue) ? LONGER : SHORTER;
+ NEXT();
+ break;
+ case '?':
+ m = 0;
+ n = 1;
+ qprefer = (v->nextvalue) ? LONGER : SHORTER;
+ NEXT();
+ break;
+ case '{':
+ NEXT();
+ m = scannum(v);
+ if (EAT(',')) {
+ if (SEE(DIGIT))
+ n = scannum(v);
+ else
+ n = INFINITY;
+ if (m > n) {
+ ERR(REG_BADBR);
+ return;
+ }
+ /* {m,n} exercises preference, even if it's {m,m} */
+ qprefer = (v->nextvalue) ? LONGER : SHORTER;
+ } else {
+ n = m;
+ /* {m} passes operand's preference through */
+ qprefer = 0;
}
- if (emptybranch) {
- NOTE(REG_UUNSPEC);
- EMPTYARC(lp, rp);
+ if (!SEE('}')) { /* catches errors too */
+ ERR(REG_BADBR);
+ return;
}
- } while (EAT('|'));
- assert(SEE(stopper) || SEE(EOS));
+ NEXT();
+ break;
+ default: /* no quantifier */
+ m = n = 1;
+ qprefer = 0;
+ break;
+ }
- if (!SEE(stopper)) {
- assert(stopper == ')' && SEE(EOS));
- ERR(REG_EPAREN);
+ /* annoying special case: {0} or {0,0} cancels everything */
+ if (m == 0 && n == 0) {
+ if (atom != NULL)
+ freesubre(v, atom);
+ if (atomtype == '(')
+ v->subs[subno] = NULL;
+ delsub(v->nfa, lp, rp);
+ EMPTYARC(lp, rp);
+ return;
}
- /* higher levels care about our preference in certain situations */
- if (branch != branches) { /* >1 branch */
- if (pprefer != LONGER)
- capture = 1;
- } else if (branches->left.prefer != pprefer)
- capture = 1;
-
- /* optimize out vacuous alternation */
- if (branch == branches) {
- assert(branch->next == NULL && branch->right.begin == NULL);
- assert(branch->left.subno == 0);
- if (capture && branch->left.tree == NULL)
- branch->op = ',';
- else {
- branches = branch->left.tree; /* might be NULL */
- freertnode(v, branch);
- }
+ /* if not a messy case, avoid hard part */
+ assert(!MESSY(top->flags));
+ f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
+ if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {
+ if (!(m == 1 && n == 1))
+ repeat(v, lp, rp, m, n);
+ if (atom != NULL)
+ freesubre(v, atom);
+ top->flags = f;
+ return;
+ }
+
+ /*
+ * hard part: something messy
+ * That is, capturing parens, back reference, short/long clash, or
+ * an atom with substructure containing one of those.
+ */
+
+ /* now we'll need a subre for the contents even if they're boring */
+ if (atom == NULL) {
+ atom = subre(v, '=', 0, lp, rp);
+ NOERR();
+ }
+
+ /*
+ * prepare a general-purpose state skeleton
+ *
+ * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
+ * / /
+ * [lp] ----> [s2] ----bypass---------------------
+ *
+ * where bypass is an empty, and prefix is some repetitions of atom
+ */
+ s = newstate(v->nfa); /* first, new endpoints for the atom */
+ s2 = newstate(v->nfa);
+ NOERR();
+ moveouts(v->nfa, lp, s);
+ moveins(v->nfa, rp, s2);
+ NOERR();
+ atom->begin = s;
+ atom->end = s2;
+ s = newstate(v->nfa); /* and spots for prefix and bypass */
+ s2 = newstate(v->nfa);
+ NOERR();
+ EMPTYARC(lp, s);
+ EMPTYARC(lp, s2);
+ NOERR();
+
+ /* break remaining subRE into x{...} and what follows */
+ t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
+ t->left = atom;
+ atomp = &t->left;
+ /* here we should recurse... but we must postpone that to the end */
+
+ /* split top into prefix and remaining */
+ assert(top->op == '=' && top->left == NULL && top->right == NULL);
+ top->left = subre(v, '=', top->flags, top->begin, lp);
+ top->op = '.';
+ top->right = t;
+
+ /* if it's a backref, now is the time to replicate the subNFA */
+ if (atomtype == BACKREF) {
+ assert(atom->begin->nouts == 1); /* just the EMPTY */
+ delsub(v->nfa, atom->begin, atom->end);
+ assert(v->subs[subno] != NULL);
+ /* and here's why the recursion got postponed: it must */
+ /* wait until the skeleton is filled in, because it may */
+ /* hit a backref that wants to copy the filled-in skeleton */
+ dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
+ atom->begin, atom->end);
+ NOERR();
+ }
+
+ /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
+ if (m == 0) {
+ EMPTYARC(s2, atom->end); /* the bypass */
+ assert(PREF(qprefer) != 0);
+ f = COMBINE(qprefer, atom->flags);
+ t = subre(v, '|', f, lp, atom->end);
+ NOERR();
+ t->left = atom;
+ t->right = subre(v, '|', PREF(f), s2, atom->end);
+ NOERR();
+ t->right->left = subre(v, '=', 0, s2, atom->end);
+ NOERR();
+ *atomp = t;
+ atomp = &t->left;
+ m = 1;
}
- if (capture) /* actually a catchall flag */
- return branches;
- freert(v, branches);
- return NULL;
+ /* deal with the rest of the quantifier */
+ if (atomtype == BACKREF) {
+ /* special case: backrefs have internal quantifiers */
+ EMPTYARC(s, atom->begin); /* empty prefix */
+ /* just stuff everything into atom */
+ repeat(v, atom->begin, atom->end, m, n);
+ atom->min = (short)m;
+ atom->max = (short)n;
+ atom->flags |= COMBINE(qprefer, atom->flags);
+ } else if (m == 1 && n == 1) {
+ /* no/vacuous quantifier: done */
+ EMPTYARC(s, atom->begin); /* empty prefix */
+ } else {
+ /* turn x{m,n} into x{m-1,n-1}x, with capturing */
+ /* parens in only second x */
+ dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
+ assert(m >= 1 && m != INFINITY && n >= 1);
+ repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
+ f = COMBINE(qprefer, atom->flags);
+ t = subre(v, '.', f, s, atom->end); /* prefix and atom */
+ NOERR();
+ t->left = subre(v, '=', PREF(f), s, atom->begin);
+ NOERR();
+ t->right = atom;
+ *atomp = t;
+ }
+
+ /* and finally, look after that postponed recursion */
+ t = top->right;
+ if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
+ t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
+ else {
+ EMPTYARC(atom->end, rp);
+ t->right = subre(v, '=', 0, atom->end, rp);
+ }
+ assert(SEE('|') || SEE(stopper) || SEE(EOS));
+ t->flags |= COMBINE(t->flags, t->right->flags);
+ top->flags |= COMBINE(top->flags, t->flags);
+}
+
+/*
+ - nonword - generate arcs for non-word-character ahead or behind
+ ^ static VOID nonword(struct vars *, int, struct state *, struct state *);
+ */
+static VOID
+nonword(v, dir, lp, rp)
+struct vars *v;
+int dir; /* AHEAD or BEHIND */
+struct state *lp;
+struct state *rp;
+{
+ int anchor = (dir == AHEAD) ? '$' : '^';
+
+ assert(dir == AHEAD || dir == BEHIND);
+ newarc(v->nfa, anchor, 1, lp, rp);
+ newarc(v->nfa, anchor, 0, lp, rp);
+ colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
+ /* (no need for special attention to \n) */
+}
+
+/*
+ - word - generate arcs for word character ahead or behind
+ ^ static VOID word(struct vars *, int, struct state *, struct state *);
+ */
+static VOID
+word(v, dir, lp, rp)
+struct vars *v;
+int dir; /* AHEAD or BEHIND */
+struct state *lp;
+struct state *rp;
+{
+ assert(dir == AHEAD || dir == BEHIND);
+ cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
+ /* (no need for special attention to \n) */
}
/*
@@ -1163,7 +1187,7 @@ struct state *rp;
NOERR();
bracket(v, left, right);
if (v->cflags&REG_NLSTOP)
- newarc(v->nfa, PLAIN, nlcolor(v), left, right);
+ newarc(v->nfa, PLAIN, v->nlcolor, left, right);
NOERR();
assert(lp->nouts == 0); /* all outarcs will be ours */
@@ -1181,7 +1205,7 @@ struct state *rp;
/* but complementing gets messy in the presence of MCCEs... */
NOTE(REG_ULOCALE);
for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--) {
- co = getcolor(v->cm, *p);
+ co = GETCOLOR(v->cm, *p);
a = findarc(lp, PLAIN, co);
ba = findarc(left, PLAIN, co);
if (ba == NULL) {
@@ -1391,7 +1415,7 @@ struct cvec *cv;
okcolors(v->nfa, v->cm);
} else {
a = findarc(v->mccepbegin, PLAIN,
- getcolor(v->cm, leader));
+ GETCOLOR(v->cm, leader));
assert(a != NULL);
s = a->to;
assert(s != v->mccepend);
@@ -1438,6 +1462,7 @@ struct state *lp;
struct state *rp;
{
chr ch, from, to;
+ celt ce;
chr *p;
int i;
color co;
@@ -1478,16 +1503,17 @@ struct state *rp;
for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
from = *p;
to = *(p+1);
- for (ch = from; ch <= to; ch++)
- if (!ISCELEADER(v, ch))
- newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp,
- rp);
- else {
- assert(singleton(v->cm, ch));
- assert(leads != NULL);
- if (!haschr(leads, ch))
- addchr(leads, ch);
- }
+ while (from <= to && (ce = nextleader(v, from, to)) != NOCELT) {
+ if (from < ce)
+ subrange(v, from, ce - 1, lp, rp);
+ assert(singleton(v->cm, ce));
+ assert(leads != NULL);
+ if (!haschr(leads, ce))
+ addchr(leads, ce);
+ from = ce + 1;
+ }
+ if (from <= to)
+ subrange(v, from, to, lp, rp);
}
if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
@@ -1496,7 +1522,7 @@ struct state *rp;
/* deal with the MCCE leaders */
NOTE(REG_ULOCALE);
for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--) {
- co = getcolor(v->cm, *p);
+ co = GETCOLOR(v->cm, *p);
a = findarc(lp, PLAIN, co);
if (a != NULL)
s = a->to;
@@ -1519,7 +1545,8 @@ struct state *rp;
for (i = 0; i < cv->nmcces; i++) {
p = cv->mcces[i];
assert(singleton(v->cm, *p));
- co = getcolor(v->cm, *p++);
+ ch = *p++;
+ co = GETCOLOR(v->cm, ch);
a = findarc(lp, PLAIN, co);
if (a != NULL)
s = a->to;
@@ -1531,7 +1558,8 @@ struct state *rp;
}
assert(*p != 0); /* at least two chars */
assert(singleton(v->cm, *p));
- co = getcolor(v->cm, *p++);
+ ch = *p++;
+ co = GETCOLOR(v->cm, ch);
assert(*p == 0); /* and only two, for now */
newarc(v->nfa, PLAIN, co, s, rp);
NOERR();
@@ -1539,20 +1567,30 @@ struct state *rp;
}
/*
- - nlcolor - assign newline a unique color, if it doesn't have one already
- * Restriction: can't be called when there are subcolors open. (Maybe
- * this should be enforced...)
- ^ static color nlcolor(struct vars *);
+ - nextleader - find next MCCE leader within range
+ ^ static celt nextleader(struct vars *, pchr, pchr);
*/
-static color
-nlcolor(v)
+static celt /* NOCELT means none */
+nextleader(v, from, to)
struct vars *v;
+pchr from;
+pchr to;
{
- if (v->nlcolor == COLORLESS) {
- v->nlcolor = subcolor(v->cm, newline());
- okcolors(v->nfa, v->cm);
+ int i;
+ chr *p;
+ chr ch;
+ celt it = NOCELT;
+
+ if (v->mcces == NULL)
+ return it;
+
+ for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++) {
+ ch = *p;
+ if (from <= ch && ch <= to)
+ if (it == NOCELT || ch < it)
+ it = ch;
}
- return v->nlcolor;
+ return it;
}
/*
@@ -1579,6 +1617,7 @@ struct vars *v;
left = newstate(v->nfa);
right = newstate(v->nfa);
NOERR();
+ /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
lexword(v);
NEXT();
assert(v->savenow != NULL && SEE('['));
@@ -1590,273 +1629,169 @@ struct vars *v;
}
/*
- - subre - construct a subre struct
- ^ static struct subre subre(struct state *, struct state *, int, int,
- ^ struct rtree *);
+ - subre - allocate a subre
+ ^ static struct subre *subre(struct vars *, int, int, struct state *,
+ ^ struct state *);
*/
-static struct subre
-subre(begin, end, prefer, subno, tree)
+static struct subre *
+subre(v, op, flags, begin, end)
+struct vars *v;
+int op;
+int flags;
struct state *begin;
struct state *end;
-int prefer;
-int subno;
-struct rtree *tree;
-{
- struct subre ret;
-
- ret.begin = begin;
- ret.end = end;
- ret.prefer = prefer;
- ret.subno = subno;
- ret.min = ret.max = 1;
- ret.tree = tree;
- ZAPCNFA(ret.cnfa);
- return ret;
-}
-
-/*
- - newrt - allocate subRE-tree node
- ^ static struct rtree *newrt(struct vars *);
- */
-static struct rtree *
-newrt(v)
-struct vars *v;
{
- struct rtree *rt;
+ struct subre *ret;
- rt = v->treefree;
- if (rt != NULL)
- v->treefree = rt->next;
+ ret = v->treefree;
+ if (ret != NULL)
+ v->treefree = ret->left;
else {
- rt = (struct rtree *)MALLOC(sizeof(struct rtree));
- if (rt == NULL) {
+ ret = (struct subre *)MALLOC(sizeof(struct subre));
+ if (ret == NULL) {
ERR(REG_ESPACE);
return NULL;
}
- rt->chain = v->treechain;
- v->treechain = rt;
+ ret->chain = v->treechain;
+ v->treechain = ret;
}
- rt->op = '?'; /* invalid */
- rt->flags = 0;
- rt->no = 0;
- rt->left.begin = NULL;
- rt->left.end = NULL;
- rt->left.prefer = NONEYET;
- rt->left.subno = 0;
- rt->left.min = rt->left.max = 1;
- rt->left.tree = NULL;
- ZAPCNFA(rt->left.cnfa);
- rt->right.begin = NULL;
- rt->right.end = NULL;
- rt->right.prefer = NONEYET;
- rt->right.subno = 0;
- rt->right.min = rt->right.max = 1;
- rt->right.tree = NULL;
- ZAPCNFA(rt->right.cnfa);
- rt->next = NULL;
-
- return rt;
+ assert(strchr("|.b(=", op) != NULL);
+
+ ret->op = op;
+ ret->flags = flags;
+ ret->retry = 0;
+ ret->subno = 0;
+ ret->min = ret->max = 1;
+ ret->left = NULL;
+ ret->right = NULL;
+ ret->begin = begin;
+ ret->end = end;
+ ZAPCNFA(ret->cnfa);
+
+ return ret;
}
/*
- - freert - free a subRE subtree
- ^ static VOID freert(struct vars *, struct rtree *);
+ - freesubre - free a subRE subtree
+ ^ static VOID freesubre(struct vars *, struct subre *);
*/
static VOID
-freert(v, rt)
+freesubre(v, sr)
struct vars *v; /* might be NULL */
-struct rtree *rt;
+struct subre *sr;
{
- if (rt == NULL)
+ if (sr == NULL)
return;
- if (rt->left.tree != NULL)
- freert(v, rt->left.tree);
- if (rt->right.tree != NULL)
- freert(v, rt->right.tree);
- if (rt->next != NULL)
- freert(v, rt->next);
+ if (sr->left != NULL)
+ freesubre(v, sr->left);
+ if (sr->right != NULL)
+ freesubre(v, sr->right);
- freertnode(v, rt);
+ freesrnode(v, sr);
}
/*
- - freertnode - free one node in a subRE subtree
- ^ static VOID freertnode(struct vars *, struct rtree *);
+ - freesrnode - free one node in a subRE subtree
+ ^ static VOID freesrnode(struct vars *, struct subre *);
*/
static VOID
-freertnode(v, rt)
+freesrnode(v, sr)
struct vars *v; /* might be NULL */
-struct rtree *rt;
+struct subre *sr;
{
- if (rt == NULL)
+ if (sr == NULL)
return;
- if (!NULLCNFA(rt->left.cnfa))
- freecnfa(&rt->left.cnfa, 0);
- if (!NULLCNFA(rt->right.cnfa))
- freecnfa(&rt->right.cnfa, 0);
- rt->flags = 0;
+ if (!NULLCNFA(sr->cnfa))
+ freecnfa(&sr->cnfa);
+ sr->flags = 0;
if (v != NULL) {
- rt->next = v->treefree;
- v->treefree = rt;
+ sr->left = v->treefree;
+ v->treefree = sr;
} else
- FREE(rt);
+ FREE(sr);
}
/*
- - optrt - optimize a subRE subtree
- ^ static VOID optrt(struct vars *, struct rtree *);
+ - optst - optimize a subRE subtree
+ ^ static VOID optst(struct vars *, struct subre *);
*/
static VOID
-optrt(v, rt)
+optst(v, t)
struct vars *v;
-struct rtree *rt;
+struct subre *t;
{
- struct rtree *t;
- int subno;
-
- if (rt == NULL)
+ if (t == NULL)
return;
- assert(rt->op != 'b');
-
- /* pull up subtrees if possible */
- if (rt->left.begin != NULL && rt->left.tree != NULL &&
- rt->left.tree->op != 'b') {
- t = rt->left.tree;
- optrt(v, t);
- if (t->right.begin == NULL && t->next == NULL &&
- (rt->left.prefer == NONEYET ||
- t->left.prefer == rt->left.prefer) &&
- (rt->left.subno == 0 || t->left.subno == 0)) {
- subno = rt->left.subno;
- rt->left = t->left;
- assert(NULLCNFA(t->left.cnfa));
- freertnode(v, t);
- if (subno != 0) {
- assert(rt->left.subno == 0 && subno > 0);
- rt->left.subno = subno;
- }
- }
- }
- if (rt->right.begin != NULL && rt->right.tree != NULL &&
- rt->right.tree->op != 'b') {
- t = rt->right.tree;
- optrt(v, t);
- if (t->right.begin == NULL && t->next == NULL &&
- (rt->right.prefer == NONEYET ||
- t->right.prefer == rt->right.prefer) &&
- (rt->right.subno == 0 || t->right.subno == 0)) {
- subno = rt->right.subno;
- rt->right = t->left;
- assert(NULLCNFA(t->right.cnfa));
- freertnode(v, t);
- if (subno != 0) {
- assert(rt->right.subno == 0 && subno > 0);
- rt->right.subno = subno;
- }
- }
- }
-
- /* simplify empties */
- if (rt->left.begin != NULL && isempty(rt->left.begin, rt->left.end))
- rt->left.end = rt->left.begin;
- if (rt->right.begin != NULL && isempty(rt->right.begin, rt->right.end))
- rt->right.end = rt->right.begin;
-
- /* if left subtree vacuous and right non-empty, move right over */
- if (rt->left.begin != NULL && rt->left.begin == rt->left.end &&
- rt->left.subno == 0 && rt->left.tree == NULL &&
- rt->right.begin != NULL) {
- rt->left = rt->right;
- rt->right.begin = NULL;
- rt->right.tree = NULL;
- }
-
- /* if right subtree vacuous, clear it out */
- if (rt->right.begin != NULL && rt->right.begin == rt->right.end &&
- rt->right.subno == 0 && rt->right.tree == NULL) {
- rt->right.begin = NULL;
- rt->right.tree = NULL;
- }
/* preference cleanup and analysis */
- if (rt->left.prefer == NONEYET)
- rt->left.prefer = LONGER;
- if (rt->left.prefer == SHORTER)
+ if (t->flags&SHORTER)
v->usedshorter = 1;
- if (rt->right.begin != NULL) {
- if (rt->right.prefer == NONEYET)
- rt->right.prefer = LONGER;
- if (rt->right.prefer == SHORTER)
- v->usedshorter = 1;
- }
- /* recurse through alternatives */
- if (rt->next != NULL)
- optrt(v, rt->next);
+ /* recurse through children */
+ if (t->left != NULL)
+ optst(v, t->left);
+ if (t->right != NULL)
+ optst(v, t->right);
}
/*
- - numrt - number tree nodes
- ^ static int numrt(struct rtree *, int);
+ - numst - number tree nodes (assigning retry indexes)
+ ^ static int numst(struct subre *, int);
*/
static int /* next number */
-numrt(rt, start)
-struct rtree *rt;
+numst(t, start)
+struct subre *t;
int start; /* starting point for subtree numbers */
{
int i;
- assert(rt != NULL);
+ assert(t != NULL);
i = start;
- rt->no = (short)i++;
- if (rt->left.tree != NULL)
- i = numrt(rt->left.tree, i);
- if (rt->right.tree != NULL)
- i = numrt(rt->right.tree, i);
- if (rt->next != NULL)
- i = numrt(rt->next, i);
+ t->retry = (short)i++;
+ if (t->left != NULL)
+ i = numst(t->left, i);
+ if (t->right != NULL)
+ i = numst(t->right, i);
return i;
}
/*
- - markrt - mark tree nodes as INUSE
- ^ static VOID markrt(struct rtree *);
+ - markst - mark tree nodes as INUSE
+ ^ static VOID markst(struct subre *);
*/
static VOID
-markrt(rt)
-struct rtree *rt;
+markst(t)
+struct subre *t;
{
- assert(rt != NULL);
-
- rt->flags |= INUSE;
- if (rt->left.tree != NULL)
- markrt(rt->left.tree);
- if (rt->right.tree != NULL)
- markrt(rt->right.tree);
- if (rt->next != NULL)
- markrt(rt->next);
+ assert(t != NULL);
+
+ t->flags |= INUSE;
+ if (t->left != NULL)
+ markst(t->left);
+ if (t->right != NULL)
+ markst(t->right);
}
/*
- - cleanrt - free any tree nodes not marked INUSE
- ^ static VOID cleanrt(struct vars *);
+ - cleanst - free any tree nodes not marked INUSE
+ ^ static VOID cleanst(struct vars *);
*/
static VOID
-cleanrt(v)
+cleanst(v)
struct vars *v;
{
- struct rtree *rt;
- struct rtree *next;
+ struct subre *t;
+ struct subre *next;
- for (rt = v->treechain; rt != NULL; rt = next) {
- next = rt->next;
- if (!(rt->flags&INUSE))
- FREE(rt);
+ for (t = v->treechain; t != NULL; t = next) {
+ next = t->chain;
+ if (!(t->flags&INUSE))
+ FREE(t);
}
v->treechain = NULL;
v->treefree = NULL; /* just on general principles */
@@ -1864,56 +1799,51 @@ struct vars *v;
/*
- nfatree - turn a subRE subtree into a tree of compacted NFAs
- ^ static VOID nfatree(struct vars *, struct rtree *, FILE *);
+ ^ static int nfatree(struct vars *, struct subre *, FILE *);
*/
-static VOID
-nfatree(v, rt, f)
+static int /* optimize results from top node */
+nfatree(v, t, f)
struct vars *v;
-struct rtree *rt;
+struct subre *t;
FILE *f; /* for debug output */
{
- if (rt == NULL)
- return;
+ assert(t != NULL && t->begin != NULL);
- if (rt->left.begin != NULL)
- nfanode(v, &rt->left, f);
- if (rt->left.tree != NULL)
- nfatree(v, rt->left.tree, f);
+ if (t->left != NULL)
+ (DISCARD)nfatree(v, t->left, f);
+ if (t->right != NULL)
+ (DISCARD)nfatree(v, t->right, f);
- if (rt->right.begin != NULL)
- nfanode(v, &rt->right, f);
- if (rt->right.tree != NULL)
- nfatree(v, rt->right.tree, f);
-
- if (rt->next != NULL)
- nfatree(v, rt->next, f);
+ return nfanode(v, t, f);
}
/*
- nfanode - do one NFA for nfatree
- ^ static VOID nfanode(struct vars *, struct subre *, FILE *);
+ ^ static int nfanode(struct vars *, struct subre *, FILE *);
*/
-static VOID
-nfanode(v, sub, f)
+static int /* optimize results */
+nfanode(v, t, f)
struct vars *v;
-struct subre *sub;
+struct subre *t;
FILE *f; /* for debug output */
{
struct nfa *nfa;
+ int ret = 0;
- if (sub->begin == NULL)
- return;
+ assert(t->begin != NULL);
nfa = newnfa(v, v->cm, v->nfa);
- NOERR();
- dupnfa(nfa, sub->begin, sub->end, nfa->init, nfa->final);
+ NOERRZ();
+ dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
if (!ISERR()) {
specialcolors(nfa);
- (DISCARD) optimize(nfa, f);
+ ret = optimize(nfa, f);
}
if (!ISERR())
- compact(nfa, &sub->cnfa);
+ compact(nfa, &t->cnfa);
+
freenfa(nfa);
+ return ret;
}
/*
@@ -1964,9 +1894,9 @@ int n;
int i;
assert(n > 0);
- for (sub = subs + 1, i = n - 1; i > 0; sub++, i--)
+ for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */
if (!NULLCNFA(sub->cnfa))
- freecnfa(&sub->cnfa, 0);
+ freecnfa(&sub->cnfa);
FREE(subs);
}
@@ -1988,14 +1918,13 @@ regex_t *re;
re->re_guts = NULL;
re->re_fns = NULL;
g->magic = 0;
- if (!NULLCNFA(g->cnfa))
- freecnfa(&g->cnfa, 0);
- if (g->cm != NULL)
- freecm(g->cm);
+ freecm(&g->cmap);
if (g->tree != NULL)
- freert((struct vars *)NULL, g->tree);
+ freesubre((struct vars *)NULL, g->tree);
if (g->lacons != NULL)
freelacons(g->lacons, g->nlacons);
+ if (!NULLCNFA(g->search))
+ freecnfa(&g->search);
FREE(g);
}
@@ -2028,41 +1957,44 @@ FILE *f;
re->re_nsub, re->re_info, re->re_csize, g->ntree,
g->usedshorter);
- dumpcolors(g->cm, f);
- dumpcnfa(&g->cnfa, f);
+ dumpcolors(&g->cmap, f);
+ if (!NULLCNFA(g->search)) {
+ printf("search:\n");
+ dumpcnfa(&g->search, f);
+ }
for (i = 1; i < g->nlacons; i++) {
fprintf(f, "la%d (%s):\n", i,
(g->lacons[i].subno) ? "positive" : "negative");
dumpcnfa(&g->lacons[i].cnfa, f);
}
- dumprt(g->tree, f, 0);
+ dumpst(g->tree, f, 0);
#endif
}
/*
- - dumprt - dump a subRE tree
- ^ static VOID dumprt(struct rtree *, FILE *, int);
+ - dumpst - dump a subRE tree
+ ^ static VOID dumpst(struct subre *, FILE *, int);
*/
static VOID
-dumprt(rt, f, nfapresent)
-struct rtree *rt;
+dumpst(t, f, nfapresent)
+struct subre *t;
FILE *f;
int nfapresent; /* is the original NFA still around? */
{
- if (rt == NULL)
+ if (t == NULL)
fprintf(f, "null tree\n");
else
- rtdump(rt, f, nfapresent, 0);
+ stdump(t, f, nfapresent, 0);
fflush(f);
}
/*
- - rtdump - recursive guts of dumprt
- ^ static VOID rtdump(struct rtree *, FILE *, int, int);
+ - stdump - recursive guts of dumpst
+ ^ static VOID stdump(struct subre *, FILE *, int, int);
*/
static VOID
-rtdump(rt, f, nfapresent, level)
-struct rtree *rt;
+stdump(t, f, nfapresent, level)
+struct subre *t;
FILE *f;
int nfapresent; /* is the original NFA still around? */
int level;
@@ -2072,102 +2004,41 @@ int level;
for (i = 0; i < level; i++)
fprintf(f, RTSEP);
- fprintf(f, "%c (n%d) {\n", rt->op, rt->no);
- if (rt->left.begin != NULL) {
- for (i = 0; i < level+1; i++)
- fprintf(f, RTSEP);
- fprintf(f, "L");
- fprintf(f, "%s", (rt->left.prefer == NONEYET) ? "-" :
- ((rt->left.prefer == LONGER) ? ">" : "<"));
- if (nfapresent)
- fprintf(f, "%ld-%ld", (long)rt->left.begin->no,
- (long)rt->left.end->no);
- if (rt->left.subno > 0)
- fprintf(f, " (%d)", rt->left.subno);
- else if (rt->left.subno < 0) {
- fprintf(f, " \\%d", -rt->left.subno);
- if (rt->left.min != 1 || rt->left.max != 1) {
- fprintf(f, "{%d-", (int)rt->left.min);
- if (rt->left.max != INFINITY)
- fprintf(f, "%d", (int)rt->left.max);
- fprintf(f, "}");
- }
- if (rt->left.tree != NULL)
- fprintf(f, "(nonNULL tree!!)");
- }
- if (rt->left.tree != NULL || !NULLCNFA(rt->left.cnfa))
- fprintf(f, ":");
- fprintf(f, "\n");
- if (!NULLCNFA(rt->left.cnfa))
- dumpcnfa(&rt->left.cnfa, f);
- if (rt->left.tree != NULL)
- rtdump(rt->left.tree, f, nfapresent, level+1);
- } else if (rt->op == 'b') {
- for (i = 0; i < level+1; i++)
- fprintf(f, RTSEP);
+ fprintf(f, "%c (", t->op);
+ if (t->flags&LONGER)
fprintf(f, "L");
- fprintf(f, "%s", (rt->left.prefer == NONEYET) ? "-" :
- ((rt->left.prefer == LONGER) ? ">" : "<"));
- assert(rt->left.subno < 0);
- fprintf(f, " \\%d", -rt->left.subno);
- if (rt->left.min != 1 || rt->left.max != 1) {
- fprintf(f, "{%d-", (int)rt->left.min);
- if (rt->left.max != INFINITY)
- fprintf(f, "%d", (int)rt->left.max);
- fprintf(f, "}");
- }
- if (rt->left.tree != NULL)
- fprintf(f, "(nonNULL tree!!)");
- fprintf(f, "\n");
- }
-
- if (rt->right.begin != NULL) {
- if (rt->op != ',')
- fprintf(f, "op %c has non-NULL right tree\n", rt->op);
- for (i = 0; i < level+1; i++)
- fprintf(f, RTSEP);
- fprintf(f, "R");
- fprintf(f, "%s", (rt->right.prefer == NONEYET) ? "-" :
- ((rt->right.prefer == LONGER) ? ">" : "<"));
- if (nfapresent)
- fprintf(f, "%ld-%ld", (long)rt->right.begin->no,
- (long)rt->right.end->no);
- if (rt->right.subno > 0)
- fprintf(f, " (%d)", rt->right.subno);
- else if (rt->right.subno < 0) {
- fprintf(f, " \\%d", -rt->right.subno);
- if (rt->right.min != 1 || rt->right.max != 1) {
- fprintf(f, "{%d-", (int)rt->right.min);
- if (rt->right.max != INFINITY)
- fprintf(f, "%d", (int)rt->right.max);
- fprintf(f, "}");
- }
- if (rt->right.tree != NULL)
- fprintf(f, "(nonNULL tree!!)");
- }
- if (rt->right.tree != NULL || !NULLCNFA(rt->right.cnfa))
- fprintf(f, ":");
- fprintf(f, "\n");
- if (!NULLCNFA(rt->right.cnfa))
- dumpcnfa(&rt->right.cnfa, f);
- if (rt->right.tree != NULL)
- rtdump(rt->right.tree, f, nfapresent, level+1);
- }
- for (i = 0; i < level; i++)
- fprintf(f, RTSEP);
- fprintf(f, "}\n");
-
- if (rt->next != NULL) {
- if (rt->op != '|')
- fprintf(f, "op %c has non-NULL next\n", rt->op);
- if (rt->next->op != rt->op)
- fprintf(f, "next op %c, expecting %c\n", rt->next->op,
- rt->op);
- rtdump(rt->next, f, nfapresent, level);
+ if (t->flags&SHORTER)
+ fprintf(f, "S");
+ if (t->flags&MIXED)
+ fprintf(f, "M");
+ if (t->flags&CAP)
+ fprintf(f, "c");
+ if (t->flags&BACKR)
+ fprintf(f, "b");
+ if (!(t->flags&INUSE))
+ fprintf(f, "!u");
+ fprintf(f, ") r%d", t->retry);
+ if (t->subno != 0)
+ fprintf(f, " #%d", t->subno);
+ if (t->min != 1 || t->max != 1) {
+ fprintf(f, "{%d,", t->min);
+ if (t->max != INFINITY)
+ fprintf(f, "%d", t->max);
+ fprintf(f, "}");
}
+ if (nfapresent)
+ fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
+ if (!NULLCNFA(t->cnfa))
+ fprintf(f, ":");
+ fprintf(f, "\n");
+ if (t->left != NULL)
+ stdump(t->left, f, nfapresent, level+1);
+ if (!NULLCNFA(t->cnfa))
+ dumpcnfa(&t->cnfa, f);
+ if (t->right != NULL)
+ stdump(t->right, f, nfapresent, level+1);
}
-#define COMPILE 1
#include "regc_lex.c"
#include "regc_color.c"
#include "regc_nfa.c"
diff --git a/generic/regcustom.h b/generic/regcustom.h
index 0fda25f..eeaafa0 100644
--- a/generic/regcustom.h
+++ b/generic/regcustom.h
@@ -37,17 +37,20 @@
#endif
/* interface types */
#define __REG_WIDE_T Tcl_UniChar
-#define __REG_WIDE_COMPILE re_ucomp
-#define __REG_WIDE_EXEC re_uexec
#define __REG_REGOFF_T long /* not really right, but good enough... */
#define __REG_VOID_T VOID
#define __REG_CONST CONST
+/* names and declarations */
+#define __REG_WIDE_COMPILE re_ucomp
+#define __REG_WIDE_EXEC re_uexec
#ifndef __REG_NOFRONT
#define __REG_NOFRONT /* don't want regcomp() and regexec() */
#endif
#ifndef __REG_NOCHAR
#define __REG_NOCHAR /* or the char versions */
#endif
+#define regfree re_ufree
+#define regerror re_uerror
/* --- end --- */
@@ -73,18 +76,11 @@ typedef int celt; /* type to hold chr, MCCE number, or NOCELT */
/* name the external functions */
#define compile re_ucomp
#define exec re_uexec
+
+/* enable/disable debugging code (by whether REG_DEBUG is defined or not) */
#ifdef notdef
-#define regfree re_ufree
-#define regerror re_uerror
+#define REG_DEBUG /* */
#endif
-/*
- * Implement a mistake in the original POSIX.2: in EREs, and only in EREs
- * (AREs do not support this botch), an unbalanced right parenthesis is an
- * ordinary character rather than an error. This was unintentional, and
- * will be fixed someday.
- */
-#define POSIX_MISTAKE /* sigh */
-
/* and pick up the standard header */
#include "regex.h"
diff --git a/generic/rege_dfa.c b/generic/rege_dfa.c
new file mode 100644
index 0000000..6e4b941
--- /dev/null
+++ b/generic/rege_dfa.c
@@ -0,0 +1,583 @@
+/*
+ * DFA routines
+ * This file is #included by regexec.c.
+ */
+
+/*
+ - longest - longest-preferred matching engine
+ ^ static chr *longest(struct vars *, struct dfa *, chr *, chr *);
+ */
+static chr * /* endpoint, or NULL */
+longest(v, d, start, stop)
+struct vars *v; /* used only for debug and exec flags */
+struct dfa *d;
+chr *start; /* where the match should start */
+chr *stop; /* match must end at or before here */
+{
+ chr *cp;
+ chr *realstop = (stop == v->stop) ? stop : stop + 1;
+ color co;
+ struct sset *css;
+ struct sset *ss;
+ chr *post;
+ int i;
+ struct colormap *cm = d->cm;
+
+ /* initialize */
+ css = initialize(v, d, start);
+ cp = start;
+
+ /* startup */
+ FDEBUG(("+++ startup +++\n"));
+ if (cp == v->start) {
+ co = d->cnfa->bos[(v->eflags&REG_NOTBOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long)co));
+ } else {
+ co = GETCOLOR(cm, *(cp - 1));
+ FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
+ }
+ css = miss(v, d, css, co, cp, start);
+ if (css == NULL)
+ return NULL;
+ css->lastseen = cp;
+
+ /* main loop */
+ if (v->eflags&REG_FTRACE)
+ while (cp < realstop) {
+ FDEBUG(("+++ at c%d +++\n", css - d->ssets));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
+ ss = css->outs[co];
+ if (ss == NULL) {
+ ss = miss(v, d, css, co, cp+1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+ else
+ while (cp < realstop) {
+ co = GETCOLOR(cm, *cp);
+ ss = css->outs[co];
+ if (ss == NULL) {
+ ss = miss(v, d, css, co, cp+1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+
+ /* shutdown */
+ FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets));
+ if (cp == v->stop && stop == v->stop) {
+ co = d->cnfa->eos[(v->eflags&REG_NOTEOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long)co));
+ ss = miss(v, d, css, co, cp, start);
+ /* special case: match ended at eol? */
+ if (ss != NULL && (ss->flags&POSTSTATE))
+ return cp;
+ else if (ss != NULL)
+ ss->lastseen = cp; /* to be tidy */
+ }
+
+ /* find last match, if any */
+ post = d->lastpost;
+ for (ss = d->ssets, i = 0; i < d->nssused; ss++, i++)
+ if ((ss->flags&POSTSTATE) && post != ss->lastseen &&
+ (post == NULL || post < ss->lastseen))
+ post = ss->lastseen;
+ if (post != NULL) /* found one */
+ return post - 1;
+
+ return NULL;
+}
+
+/*
+ - shortest - shortest-preferred matching engine
+ ^ static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *,
+ ^ chr **);
+ */
+static chr * /* endpoint, or NULL */
+shortest(v, d, start, min, max, coldp)
+struct vars *v; /* used only for debug and exec flags */
+struct dfa *d;
+chr *start; /* where the match should start */
+chr *min; /* match must end at or after here */
+chr *max; /* match must end at or before here */
+chr **coldp; /* store coldstart pointer here, if nonNULL */
+{
+ chr *cp;
+ chr *realmin = (min == v->stop) ? min : min + 1;
+ chr *realmax = (max == v->stop) ? max : max + 1;
+ color co;
+ struct sset *css;
+ struct sset *firstss;
+ struct sset *ss;
+ struct colormap *cm = d->cm;
+
+ /* initialize */
+ css = initialize(v, d, start);
+ firstss = css;
+ cp = start;
+
+ /* startup */
+ FDEBUG(("--- startup ---\n"));
+ if (cp == v->start) {
+ co = d->cnfa->bos[(v->eflags&REG_NOTBOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long)co));
+ } else {
+ co = GETCOLOR(cm, *(cp - 1));
+ FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
+ }
+ css = miss(v, d, css, co, cp, start);
+ if (css == NULL)
+ return NULL;
+ css->lastseen = cp;
+ ss = css;
+
+ /* main loop */
+ if (v->eflags&REG_FTRACE)
+ while (cp < realmax) {
+ FDEBUG(("--- at c%d ---\n", css - d->ssets));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
+ ss = css->outs[co];
+ if (ss == NULL) {
+ ss = miss(v, d, css, co, cp+1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ if ((ss->flags&POSTSTATE) && cp >= realmin)
+ break; /* NOTE BREAK OUT */
+ }
+ else
+ while (cp < realmax) {
+ co = GETCOLOR(cm, *cp);
+ ss = css->outs[co];
+ if (ss == NULL) {
+ ss = miss(v, d, css, co, cp+1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ if ((ss->flags&POSTSTATE) && cp >= realmin)
+ break; /* NOTE BREAK OUT */
+ }
+
+ if (ss == NULL)
+ return NULL;
+ if (ss->flags&POSTSTATE) {
+ assert(firstss->flags&STARTER);
+ assert(firstss->lastseen != NULL);
+ if (coldp != NULL)
+ *coldp = firstss->lastseen;
+ assert(cp >= realmin);
+ return cp - 1;
+ }
+
+ /* shutdown */
+ FDEBUG(("--- shutdown at c%d ---\n", css - d->ssets));
+ if (cp == v->stop && max == v->stop) {
+ co = d->cnfa->eos[(v->eflags&REG_NOTEOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long)co));
+ ss = miss(v, d, css, co, cp, start);
+ /* special case: match ended at eol? */
+ if (ss != NULL && (ss->flags&POSTSTATE)) {
+ assert(firstss->flags&STARTER);
+ assert(firstss->lastseen != NULL);
+ if (coldp != NULL)
+ *coldp = firstss->lastseen;
+ return cp;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ - newdfa - set up a fresh DFA
+ ^ static struct dfa *newdfa(struct vars *, struct cnfa *,
+ ^ struct colormap *);
+ */
+static struct dfa *
+newdfa(v, cnfa, cm)
+struct vars *v;
+struct cnfa *cnfa;
+struct colormap *cm;
+{
+ struct dfa *d = (struct dfa *)MALLOC(sizeof(struct dfa));
+ int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
+ struct sset *ss;
+ int i;
+
+ assert(cnfa != NULL && cnfa->nstates != 0);
+ if (d == NULL) {
+ ERR(REG_ESPACE);
+ return NULL;
+ }
+
+ d->ssets = (struct sset *)MALLOC(CACHE * sizeof(struct sset));
+ d->statesarea = (unsigned *)MALLOC((CACHE+WORK) * wordsper *
+ sizeof(unsigned));
+ d->work = &d->statesarea[CACHE * wordsper];
+ d->outsarea = (struct sset **)MALLOC(CACHE * cnfa->ncolors *
+ sizeof(struct sset *));
+ d->incarea = (struct arcp *)MALLOC(CACHE * cnfa->ncolors *
+ sizeof(struct arcp));
+ if (d->ssets == NULL || d->statesarea == NULL || d->outsarea == NULL ||
+ d->incarea == NULL) {
+ freedfa(d);
+ ERR(REG_ESPACE);
+ return NULL;
+ }
+
+ d->nssets = (v->eflags&REG_SMALL) ? 7 : CACHE;
+ d->nssused = 0;
+ d->nstates = cnfa->nstates;
+ d->ncolors = cnfa->ncolors;
+ d->wordsper = wordsper;
+ d->cnfa = cnfa;
+ d->cm = cm;
+ d->lastpost = NULL;
+ d->search = d->ssets;
+
+ for (ss = d->ssets, i = 0; i < d->nssets; ss++, i++) {
+ /* initialization of most fields is done as needed */
+ ss->states = &d->statesarea[i * d->wordsper];
+ ss->outs = &d->outsarea[i * d->ncolors];
+ ss->inchain = &d->incarea[i * d->ncolors];
+ }
+
+ return d;
+}
+
+/*
+ - freedfa - free a DFA
+ ^ static VOID freedfa(struct dfa *);
+ */
+static VOID
+freedfa(d)
+struct dfa *d;
+{
+ if (d->ssets != NULL)
+ FREE(d->ssets);
+ if (d->statesarea != NULL)
+ FREE(d->statesarea);
+ if (d->outsarea != NULL)
+ FREE(d->outsarea);
+ if (d->incarea != NULL)
+ FREE(d->incarea);
+ FREE(d);
+}
+
+/*
+ - hash - construct a hash code for a bitvector
+ * There are probably better ways, but they're more expensive.
+ ^ static unsigned hash(unsigned *, int);
+ */
+static unsigned
+hash(uv, n)
+unsigned *uv;
+int n;
+{
+ int i;
+ unsigned h;
+
+ h = 0;
+ for (i = 0; i < n; i++)
+ h ^= uv[i];
+ return h;
+}
+
+/*
+ - initialize - hand-craft a cache entry for startup, otherwise get ready
+ ^ static struct sset *initialize(struct vars *, struct dfa *, chr *);
+ */
+static struct sset *
+initialize(v, d, start)
+struct vars *v; /* used only for debug flags */
+struct dfa *d;
+chr *start;
+{
+ struct sset *ss;
+ int i;
+
+ /* is previous one still there? */
+ if (d->nssused > 0 && (d->ssets[0].flags&STARTER))
+ ss = &d->ssets[0];
+ else { /* no, must (re)build it */
+ ss = getvacant(v, d, start, start);
+ for (i = 0; i < d->wordsper; i++)
+ ss->states[i] = 0;
+ BSET(ss->states, d->cnfa->pre);
+ ss->hash = hash(ss->states, d->wordsper);
+ assert(d->cnfa->pre != d->cnfa->post);
+ ss->flags = STARTER|LOCKED;
+ /* lastseen dealt with below */
+ }
+
+ for (i = 0; i < d->nssused; i++)
+ d->ssets[i].lastseen = NULL;
+ ss->lastseen = start; /* maybe untrue, but harmless */
+ d->lastpost = NULL;
+ return ss;
+}
+
+/*
+ - miss - handle a cache miss
+ ^ static struct sset *miss(struct vars *, struct dfa *, struct sset *,
+ ^ pcolor, chr *, chr *);
+ */
+static struct sset * /* NULL if goes to empty set */
+miss(v, d, css, co, cp, start)
+struct vars *v; /* used only for debug flags */
+struct dfa *d;
+struct sset *css;
+pcolor co;
+chr *cp; /* next chr */
+chr *start; /* where the attempt got started */
+{
+ struct cnfa *cnfa = d->cnfa;
+ int i;
+ unsigned h;
+ struct carc *ca;
+ struct sset *p;
+ int ispost;
+ int gotstate;
+ int dolacons;
+ int didlacons;
+
+ /* for convenience, we can be called even if it might not be a miss */
+ if (css->outs[co] != NULL) {
+ FDEBUG(("hit\n"));
+ return css->outs[co];
+ }
+ FDEBUG(("miss\n"));
+
+ /* first, what set of states would we end up in? */
+ for (i = 0; i < d->wordsper; i++)
+ d->work[i] = 0;
+ ispost = 0;
+ gotstate = 0;
+ for (i = 0; i < d->nstates; i++)
+ if (ISBSET(css->states, i))
+ for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
+ if (ca->co == co) {
+ BSET(d->work, ca->to);
+ gotstate = 1;
+ if (ca->to == cnfa->post)
+ ispost = 1;
+ FDEBUG(("%d -> %d\n", i, ca->to));
+ }
+ dolacons = (gotstate) ? (cnfa->flags&HASLACONS) : 0;
+ didlacons = 0;
+ while (dolacons) { /* transitive closure */
+ dolacons = 0;
+ for (i = 0; i < d->nstates; i++)
+ if (ISBSET(d->work, i))
+ for (ca = cnfa->states[i]; ca->co != COLORLESS;
+ ca++)
+ if (ca->co > cnfa->ncolors &&
+ !ISBSET(d->work, ca->to) &&
+ lacon(v, cnfa, cp,
+ ca->co)) {
+ BSET(d->work, ca->to);
+ dolacons = 1;
+ didlacons = 1;
+ if (ca->to == cnfa->post)
+ ispost = 1;
+ FDEBUG(("%d :> %d\n",i,ca->to));
+ }
+ }
+ if (!gotstate)
+ return NULL;
+ h = hash(d->work, d->wordsper);
+
+ /* next, is that in the cache? */
+ for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
+ if (p->hash == h && memcmp(VS(d->work), VS(p->states),
+ d->wordsper*sizeof(unsigned)) == 0) {
+ FDEBUG(("cached c%d\n", p - d->ssets));
+ break; /* NOTE BREAK OUT */
+ }
+ if (i == 0) { /* nope, need a new cache entry */
+ p = getvacant(v, d, cp, start);
+ assert(p != css);
+ for (i = 0; i < d->wordsper; i++)
+ p->states[i] = d->work[i];
+ p->hash = h;
+ p->flags = (ispost) ? POSTSTATE : 0;
+ /* lastseen to be dealt with by caller */
+ }
+
+ if (!didlacons) { /* lookahead conds. always cache miss */
+ css->outs[co] = p;
+ css->inchain[co] = p->ins;
+ p->ins.ss = css;
+ p->ins.co = (color)co;
+ }
+ return p;
+}
+
+/*
+ - lacon - lookahead-constraint checker for miss()
+ ^ static int lacon(struct vars *, struct cnfa *, chr *, pcolor);
+ */
+static int /* predicate: constraint satisfied? */
+lacon(v, pcnfa, cp, co)
+struct vars *v;
+struct cnfa *pcnfa; /* parent cnfa */
+chr *cp;
+pcolor co; /* "color" of the lookahead constraint */
+{
+ int n;
+ struct subre *sub;
+ struct dfa *d;
+ chr *end;
+
+ n = co - pcnfa->ncolors;
+ assert(n < v->g->nlacons && v->g->lacons != NULL);
+ FDEBUG(("=== testing lacon %d\n", n));
+ sub = &v->g->lacons[n];
+ d = newdfa(v, &sub->cnfa, &v->g->cmap);
+ if (d == NULL) {
+ ERR(REG_ESPACE);
+ return 0;
+ }
+ end = longest(v, d, cp, v->stop);
+ freedfa(d);
+ FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
+ return (sub->subno) ? (end != NULL) : (end == NULL);
+}
+
+/*
+ - getvacant - get a vacant state set
+ * This routine clears out the inarcs and outarcs, but does not otherwise
+ * clear the innards of the state set -- that's up to the caller.
+ ^ static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
+ */
+static struct sset *
+getvacant(v, d, cp, start)
+struct vars *v; /* used only for debug flags */
+struct dfa *d;
+chr *cp;
+chr *start;
+{
+ int i;
+ struct sset *ss;
+ struct sset *p;
+ struct arcp ap;
+ struct arcp lastap;
+ color co;
+
+ ss = pickss(v, d, cp, start);
+ assert(!(ss->flags&LOCKED));
+
+ /* clear out its inarcs, including self-referential ones */
+ ap = ss->ins;
+ while ((p = ap.ss) != NULL) {
+ co = ap.co;
+ FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long)co));
+ p->outs[co] = NULL;
+ ap = p->inchain[co];
+ p->inchain[co].ss = NULL; /* paranoia */
+ }
+ ss->ins.ss = NULL;
+
+ /* take it off the inarc chains of the ssets reached by its outarcs */
+ for (i = 0; i < d->ncolors; i++) {
+ p = ss->outs[i];
+ assert(p != ss); /* not self-referential */
+ if (p == NULL)
+ continue; /* NOTE CONTINUE */
+ FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets));
+ if (p->ins.ss == ss && p->ins.co == i)
+ p->ins = ss->inchain[i];
+ else {
+ assert(p->ins.ss != NULL);
+ for (ap = p->ins; ap.ss != NULL &&
+ !(ap.ss == ss && ap.co == i);
+ ap = ap.ss->inchain[ap.co])
+ lastap = ap;
+ assert(ap.ss != NULL);
+ lastap.ss->inchain[lastap.co] = ss->inchain[i];
+ }
+ ss->outs[i] = NULL;
+ ss->inchain[i].ss = NULL;
+ }
+
+ /* if ss was a success state, may need to remember location */
+ if ((ss->flags&POSTSTATE) && ss->lastseen != d->lastpost &&
+ (d->lastpost == NULL || d->lastpost < ss->lastseen))
+ d->lastpost = ss->lastseen;
+
+ return ss;
+}
+
+/*
+ - pickss - pick the next stateset to be used
+ ^ static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
+ */
+static struct sset *
+pickss(v, d, cp, start)
+struct vars *v; /* used only for debug flags */
+struct dfa *d;
+chr *cp;
+chr *start;
+{
+ int i;
+ struct sset *ss;
+ struct sset *end;
+ chr *ancient;
+
+ /* shortcut for cases where cache isn't full */
+ if (d->nssused < d->nssets) {
+ ss = &d->ssets[d->nssused];
+ d->nssused++;
+ FDEBUG(("new c%d\n", ss - d->ssets));
+ /* must make innards consistent */
+ ss->ins.ss = NULL;
+ ss->ins.co = WHITE; /* give it some value */
+ for (i = 0; i < d->ncolors; i++) {
+ ss->outs[i] = NULL;
+ ss->inchain[i].ss = NULL;
+ }
+ ss->flags = 0;
+ return ss;
+ }
+
+ /* look for oldest, or old enough anyway */
+ if (cp - start > d->nssets*2/3) /* oldest 33% are expendable */
+ ancient = cp - d->nssets*2/3;
+ else
+ ancient = start;
+ for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
+ if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+ !(ss->flags&LOCKED)) {
+ d->search = ss + 1;
+ FDEBUG(("replacing c%d\n", ss - d->ssets));
+ return ss;
+ }
+ for (ss = d->ssets, end = d->search; ss < end; ss++)
+ if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+ !(ss->flags&LOCKED)) {
+ d->search = ss + 1;
+ FDEBUG(("replacing c%d\n", ss - d->ssets));
+ return ss;
+ }
+
+ /* nobody's old enough?!? -- something's really wrong */
+ FDEBUG(("can't find victim to replace!\n"));
+ assert(NOTREACHED);
+ ERR(REG_ASSERT);
+ return d->ssets;
+}
diff --git a/generic/regerror.c b/generic/regerror.c
index 5eb67a7..7fb2ef3 100644
--- a/generic/regerror.c
+++ b/generic/regerror.c
@@ -15,7 +15,7 @@ static struct rerr {
} rerrs[] = {
/* the actual table is built from regex.h */
# include "regerrs.h"
- -1, "", "oops", /* explanation special-cased in code */
+ { -1, "", "oops" }, /* explanation special-cased in code */
};
/*
diff --git a/generic/regerrs.h b/generic/regerrs.h
index 8298597..af2805c 100644
--- a/generic/regerrs.h
+++ b/generic/regerrs.h
@@ -1,19 +1,18 @@
-REG_OKAY, "REG_OKAY", "no errors detected",
-REG_NOMATCH, "REG_NOMATCH", "failed to match",
-REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.1)",
-REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element",
-REG_ECTYPE, "REG_ECTYPE", "invalid character class",
-REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence",
-REG_ESUBREG, "REG_ESUBREG", "invalid backreference number",
-REG_EBRACK, "REG_EBRACK", "brackets [] not balanced",
-REG_EPAREN, "REG_EPAREN", "parentheses () not balanced",
-REG_EBRACE, "REG_EBRACE", "braces {} not balanced",
-REG_BADBR, "REG_BADBR", "invalid repetition count(s)",
-REG_ERANGE, "REG_ERANGE", "invalid character range",
-REG_ESPACE, "REG_ESPACE", "out of memory",
-REG_BADRPT, "REG_BADRPT", "quantifier operand invalid",
-REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug",
-REG_INVARG, "REG_INVARG", "invalid argument to regex function",
-REG_MIXED, "REG_MIXED", "character widths of regex and string differ",
-REG_BADOPT, "REG_BADOPT", "invalid embedded option",
-REG_IMPOSS, "REG_IMPOSS", "can never match",
+{ REG_OKAY, "REG_OKAY", "no errors detected" },
+{ REG_NOMATCH, "REG_NOMATCH", "failed to match" },
+{ REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.1)" },
+{ REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" },
+{ REG_ECTYPE, "REG_ECTYPE", "invalid character class" },
+{ REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence" },
+{ REG_ESUBREG, "REG_ESUBREG", "invalid backreference number" },
+{ REG_EBRACK, "REG_EBRACK", "brackets [] not balanced" },
+{ REG_EPAREN, "REG_EPAREN", "parentheses () not balanced" },
+{ REG_EBRACE, "REG_EBRACE", "braces {} not balanced" },
+{ REG_BADBR, "REG_BADBR", "invalid repetition count(s)" },
+{ REG_ERANGE, "REG_ERANGE", "invalid character range" },
+{ REG_ESPACE, "REG_ESPACE", "out of memory" },
+{ REG_BADRPT, "REG_BADRPT", "quantifier operand invalid" },
+{ REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug" },
+{ REG_INVARG, "REG_INVARG", "invalid argument to regex function" },
+{ REG_MIXED, "REG_MIXED", "character widths of regex and string differ" },
+{ REG_BADOPT, "REG_BADOPT", "invalid embedded option" },
diff --git a/generic/regex.h b/generic/regex.h
index 6f61dd3..7ca6ad0 100644
--- a/generic/regex.h
+++ b/generic/regex.h
@@ -16,7 +16,7 @@
* suggestive of the wide type, e.g. re_ucomp and re_uexec for Unicode).
* For cranky old compilers, it may be necessary to do something like:
* #define __REG_WIDE_COMPILE(a,b,c,d) re_Xcomp(a,b,c,d)
- * #define __REG_WIDE_EXEC(a,b,c,d,e,f) re_Xexec(a,b,c,d,e,f)
+ * #define __REG_WIDE_EXEC(a,b,c,d,e,f,g) re_Xexec(a,b,c,d,e,f,g)
* rather than just #defining the names as parameterless macros.
*
* For some specialized purposes, it may be desirable to suppress the
@@ -44,8 +44,14 @@ extern "C" {
/*
- * Add your own defines, if needed, here. The --- stuff is for automatic
- * generation of this file from regproto.h and regcustom.h.
+ * Add your own defines, if needed, here.
+ */
+
+
+
+/*
+ * Location where a chunk of regcustom.h is automatically spliced into
+ * this file (working from its prototype, regproto.h).
*/
/* --- begin --- */
/* ensure certain things don't sneak in from system headers */
@@ -69,17 +75,20 @@ extern "C" {
#endif
/* interface types */
#define __REG_WIDE_T Tcl_UniChar
-#define __REG_WIDE_COMPILE re_ucomp
-#define __REG_WIDE_EXEC re_uexec
#define __REG_REGOFF_T long /* not really right, but good enough... */
#define __REG_VOID_T VOID
#define __REG_CONST CONST
+/* names and declarations */
+#define __REG_WIDE_COMPILE re_ucomp
+#define __REG_WIDE_EXEC re_uexec
#ifndef __REG_NOFRONT
#define __REG_NOFRONT /* don't want regcomp() and regexec() */
#endif
#ifndef __REG_NOCHAR
#define __REG_NOCHAR /* or the char versions */
#endif
+#define regfree re_ufree
+#define regerror re_uerror
/* --- end --- */
@@ -139,6 +148,7 @@ typedef struct {
# define REG_UUNPORT 001000
# define REG_ULOCALE 002000
# define REG_UEMPTYMATCH 004000
+# define REG_UIMPOSSIBLE 010000
int re_csize; /* sizeof(character) */
char *re_endp; /* backward compatibility kludge */
/* the rest is opaque pointers to hidden innards */
@@ -249,7 +259,6 @@ typedef struct {
#define REG_INVARG 16 /* invalid argument to regex function */
#define REG_MIXED 17 /* character widths of regex and string differ */
#define REG_BADOPT 18 /* invalid embedded option */
-#define REG_IMPOSS 19 /* can never match */
/* two specials for debugging and testing */
#define REG_ATOI 101 /* convert error-code name to number */
#define REG_ITOA 102 /* convert error-code number to name */
@@ -280,8 +289,8 @@ int regexec _ANSI_ARGS_((regex_t *, __REG_CONST char *, size_t, regmatch_t [], i
#ifdef __REG_WIDE_T
int __REG_WIDE_EXEC _ANSI_ARGS_((regex_t *, __REG_CONST __REG_WIDE_T *, size_t, rm_detail_t *, size_t, regmatch_t [], int));
#endif
-re_void regfree _ANSI_ARGS_((regex_t *));
-extern size_t regerror _ANSI_ARGS_((int, __REG_CONST regex_t *, char *, size_t));
+re_void re_ufree _ANSI_ARGS_((regex_t *));
+extern size_t re_uerror _ANSI_ARGS_((int, __REG_CONST regex_t *, char *, size_t));
/* automatically gathered by fwd; do not hand-edit */
/* =====^!^===== end forwards =====^!^===== */
diff --git a/generic/regexec.c b/generic/regexec.c
index 4220062..1671866 100644
--- a/generic/regexec.c
+++ b/generic/regexec.c
@@ -17,8 +17,6 @@ struct vars {
chr *stop; /* just past end of string */
int err; /* error code if any (0 none) */
regoff_t *mem; /* memory vector for backtracking */
- regoff_t *mem1; /* localizer vector */
- regoff_t *mem2; /* dissector vector */
};
#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
#define ISERR() VISERR(v)
@@ -79,21 +77,20 @@ struct dfa {
int exec _ANSI_ARGS_((regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int));
static int find _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *));
static int cfind _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *));
-static VOID zapmatches _ANSI_ARGS_((regmatch_t *, size_t));
-static VOID zapmem _ANSI_ARGS_((struct vars *, struct rtree *));
+static VOID zapsubs _ANSI_ARGS_((regmatch_t *, size_t));
+static VOID zapmem _ANSI_ARGS_((struct vars *, struct subre *));
static VOID subset _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
-static int dissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *));
-static int altdissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *));
-static int cdissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *));
-static int crevdissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *));
-static int csindissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *));
-static int cbrdissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *));
-static int caltdissect _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *));
-static chr *dismatch _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *));
-static chr *dismrev _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *));
-static chr *dismsin _ANSI_ARGS_((struct vars *, struct rtree *, chr *, chr *));
+static int dissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
+static int condissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
+static int altdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
+static int cdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
+static int ccondissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
+static int crevdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
+static int cbrdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
+static int caltdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
+/* === rege_dfa.c === */
static chr *longest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *));
-static chr *shortest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, chr *));
+static chr *shortest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, chr *, chr **));
static struct dfa *newdfa _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *));
static VOID freedfa _ANSI_ARGS_((struct dfa *));
static unsigned hash _ANSI_ARGS_((unsigned *, int));
@@ -137,6 +134,8 @@ int flags;
/* setup */
v->re = re;
v->g = (struct guts *)re->re_guts;
+ if (v->g->unmatchable)
+ return REG_NOMATCH;
complications = (v->g->info&REG_UBACKREF) ? 1 : 0;
if (v->g->usedshorter)
complications = 1;
@@ -157,30 +156,30 @@ int flags;
v->stop = (chr *)string + len;
v->err = 0;
if (complications) {
- v->mem1 = (regoff_t *)MALLOC(2*v->g->ntree*sizeof(regoff_t));
- if (v->mem1 == NULL) {
+ v->mem = (regoff_t *)MALLOC(v->g->ntree*sizeof(regoff_t));
+ if (v->mem == NULL) {
if (v->pmatch != pmatch)
FREE(v->pmatch);
return REG_ESPACE;
}
- v->mem2 = v->mem1 + v->g->ntree;
} else
- v->mem1 = NULL;
+ v->mem = NULL;
/* do it */
+ assert(v->g->tree != NULL);
if (complications)
- st = cfind(v, &v->g->cnfa, v->g->cm);
+ st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
else
- st = find(v, &v->g->cnfa, v->g->cm);
+ st = find(v, &v->g->tree->cnfa, &v->g->cmap);
if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
- zapmatches(pmatch, nmatch);
+ zapsubs(pmatch, nmatch);
n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t));
}
if (v->pmatch != pmatch)
FREE(v->pmatch);
- if (v->mem1 != NULL)
- FREE(v->mem1);
+ if (v->mem != NULL)
+ FREE(v->mem);
return st;
}
@@ -195,33 +194,57 @@ struct cnfa *cnfa;
struct colormap *cm;
{
struct dfa *d = newdfa(v, cnfa, cm);
+ struct dfa *s = newdfa(v, &v->g->search, cm);
chr *begin;
chr *end;
+#ifdef notdef
+ chr *open; /* open and close of range of possible starts */
+ chr *close;
+#endif
chr *stop = (cnfa->flags&LEFTANCH) ? v->start : v->stop;
if (d == NULL)
return v->err;
+ if (s == NULL) {
+ freedfa(d);
+ return v->err;
+ }
- for (begin = v->start; begin <= stop; begin++) {
- MDEBUG(("\ntrying at %ld\n", (long)OFF(begin)));
- end = longest(v, d, begin, v->stop);
- if (end != NULL) {
- if (v->nmatch > 0) {
- v->pmatch[0].rm_so = OFF(begin);
- v->pmatch[0].rm_eo = OFF(end);
- }
- freedfa(d);
- if (v->nmatch > 1) {
- zapmatches(v->pmatch, v->nmatch);
- return dissect(v, v->g->tree, begin, end);
+#ifdef notdef
+ close = v->start;
+ while (close < stop) {
+ MDEBUG(("\nsearch at %ld\n", (long)OFF(close)));
+ close = shortest(v, s, close, close, v->stop, &open);
+ if (close == NULL)
+ break; /* NOTE BREAK */
+ MDEBUG(("between %ld and %ld\n", (long)OFF(open),
+ (long)OFF(close)));
+#endif
+ for (begin = v->start; begin <= stop; begin++) {
+ end = longest(v, d, begin, v->stop);
+ if (end != NULL) {
+ if (v->nmatch > 0) {
+ v->pmatch[0].rm_so = OFF(begin);
+ v->pmatch[0].rm_eo = OFF(end);
+ }
+ freedfa(d);
+ freedfa(s);
+ if (v->nmatch > 1) {
+ zapsubs(v->pmatch, v->nmatch);
+ return dissect(v, v->g->tree, begin,
+ end);
+ }
+ if (ISERR())
+ return v->err;
+ return REG_OKAY;
}
- if (ISERR())
- return v->err;
- return REG_OKAY;
}
+#ifdef notdef
}
+#endif
freedfa(d);
+ freedfa(s);
if (ISERR())
return v->err;
return REG_NOMATCH;
@@ -241,33 +264,28 @@ struct colormap *cm;
chr *begin;
chr *end;
chr *stop = (cnfa->flags&LEFTANCH) ? v->start : v->stop;
+ chr *estart;
chr *estop;
int er;
- int usedis = (v->g->tree == NULL || v->g->tree->op == '|') ? 0 : 1;
+ int shorter = v->g->tree->flags&SHORTER;
if (d == NULL)
return v->err;
- if (!v->g->usedshorter)
- usedis = 0;
for (begin = v->start; begin <= stop; begin++) {
- MDEBUG(("\ntrying at %ld\n", (long)OFF(begin)));
- if (usedis) {
- v->mem = v->mem1;
- zapmem(v, v->g->tree);
- }
+ MDEBUG(("\ncfind trying at %ld\n", (long)OFF(begin)));
+ estart = begin;
estop = v->stop;
for (;;) {
- if (usedis) {
- v->mem = v->mem1;
- end = dismatch(v, v->g->tree, begin, v->stop);
- } else
+ if (shorter)
+ end = shortest(v, d, begin, estart, estop,
+ (chr **)NULL);
+ else
end = longest(v, d, begin, estop);
if (end == NULL)
break; /* NOTE BREAK OUT */
MDEBUG(("tentative end %ld\n", (long)OFF(end)));
- zapmatches(v->pmatch, v->nmatch);
- v->mem = v->mem2;
+ zapsubs(v->pmatch, v->nmatch);
zapmem(v, v->g->tree);
er = cdissect(v, v->g->tree, begin, end);
switch (er) {
@@ -283,14 +301,15 @@ struct colormap *cm;
break;
case REG_NOMATCH:
/* go around and try again */
- if (!usedis) {
- if (end == begin) {
- /* no point in trying again */
- freedfa(d);
- return REG_NOMATCH;
- }
- estop = end - 1;
+ if ((shorter) ? end == estop : end == begin) {
+ /* no point in trying again */
+ freedfa(d);
+ return REG_NOMATCH;
}
+ if (shorter)
+ estart = end + 1;
+ else
+ estop = end - 1;
break;
default:
freedfa(d);
@@ -307,11 +326,11 @@ struct colormap *cm;
}
/*
- - zapmatches - initialize the subexpression matches to "no match"
- ^ static VOID zapmatches(regmatch_t *, size_t);
+ - zapsubs - initialize the subexpression matches to "no match"
+ ^ static VOID zapsubs(regmatch_t *, size_t);
*/
static VOID
-zapmatches(p, n)
+zapsubs(p, n)
regmatch_t *p;
size_t n;
{
@@ -325,33 +344,28 @@ size_t n;
/*
- zapmem - initialize the retry memory of a subtree to zeros
- ^ static VOID zapmem(struct vars *, struct rtree *);
+ ^ static VOID zapmem(struct vars *, struct subre *);
*/
static VOID
-zapmem(v, rt)
+zapmem(v, t)
struct vars *v;
-struct rtree *rt;
+struct subre *t;
{
- if (rt == NULL)
+ if (t == NULL)
return;
assert(v->mem != NULL);
- v->mem[rt->no] = 0;
-
- if (rt->left.tree != NULL)
- zapmem(v, rt->left.tree);
- if (rt->left.subno > 0) {
- v->pmatch[rt->left.subno].rm_so = -1;
- v->pmatch[rt->left.subno].rm_eo = -1;
+ v->mem[t->retry] = 0;
+ if (t->op == '(') {
+ assert(t->subno > 0);
+ v->pmatch[t->subno].rm_so = -1;
+ v->pmatch[t->subno].rm_eo = -1;
}
- if (rt->right.tree != NULL)
- zapmem(v, rt->right.tree);
- if (rt->right.subno > 0) {
- v->pmatch[rt->right.subno].rm_so = -1;
- v->pmatch[rt->right.subno].rm_eo = -1;
- }
- if (rt->next != NULL)
- zapmem(v, rt->next);
+
+ if (t->left != NULL)
+ zapmem(v, t->left);
+ if (t->right != NULL)
+ zapmem(v, t->right);
}
/*
@@ -367,8 +381,6 @@ chr *end;
{
int n = sub->subno;
- if (n == 0)
- return;
assert(n > 0);
if ((size_t)n >= v->nmatch)
return;
@@ -380,12 +392,54 @@ chr *end;
/*
- dissect - determine subexpression matches (uncomplicated case)
- ^ static int dissect(struct vars *, struct rtree *, chr *, chr *);
+ ^ static int dissect(struct vars *, struct subre *, chr *, chr *);
+ */
+static int /* regexec return code */
+dissect(v, t, begin, end)
+struct vars *v;
+struct subre *t;
+chr *begin; /* beginning of relevant substring */
+chr *end; /* end of same */
+{
+ assert(t != NULL);
+ MDEBUG(("dissect %ld-%ld\n", (long)OFF(begin), (long)OFF(end)));
+
+ switch (t->op) {
+ case '=': /* terminal node */
+ assert(t->left == NULL && t->right == NULL);
+ return REG_OKAY; /* no action, parent did the work */
+ break;
+ case '|': /* alternation */
+ assert(t->left != NULL);
+ return altdissect(v, t, begin, end);
+ break;
+ case 'b': /* back ref -- shouldn't be calling us! */
+ return REG_ASSERT;
+ break;
+ case '.': /* concatenation */
+ assert(t->left != NULL && t->right != NULL);
+ return condissect(v, t, begin, end);
+ break;
+ case '(': /* capturing */
+ assert(t->left != NULL && t->right == NULL);
+ assert(t->subno > 0);
+ subset(v, t, begin, end);
+ return dissect(v, t->left, begin, end);
+ break;
+ default:
+ return REG_ASSERT;
+ break;
+ }
+}
+
+/*
+ - condissect - determine concatenation subexpression matches (uncomplicated)
+ ^ static int condissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
-dissect(v, rt, begin, end)
+condissect(v, t, begin, end)
struct vars *v;
-struct rtree *rt;
+struct subre *t;
chr *begin; /* beginning of relevant substring */
chr *end; /* end of same */
{
@@ -394,37 +448,14 @@ chr *end; /* end of same */
chr *mid;
int i;
- if (rt == NULL)
- return REG_OKAY;
- MDEBUG(("substring %ld-%ld\n", (long)OFF(begin), (long)OFF(end)));
-
- /* alternatives -- punt to auxiliary */
- if (rt->op == '|')
- return altdissect(v, rt, begin, end);
+ assert(t->op == '.');
+ assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ assert(t->right != NULL && t->right->cnfa.nstates > 0);
- /* concatenation -- need to split the substring between parts */
- assert(rt->op == ',');
- assert(rt->left.cnfa.nstates > 0);
- d = newdfa(v, &rt->left.cnfa, v->g->cm);
+ d = newdfa(v, &t->left->cnfa, &v->g->cmap);
if (ISERR())
return v->err;
-
- /* in some cases, there may be no right side... */
- if (rt->right.cnfa.nstates == 0) {
- MDEBUG(("singleton\n"));
- if (longest(v, d, begin, end) != end) {
- freedfa(d);
- return REG_ASSERT;
- }
- freedfa(d);
- assert(rt->left.subno >= 0);
- subset(v, &rt->left, begin, end);
- return dissect(v, rt->left.tree, begin, end);
- }
-
- /* general case */
- assert(rt->right.cnfa.nstates > 0);
- d2 = newdfa(v, &rt->right.cnfa, v->g->cm);
+ d2 = newdfa(v, &t->right->cnfa, &v->g->cmap);
if (ISERR()) {
freedfa(d);
return v->err;
@@ -464,45 +495,39 @@ chr *end; /* end of same */
MDEBUG(("successful\n"));
freedfa(d);
freedfa(d2);
- assert(rt->left.subno >= 0);
- subset(v, &rt->left, begin, mid);
- assert(rt->right.subno >= 0);
- subset(v, &rt->right, mid, end);
- i = dissect(v, rt->left.tree, begin, mid);
+ i = dissect(v, t->left, begin, mid);
if (i != REG_OKAY)
return i;
- return dissect(v, rt->right.tree, mid, end);
+ return dissect(v, t->right, mid, end);
}
/*
- altdissect - determine alternative subexpression matches (uncomplicated)
- ^ static int altdissect(struct vars *, struct rtree *, chr *, chr *);
+ ^ static int altdissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
-altdissect(v, rt, begin, end)
+altdissect(v, t, begin, end)
struct vars *v;
-struct rtree *rt;
+struct subre *t;
chr *begin; /* beginning of relevant substring */
chr *end; /* end of same */
{
struct dfa *d;
int i;
- assert(rt != NULL);
- assert(rt->op == '|');
+ assert(t != NULL);
+ assert(t->op == '|');
- for (i = 0; rt != NULL; rt = rt->next, i++) {
+ for (i = 0; t != NULL; t = t->right, i++) {
MDEBUG(("trying %dth\n", i));
- assert(rt->left.begin != NULL);
- d = newdfa(v, &rt->left.cnfa, v->g->cm);
+ assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ d = newdfa(v, &t->left->cnfa, &v->g->cmap);
if (ISERR())
return v->err;
if (longest(v, d, begin, end) == end) {
MDEBUG(("success\n"));
freedfa(d);
- assert(rt->left.subno >= 0);
- subset(v, &rt->left, begin, end);
- return dissect(v, rt->left.tree, begin, end);
+ return dissect(v, t->left, begin, end);
}
freedfa(d);
}
@@ -513,12 +538,61 @@ chr *end; /* end of same */
- cdissect - determine subexpression matches (with complications)
* The retry memory stores the offset of the trial midpoint from begin,
* plus 1 so that 0 uniquely means "clean slate".
- ^ static int cdissect(struct vars *, struct rtree *, chr *, chr *);
+ ^ static int cdissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
-cdissect(v, rt, begin, end)
+cdissect(v, t, begin, end)
struct vars *v;
-struct rtree *rt;
+struct subre *t;
+chr *begin; /* beginning of relevant substring */
+chr *end; /* end of same */
+{
+ int er;
+
+ assert(t != NULL);
+ MDEBUG(("cdissect %ld-%ld\n", (long)OFF(begin), (long)OFF(end)));
+
+ switch (t->op) {
+ case '=': /* terminal node */
+ assert(t->left == NULL && t->right == NULL);
+ return REG_OKAY; /* no action, parent did the work */
+ break;
+ case '|': /* alternation */
+ assert(t->left != NULL);
+ return caltdissect(v, t, begin, end);
+ break;
+ case 'b': /* back ref -- shouldn't be calling us! */
+ assert(t->left == NULL && t->right == NULL);
+ return cbrdissect(v, t, begin, end);
+ break;
+ case '.': /* concatenation */
+ assert(t->left != NULL && t->right != NULL);
+ return ccondissect(v, t, begin, end);
+ break;
+ case '(': /* capturing */
+ assert(t->left != NULL && t->right == NULL);
+ assert(t->subno > 0);
+ er = cdissect(v, t->left, begin, end);
+ if (er == REG_OKAY)
+ subset(v, t, begin, end);
+ return er;
+ break;
+ default:
+ return REG_ASSERT;
+ break;
+ }
+}
+
+/*
+ - ccondissect - concatenation subexpression matches (with complications)
+ * The retry memory stores the offset of the trial midpoint from begin,
+ * plus 1 so that 0 uniquely means "clean slate".
+ ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *);
+ */
+static int /* regexec return code */
+ccondissect(v, t, begin, end)
+struct vars *v;
+struct subre *t;
chr *begin; /* beginning of relevant substring */
chr *end; /* end of same */
{
@@ -527,36 +601,25 @@ chr *end; /* end of same */
chr *mid;
int er;
- if (rt == NULL)
- return REG_OKAY;
- MDEBUG(("csubstr %ld-%ld\n", (long)OFF(begin), (long)OFF(end)));
-
- /* punt various cases to auxiliaries */
- if (rt->op == '|') /* alternatives */
- return caltdissect(v, rt, begin, end);
- if (rt->op == 'b') /* backref */
- return cbrdissect(v, rt, begin, end);
- if (rt->right.cnfa.nstates == 0) /* no RHS */
- return csindissect(v, rt, begin, end);
- if (rt->left.prefer == SHORTER) /* reverse scan */
- return crevdissect(v, rt, begin, end);
+ assert(t->op == '.');
+ assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ assert(t->right != NULL && t->right->cnfa.nstates > 0);
- /* concatenation -- need to split the substring between parts */
- assert(rt->op == ',');
- assert(rt->left.cnfa.nstates > 0);
- assert(rt->right.cnfa.nstates > 0);
- d = newdfa(v, &rt->left.cnfa, v->g->cm);
+ if (t->left->flags&SHORTER) /* reverse scan */
+ return crevdissect(v, t, begin, end);
+
+ d = newdfa(v, &t->left->cnfa, &v->g->cmap);
if (ISERR())
return v->err;
- d2 = newdfa(v, &rt->right.cnfa, v->g->cm);
+ d2 = newdfa(v, &t->right->cnfa, &v->g->cmap);
if (ISERR()) {
freedfa(d);
return v->err;
}
- MDEBUG(("cconcat %d\n", rt->no));
+ MDEBUG(("cconcat %d\n", t->retry));
/* pick a tentative midpoint */
- if (v->mem[rt->no] == 0) {
+ if (v->mem[t->retry] == 0) {
mid = longest(v, d, begin, end);
if (mid == NULL) {
freedfa(d);
@@ -564,19 +627,18 @@ chr *end; /* end of same */
return REG_NOMATCH;
}
MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid)));
- subset(v, &rt->left, begin, mid);
- v->mem[rt->no] = (mid - begin) + 1;
+ v->mem[t->retry] = (mid - begin) + 1;
} else {
- mid = begin + (v->mem[rt->no] - 1);
+ mid = begin + (v->mem[t->retry] - 1);
MDEBUG(("working midpoint %ld\n", (long)OFF(mid)));
}
/* iterate until satisfaction or failure */
for (;;) {
/* try this midpoint on for size */
- er = cdissect(v, rt->left.tree, begin, mid);
+ er = cdissect(v, t->left, begin, mid);
if (er == REG_OKAY && longest(v, d2, mid, end) == end &&
- (er = cdissect(v, rt->right.tree, mid, end)) ==
+ (er = cdissect(v, t->right, mid, end)) ==
REG_OKAY)
break; /* NOTE BREAK OUT */
if (er != REG_OKAY && er != REG_NOMATCH) {
@@ -588,7 +650,7 @@ chr *end; /* end of same */
/* that midpoint didn't work, find a new one */
if (mid == begin) {
/* all possibilities exhausted */
- MDEBUG(("%d no midpoint\n", rt->no));
+ MDEBUG(("%d no midpoint\n", t->retry));
freedfa(d);
freedfa(d2);
return REG_NOMATCH;
@@ -596,23 +658,21 @@ chr *end; /* end of same */
mid = longest(v, d, begin, mid-1);
if (mid == NULL) {
/* failed to find a new one */
- MDEBUG(("%d failed midpoint\n", rt->no));
+ MDEBUG(("%d failed midpoint\n", t->retry));
freedfa(d);
freedfa(d2);
return REG_NOMATCH;
}
- MDEBUG(("%d: new midpoint %ld\n", rt->no, (long)OFF(mid)));
- subset(v, &rt->left, begin, mid);
- v->mem[rt->no] = (mid - begin) + 1;
- zapmem(v, rt->left.tree);
- zapmem(v, rt->right.tree);
+ MDEBUG(("%d: new midpoint %ld\n", t->retry, (long)OFF(mid)));
+ v->mem[t->retry] = (mid - begin) + 1;
+ zapmem(v, t->left);
+ zapmem(v, t->right);
}
/* satisfaction */
MDEBUG(("successful\n"));
freedfa(d);
freedfa(d2);
- subset(v, &rt->right, mid, end);
return REG_OKAY;
}
@@ -620,12 +680,12 @@ chr *end; /* end of same */
- crevdissect - determine shortest-first subexpression matches
* The retry memory stores the offset of the trial midpoint from begin,
* plus 1 so that 0 uniquely means "clean slate".
- ^ static int crevdissect(struct vars *, struct rtree *, chr *, chr *);
+ ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
-crevdissect(v, rt, begin, end)
+crevdissect(v, t, begin, end)
struct vars *v;
-struct rtree *rt;
+struct subre *t;
chr *begin; /* beginning of relevant substring */
chr *end; /* end of same */
{
@@ -634,45 +694,43 @@ chr *end; /* end of same */
chr *mid;
int er;
- if (rt == NULL)
- return REG_OKAY;
- assert(rt->op == ',' && rt->left.prefer == SHORTER);
+ assert(t->op == '.');
+ assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ assert(t->right != NULL && t->right->cnfa.nstates > 0);
+ assert(t->left->flags&SHORTER);
/* concatenation -- need to split the substring between parts */
- assert(rt->left.cnfa.nstates > 0);
- assert(rt->right.cnfa.nstates > 0);
- d = newdfa(v, &rt->left.cnfa, v->g->cm);
+ d = newdfa(v, &t->left->cnfa, &v->g->cmap);
if (ISERR())
return v->err;
- d2 = newdfa(v, &rt->right.cnfa, v->g->cm);
+ d2 = newdfa(v, &t->right->cnfa, &v->g->cmap);
if (ISERR()) {
freedfa(d);
return v->err;
}
- MDEBUG(("crev %d\n", rt->no));
+ MDEBUG(("crev %d\n", t->retry));
/* pick a tentative midpoint */
- if (v->mem[rt->no] == 0) {
- mid = shortest(v, d, begin, begin, end);
+ if (v->mem[t->retry] == 0) {
+ mid = shortest(v, d, begin, begin, end, (chr **)NULL);
if (mid == NULL) {
freedfa(d);
freedfa(d2);
return REG_NOMATCH;
}
MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid)));
- subset(v, &rt->left, begin, mid);
- v->mem[rt->no] = (mid - begin) + 1;
+ v->mem[t->retry] = (mid - begin) + 1;
} else {
- mid = begin + (v->mem[rt->no] - 1);
+ mid = begin + (v->mem[t->retry] - 1);
MDEBUG(("working midpoint %ld\n", (long)OFF(mid)));
}
/* iterate until satisfaction or failure */
for (;;) {
/* try this midpoint on for size */
- er = cdissect(v, rt->left.tree, begin, mid);
+ er = cdissect(v, t->left, begin, mid);
if (er == REG_OKAY && longest(v, d2, mid, end) == end &&
- (er = cdissect(v, rt->right.tree, mid, end)) ==
+ (er = cdissect(v, t->right, mid, end)) ==
REG_OKAY)
break; /* NOTE BREAK OUT */
if (er != REG_OKAY && er != REG_NOMATCH) {
@@ -684,101 +742,58 @@ chr *end; /* end of same */
/* that midpoint didn't work, find a new one */
if (mid == end) {
/* all possibilities exhausted */
- MDEBUG(("%d no midpoint\n", rt->no));
+ MDEBUG(("%d no midpoint\n", t->retry));
freedfa(d);
freedfa(d2);
return REG_NOMATCH;
}
- mid = shortest(v, d, begin, mid+1, end);
+ mid = shortest(v, d, begin, mid+1, end, (chr **)NULL);
if (mid == NULL) {
/* failed to find a new one */
- MDEBUG(("%d failed midpoint\n", rt->no));
+ MDEBUG(("%d failed midpoint\n", t->retry));
freedfa(d);
freedfa(d2);
return REG_NOMATCH;
}
- MDEBUG(("%d: new midpoint %ld\n", rt->no, (long)OFF(mid)));
- subset(v, &rt->left, begin, mid);
- v->mem[rt->no] = (mid - begin) + 1;
- zapmem(v, rt->left.tree);
- zapmem(v, rt->right.tree);
+ MDEBUG(("%d: new midpoint %ld\n", t->retry, (long)OFF(mid)));
+ v->mem[t->retry] = (mid - begin) + 1;
+ zapmem(v, t->left);
+ zapmem(v, t->right);
}
/* satisfaction */
MDEBUG(("successful\n"));
freedfa(d);
freedfa(d2);
- subset(v, &rt->right, mid, end);
- return REG_OKAY;
-}
-
-/*
- - csindissect - determine singleton subexpression matches (with complications)
- ^ static int csindissect(struct vars *, struct rtree *, chr *, chr *);
- */
-static int /* regexec return code */
-csindissect(v, rt, begin, end)
-struct vars *v;
-struct rtree *rt;
-chr *begin; /* beginning of relevant substring */
-chr *end; /* end of same */
-{
- struct dfa *d;
- int er;
-
- assert(rt != NULL);
- assert(rt->op == ',');
- assert(rt->right.cnfa.nstates == 0);
- MDEBUG(("csingleton %d\n", rt->no));
-
- assert(rt->left.cnfa.nstates > 0);
-
- /* exploit memory only to suppress repeated work in retries */
- if (!v->mem[rt->no]) {
- d = newdfa(v, &rt->left.cnfa, v->g->cm);
- if (longest(v, d, begin, end) != end) {
- freedfa(d);
- return REG_NOMATCH;
- }
- freedfa(d);
- v->mem[rt->no] = 1;
- MDEBUG(("csingleton matched\n"));
- }
-
- er = cdissect(v, rt->left.tree, begin, end);
- if (er != REG_OKAY)
- return er;
- subset(v, &rt->left, begin, end);
return REG_OKAY;
}
/*
- cbrdissect - determine backref subexpression matches
- ^ static int cbrdissect(struct vars *, struct rtree *, chr *, chr *);
+ ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
-cbrdissect(v, rt, begin, end)
+cbrdissect(v, t, begin, end)
struct vars *v;
-struct rtree *rt;
+struct subre *t;
chr *begin; /* beginning of relevant substring */
chr *end; /* end of same */
{
int i;
- int n = -rt->left.subno;
+ int n = t->subno;
size_t len;
chr *paren;
chr *p;
chr *stop;
- int min = rt->left.min;
- int max = rt->left.max;
+ int min = t->min;
+ int max = t->max;
- assert(rt != NULL);
- assert(rt->op == 'b');
- assert(rt->right.cnfa.nstates == 0);
+ assert(t != NULL);
+ assert(t->op == 'b');
assert(n >= 0);
assert((size_t)n < v->nmatch);
- MDEBUG(("cbackref n%d %d{%d-%d}\n", rt->no, n, min, max));
+ MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
if (v->pmatch[n].rm_so == -1)
return REG_NOMATCH;
@@ -786,9 +801,9 @@ chr *end; /* end of same */
len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
/* no room to maneuver -- retries are pointless */
- if (v->mem[rt->no])
+ if (v->mem[t->retry])
return REG_NOMATCH;
- v->mem[rt->no] = 1;
+ v->mem[t->retry] = 1;
/* special-case zero-length string */
if (len == 0) {
@@ -822,12 +837,12 @@ chr *end; /* end of same */
/*
- caltdissect - determine alternative subexpression matches (w. complications)
- ^ static int caltdissect(struct vars *, struct rtree *, chr *, chr *);
+ ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
-caltdissect(v, rt, begin, end)
+caltdissect(v, t, begin, end)
struct vars *v;
-struct rtree *rt;
+struct subre *t;
chr *begin; /* beginning of relevant substring */
chr *end; /* end of same */
{
@@ -837,839 +852,37 @@ chr *end; /* end of same */
# define TRYING 1 /* top matched, trying submatches */
# define TRIED 2 /* top didn't match or submatches exhausted */
- if (rt == NULL)
+ if (t == NULL)
return REG_NOMATCH;
- assert(rt->op == '|');
- if (v->mem[rt->no] == TRIED)
- return caltdissect(v, rt->next, begin, end);
+ assert(t->op == '|');
+ if (v->mem[t->retry] == TRIED)
+ return caltdissect(v, t->right, begin, end);
- MDEBUG(("calt n%d\n", rt->no));
- assert(rt->left.begin != NULL);
+ MDEBUG(("calt n%d\n", t->retry));
+ assert(t->left != NULL);
- if (v->mem[rt->no] == UNTRIED) {
- d = newdfa(v, &rt->left.cnfa, v->g->cm);
+ if (v->mem[t->retry] == UNTRIED) {
+ d = newdfa(v, &t->left->cnfa, &v->g->cmap);
if (ISERR())
return v->err;
if (longest(v, d, begin, end) != end) {
freedfa(d);
- v->mem[rt->no] = TRIED;
- return caltdissect(v, rt->next, begin, end);
+ v->mem[t->retry] = TRIED;
+ return caltdissect(v, t->right, begin, end);
}
freedfa(d);
MDEBUG(("calt matched\n"));
- v->mem[rt->no] = TRYING;
+ v->mem[t->retry] = TRYING;
}
- er = cdissect(v, rt->left.tree, begin, end);
- if (er == REG_OKAY) {
- subset(v, &rt->left, begin, end);
- return REG_OKAY;
- }
+ er = cdissect(v, t->left, begin, end);
if (er != REG_NOMATCH)
return er;
- v->mem[rt->no] = TRIED;
- return caltdissect(v, rt->next, begin, end);
-}
-
-/*
- - dismatch - determine overall match using top-level dissection
- * The retry memory stores the offset of the trial midpoint from begin,
- * plus 1 so that 0 uniquely means "clean slate".
- ^ static chr *dismatch(struct vars *, struct rtree *, chr *, chr *);
- */
-static chr * /* endpoint, or NULL */
-dismatch(v, rt, begin, end)
-struct vars *v;
-struct rtree *rt;
-chr *begin; /* beginning of relevant substring */
-chr *end; /* end of same */
-{
- struct dfa *d;
- struct dfa *d2;
- chr *mid;
- chr *ret;
-
- if (rt == NULL)
- return begin;
- MDEBUG(("dsubstr %ld-%ld\n", (long)OFF(begin), (long)OFF(end)));
-
- /* punt various cases to auxiliaries */
- if (rt->right.cnfa.nstates == 0) /* no RHS */
- return dismsin(v, rt, begin, end);
- if (rt->left.prefer == SHORTER) /* reverse scan */
- return dismrev(v, rt, begin, end);
-
- /* concatenation -- need to split the substring between parts */
- assert(rt->op == ',');
- assert(rt->left.cnfa.nstates > 0);
- assert(rt->right.cnfa.nstates > 0);
- d = newdfa(v, &rt->left.cnfa, v->g->cm);
- if (ISERR())
- return NULL;
- d2 = newdfa(v, &rt->right.cnfa, v->g->cm);
- if (ISERR()) {
- freedfa(d);
- return NULL;
- }
- MDEBUG(("dconcat %d\n", rt->no));
-
- /* pick a tentative midpoint */
- if (v->mem[rt->no] == 0) {
- mid = longest(v, d, begin, end);
- if (mid == NULL) {
- freedfa(d);
- freedfa(d2);
- return NULL;
- }
- MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid)));
- v->mem[rt->no] = (mid - begin) + 1;
- } else {
- mid = begin + (v->mem[rt->no] - 1);
- MDEBUG(("working midpoint %ld\n", (long)OFF(mid)));
- }
-
- /* iterate until satisfaction or failure */
- for (;;) {
- /* try this midpoint on for size */
- if (rt->right.tree == NULL || rt->right.tree->op == 'b') {
- if (rt->right.prefer == LONGER)
- ret = longest(v, d2, mid, end);
- else
- ret = shortest(v, d2, mid, mid, end);
- } else {
- if (longest(v, d2, mid, end) != NULL)
- ret = dismatch(v, rt->right.tree, mid, end);
- else
- ret = NULL;
- }
- if (ret != NULL)
- break; /* NOTE BREAK OUT */
-
- /* that midpoint didn't work, find a new one */
- if (mid == begin) {
- /* all possibilities exhausted */
- MDEBUG(("%d no midpoint\n", rt->no));
- freedfa(d);
- freedfa(d2);
- return NULL;
- }
- mid = longest(v, d, begin, mid-1);
- if (mid == NULL) {
- /* failed to find a new one */
- MDEBUG(("%d failed midpoint\n", rt->no));
- freedfa(d);
- freedfa(d2);
- return NULL;
- }
- MDEBUG(("%d: new midpoint %ld\n", rt->no, (long)OFF(mid)));
- v->mem[rt->no] = (mid - begin) + 1;
- zapmem(v, rt->right.tree);
- }
-
- /* satisfaction */
- MDEBUG(("successful\n"));
- freedfa(d);
- freedfa(d2);
- return ret;
-}
-
-/*
- - dismrev - determine overall match using top-level dissection
- * The retry memory stores the offset of the trial midpoint from begin,
- * plus 1 so that 0 uniquely means "clean slate".
- ^ static chr *dismrev(struct vars *, struct rtree *, chr *, chr *);
- */
-static chr * /* endpoint, or NULL */
-dismrev(v, rt, begin, end)
-struct vars *v;
-struct rtree *rt;
-chr *begin; /* beginning of relevant substring */
-chr *end; /* end of same */
-{
- struct dfa *d;
- struct dfa *d2;
- chr *mid;
- chr *ret;
-
- if (rt == NULL)
- return begin;
- MDEBUG(("rsubstr %ld-%ld\n", (long)OFF(begin), (long)OFF(end)));
-
- /* concatenation -- need to split the substring between parts */
- assert(rt->op == ',');
- assert(rt->left.cnfa.nstates > 0);
- assert(rt->right.cnfa.nstates > 0);
- d = newdfa(v, &rt->left.cnfa, v->g->cm);
- if (ISERR())
- return NULL;
- d2 = newdfa(v, &rt->right.cnfa, v->g->cm);
- if (ISERR()) {
- freedfa(d);
- return NULL;
- }
- MDEBUG(("dconcat %d\n", rt->no));
-
- /* pick a tentative midpoint */
- if (v->mem[rt->no] == 0) {
- mid = shortest(v, d, begin, begin, end);
- if (mid == NULL) {
- freedfa(d);
- freedfa(d2);
- return NULL;
- }
- MDEBUG(("tentative midpoint %ld\n", (long)OFF(mid)));
- v->mem[rt->no] = (mid - begin) + 1;
- } else {
- mid = begin + (v->mem[rt->no] - 1);
- MDEBUG(("working midpoint %ld\n", (long)OFF(mid)));
- }
-
- /* iterate until satisfaction or failure */
- for (;;) {
- /* try this midpoint on for size */
- if (rt->right.tree == NULL || rt->right.tree->op == 'b') {
- if (rt->right.prefer == LONGER)
- ret = longest(v, d2, mid, end);
- else
- ret = shortest(v, d2, mid, mid, end);
- } else {
- if (longest(v, d2, mid, end) != NULL)
- ret = dismatch(v, rt->right.tree, mid, end);
- else
- ret = NULL;
- }
- if (ret != NULL)
- break; /* NOTE BREAK OUT */
-
- /* that midpoint didn't work, find a new one */
- if (mid == end) {
- /* all possibilities exhausted */
- MDEBUG(("%d no midpoint\n", rt->no));
- freedfa(d);
- freedfa(d2);
- return NULL;
- }
- mid = shortest(v, d, begin, mid+1, end);
- if (mid == NULL) {
- /* failed to find a new one */
- MDEBUG(("%d failed midpoint\n", rt->no));
- freedfa(d);
- freedfa(d2);
- return NULL;
- }
- MDEBUG(("%d: new midpoint %ld\n", rt->no, (long)OFF(mid)));
- v->mem[rt->no] = (mid - begin) + 1;
- zapmem(v, rt->right.tree);
- }
-
- /* satisfaction */
- MDEBUG(("successful\n"));
- freedfa(d);
- freedfa(d2);
- return ret;
-}
-
-/*
- - dismsin - determine singleton subexpression matches (with complications)
- ^ static chr *dismsin(struct vars *, struct rtree *, chr *, chr *);
- */
-static chr *
-dismsin(v, rt, begin, end)
-struct vars *v;
-struct rtree *rt;
-chr *begin; /* beginning of relevant substring */
-chr *end; /* end of same */
-{
- struct dfa *d;
- chr *ret;
-
- assert(rt != NULL);
- assert(rt->op == ',');
- assert(rt->right.cnfa.nstates == 0);
- MDEBUG(("dsingleton %d\n", rt->no));
-
- assert(rt->left.cnfa.nstates > 0);
-
- /* retries are pointless */
- if (v->mem[rt->no])
- return NULL;
- v->mem[rt->no] = 1;
-
- d = newdfa(v, &rt->left.cnfa, v->g->cm);
- if (d == NULL)
- return NULL;
- if (rt->left.prefer == LONGER)
- ret = longest(v, d, begin, end);
- else
- ret = shortest(v, d, begin, begin, end);
- freedfa(d);
- if (ret != NULL)
- MDEBUG(("dsingleton matched\n"));
- return ret;
-}
-
-/*
- - longest - longest-preferred matching engine
- ^ static chr *longest(struct vars *, struct dfa *, chr *, chr *);
- */
-static chr * /* endpoint, or NULL */
-longest(v, d, start, stop)
-struct vars *v; /* used only for debug and exec flags */
-struct dfa *d;
-chr *start; /* where the match should start */
-chr *stop; /* match must end at or before here */
-{
- chr *cp;
- chr *realstop = (stop == v->stop) ? stop : stop + 1;
- color co;
- struct sset *css;
- struct sset *ss;
- chr *post;
- int i;
- struct colormap *cm = d->cm;
-
- /* initialize */
- css = initialize(v, d, start);
- cp = start;
-
- /* startup */
- FDEBUG(("+++ startup +++\n"));
- if (cp == v->start) {
- co = d->cnfa->bos[(v->eflags&REG_NOTBOL) ? 0 : 1];
- FDEBUG(("color %ld\n", (long)co));
- } else {
- co = GETCOLOR(cm, *(cp - 1));
- FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
- }
- css = miss(v, d, css, co, cp, start);
- if (css == NULL)
- return NULL;
- css->lastseen = cp;
-
- /* main loop */
- if (v->eflags&REG_FTRACE)
- while (cp < realstop) {
- FDEBUG(("+++ at c%d +++\n", css - d->ssets));
- co = GETCOLOR(cm, *cp);
- FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
- ss = css->outs[co];
- if (ss == NULL) {
- ss = miss(v, d, css, co, cp+1, start);
- if (ss == NULL)
- break; /* NOTE BREAK OUT */
- }
- cp++;
- ss->lastseen = cp;
- css = ss;
- }
- else
- while (cp < realstop) {
- co = GETCOLOR(cm, *cp);
- ss = css->outs[co];
- if (ss == NULL) {
- ss = miss(v, d, css, co, cp+1, start);
- if (ss == NULL)
- break; /* NOTE BREAK OUT */
- }
- cp++;
- ss->lastseen = cp;
- css = ss;
- }
-
- /* shutdown */
- FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets));
- if (cp == v->stop && stop == v->stop) {
- co = d->cnfa->eos[(v->eflags&REG_NOTEOL) ? 0 : 1];
- FDEBUG(("color %ld\n", (long)co));
- ss = miss(v, d, css, co, cp, start);
- /* special case: match ended at eol? */
- if (ss != NULL && (ss->flags&POSTSTATE))
- return cp;
- else if (ss != NULL)
- ss->lastseen = cp; /* to be tidy */
- }
-
- /* find last match, if any */
- post = d->lastpost;
- for (ss = d->ssets, i = 0; i < d->nssused; ss++, i++)
- if ((ss->flags&POSTSTATE) && post != ss->lastseen &&
- (post == NULL || post < ss->lastseen))
- post = ss->lastseen;
- if (post != NULL) /* found one */
- return post - 1;
-
- return NULL;
-}
-
-/*
- - shortest - shortest-preferred matching engine
- ^ static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *);
- */
-static chr * /* endpoint, or NULL */
-shortest(v, d, start, min, max)
-struct vars *v; /* used only for debug and exec flags */
-struct dfa *d;
-chr *start; /* where the match should start */
-chr *min; /* match must end at or after here */
-chr *max; /* match must end at or before here */
-{
- chr *cp;
- chr *realmin = (min == v->stop) ? min : min + 1;
- chr *realmax = (max == v->stop) ? max : max + 1;
- color co;
- struct sset *css;
- struct sset *ss;
- struct colormap *cm = d->cm;
-
- /* initialize */
- css = initialize(v, d, start);
- cp = start;
-
- /* startup */
- FDEBUG(("--- startup ---\n"));
- if (cp == v->start) {
- co = d->cnfa->bos[(v->eflags&REG_NOTBOL) ? 0 : 1];
- FDEBUG(("color %ld\n", (long)co));
- } else {
- co = GETCOLOR(cm, *(cp - 1));
- FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
- }
- css = miss(v, d, css, co, cp, start);
- if (css == NULL)
- return NULL;
- css->lastseen = cp;
- ss = css;
-
- /* main loop */
- if (v->eflags&REG_FTRACE)
- while (cp < realmax) {
- FDEBUG(("--- at c%d ---\n", css - d->ssets));
- co = GETCOLOR(cm, *cp);
- FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
- ss = css->outs[co];
- if (ss == NULL) {
- ss = miss(v, d, css, co, cp+1, start);
- if (ss == NULL)
- break; /* NOTE BREAK OUT */
- }
- cp++;
- ss->lastseen = cp;
- css = ss;
- if ((ss->flags&POSTSTATE) && cp >= realmin)
- break; /* NOTE BREAK OUT */
- }
- else
- while (cp < realmax) {
- co = GETCOLOR(cm, *cp);
- ss = css->outs[co];
- if (ss == NULL) {
- ss = miss(v, d, css, co, cp+1, start);
- if (ss == NULL)
- break; /* NOTE BREAK OUT */
- }
- cp++;
- ss->lastseen = cp;
- css = ss;
- if ((ss->flags&POSTSTATE) && cp >= realmin)
- break; /* NOTE BREAK OUT */
- }
-
- if (ss == NULL)
- return NULL;
- if (ss->flags&POSTSTATE) {
- assert(cp >= realmin);
- return cp - 1;
- }
-
- /* shutdown */
- FDEBUG(("--- shutdown at c%d ---\n", css - d->ssets));
- if (cp == v->stop && max == v->stop) {
- co = d->cnfa->eos[(v->eflags&REG_NOTEOL) ? 0 : 1];
- FDEBUG(("color %ld\n", (long)co));
- ss = miss(v, d, css, co, cp, start);
- /* special case: match ended at eol? */
- if (ss != NULL && (ss->flags&POSTSTATE))
- return cp;
- }
-
- return NULL;
-}
-
-/*
- - newdfa - set up a fresh DFA
- ^ static struct dfa *newdfa(struct vars *, struct cnfa *,
- ^ struct colormap *);
- */
-static struct dfa *
-newdfa(v, cnfa, cm)
-struct vars *v;
-struct cnfa *cnfa;
-struct colormap *cm;
-{
- struct dfa *d = (struct dfa *)MALLOC(sizeof(struct dfa));
- int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
- struct sset *ss;
- int i;
-
- assert(cnfa != NULL && cnfa->nstates != 0);
- if (d == NULL) {
- ERR(REG_ESPACE);
- return NULL;
- }
-
- d->ssets = (struct sset *)MALLOC(CACHE * sizeof(struct sset));
- d->statesarea = (unsigned *)MALLOC((CACHE+WORK) * wordsper *
- sizeof(unsigned));
- d->work = &d->statesarea[CACHE * wordsper];
- d->outsarea = (struct sset **)MALLOC(CACHE * cnfa->ncolors *
- sizeof(struct sset *));
- d->incarea = (struct arcp *)MALLOC(CACHE * cnfa->ncolors *
- sizeof(struct arcp));
- if (d->ssets == NULL || d->statesarea == NULL || d->outsarea == NULL ||
- d->incarea == NULL) {
- freedfa(d);
- ERR(REG_ESPACE);
- return NULL;
- }
-
- d->nssets = (v->eflags&REG_SMALL) ? 5 : CACHE;
- d->nssused = 0;
- d->nstates = cnfa->nstates;
- d->ncolors = cnfa->ncolors;
- d->wordsper = wordsper;
- d->cnfa = cnfa;
- d->cm = cm;
- d->lastpost = NULL;
- d->search = d->ssets;
-
- for (ss = d->ssets, i = 0; i < d->nssets; ss++, i++) {
- /* initialization of most fields is done as needed */
- ss->states = &d->statesarea[i * d->wordsper];
- ss->outs = &d->outsarea[i * d->ncolors];
- ss->inchain = &d->incarea[i * d->ncolors];
- }
-
- return d;
-}
-
-/*
- - freedfa - free a DFA
- ^ static VOID freedfa(struct dfa *);
- */
-static VOID
-freedfa(d)
-struct dfa *d;
-{
- if (d->ssets != NULL)
- FREE(d->ssets);
- if (d->statesarea != NULL)
- FREE(d->statesarea);
- if (d->outsarea != NULL)
- FREE(d->outsarea);
- if (d->incarea != NULL)
- FREE(d->incarea);
- FREE(d);
-}
-
-/*
- - hash - construct a hash code for a bitvector
- * There are probably better ways, but they're more expensive.
- ^ static unsigned hash(unsigned *, int);
- */
-static unsigned
-hash(uv, n)
-unsigned *uv;
-int n;
-{
- int i;
- unsigned h;
-
- h = 0;
- for (i = 0; i < n; i++)
- h ^= uv[i];
- return h;
-}
-
-/*
- - initialize - hand-craft a cache entry for startup, otherwise get ready
- ^ static struct sset *initialize(struct vars *, struct dfa *, chr *);
- */
-static struct sset *
-initialize(v, d, start)
-struct vars *v; /* used only for debug flags */
-struct dfa *d;
-chr *start;
-{
- struct sset *ss;
- int i;
-
- /* is previous one still there? */
- if (d->nssused > 0 && (d->ssets[0].flags&STARTER))
- ss = &d->ssets[0];
- else { /* no, must (re)build it */
- ss = getvacant(v, d, start, start);
- for (i = 0; i < d->wordsper; i++)
- ss->states[i] = 0;
- BSET(ss->states, d->cnfa->pre);
- ss->hash = hash(ss->states, d->wordsper);
- assert(d->cnfa->pre != d->cnfa->post);
- ss->flags = STARTER;
- /* lastseen dealt with below */
- }
-
- for (i = 0; i < d->nssused; i++)
- d->ssets[i].lastseen = NULL;
- ss->lastseen = start; /* maybe untrue, but harmless */
- d->lastpost = NULL;
- return ss;
-}
-
-/*
- - miss - handle a cache miss
- ^ static struct sset *miss(struct vars *, struct dfa *, struct sset *,
- ^ pcolor, chr *, chr *);
- */
-static struct sset * /* NULL if goes to empty set */
-miss(v, d, css, co, cp, start)
-struct vars *v; /* used only for debug flags */
-struct dfa *d;
-struct sset *css;
-pcolor co;
-chr *cp; /* next chr */
-chr *start; /* where the attempt got started */
-{
- struct cnfa *cnfa = d->cnfa;
- int i;
- unsigned h;
- struct carc *ca;
- struct sset *p;
- int ispost;
- int gotstate;
- int dolacons;
- int didlacons;
-
- /* for convenience, we can be called even if it might not be a miss */
- if (css->outs[co] != NULL) {
- FDEBUG(("hit\n"));
- return css->outs[co];
- }
- FDEBUG(("miss\n"));
-
- /* first, what set of states would we end up in? */
- for (i = 0; i < d->wordsper; i++)
- d->work[i] = 0;
- ispost = 0;
- gotstate = 0;
- for (i = 0; i < d->nstates; i++)
- if (ISBSET(css->states, i))
- for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
- if (ca->co == co) {
- BSET(d->work, ca->to);
- gotstate = 1;
- if (ca->to == cnfa->post)
- ispost = 1;
- FDEBUG(("%d -> %d\n", i, ca->to));
- }
- dolacons = (gotstate) ? (cnfa->flags&HASLACONS) : 0;
- didlacons = 0;
- while (dolacons) { /* transitive closure */
- dolacons = 0;
- for (i = 0; i < d->nstates; i++)
- if (ISBSET(d->work, i))
- for (ca = cnfa->states[i]; ca->co != COLORLESS;
- ca++)
- if (ca->co > cnfa->ncolors &&
- !ISBSET(d->work, ca->to) &&
- lacon(v, cnfa, cp,
- ca->co)) {
- BSET(d->work, ca->to);
- dolacons = 1;
- didlacons = 1;
- if (ca->to == cnfa->post)
- ispost = 1;
- FDEBUG(("%d :> %d\n",i,ca->to));
- }
- }
- if (!gotstate)
- return NULL;
- h = hash(d->work, d->wordsper);
-
- /* next, is that in the cache? */
- for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
- if (p->hash == h && memcmp(VS(d->work), VS(p->states),
- d->wordsper*sizeof(unsigned)) == 0) {
- FDEBUG(("cached c%d\n", p - d->ssets));
- break; /* NOTE BREAK OUT */
- }
- if (i == 0) { /* nope, need a new cache entry */
- p = getvacant(v, d, cp, start);
- assert(p != css);
- for (i = 0; i < d->wordsper; i++)
- p->states[i] = d->work[i];
- p->hash = h;
- p->flags = (ispost) ? POSTSTATE : 0;
- /* lastseen to be dealt with by caller */
- }
-
- if (!didlacons) { /* lookahead conds. always cache miss */
- css->outs[co] = p;
- css->inchain[co] = p->ins;
- p->ins.ss = css;
- p->ins.co = (color)co;
- }
- return p;
+ v->mem[t->retry] = TRIED;
+ return caltdissect(v, t->right, begin, end);
}
-/*
- - lacon - lookahead-constraint checker for miss()
- ^ static int lacon(struct vars *, struct cnfa *, chr *, pcolor);
- */
-static int /* predicate: constraint satisfied? */
-lacon(v, pcnfa, cp, co)
-struct vars *v;
-struct cnfa *pcnfa; /* parent cnfa */
-chr *cp;
-pcolor co; /* "color" of the lookahead constraint */
-{
- int n;
- struct subre *sub;
- struct dfa *d;
- chr *end;
- n = co - pcnfa->ncolors;
- assert(n < v->g->nlacons && v->g->lacons != NULL);
- FDEBUG(("=== testing lacon %d\n", n));
- sub = &v->g->lacons[n];
- d = newdfa(v, &sub->cnfa, v->g->cm);
- if (d == NULL) {
- ERR(REG_ESPACE);
- return 0;
- }
- end = longest(v, d, cp, v->stop);
- freedfa(d);
- FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
- return (sub->subno) ? (end != NULL) : (end == NULL);
-}
-/*
- - getvacant - get a vacant state set
- * This routine clears out the inarcs and outarcs, but does not otherwise
- * clear the innards of the state set -- that's up to the caller.
- ^ static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
- */
-static struct sset *
-getvacant(v, d, cp, start)
-struct vars *v; /* used only for debug flags */
-struct dfa *d;
-chr *cp;
-chr *start;
-{
- int i;
- struct sset *ss;
- struct sset *p;
- struct arcp ap;
- struct arcp lastap;
- color co;
-
- ss = pickss(v, d, cp, start);
- assert(!(ss->flags&LOCKED));
-
- /* clear out its inarcs, including self-referential ones */
- ap = ss->ins;
- while ((p = ap.ss) != NULL) {
- co = ap.co;
- FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long)co));
- p->outs[co] = NULL;
- ap = p->inchain[co];
- p->inchain[co].ss = NULL; /* paranoia */
- }
- ss->ins.ss = NULL;
-
- /* take it off the inarc chains of the ssets reached by its outarcs */
- for (i = 0; i < d->ncolors; i++) {
- p = ss->outs[i];
- assert(p != ss); /* not self-referential */
- if (p == NULL)
- continue; /* NOTE CONTINUE */
- FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets));
- if (p->ins.ss == ss && p->ins.co == i)
- p->ins = ss->inchain[i];
- else {
- assert(p->ins.ss != NULL);
- for (ap = p->ins; ap.ss != NULL &&
- !(ap.ss == ss && ap.co == i);
- ap = ap.ss->inchain[ap.co])
- lastap = ap;
- assert(ap.ss != NULL);
- lastap.ss->inchain[lastap.co] = ss->inchain[i];
- }
- ss->outs[i] = NULL;
- ss->inchain[i].ss = NULL;
- }
-
- /* if ss was a success state, may need to remember location */
- if ((ss->flags&POSTSTATE) && ss->lastseen != d->lastpost &&
- (d->lastpost == NULL || d->lastpost < ss->lastseen))
- d->lastpost = ss->lastseen;
-
- return ss;
-}
-
-/*
- - pickss - pick the next stateset to be used
- ^ static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
- */
-static struct sset *
-pickss(v, d, cp, start)
-struct vars *v; /* used only for debug flags */
-struct dfa *d;
-chr *cp;
-chr *start;
-{
- int i;
- struct sset *ss;
- struct sset *end;
- chr *ancient;
-
- /* shortcut for cases where cache isn't full */
- if (d->nssused < d->nssets) {
- ss = &d->ssets[d->nssused];
- d->nssused++;
- FDEBUG(("new c%d\n", ss - d->ssets));
- /* must make innards consistent */
- ss->ins.ss = NULL;
- for (i = 0; i < d->ncolors; i++) {
- ss->outs[i] = NULL;
- ss->inchain[i].ss = NULL;
- }
- ss->flags = 0;
- return ss;
- }
-
- /* look for oldest, or old enough anyway */
- if (cp - start > d->nssets*3/4) /* oldest 25% are expendable */
- ancient = cp - d->nssets*3/4;
- else
- ancient = start;
- for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
- if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
- !(ss->flags&LOCKED)) {
- d->search = ss + 1;
- FDEBUG(("replacing c%d\n", ss - d->ssets));
- return ss;
- }
- for (ss = d->ssets, end = d->search; ss < end; ss++)
- if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
- !(ss->flags&LOCKED)) {
- d->search = ss + 1;
- FDEBUG(("replacing c%d\n", ss - d->ssets));
- return ss;
- }
-
- /* nobody's old enough?!? -- something's really wrong */
- FDEBUG(("can't find victim to replace!\n"));
- assert(NOTREACHED);
- ERR(REG_ASSERT);
- return d->ssets;
-}
+#include "rege_dfa.c"
diff --git a/generic/regguts.h b/generic/regguts.h
index 93d0eec..5f9a0ab 100644
--- a/generic/regguts.h
+++ b/generic/regguts.h
@@ -96,7 +96,9 @@
* debugging facilities
*/
#ifdef REG_DEBUG
+/* FDEBUG does finite-state tracing */
#define FDEBUG(arglist) { if (v->eflags&REG_FTRACE) printf arglist; }
+/* MDEBUG does higher-level tracing */
#define MDEBUG(arglist) { if (v->eflags&REG_MTRACE) printf arglist; }
#else
#define FDEBUG(arglist) {}
@@ -115,15 +117,6 @@
/*
- * Map a truth value into -1 for false, 1 for true. This is so it is
- * possible to write compile-time assertions by declaring a dummy array
- * of this size. (Why not #if? Because sizeof is not available there.)
- */
-#define NEGIFNOT(x) (2*!!(x) - 1) /* !! ensures 0 or 1 */
-
-
-
-/*
* We dissect a chr into byts for colormap table indexing. Here we define
* a byt, which will be the same as a byte on most machines... The exact
* size of a byt is not critical, but about 8 bits is good, and extraction
@@ -154,10 +147,10 @@ typedef int pcolor; /* what color promotes to */
* A colormap is a tree -- more precisely, a DAG -- indexed at each level
* by a byt of the chr, to map the chr to a color efficiently. Because
* lower sections of the tree can be shared, it can exploit the usual
- * sparseness of such a mapping table. The final tree is always NBYTS
- * levels deep (at present it may be shallower during construction, but
- * it is always "filled" to full depth at the end of that, using pointers
- * to "fill blocks" which are entirely WHITE in color).
+ * sparseness of such a mapping table. The tree is always NBYTS levels
+ * deep (in the past it was shallower during construction but was "filled"
+ * to full depth at the end of that); areas that are unaltered as yet point
+ * to "fill blocks" which are entirely WHITE in color.
*/
/* the tree itself */
@@ -177,12 +170,14 @@ union tree {
/* internal per-color structure for the color machinery */
struct colordesc {
uchr nchrs; /* number of chars of this color */
- color sub; /* open subcolor of this one, or NOSUB */
+ color sub; /* open subcolor (if any); free chain ptr */
# define NOSUB COLORLESS
struct arc *arcs; /* color chain */
-# define UNUSEDCOLOR(cd) ((cd)->nchrs == 0 && (cd)->sub == NOSUB)
int flags;
-# define PSEUDO 1 /* pseudocolor, no real chars */
+# define FREECOL 01 /* currently free */
+# define PSEUDO 02 /* pseudocolor, no real chars */
+# define UNUSEDCOLOR(cd) ((cd)->flags&FREECOL)
+ union tree *block; /* block of solid color, if any */
};
/* the color map itself */
@@ -190,13 +185,13 @@ struct colormap {
int magic;
# define CMMAGIC 0x876
struct vars *v; /* for compile error reporting */
- color rest;
- int filled; /* has it been filled? */
size_t ncds; /* number of colordescs */
+ size_t max; /* highest in use */
+ color free; /* beginning of free chain (if non-0) */
struct colordesc *cd;
-# define CDEND(cm) (&(cm)->cd[(cm)->ncds])
+# define CDEND(cm) (&(cm)->cd[(cm)->max + 1])
# define NINLINECDS ((size_t)10)
- struct colordesc cds[NINLINECDS];
+ struct colordesc cdspace[NINLINECDS];
union tree tree[NBYTS]; /* tree top, plus fill blocks */
};
@@ -208,6 +203,7 @@ struct colormap {
#if NBYTS == 1
#define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)])
#endif
+/* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */
#if NBYTS == 2
#define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)])
#endif
@@ -326,34 +322,35 @@ struct cnfa {
/*
- * definitions for subexpression tree
- * The intrepid code-reader is hereby warned that the subexpression tree
- * is kludge piled upon kludge, and is badly in need of rethinking. Do
- * not expect it to look clean and sensible.
+ * subexpression tree
*/
struct subre {
- struct state *begin; /* outarcs from here... */
- struct state *end; /* ...ending in inarcs here */
- int prefer; /* match preference */
-# define NONEYET 00
-# define LONGER 01
-# define SHORTER 02
- int subno; /* subexpression number (0 none, <0 backref) */
+ char op; /* '|', '.' (concat), 'b' (backref), '(', '=' */
+ char flags;
+# define LONGER 01 /* prefers longer match */
+# define SHORTER 02 /* prefers shorter match */
+# define MIXED 04 /* mixed preference below */
+# define CAP 010 /* capturing parens below */
+# define BACKR 020 /* back reference below */
+# define INUSE 0100 /* in use in final tree */
+# define LOCAL 03 /* bits which may not propagate up */
+# define LMIX(f) ((f)<<2) /* LONGER -> MIXED */
+# define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */
+# define UP(f) (((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED))
+# define MESSY(f) ((f)&(MIXED|CAP|BACKR))
+# define PREF(f) ((f)&LOCAL)
+# define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
+# define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
+ short retry; /* index into retry memory */
+ int subno; /* subexpression number (for 'b' and '(') */
short min; /* min repetitions, for backref only */
short max; /* max repetitions, for backref only */
- struct rtree *tree; /* substructure, if any */
+ struct subre *left; /* left child, if any (also freelist chain) */
+ struct subre *right; /* right child, if any */
+ struct state *begin; /* outarcs from here... */
+ struct state *end; /* ...ending in inarcs here */
struct cnfa cnfa; /* compacted NFA, if any */
-};
-
-struct rtree {
- char op; /* operator: '|', ',' */
- char flags;
-# define INUSE 01 /* in use in the tree */
- short no; /* index into retry memory */
- struct subre left;
- struct rtree *next; /* for '|' */
- struct subre right; /* for ',' */
- struct rtree *chain; /* for bookkeeping and error cleanup */
+ struct subre *chain; /* for bookkeeping and error cleanup */
};
@@ -377,12 +374,13 @@ struct guts {
int cflags; /* copy of compile flags */
int info; /* copy of re_info */
size_t nsub; /* copy of re_nsub */
- struct cnfa cnfa;
- struct rtree *tree;
+ struct subre *tree;
+ struct cnfa search; /* for fast preliminary search */
int ntree;
- struct colormap *cm;
+ struct colormap cmap;
int FUNCPTR(compare, (CONST chr *, CONST chr *, size_t));
struct subre *lacons; /* lookahead-constraint vector */
int nlacons; /* size of lacons */
int usedshorter; /* used non-greedy quantifiers? */
+ int unmatchable; /* cannot match anything? */
};
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 9f46efc..1a1cf2a 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -12,7 +12,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclCmdMZ.c,v 1.1.2.4 1998/10/21 20:40:05 stanton Exp $
+ * RCS: @(#) $Id: tclCmdMZ.c,v 1.1.2.5 1998/11/11 01:44:52 stanton Exp $
*/
#include "tclInt.h"
@@ -90,7 +90,9 @@ Tcl_PwdObjCmd(dummy, interp, objc, objv)
* Tcl_RegexpObjCmd --
*
* This procedure is invoked to process the "regexp" Tcl command.
- * See the user documentation for details on what it does.
+ * See the user documentation for details on what it does. The
+ * REGEXP_TEST stuff is to minimize code differences between this
+ * and the "testregexp" command.
*
* Results:
* A standard Tcl result.
@@ -117,12 +119,24 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv)
Tcl_UniChar *wStart;
static char *options[] = {
"-indices", "-nocase", "-about", "-expanded",
- "-unsupported0", "--", (char *) NULL
+ "-line", "-linestop", "-lineanchor",
+#ifdef REGEXP_TEST
+ "-xflags",
+#endif
+ "--", (char *) NULL
};
enum options {
REGEXP_INDICES, REGEXP_NOCASE, REGEXP_ABOUT, REGEXP_EXPANDED,
- REGEXP_XFLAGS, REGEXP_LAST
+ REGEXP_MULTI, REGEXP_NOCROSS, REGEXP_NEWL,
+#ifdef REGEXP_TEST
+ REGEXP_XFLAGS,
+#endif
+ REGEXP_LAST
};
+#ifndef REGEXP_TEST
+# define REGEXP_XFLAGS -1 /* impossible value */
+# define TclRegXflags(a,b,c,d) /* do nothing */
+#endif
indices = 0;
about = 0;
@@ -159,6 +173,18 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv)
cflags |= REG_EXPANDED;
break;
}
+ case REGEXP_MULTI: {
+ cflags |= REG_NEWLINE;
+ break;
+ }
+ case REGEXP_NOCROSS: {
+ cflags |= REG_NLSTOP;
+ break;
+ }
+ case REGEXP_NEWL: {
+ cflags |= REG_NLANCH;
+ break;
+ }
case REGEXP_XFLAGS: {
hasxflags = 1;
break;
@@ -207,7 +233,7 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv)
wStart = TclUtfToUniCharDString(string, stringLength, &stringBuffer);
wLen = Tcl_DStringLength(&stringBuffer) / sizeof(Tcl_UniChar);
- match = TclRegExpExecUniChar(interp, regExpr, wStart, wLen, eflags);
+ match = TclRegExpExecUniChar(interp, regExpr, wStart, wLen, objc-2, eflags);
if (match < 0) {
result = TCL_ERROR;
goto done;
@@ -384,7 +410,7 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv)
* so that "^" won't match.
*/
- match = TclRegExpExecUniChar(interp, regExpr, w, wEnd - w,
+ match = TclRegExpExecUniChar(interp, regExpr, w, wEnd - w, 10,
((w > wStart) ? REG_NOTBOL : 0));
if (match < 0) {
result = TCL_ERROR;
diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c
index d65b19a..a4c9c37 100644
--- a/generic/tclRegexp.c
+++ b/generic/tclRegexp.c
@@ -10,7 +10,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclRegexp.c,v 1.1.2.3 1998/10/21 20:40:06 stanton Exp $
+ * RCS: @(#) $Id: tclRegexp.c,v 1.1.2.4 1998/11/11 01:44:53 stanton Exp $
*/
#include "tclInt.h"
@@ -22,60 +22,42 @@
* The routines in this file use Henry Spencer's regular expression
* package contained in the following additional source files:
*
- * chr.h tclRegexp.h lex.c
- * guts.h color.c locale.c
- * wchar.h compile.c nfa.c
- * wctype.h exec.c
- *
- * Copyright (c) 1986 by University of Toronto.
- * Written by Henry Spencer. Not derived from licensed software.
- *
- * Permission is granted to anyone to use this software for any
- * purpose on any computer system, and to redistribute it freely,
- * subject to the following restrictions:
- *
- * 1. The author is not responsible for the consequences of use of
- * this software, no matter how awful, even if they arise
- * from defects in it.
- *
- * 2. The origin of this software must not be misrepresented, either
- * by explicit claim or by omission.
- *
- * 3. Altered versions must be plainly marked as such, and must not
- * be misrepresented as being the original software.
- *
- * Beware that some of this code is subtly aware of the way operator
- * precedence is structured in regular expressions. Serious changes in
- * regular-expression syntax might require a total rethink.
+ * regc_color.c regc_cvec.c regc_lex.c
+ * regc_nfa.c regcomp.c regcustom.h
+ * rege_dfa.c regerror.c regerrs.h
+ * regex.h regexec.c regfree.c
+ * regfronts.c regguts.h
+ *
+ * Copyright (c) 1998 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* *** NOTE: this code has been altered slightly for use in Tcl: ***
- * *** 1. Use ckalloc, ckfree, and ckrealloc instead of malloc, ***
- * *** free, and realloc. ***
- * *** 2. Add extra argument to regexp to specify the real ***
- * *** start of the string separately from the start of the ***
- * *** current search. This is needed to search for multiple ***
- * *** matches within a string. ***
- * *** 3. Names have been changed, e.g. from re_comp to ***
+ * *** 1. Names have been changed, e.g. from re_comp to ***
* *** TclRegComp, to avoid clashes with other ***
* *** regexp implementations used by applications. ***
- * *** 4. Various lint-like things, such as casting arguments ***
- * *** in procedure calls, removing debugging code and ***
- * *** unused vars and procs. ***
- * *** 5. Removed "backward-compatibility kludges" such as ***
- * *** REG_PEND and REG_STARTEND flags, the re_endp field in ***
- * *** the regex_t struct, and the fronts.c layer. ***
- * *** 6. Changed char to Tcl_UniChar. ***
- * *** 7. Removed -DPOSIX_MISTAKE compile-time flag. ***
- * *** This is now the default. ***
- * *** 8. For performance considerations, created new Unicode ***
- * *** interfaces to avoid having to convert between UTF and ***
- * *** Unicode when we already had the Unicode string. For ***
- * *** example, in a "regsub -all" we were converting the ***
- * *** match string to Unicode N times, where N is the number ***
- * *** of bytes in the source string. ***
- * *** 9. Changed/widened some of the interfaces to allow ***
- * *** explicit passing of flags so that tclTest.c could ***
- * *** test the full range of acceptable flags. ***
*/
/*
@@ -181,7 +163,7 @@ Tcl_RegExpCompile(interp, string)
if (iPtr->patterns[NUM_REGEXPS-1] != NULL) {
ckfree(iPtr->patterns[NUM_REGEXPS-1]);
- regfree(&(iPtr->regexps[NUM_REGEXPS-1]->re));
+ re_ufree(&(iPtr->regexps[NUM_REGEXPS-1]->re));
ckfree((char *) iPtr->regexps[NUM_REGEXPS-1]);
}
for (i = NUM_REGEXPS - 2; i >= 0; i--) {
@@ -249,7 +231,7 @@ Tcl_RegExpExec(interp, re, string, start)
* Perform the regexp match.
*/
- result = TclRegExpExecUniChar(interp, re, uniString, numChars,
+ result = TclRegExpExecUniChar(interp, re, uniString, numChars, -1,
((string > start) ? REG_NOTBOL : 0));
Tcl_DStringFree(&stringBuffer);
@@ -324,7 +306,7 @@ Tcl_RegExpRange(re, index, startPtr, endPtr)
*/
int
-TclRegExpExecUniChar(interp, re, wString, numChars, flags)
+TclRegExpExecUniChar(interp, re, wString, numChars, nmatches, flags)
Tcl_Interp *interp; /* Interpreter to use for error reporting. */
Tcl_RegExp re; /* Compiled regular expression; returned by
* a previous call to Tcl_RegExpCompile() or
@@ -332,14 +314,20 @@ TclRegExpExecUniChar(interp, re, wString, numChars, flags)
CONST Tcl_UniChar *wString; /* String against which to match re. */
int numChars; /* Length of string in Tcl_UniChars (must
* be >= 0). */
+ int nmatches; /* How many subexpression matches (counting
+ * the whole match as subexpression 0) are
+ * of interest. -1 means "don't know". */
int flags; /* Regular expression flags. */
{
int status;
TclRegexp *regexpPtr = (TclRegexp *) re;
+ size_t nm = regexpPtr->re.re_nsub + 1;
+
+ if (nmatches >= 0 && (size_t) nmatches < nm)
+ nm = (size_t) nmatches;
status = re_uexec(&regexpPtr->re, wString, (size_t) numChars,
- (rm_detail_t *)NULL,
- regexpPtr->re.re_nsub + 1, regexpPtr->matches, flags);
+ (rm_detail_t *)NULL, nm, regexpPtr->matches, flags);
/*
* Check for errors.
@@ -569,6 +557,7 @@ TclRegAbout(interp, re)
REG_UUNPORT, "REG_UUNPORT",
REG_ULOCALE, "REG_ULOCALE",
REG_UEMPTYMATCH, "REG_UEMPTYMATCH",
+ REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE",
0, "",
};
struct infoname *inf;
@@ -632,12 +621,12 @@ TclRegError(interp, msg, status)
char *p;
Tcl_ResetResult(interp);
- n = regerror(status, (regex_t *)NULL, buf, sizeof(buf));
+ n = re_uerror(status, (regex_t *)NULL, buf, sizeof(buf));
p = (n > sizeof(buf)) ? "..." : "";
Tcl_AppendResult(interp, msg, buf, p, NULL);
sprintf(cbuf, "%d", status);
- (VOID) regerror(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf));
+ (VOID) re_uerror(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf));
Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL);
}
@@ -665,7 +654,7 @@ FreeRegexpInternalRep(objPtr)
{
TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;
- regfree(&regexpRepPtr->re);
+ re_ufree(&regexpRepPtr->re);
if (regexpRepPtr->matches) {
ckfree((char *) regexpRepPtr->matches);
}
@@ -677,7 +666,7 @@ FreeRegexpInternalRep(objPtr)
*
* DupRegexpInternalRep --
*
- * It is way to hairy to copy a regular expression, so we punt
+ * It is way too hairy to copy a regular expression, so we punt
* and revert the object back to a vanilla string.
*
* Results:
@@ -802,100 +791,3 @@ CompileRegexp(interp, string, length, flags)
return regexpPtr;
}
-
-/*
- *---------------------------------------------------------------------------
- *
- * TclRegXflags --
- *
- * Parse a string of extended regexp flag letters, for testing.
- *
- * Results:
- * No return value (you're on your own for errors here).
- *
- * Side effects:
- * Modifies *cflagsPtr, a regcomp flags word, and *eflagsPtr, a
- * regexec flags word, as appropriate.
- *
- *----------------------------------------------------------------------
- */
-
-VOID
-TclRegXflags(string, length, cflagsPtr, eflagsPtr)
- char *string; /* The string of flags. */
- int length; /* The length of the string in bytes. */
- int *cflagsPtr; /* compile flags word */
- int *eflagsPtr; /* exec flags word */
-{
- int i;
- int cflags;
- int eflags;
-
- cflags = *cflagsPtr;
- eflags = *eflagsPtr;
- for (i = 0; i < length; i++) {
- switch (string[i]) {
- case 'a': {
- cflags |= REG_ADVF;
- break;
- }
- case 'b': {
- cflags &= ~REG_ADVANCED;
- break;
- }
- case 'e': {
- cflags &= ~REG_ADVANCED;
- cflags |= REG_EXTENDED;
- break;
- }
- case 'q': {
- cflags &= ~REG_ADVANCED;
- cflags |= REG_QUOTE;
- break;
- }
- case 'i': {
- cflags |= REG_ICASE;
- break;
- }
- case 'o': { /* o for opaque */
- cflags |= REG_NOSUB;
- break;
- }
- case 'x': {
- cflags |= REG_EXPANDED;
- break;
- }
- case 'p': {
- cflags |= REG_NLSTOP;
- break;
- }
- case 'w': {
- cflags |= REG_NLANCH;
- break;
- }
- case 'n': {
- cflags |= REG_NEWLINE;
- break;
- }
- case '+': {
- cflags |= REG_FAKEEC;
- break;
- }
- case '^': {
- eflags |= REG_NOTBOL;
- break;
- }
- case '$': {
- eflags |= REG_NOTEOL;
- break;
- }
- case '%': {
- eflags |= REG_SMALL;
- break;
- }
- }
- }
-
- *cflagsPtr = cflags;
- *eflagsPtr = eflags;
-}
diff --git a/generic/tclRegexp.h b/generic/tclRegexp.h
index 9e56730..cd275f2 100644
--- a/generic/tclRegexp.h
+++ b/generic/tclRegexp.h
@@ -33,13 +33,13 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclRegexp.h,v 1.1.2.3 1998/10/21 20:40:06 stanton Exp $
+ * RCS: @(#) $Id: tclRegexp.h,v 1.1.2.4 1998/11/11 01:44:54 stanton Exp $
*/
#ifndef _TCLREGEXP
#define _TCLREGEXP
-#include "regcustom.h"
+#include "regex.h"
#ifdef BUILD_tcl
# undef TCL_STORAGE_CLASS
@@ -78,7 +78,7 @@ EXTERN VOID TclRegXflags _ANSI_ARGS_((char *string, int length,
int *cflagsPtr, int *eflagsPtr));
EXTERN int TclRegExpExecUniChar _ANSI_ARGS_((Tcl_Interp *interp,
Tcl_RegExp re, CONST Tcl_UniChar *uniString,
- int numChars, int flags));
+ int numChars, int nmatches, int flags));
EXTERN int TclRegExpMatchObj _ANSI_ARGS_((Tcl_Interp *interp,
char *string, Tcl_Obj *patObj));
EXTERN void TclRegExpRangeUniChar _ANSI_ARGS_((Tcl_RegExp re,
diff --git a/generic/tclTest.c b/generic/tclTest.c
index 2136b7c..b408fee 100644
--- a/generic/tclTest.c
+++ b/generic/tclTest.c
@@ -12,13 +12,14 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclTest.c,v 1.1.2.3 1998/10/21 20:40:07 stanton Exp $
+ * RCS: @(#) $Id: tclTest.c,v 1.1.2.4 1998/11/11 01:44:54 stanton Exp $
*/
#define TCL_TEST
#include "tclInt.h"
#include "tclPort.h"
+#include "tclRegexp.h"
#include <locale.h>
/*
@@ -244,6 +245,11 @@ static int TestparsevarObjCmd _ANSI_ARGS_((ClientData dummy,
static int TestparsevarnameObjCmd _ANSI_ARGS_((ClientData dummy,
Tcl_Interp *interp, int objc,
Tcl_Obj *CONST objv[]));
+static int TestregexpObjCmd _ANSI_ARGS_((ClientData dummy,
+ Tcl_Interp *interp, int objc,
+ Tcl_Obj *CONST objv[]));
+static void TestregexpXflags _ANSI_ARGS_((char *string,
+ int length, int *cflagsPtr, int *eflagsPtr));
static int TestsaveresultCmd _ANSI_ARGS_((ClientData dummy,
Tcl_Interp *interp, int objc,
Tcl_Obj *CONST objv[]));
@@ -394,6 +400,8 @@ Tcltest_Init(interp)
(ClientData) 0, (Tcl_CmdDeleteProc *) NULL);
Tcl_CreateObjCommand(interp, "testparsevarname", TestparsevarnameObjCmd,
(ClientData) 0, (Tcl_CmdDeleteProc *) NULL);
+ Tcl_CreateObjCommand(interp, "testregexp", TestregexpObjCmd,
+ (ClientData) 0, (Tcl_CmdDeleteProc *) NULL);
Tcl_CreateObjCommand(interp, "testsaveresult", TestsaveresultCmd,
(ClientData) 0, (Tcl_CmdDeleteProc *) NULL);
Tcl_CreateCommand(interp, "testsetassocdata", TestsetassocdataCmd,
@@ -2501,6 +2509,317 @@ TestparsevarnameObjCmd(clientData, interp, objc, objv)
/*
*----------------------------------------------------------------------
*
+ * TestregexpObjCmd --
+ *
+ * This procedure implements the "testregexp" command. It is
+ * used to give a direct interface for regexp flags. It's identical
+ * to Tcl_RegexpObjCmd except for the REGEXP_TEST define, which
+ * enables the -xflags option.
+ *
+ * Results:
+ * A standard Tcl result.
+ *
+ * Side effects:
+ * See the user documentation.
+ *
+ *----------------------------------------------------------------------
+ */
+
+ /* ARGSUSED */
+int
+TestregexpObjCmd(dummy, interp, objc, objv)
+ ClientData dummy; /* Not used. */
+ Tcl_Interp *interp; /* Current interpreter. */
+ int objc; /* Number of arguments. */
+ Tcl_Obj *CONST objv[]; /* Argument objects. */
+{
+ int i, result, indices, stringLength, wLen, match, about;
+ int hasxflags, cflags, eflags;
+ Tcl_RegExp regExpr;
+ char *string;
+ Tcl_DString stringBuffer, valueBuffer;
+ Tcl_UniChar *wStart;
+# define REGEXP_TEST /* yes */
+ static char *options[] = {
+ "-indices", "-nocase", "-about", "-expanded",
+ "-line", "-linestop", "-lineanchor",
+#ifdef REGEXP_TEST
+ "-xflags",
+#endif
+ "--", (char *) NULL
+ };
+ enum options {
+ REGEXP_INDICES, REGEXP_NOCASE, REGEXP_ABOUT, REGEXP_EXPANDED,
+ REGEXP_MULTI, REGEXP_NOCROSS, REGEXP_NEWL,
+#ifdef REGEXP_TEST
+ REGEXP_XFLAGS,
+#endif
+ REGEXP_LAST
+ };
+#ifndef REGEXP_TEST
+# define REGEXP_XFLAGS -1 /* impossible value */
+# define TestregexpXflags(a,b,c,d) /* do nothing */
+#endif
+
+ indices = 0;
+ about = 0;
+ cflags = REG_ADVANCED;
+ eflags = 0;
+ hasxflags = 0;
+
+ for (i = 1; i < objc; i++) {
+ char *name;
+ int index;
+
+ name = Tcl_GetString(objv[i]);
+ if (name[0] != '-') {
+ break;
+ }
+ if (Tcl_GetIndexFromObj(interp, objv[i], options, "switch", TCL_EXACT,
+ &index) != TCL_OK) {
+ return TCL_ERROR;
+ }
+ switch ((enum options) index) {
+ case REGEXP_INDICES: {
+ indices = 1;
+ break;
+ }
+ case REGEXP_NOCASE: {
+ cflags |= REG_ICASE;
+ break;
+ }
+ case REGEXP_ABOUT: {
+ about = 1;
+ break;
+ }
+ case REGEXP_EXPANDED: {
+ cflags |= REG_EXPANDED;
+ break;
+ }
+ case REGEXP_MULTI: {
+ cflags |= REG_NEWLINE;
+ break;
+ }
+ case REGEXP_NOCROSS: {
+ cflags |= REG_NLSTOP;
+ break;
+ }
+ case REGEXP_NEWL: {
+ cflags |= REG_NLANCH;
+ break;
+ }
+ case REGEXP_XFLAGS: {
+ hasxflags = 1;
+ break;
+ }
+ case REGEXP_LAST: {
+ i++;
+ goto endOfForLoop;
+ }
+ }
+ }
+
+ endOfForLoop:
+ if (objc - i < hasxflags + 2 - about) {
+ Tcl_WrongNumArgs(interp, 1, objv,
+ "?switches? exp string ?matchVar? ?subMatchVar subMatchVar ...?");
+ return TCL_ERROR;
+ }
+ objc -= i;
+ objv += i;
+
+ if (hasxflags) {
+ string = Tcl_GetStringFromObj(objv[0], &stringLength);
+ TestregexpXflags(string, stringLength, &cflags, &eflags);
+ objc--;
+ objv++;
+ }
+
+ regExpr = TclRegCompObj(interp, objv[0], cflags);
+ if (regExpr == NULL) {
+ return TCL_ERROR;
+ }
+
+ if (about) {
+ if (TclRegAbout(interp, regExpr) < 0) {
+ return TCL_ERROR;
+ }
+ return TCL_OK;
+ }
+
+ result = TCL_OK;
+ string = Tcl_GetStringFromObj(objv[1], &stringLength);
+
+ Tcl_DStringInit(&valueBuffer);
+
+ Tcl_DStringInit(&stringBuffer);
+ wStart = TclUtfToUniCharDString(string, stringLength, &stringBuffer);
+ wLen = Tcl_DStringLength(&stringBuffer) / sizeof(Tcl_UniChar);
+
+ match = TclRegExpExecUniChar(interp, regExpr, wStart, wLen, objc-2, eflags);
+ if (match < 0) {
+ result = TCL_ERROR;
+ goto done;
+ }
+ if (match == 0) {
+ /*
+ * Set the interpreter's object result to an integer object w/ value 0.
+ */
+
+ Tcl_SetIntObj(Tcl_GetObjResult(interp), 0);
+ goto done;
+ }
+
+ /*
+ * If additional variable names have been specified, return
+ * index information in those variables.
+ */
+
+ objc -= 2;
+ objv += 2;
+
+ for (i = 0; i < objc; i++) {
+ char *varName, *value;
+ int start, end;
+
+ varName = Tcl_GetString(objv[i]);
+
+ TclRegExpRangeUniChar(regExpr, i, &start, &end);
+ if (start < 0) {
+ if (indices) {
+ value = Tcl_SetVar(interp, varName, "-1 -1", 0);
+ } else {
+ value = Tcl_SetVar(interp, varName, "", 0);
+ }
+ } else {
+ if (indices) {
+ char info[TCL_INTEGER_SPACE * 2];
+
+ sprintf(info, "%d %d", start, end - 1);
+ value = Tcl_SetVar(interp, varName, info, 0);
+ } else {
+ value = TclUniCharToUtfDString(wStart + start, end - start,
+ &valueBuffer);
+ value = Tcl_SetVar(interp, varName, value, 0);
+ Tcl_DStringSetLength(&valueBuffer, 0);
+ }
+ }
+ if (value == NULL) {
+ Tcl_AppendResult(interp, "couldn't set variable \"",
+ varName, "\"", (char *) NULL);
+ result = TCL_ERROR;
+ goto done;
+ }
+ }
+
+ /*
+ * Set the interpreter's object result to an integer object w/ value 1.
+ */
+
+ Tcl_SetIntObj(Tcl_GetObjResult(interp), 1);
+
+ done:
+ Tcl_DStringFree(&stringBuffer);
+ Tcl_DStringFree(&valueBuffer);
+ return result;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * TestregexpXflags --
+ *
+ * Parse a string of extended regexp flag letters, for testing.
+ *
+ * Results:
+ * No return value (you're on your own for errors here).
+ *
+ * Side effects:
+ * Modifies *cflagsPtr, a regcomp flags word, and *eflagsPtr, a
+ * regexec flags word, as appropriate.
+ *
+ *----------------------------------------------------------------------
+ */
+
+VOID
+TestregexpXflags(string, length, cflagsPtr, eflagsPtr)
+ char *string; /* The string of flags. */
+ int length; /* The length of the string in bytes. */
+ int *cflagsPtr; /* compile flags word */
+ int *eflagsPtr; /* exec flags word */
+{
+ int i;
+ int cflags;
+ int eflags;
+
+ cflags = *cflagsPtr;
+ eflags = *eflagsPtr;
+ for (i = 0; i < length; i++) {
+ switch (string[i]) {
+ case 'a': {
+ cflags |= REG_ADVF;
+ break;
+ }
+ case 'b': {
+ cflags &= ~REG_ADVANCED;
+ break;
+ }
+ case 'e': {
+ cflags &= ~REG_ADVANCED;
+ cflags |= REG_EXTENDED;
+ break;
+ }
+ case 'q': {
+ cflags &= ~REG_ADVANCED;
+ cflags |= REG_QUOTE;
+ break;
+ }
+ case 'o': { /* o for opaque */
+ cflags |= REG_NOSUB;
+ break;
+ }
+ case '+': {
+ cflags |= REG_FAKEEC;
+ break;
+ }
+ case ',': {
+ cflags |= REG_PROGRESS;
+ break;
+ }
+ case '.': {
+ cflags |= REG_DUMP;
+ break;
+ }
+ case ':': {
+ eflags |= REG_MTRACE;
+ break;
+ }
+ case ';': {
+ eflags |= REG_FTRACE;
+ break;
+ }
+ case '^': {
+ eflags |= REG_NOTBOL;
+ break;
+ }
+ case '$': {
+ eflags |= REG_NOTEOL;
+ break;
+ }
+ case '%': {
+ eflags |= REG_SMALL;
+ break;
+ }
+ }
+ }
+
+ *cflagsPtr = cflags;
+ *eflagsPtr = eflags;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
* TestsetassocdataCmd --
*
* This procedure implements the "testsetassocdata" command. It is used
diff --git a/tests/regexp.test b/tests/regexp.test
index a97a258..8e52abf 100644
--- a/tests/regexp.test
+++ b/tests/regexp.test
@@ -10,7 +10,7 @@
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
-# RCS: @(#) $Id: regexp.test,v 1.1.2.3 1998/10/21 20:40:08 stanton Exp $
+# RCS: @(#) $Id: regexp.test,v 1.1.2.4 1998/11/11 01:44:56 stanton Exp $
if {[string compare test [info procs test]] == 1} then {source defs}
@@ -183,7 +183,7 @@ test regexp-6.2 {regexp errors} {
} {1 {wrong # args: should be "regexp ?switches? exp string ?matchVar? ?subMatchVar subMatchVar ...?"}}
test regexp-6.3 {regexp errors} {
list [catch {regexp -gorp a} msg] $msg
-} {1 {bad switch "-gorp": must be -indices, -nocase, -about, -expanded, or --}}
+} {1 {bad switch "-gorp": must be -indices, -nocase, -about, -expanded, -line, -linestop, -lineanchor, or --}}
test regexp-6.4 {regexp errors} {
list [catch {regexp a( b} msg] $msg
} {1 {couldn't compile regular expression pattern: parentheses () not balanced}}
diff --git a/tests/regtests.test b/tests/regtests.test
index 02efa79..d4b93e2 100644
--- a/tests/regtests.test
+++ b/tests/regtests.test
@@ -56,6 +56,7 @@ if {[string compare test [info procs test]] == 1} then {source defs}
# B ERE/ARE literal-_b_race heuristic used
# E backslash (_e_scape) seen within []
# H looka_h_ead constraint seen
+# I _i_mpossible to match
# L _l_ocale-specific construct seen
# M unportable (_m_achine-specific) construct seen
# N RE can match empty (_n_ull) string
@@ -74,7 +75,7 @@ if {[string compare test [info procs test]] == 1} then {source defs}
# test procedures and related
set ask "about"
-set xflags "unsupported0"
+set xflags "xflags"
set testbypassed 0
# re_info abbreviation mapping table
@@ -82,6 +83,7 @@ set infonames(A) "REG_UBSALNUM"
set infonames(B) "REG_UBRACES"
set infonames(E) "REG_UBBS"
set infonames(H) "REG_ULOOKAHEAD"
+set infonames(I) "REG_UIMPOSSIBLE"
set infonames(L) "REG_ULOCALE"
set infonames(M) "REG_UUNPORT"
set infonames(N) "REG_UEMPTYMATCH"
@@ -90,7 +92,7 @@ set infonames(Q) "REG_UBOUNDS"
set infonames(R) "REG_UBACKREF"
set infonames(S) "REG_UUNSPEC"
set infonames(U) "REG_UPBOTCH"
-set infonameorder "RHQBAUEPSMLN" ;# must match bit order, lsb first
+set infonameorder "RHQBAUEPSMLNI" ;# must match bit order, lsb first
# set major test number and description
proc doing {major desc} {
@@ -132,6 +134,9 @@ proc flags {fl} {
switch -exact -- $f {
"i" { lappend args "-nocase" }
"x" { lappend args "-expanded" }
+ "n" { lappend args "-line" }
+ "p" { lappend args "-linestop" }
+ "w" { lappend args "-lineanchor" }
"-" { }
default { append flags $f }
}
@@ -169,7 +174,7 @@ proc e {testid flags re err} {
return
}
- set cmd [concat [list regexp -$ask] [flags $flags] [list $re]]
+ set cmd [concat [list testregexp -$ask] [flags $flags] [list $re]]
set run "list \[catch \{$cmd\}\] \[lindex \$errorCode 1\]"
test $prefix.[tno $testid] [desc $testid] $run [list 1 REG_$err]
}
@@ -192,7 +197,7 @@ proc f {testid flags re target args} {
set f [flags $flags]
set infoflags [infoflags $flags]
- set ccmd [concat [list regexp -$ask] $f [list $re]]
+ set ccmd [concat [list testregexp -$ask] $f [list $re]]
set nsub [expr [llength $args] - 1]
if {$nsub == -1} {
# didn't tell us number of subexps
@@ -205,7 +210,7 @@ proc f {testid flags re target args} {
test $prefix.[tno $testid] [desc $testid] $ccmd $info
set testid [lreplace $testid end end "execute"]
- set ecmd [concat [list regexp] $f [list $re $target]]
+ set ecmd [concat [list testregexp] $f [list $re $target]]
test $prefix.[tno $testid] [desc $testid] $ecmd 0
}
@@ -229,8 +234,8 @@ proc matchexpected {opts testid flags re target args} {
set f [flags $flags]
set infoflags [infoflags $flags]
- set ccmd [concat [list regexp -$ask] $f [list $re]]
- set ecmd [concat [list regexp] $opts $f [list $re $target]]
+ set ccmd [concat [list testregexp -$ask] $f [list $re]]
+ set ecmd [concat [list testregexp] $opts $f [list $re $target]]
set nsub [expr [llength $args] - 1]
set names [list]
@@ -364,7 +369,7 @@ m 5 b ^* * *
e 6 - ^* BADRPT
f 7 & ^b ^b
m 8 b x^ x^ x^
-e 9 - x^ IMPOSS
+f 9 I x^ x
m 10 n "\n^" "x\nb" "\n"
f 11 bS {\(^b\)} ^b
m 12 - (^b) b b b
@@ -372,7 +377,7 @@ m 13 & {x$} x x
m 14 bS {\(x$\)} x x x
m 15 - {(x$)} x x x
m 16 b {x$y} "x\$y" "x\$y"
-e 17 - {x$y} IMPOSS
+f 17 I {x$y} xy
m 18 n "x\$\n" "x\n" "x\n"
e 19 - + BADRPT
e 20 - ? BADRPT
@@ -473,6 +478,8 @@ m 40 eE {a[\\]b} "a\\b" "a\\b"
m 41 bE {a[\\]b} "a\\b" "a\\b"
e 42 - {a[\Z]b} EESCAPE
m 43 & {a[[b]c} "a\[c" "a\[c"
+m 44 EMP {a[\u00fe-\u0507][\u00ff-\u0300]b} \
+ "a\u0102\u02ffb" "a\u0102\u02ffb"
@@ -539,6 +546,12 @@ e 23 b {\<*} BADRPT
e 24 b {\>*} BADRPT
e 25 - {\y*} BADRPT
e 26 - {\Y*} BADRPT
+m 27 LP {\ma} a a
+f 28 LP {\ma} ba
+m 29 LP {a\M} a a
+f 30 LP {a\M} ab
+f 31 ILP {\Ma} a
+f 32 ILP {a\m} a
@@ -587,7 +600,7 @@ m 19 P "a\\U00000008x" "a\bx" "a\bx"
e 20 - {a\U0000008x} EESCAPE
m 21 P "a\\vb" "a\vb" "a\vb"
m 22 MP "a\\x08x" "a\bx" "a\bx"
-e 23 - {a\xx} EESCAPE
+e 23 - {a\xq} EESCAPE
m 24 MP "a\\x0008x" "a\bx" "a\bx"
e 25 - {a\z} EESCAPE
m 26 MP "a\\010b" "a\bb" "a\bb"
@@ -663,7 +676,7 @@ m 1 P a(?#comment)b ab ab
doing 18 "unmatchable REs"
-e 1 - a^b IMPOSS
+f 1 I a^b ab
@@ -757,7 +770,7 @@ doing 22 "multicharacter collating elements"
# again ugh
# currently disabled because the fake MCCE we use for testing is unavailable
xx m 1 &+L {a[c]e} ace ace
-xx e 2 &+ {a[c]h} IMPOSS
+xx f 2 &+I {a[c]h} ach
xx m 3 &+L {a[[.ch.]]} ach ach
xx f 4 &+L {a[[.ch.]]} ace
xx m 5 &+L {a[c[.ch.]]} ac ac
diff --git a/unix/Makefile.in b/unix/Makefile.in
index fd2bdbc..cbcd510 100644
--- a/unix/Makefile.in
+++ b/unix/Makefile.in
@@ -5,7 +5,7 @@
# "autoconf" program (constructs like "@foo@" will get replaced in the
# actual Makefile.
#
-# RCS: @(#) $Id: Makefile.in,v 1.1.2.4 1998/10/21 20:40:12 stanton Exp $
+# RCS: @(#) $Id: Makefile.in,v 1.1.2.5 1998/11/11 01:44:57 stanton Exp $
# Current Tcl version; used in various names.
@@ -618,7 +618,7 @@ regcomp.o: $(REGHDRS) $(GENERIC_DIR)/regcomp.c $(GENERIC_DIR)/regc_lex.c \
$(GENERIC_DIR)/regc_nfa.c $(GENERIC_DIR)/regc_cvec.c
$(CC) -c $(CC_SWITCHES) $(GENERIC_DIR)/regcomp.c
-regexec.o: $(REGHDRS) $(GENERIC_DIR)/regexec.c
+regexec.o: $(REGHDRS) $(GENERIC_DIR)/regexec.c $(GENERIC_DIR)/rege_dfa.c
$(CC) -c $(CC_SWITCHES) $(GENERIC_DIR)/regexec.c
regfree.o: $(REGHDRS) $(GENERIC_DIR)/regfree.c
diff --git a/win/makefile.vc b/win/makefile.vc
index ac73808..de9f892 100644
--- a/win/makefile.vc
+++ b/win/makefile.vc
@@ -4,7 +4,7 @@
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
# Copyright (c) 1995-1996 Sun Microsystems, Inc.
-# RCS: @(#) $Id: makefile.vc,v 1.1.2.7 1998/11/04 04:41:26 stanton Exp $
+# RCS: @(#) $Id: makefile.vc,v 1.1.2.8 1998/11/11 01:44:57 stanton Exp $
# Does not depend on the presence of any environment variables in
# order to compile tcl; all needed information is derived from
@@ -466,12 +466,23 @@ $(TMPDIR)\tclAppInit.obj : $(WINDIR)\tclAppInit.c
# Dedependency rules
$(GENERICDIR)\regcomp.c: \
- $(GENERICDIR)\regc_locale.c \
$(GENERICDIR)\regguts.h \
$(GENERICDIR)\regc_lex.c \
$(GENERICDIR)\regc_color.c \
$(GENERICDIR)\regc_nfa.c \
- $(GENERICDIR)\regc_cvec.c
+ $(GENERICDIR)\regc_cvec.c \
+ $(GENERICDIR)\regc_locale.c
+$(GENERICDIR)\regcustom.h: \
+ $(GENERICDIR)\tclInt.h \
+ $(GENERICDIR)\tclPort.h \
+ $(GENERICDIR)\regex.h
+$(GENERICDIR)\regexec.c: \
+ $(GENERICDIR)\rege_dfa.c \
+ $(GENERICDIR)\regguts.h
+$(GENERICDIR)\regerror.c: $(GENERICDIR)\regguts.h
+$(GENERICDIR)\regfree.c: $(GENERICDIR)\regguts.h
+$(GENERICDIR)\regfronts.c: $(GENERICDIR)\regguts.h
+$(GENERICDIR)\regguts.h: $(GENERICDIR)\regcustom.h
#
# Implicit rules