summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorstanton <stanton>1999-08-05 01:16:57 (GMT)
committerstanton <stanton>1999-08-05 01:16:57 (GMT)
commit144c26987f8a6f6285efd2f4e878e844b9c13d33 (patch)
tree6ba27c1270de462a013ecbff3d320551d574c2c5
parent97320151634cf8e22126e5ab7c59b1aa9931c002 (diff)
downloadtcl-144c26987f8a6f6285efd2f4e878e844b9c13d33.zip
tcl-144c26987f8a6f6285efd2f4e878e844b9c13d33.tar.gz
tcl-144c26987f8a6f6285efd2f4e878e844b9c13d33.tar.bz2
* generic/regc_nfa.c:
* generic/regcomp.c: * generic/rege_dfa.c: * generic/regexec.c: * generic/regguts.h: Applied patches supplied by Henry Spencer to greatly enhance the performance of certain classes of regular expressions. [Bug: 2440, 2447]
-rw-r--r--generic/regc_nfa.c8
-rw-r--r--generic/regcomp.c128
-rw-r--r--generic/rege_dfa.c2
-rw-r--r--generic/regexec.c42
-rw-r--r--generic/regguts.h3
5 files changed, 111 insertions, 72 deletions
diff --git a/generic/regc_nfa.c b/generic/regc_nfa.c
index 897b3eb..9881cd4 100644
--- a/generic/regc_nfa.c
+++ b/generic/regc_nfa.c
@@ -703,9 +703,9 @@ struct nfa *nfa;
/*
- optimize - optimize an NFA
- ^ static int optimize(struct nfa *, FILE *);
+ ^ static long optimize(struct nfa *, FILE *);
*/
-static int /* re_info bits */
+static long /* re_info bits */
optimize(nfa, f)
struct nfa *nfa;
FILE *f; /* for debug output; NULL none */
@@ -1187,9 +1187,9 @@ struct state *mark; /* the value to mark with */
/*
- analyze - ascertain potentially-useful facts about an optimized NFA
- ^ static int analyze(struct nfa *);
+ ^ static long analyze(struct nfa *);
*/
-static int /* re_info bits to be ORed in */
+static long /* re_info bits to be ORed in */
analyze(nfa)
struct nfa *nfa;
{
diff --git a/generic/regcomp.c b/generic/regcomp.c
index b292012..85639a1 100644
--- a/generic/regcomp.c
+++ b/generic/regcomp.c
@@ -41,7 +41,7 @@
int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int));
static VOID moresubs _ANSI_ARGS_((struct vars *, int));
static int freev _ANSI_ARGS_((struct vars *, int));
-static VOID makescan _ANSI_ARGS_((struct vars *, struct nfa *));
+static VOID makesearch _ANSI_ARGS_((struct vars *, struct nfa *));
static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int));
static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *));
@@ -65,14 +65,15 @@ static VOID optst _ANSI_ARGS_((struct vars *, struct subre *));
static int numst _ANSI_ARGS_((struct subre *, int));
static VOID markst _ANSI_ARGS_((struct subre *));
static VOID cleanst _ANSI_ARGS_((struct vars *));
-static int nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
-static int nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
+static long nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
+static long nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int));
static VOID freelacons _ANSI_ARGS_((struct subre *, int));
static VOID rfree _ANSI_ARGS_((regex_t *));
static VOID dump _ANSI_ARGS_((regex_t *, FILE *));
static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int));
-static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int, int));
+static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int));
+static char *stid _ANSI_ARGS_((struct subre *, char *, size_t));
/* === regc_lex.c === */
static VOID lexstart _ANSI_ARGS_((struct vars *));
static VOID prefixes _ANSI_ARGS_((struct vars *));
@@ -138,7 +139,7 @@ static VOID dupnfa _ANSI_ARGS_((struct nfa *, struct state *, struct state *, st
static VOID duptraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID cleartraverse _ANSI_ARGS_((struct nfa *, struct state *));
static VOID specialcolors _ANSI_ARGS_((struct nfa *));
-static int optimize _ANSI_ARGS_((struct nfa *, FILE *));
+static long optimize _ANSI_ARGS_((struct nfa *, FILE *));
static VOID pullback _ANSI_ARGS_((struct nfa *, FILE *));
static int pull _ANSI_ARGS_((struct nfa *, struct arc *));
static VOID pushfwd _ANSI_ARGS_((struct nfa *, FILE *));
@@ -152,7 +153,7 @@ static int unempty _ANSI_ARGS_((struct nfa *, struct arc *));
static VOID cleanup _ANSI_ARGS_((struct nfa *));
static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
-static int analyze _ANSI_ARGS_((struct nfa *));
+static long analyze _ANSI_ARGS_((struct nfa *));
static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *));
static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *));
static VOID freecnfa _ANSI_ARGS_((struct cnfa *));
@@ -229,7 +230,6 @@ struct vars {
struct subre *lacons; /* lookahead-constraint vector */
int nlacons; /* size of lacons */
int usedshorter; /* used short-preferring quantifiers */
- int unmatchable; /* can never match */
};
/* parsing macros; most know that `v' is the struct vars pointer */
@@ -382,36 +382,37 @@ int flags;
specialcolors(v->nfa);
CNOERR();
if (debug != NULL) {
+ fprintf(debug, "\n\n\n========= RAW ==========\n");
dumpnfa(v->nfa, debug);
dumpst(v->tree, debug, 1);
}
v->usedshorter = 0;
- v->unmatchable = 0;
optst(v, v->tree);
v->ntree = numst(v->tree, 1);
markst(v->tree);
cleanst(v);
if (debug != NULL) {
+ fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
fprintf(debug, "-->\n");
dumpst(v->tree, debug, 1);
}
/* build compacted NFAs for tree, lacons, fast search */
re->re_info |= nfatree(v, v->tree, debug);
- if (debug != NULL) {
- fprintf(debug, "---->\n");
- dumpst(v->tree, debug, 1);
- }
CNOERR();
- if (re->re_info&REG_UIMPOSSIBLE)
- v->unmatchable = 1;
assert(v->nlacons == 0 || v->lacons != NULL);
- for (i = 1; i < v->nlacons; i++)
+ for (i = 1; i < v->nlacons; i++) {
+ if (debug != NULL)
+ fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
nfanode(v, &v->lacons[i], debug);
+ }
CNOERR();
+ if (debug != NULL)
+ fprintf(debug, "\n\n\n========= SEARCH ==========\n");
+ /* can sacrifice main NFA now, so use it as work area */
(DISCARD)optimize(v->nfa, debug);
CNOERR();
- makescan(v, v->nfa);
+ makesearch(v, v->nfa);
CNOERR();
compact(v->nfa, &g->search);
CNOERR();
@@ -431,7 +432,6 @@ int flags;
v->lacons = NULL;
g->nlacons = v->nlacons;
g->usedshorter = v->usedshorter;
- g->unmatchable = v->unmatchable;
if (flags&REG_DUMP)
dump(re, stdout);
@@ -507,12 +507,12 @@ int err;
}
/*
- - makescan - turn an NFA into a fast-scan NFA (implicit prepend of .*?)
+ - makesearch - turn an NFA into a fast-scan NFA (implicit prepend of .*?)
* NFA must have been optimize()d already.
- ^ static VOID makescan(struct vars *, struct nfa *);
+ ^ static VOID makesearch(struct vars *, struct nfa *);
*/
static VOID
-makescan(v, nfa)
+makesearch(v, nfa)
struct vars *v;
struct nfa *nfa;
{
@@ -1907,9 +1907,9 @@ struct vars *v;
/*
- nfatree - turn a subRE subtree into a tree of compacted NFAs
- ^ static int nfatree(struct vars *, struct subre *, FILE *);
+ ^ static long nfatree(struct vars *, struct subre *, FILE *);
*/
-static int /* optimize results from top node */
+static long /* optimize results from top node */
nfatree(v, t, f)
struct vars *v;
struct subre *t;
@@ -1927,19 +1927,23 @@ FILE *f; /* for debug output */
/*
- nfanode - do one NFA for nfatree
- ^ static int nfanode(struct vars *, struct subre *, FILE *);
+ ^ static long nfanode(struct vars *, struct subre *, FILE *);
*/
-static int /* optimize results */
+static long /* optimize results */
nfanode(v, t, f)
struct vars *v;
struct subre *t;
FILE *f; /* for debug output */
{
struct nfa *nfa;
- int ret = 0;
+ long ret = 0;
+ char idbuf[50];
assert(t->begin != NULL);
+ if (f != NULL)
+ fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
+ stid(t, idbuf, sizeof(idbuf)));
nfa = newnfa(v, v->cm, v->nfa);
NOERRZ();
dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
@@ -2061,20 +2065,22 @@ FILE *f;
fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
GUTSMAGIC);
- fprintf(f, "nsub %d, info 0%o, csize %d, ntree %d, usedshort %d\n",
+ fprintf(f, "\n\n\n========= DUMP ==========\n");
+ fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d, usedshort %d\n",
re->re_nsub, re->re_info, re->re_csize, g->ntree,
g->usedshorter);
dumpcolors(&g->cmap, f);
if (!NULLCNFA(g->search)) {
- printf("search:\n");
+ printf("\nsearch:\n");
dumpcnfa(&g->search, f);
}
for (i = 1; i < g->nlacons; i++) {
- fprintf(f, "la%d (%s):\n", i,
+ fprintf(f, "\nla%d (%s):\n", i,
(g->lacons[i].subno) ? "positive" : "negative");
dumpcnfa(&g->lacons[i].cnfa, f);
}
+ fprintf(f, "\n");
dumpst(g->tree, f, 0);
#endif
}
@@ -2092,59 +2098,79 @@ int nfapresent; /* is the original NFA still around? */
if (t == NULL)
fprintf(f, "null tree\n");
else
- stdump(t, f, nfapresent, 0);
+ stdump(t, f, nfapresent);
fflush(f);
}
/*
- stdump - recursive guts of dumpst
- ^ static VOID stdump(struct subre *, FILE *, int, int);
+ ^ static VOID stdump(struct subre *, FILE *, int);
*/
static VOID
-stdump(t, f, nfapresent, level)
+stdump(t, f, nfapresent)
struct subre *t;
FILE *f;
int nfapresent; /* is the original NFA still around? */
-int level;
{
int i;
-# define RTSEP " "
+ char idbuf[50];
- for (i = 0; i < level; i++)
- fprintf(f, RTSEP);
- fprintf(f, "%c (", t->op);
+ fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
if (t->flags&LONGER)
- fprintf(f, "L");
+ fprintf(f, " longest");
if (t->flags&SHORTER)
- fprintf(f, "S");
+ fprintf(f, " shortest");
if (t->flags&MIXED)
- fprintf(f, "M");
+ fprintf(f, " hasmixed");
if (t->flags&CAP)
- fprintf(f, "c");
+ fprintf(f, " hascapture");
if (t->flags&BACKR)
- fprintf(f, "b");
+ fprintf(f, " hasbackref");
if (!(t->flags&INUSE))
- fprintf(f, "!u");
- fprintf(f, ") r%d", t->retry);
+ fprintf(f, " UNUSED");
if (t->subno != 0)
- fprintf(f, " #%d", t->subno);
+ fprintf(f, " (#%d)", t->subno);
if (t->min != 1 || t->max != 1) {
- fprintf(f, "{%d,", t->min);
+ fprintf(f, " {%d,", t->min);
if (t->max != INFINITY)
fprintf(f, "%d", t->max);
fprintf(f, "}");
}
if (nfapresent)
fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
- if (!NULLCNFA(t->cnfa))
- fprintf(f, ":");
- fprintf(f, "\n");
if (t->left != NULL)
- stdump(t->left, f, nfapresent, level+1);
- if (!NULLCNFA(t->cnfa))
+ fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
+ if (t->right != NULL)
+ fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
+ if (!NULLCNFA(t->cnfa)) {
+ fprintf(f, "\n");
dumpcnfa(&t->cnfa, f);
+ fprintf(f, "\n");
+ }
+ if (t->left != NULL)
+ stdump(t->left, f, nfapresent);
if (t->right != NULL)
- stdump(t->right, f, nfapresent, level+1);
+ stdump(t->right, f, nfapresent);
+}
+
+/*
+ - stid - identify a subtree node for dumping
+ ^ static char *stid(struct subre *, char *, size_t);
+ */
+static char * /* points to buf or constant string */
+stid(t, buf, bufsize)
+struct subre *t;
+char *buf;
+size_t bufsize;
+{
+ /* big enough for hex int or decimal t->retry? */
+ if (bufsize < sizeof(int)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1)
+ return "unable";
+ if (t->retry != 0)
+ sprintf(buf, "%d", t->retry);
+ else
+ sprintf(buf, "0x%x", (int)t); /* may lose bits, that's okay */
+ return buf;
}
#include "regc_lex.c"
diff --git a/generic/rege_dfa.c b/generic/rege_dfa.c
index 1628150..313892c 100644
--- a/generic/rege_dfa.c
+++ b/generic/rege_dfa.c
@@ -503,7 +503,7 @@ chr *start; /* where the attempt got started */
}
if (!sawlacons) { /* lookahead conds. always cache miss */
- FDEBUG(("c%d[%d]->%d\n", css - d->ssets, co, p - d->ssets));
+ FDEBUG(("c%d[%d]->c%d\n", css - d->ssets, co, p - d->ssets));
css->outs[co] = p;
css->inchain[co] = p->ins;
p->ins.ss = css;
diff --git a/generic/regexec.c b/generic/regexec.c
index ab9dc21..3d216db 100644
--- a/generic/regexec.c
+++ b/generic/regexec.c
@@ -172,7 +172,7 @@ int flags;
register struct vars *v = &var;
int st;
size_t n;
- int complications;
+ int backref;
# define LOCALMAT 20
regmatch_t mat[LOCALMAT];
# define LOCALMEM 40
@@ -189,16 +189,14 @@ int flags;
v->g = (struct guts *)re->re_guts;
if ((v->g->cflags&REG_EXPECT) && details == NULL)
return REG_INVARG;
- if (v->g->unmatchable)
+ if (v->g->info&REG_UIMPOSSIBLE)
return REG_NOMATCH;
- complications = (v->g->info&REG_UBACKREF) ? 1 : 0;
- if (v->g->usedshorter)
- complications = 1;
+ backref = (v->g->info&REG_UBACKREF) ? 1 : 0;
v->eflags = flags;
if (v->g->cflags&REG_NOSUB)
nmatch = 0; /* override client */
v->nmatch = nmatch;
- if (complications && v->nmatch < v->g->nsub + 1) {
+ if (backref && v->nmatch < v->g->nsub + 1) {
/* need work area bigger than what user gave us */
if (v->g->nsub + 1 <= LOCALMAT)
v->pmatch = mat;
@@ -214,7 +212,8 @@ int flags;
v->start = (chr *)string;
v->stop = (chr *)string + len;
v->err = 0;
- if (complications) {
+ if (backref) {
+ /* need retry memory */
assert(v->g->ntree >= 0);
n = (size_t)v->g->ntree;
if (n <= LOCALMEM)
@@ -231,7 +230,7 @@ int flags;
/* do it */
assert(v->g->tree != NULL);
- if (complications)
+ if (backref)
st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
else
st = find(v, &v->g->tree->cnfa, &v->g->cmap);
@@ -266,11 +265,12 @@ struct colormap *cm;
struct smalldfa sa;
struct dfa *s = newdfa(v, &v->g->search, cm, &sa);
chr *begin;
- chr *end = NULL;
+ chr *end;
chr *cold;
chr *open; /* open and close of range of possible starts */
chr *close;
int hitend;
+ int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0;
if (ISERR()) { /* should be newdfa() failure */
if (d != NULL)
@@ -311,7 +311,11 @@ struct colormap *cm;
MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
for (begin = open; begin <= close; begin++) {
MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
- end = longest(v, d, begin, v->stop, &hitend);
+ if (shorter)
+ end = shortest(v, d, begin, begin, v->stop,
+ (chr **)NULL, &hitend);
+ else
+ end = longest(v, d, begin, v->stop, &hitend);
if (hitend && cold == NULL)
cold = begin;
if (end != NULL)
@@ -328,7 +332,7 @@ struct colormap *cm;
if (cold != NULL)
v->details->rm_extend.rm_so = OFF(cold);
else
- v->details->rm_extend.rm_eo = OFF(v->stop);
+ v->details->rm_extend.rm_so = OFF(v->stop);
v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
}
if (v->nmatch > 1) {
@@ -591,6 +595,8 @@ chr *end; /* end of same */
struct dfa *d2;
chr *mid;
int i;
+ int shorter = (t->left->flags&SHORTER) ? 1 : 0;
+ chr *stop = (shorter) ? end : begin;
assert(t->op == '.');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
@@ -606,7 +612,11 @@ chr *end; /* end of same */
}
/* pick a tentative midpoint */
- mid = longest(v, d, begin, end, (int *)NULL);
+ if (shorter)
+ mid = shortest(v, d, begin, begin, end, (chr **)NULL,
+ (int *)NULL);
+ else
+ mid = longest(v, d, begin, end, (int *)NULL);
if (mid == NULL) {
freedfa(d);
freedfa(d2);
@@ -617,14 +627,18 @@ chr *end; /* end of same */
/* iterate until satisfaction or failure */
while (longest(v, d2, mid, end, (int *)NULL) != end) {
/* that midpoint didn't work, find a new one */
- if (mid == begin) {
+ if (mid == stop) {
/* all possibilities exhausted! */
MDEBUG(("no midpoint!\n"));
freedfa(d);
freedfa(d2);
return REG_ASSERT;
}
- mid = longest(v, d, begin, mid-1, (int *)NULL);
+ if (shorter)
+ mid = shortest(v, d, begin, mid+1, end, (chr **)NULL,
+ (int *)NULL);
+ else
+ mid = longest(v, d, begin, mid-1, (int *)NULL);
if (mid == NULL) {
/* failed to find a new one! */
MDEBUG(("failed midpoint!\n"));
diff --git a/generic/regguts.h b/generic/regguts.h
index 29b1721..45c47f6 100644
--- a/generic/regguts.h
+++ b/generic/regguts.h
@@ -401,7 +401,7 @@ struct guts {
int magic;
# define GUTSMAGIC 0xfed9
int cflags; /* copy of compile flags */
- int info; /* copy of re_info */
+ long info; /* copy of re_info */
size_t nsub; /* copy of re_nsub */
struct subre *tree;
struct cnfa search; /* for fast preliminary search */
@@ -411,5 +411,4 @@ struct guts {
struct subre *lacons; /* lookahead-constraint vector */
int nlacons; /* size of lacons */
int usedshorter; /* used non-greedy quantifiers? */
- int unmatchable; /* cannot match anything? */
};