summaryrefslogtreecommitdiffstats
path: root/generic/regguts.h
diff options
context:
space:
mode:
authordgp <dgp@users.sourceforge.net>2015-09-21 19:04:40 (GMT)
committerdgp <dgp@users.sourceforge.net>2015-09-21 19:04:40 (GMT)
commitd9db840088cdabd2863a7bd92ca051cda3f56c46 (patch)
tree0a529f6ed9e006d0521d481343a52d9f8f2818bd /generic/regguts.h
parent42b210de3d1f3c3e38df2ee20bba91a796324108 (diff)
downloadtcl-d9db840088cdabd2863a7bd92ca051cda3f56c46.zip
tcl-d9db840088cdabd2863a7bd92ca051cda3f56c46.tar.gz
tcl-d9db840088cdabd2863a7bd92ca051cda3f56c46.tar.bz2
[1115587][0e0e150e49] Major fix for regexp handling of quantified backrefs.
Contributed by Tom Lane from the Postgres project.
Diffstat (limited to 'generic/regguts.h')
-rw-r--r--generic/regguts.h27
1 files changed, 22 insertions, 5 deletions
diff --git a/generic/regguts.h b/generic/regguts.h
index 1b6abe6..e1c60ce 100644
--- a/generic/regguts.h
+++ b/generic/regguts.h
@@ -329,11 +329,28 @@ struct cnfa {
/*
* subexpression tree
+ *
+ * "op" is one of:
+ * '=' plain regex without interesting substructure (implemented as DFA)
+ * 'b' back-reference (has no substructure either)
+ * '(' capture node: captures the match of its single child
+ * '.' concatenation: matches a match for left, then a match for right
+ * '|' alternation: matches a match for left or a match for right
+ * '*' iteration: matches some number of matches of its single child
+ *
+ * Note: the right child of an alternation must be another alternation or
+ * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
+ * might expect. This could stand to be changed. Actually I'd rather see
+ * a single alternation node with N children, but that will take revising
+ * the representation of struct subre.
+ *
+ * Note: when a backref is directly quantified, we stick the min/max counts
+ * into the backref rather than plastering an iteration node on top. This is
+ * for efficiency: there is no need to search for possible division points.
*/
struct subre {
- char op; /* '|', '.' (concat), 'b' (backref), '(',
- * '=' */
+ char op; /* see type codes above */
char flags;
#define LONGER 01 /* prefers longer match */
#define SHORTER 02 /* prefers shorter match */
@@ -349,10 +366,10 @@ struct subre {
#define PREF(f) ((f)&NOPROP)
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
- short retry; /* index into retry memory */
+ short id; /* ID of subre (1..ntree) */
int subno; /* subexpression number (for 'b' and '(') */
- short min; /* min repetitions, for backref only */
- short max; /* max repetitions, for backref only */
+ short min; /* min repetitions for iteration or backref */
+ short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */
struct subre *right; /* right child, if any */
struct state *begin; /* outarcs from here... */