diff options
Diffstat (limited to 'generic/regguts.h')
| -rw-r--r-- | generic/regguts.h | 170 | 
1 files changed, 94 insertions, 76 deletions
| diff --git a/generic/regguts.h b/generic/regguts.h index 190d40b..ad9d5b9 100644 --- a/generic/regguts.h +++ b/generic/regguts.h @@ -39,15 +39,6 @@   * Things that regcustom.h might override.   */ -/* standard header files (NULL is a reasonable indicator for them) */ -#ifndef NULL -#include <stdio.h> -#include <stdlib.h> -#include <ctype.h> -#include <limits.h> -#include <string.h> -#endif -  /* assertions */  #ifndef assert  #ifndef REG_DEBUG @@ -58,54 +49,20 @@  #include <assert.h>  #endif -/* voids */ -#ifndef VOID -#define	VOID	void			/* for function return values */ -#endif -#ifndef DISCARD -#define	DISCARD	void			/* for throwing values away */ -#endif -#ifndef PVOID -#define	PVOID	void *			/* generic pointer */ -#endif -#ifndef VS -#define	VS(x)	((void*)(x))		/* cast something to generic ptr */ -#endif -#ifndef NOPARMS -#define	NOPARMS	void			/* for empty parm lists */ -#endif - -/* const */ -#ifndef CONST -#define	CONST	const			/* for old compilers, might be empty */ -#endif - -/* function-pointer declarator */ -#ifndef FUNCPTR -#if __STDC__ >= 1 -#define	FUNCPTR(name, args)	(*name)args -#else -#define	FUNCPTR(name, args)	(*name)() -#endif -#endif -  /* memory allocation */  #ifndef MALLOC  #define	MALLOC(n)	malloc(n)  #endif  #ifndef REALLOC -#define	REALLOC(p, n)	realloc(VS(p), n) +#define	REALLOC(p, n)	realloc(p, n)  #endif  #ifndef FREE -#define	FREE(p)		free(VS(p)) +#define	FREE(p)		free(p)  #endif  /* want size of a char in bits, and max value in bounded quantifiers */ -#ifndef CHAR_BIT -#include <limits.h> -#endif  #ifndef _POSIX2_RE_DUP_MAX -#define	_POSIX2_RE_DUP_MAX	255	/* normally from <limits.h> */ +#define	_POSIX2_RE_DUP_MAX 255	/* normally from <limits.h> */  #endif  /* @@ -116,7 +73,7 @@  #define	xxx		1  #define	DUPMAX	_POSIX2_RE_DUP_MAX -#define	INFINITY	(DUPMAX+1) +#define	DUPINF	(DUPMAX+1)  #define	REMAGIC	0xfed7		/* magic number for main struct */ @@ -162,6 +119,7 @@  typedef short color;		/* colors of characters */  typedef int pcolor;		/* what color promotes to */ +#define MAX_COLOR	SHRT_MAX /* max color value */  #define	COLORLESS	(-1)	/* impossible color */  #define	WHITE		0	/* default color, parent of all others */ @@ -189,7 +147,7 @@ union tree {  #define	tcolor	colors.ccolor  #define	tptr	ptrs.pptr -/* internal per-color structure for the color machinery */ +/* Internal per-color descriptor structure for the color machinery */  struct colordesc {      uchr nchrs;			/* number of chars of this color */      color sub;			/* open subcolor (if any); free chain ptr */ @@ -202,7 +160,14 @@ struct colordesc {      union tree *block;		/* block of solid color, if any */  }; -/* the color map itself */ +/* + * The color map itself + * + * Much of the data in the colormap struct is only used at compile time. + * However, the bulk of the space usage is in the "tree" structure, so it's + * not clear that there's much point in converting the rest to a more compact + * form when compilation is finished. + */  struct colormap {      int magic;  #define	CMMAGIC	0x876 @@ -235,9 +200,9 @@ struct colormap {  /*   * Interface definitions for locale-interface functions in locale.c. - * Multi-character collating elements (MCCEs) cause most of the trouble.   */ +/* Representation of a set of characters. */  struct cvec {      int nchrs;			/* number of chrs */      int chrspace;		/* number of chrs possible */ @@ -245,18 +210,11 @@ struct cvec {      int nranges;		/* number of ranges (chr pairs) */      int rangespace;		/* number of chrs possible */      chr *ranges;		/* pointer to vector of chr pairs */ -    int nmcces;			/* number of MCCEs */ -    int mccespace;		/* number of MCCEs possible */ -    int nmccechrs;		/* number of chrs used for MCCEs */ -    chr *mcces[1];		/* pointers to 0-terminated MCCEs */ -				/* and both batches of chrs are on the end */  }; -/* caution:  this value cannot be changed easily */ -#define	MAXMCCE	2		/* length of longest MCCE */ -  /* - * definitions for NFA internal representation + * definitions for non-deterministic finite autmaton (NFA) internal + * representation   *   * Having a "from" pointer within each arc may seem redundant, but it saves a   * lot of hassle. @@ -265,15 +223,17 @@ struct cvec {  struct state;  struct arc { -    int type; -#define	ARCFREE	'\0' +    int type;			/* 0 if free, else an NFA arc type code */      color co;      struct state *from;		/* where it's from (and contained within) */      struct state *to;		/* where it's to */ -    struct arc *outchain;	/* *from's outs chain or free chain */ -#define	freechain	outchain +    struct arc *outchain;	/* link in *from's outs chain or free chain */ +    struct arc *outchainRev;	/* back-link in *from's outs chain */ +#define	freechain outchain	/* we do not maintain "freechainRev" */      struct arc *inchain;	/* *to's ins chain */ +    struct arc *inchainRev;	/* back-link in *to's ins chain */      struct arc *colorchain;	/* color's arc chain */ +    struct arc *colorchainRev;	/* back-link in color's arc chain */  };  struct arcbatch {		/* for bulk allocation of arcs */ @@ -284,7 +244,7 @@ struct arcbatch {		/* for bulk allocation of arcs */  struct state {      int no; -#		define	FREESTATE	(-1) +#define	FREESTATE	(-1)      char flag;			/* marks special states */      int nins;			/* number of inarcs */      struct arc *ins;		/* chain of inarcs */ @@ -316,11 +276,22 @@ struct nfa {  /*   * definitions for compacted NFA + * + * The main space savings in a compacted NFA is from making the arcs as small + * as possible.  We store only the transition color and next-state number for + * each arc.  The list of out arcs for each state is an array beginning at + * cnfa.states[statenumber], and terminated by a dummy carc struct with + * co == COLORLESS. + * + * The non-dummy carc structs are of two types: plain arcs and LACON arcs. + * Plain arcs just store the transition color number as "co".  LACON arcs + * store the lookahead constraint number plus cnfa.ncolors as "co".  LACON + * arcs can be distinguished from plain by testing for co >= cnfa.ncolors.   */  struct carc {      color co;			/* COLORLESS is list terminator */ -    int to;			/* state number */ +    int to;			/* next-state number */  };  struct cnfa { @@ -332,19 +303,52 @@ struct cnfa {      int post;			/* teardown state number */      color bos[2];		/* colors, if any, assigned to BOS and BOL */      color eos[2];		/* colors, if any, assigned to EOS and EOL */ +    char *stflags;		/* vector of per-state flags bytes */ +#define CNFA_NOPROGRESS	01	/* flag bit for a no-progress state */      struct carc **states;	/* vector of pointers to outarc lists */ +    /* states[n] are pointers into a single malloc'd array of arcs */      struct carc *arcs;		/* the area for the lists */  };  #define	ZAPCNFA(cnfa)	((cnfa).nstates = 0)  #define	NULLCNFA(cnfa)	((cnfa).nstates == 0)  /* + * This symbol limits the transient heap space used by the regex compiler, + * and thereby also the maximum complexity of NFAs that we'll deal with. + * Currently we only count NFA states and arcs against this; the other + * transient data is generally not large enough to notice compared to those. + * Note that we do not charge anything for the final output data structures + * (the compacted NFA and the colormap). + */ +#ifndef REG_MAX_COMPILE_SPACE +#define REG_MAX_COMPILE_SPACE  \ +	(100000 * sizeof(struct state) + 100000 * sizeof(struct arcbatch)) +#endif + +/*   * subexpression tree + * + * "op" is one of: + *	'='  plain regex without interesting substructure (implemented as DFA) + *	'b'  back-reference (has no substructure either) + *	'('  capture node: captures the match of its single child + *	'.'  concatenation: matches a match for left, then a match for right + *	'|'  alternation: matches a match for left or a match for right + *	'*'  iteration: matches some number of matches of its single child + * + * Note: the right child of an alternation must be another alternation or + * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you + * might expect.  This could stand to be changed.  Actually I'd rather see + * a single alternation node with N children, but that will take revising + * the representation of struct subre. + * + * Note: when a backref is directly quantified, we stick the min/max counts + * into the backref rather than plastering an iteration node on top.  This is + * for efficiency: there is no need to search for possible division points.   */  struct subre { -    char op;			/* '|', '.' (concat), 'b' (backref), '(', -				 * '=' */ +    char op;			/* see type codes above */      char flags;  #define	LONGER	01		/* prefers longer match */  #define	SHORTER	02		/* prefers shorter match */ @@ -352,18 +356,18 @@ struct subre {  #define	CAP	010		/* capturing parens below */  #define	BACKR	020		/* back reference below */  #define	INUSE	0100		/* in use in final tree */ -#define	LOCAL	03		/* bits which may not propagate up */ +#define	NOPROP	03		/* bits which may not propagate up */  #define	LMIX(f)	((f)<<2)	/* LONGER -> MIXED */  #define	SMIX(f)	((f)<<1)	/* SHORTER -> MIXED */ -#define	UP(f)	(((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED)) +#define	UP(f)	(((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED))  #define	MESSY(f)	((f)&(MIXED|CAP|BACKR)) -#define	PREF(f)	((f)&LOCAL) +#define	PREF(f)	((f)&NOPROP)  #define	PREF2(f1, f2)	((PREF(f1) != 0) ? PREF(f1) : PREF(f2))  #define	COMBINE(f1, f2)	(UP((f1)|(f2)) | PREF2(f1, f2)) -    short retry;		/* index into retry memory */ +    short id;			/* ID of subre (1..ntree-1) */      int subno;			/* subexpression number (for 'b' and '(') */ -    short min;			/* min repetitions, for backref only */ -    short max;			/* max repetitions, for backref only */ +    short min;			/* min repetitions for iteration or backref */ +    short max;			/* max repetitions for iteration or backref */      struct subre *left;		/* left child, if any (also freelist chain) */      struct subre *right;	/* right child, if any */      struct state *begin;	/* outarcs from here... */ @@ -378,7 +382,7 @@ struct subre {   */  struct fns { -    VOID FUNCPTR(free, (regex_t *)); +    void (*free) (regex_t *);  };  /* @@ -393,12 +397,26 @@ struct guts {      size_t nsub;		/* copy of re_nsub */      struct subre *tree;      struct cnfa search;		/* for fast preliminary search */ -    int ntree; +    int ntree;			/* number of subre's, plus one */      struct colormap cmap; -    int FUNCPTR(compare, (CONST chr *, CONST chr *, size_t)); +    int (*compare) (const chr *, const chr *, size_t);      struct subre *lacons;	/* lookahead-constraint vector */      int nlacons;		/* size of lacons */  }; + +/* + * Magic for allocating a variable workspace. This default version is + * stack-hungry. + */ + +#ifndef AllocVars +#define AllocVars(vPtr) \ +    struct vars var; \ +    register struct vars *vPtr = &var +#endif +#ifndef FreeVars +#define FreeVars(vPtr) ((void) 0) +#endif  /*   * Local Variables: | 
