NEW: move from tools and config to create CMake

author: Bill Hoffman <bill.hoffman@kitware.com> 2000-08-29 19:26:29 (GMT)
committer: Bill Hoffman <bill.hoffman@kitware.com> 2000-08-29 19:26:29 (GMT)
commit: 1f42f521cea8b953f4ad7ef5ca0d4c6d49c42a88 (patch)
tree: a7b88a2e4397167c0d6c72c9cf0fcf8f91dcd024 /Source/cmRegularExpression.cxx
parent: d6bdba1096f36268e414f32829e22159e983031a (diff)
download: CMake-1f42f521cea8b953f4ad7ef5ca0d4c6d49c42a88.zip
CMake-1f42f521cea8b953f4ad7ef5ca0d4c6d49c42a88.tar.gz
CMake-1f42f521cea8b953f4ad7ef5ca0d4c6d49c42a88.tar.bz2
1 files changed, 1279 insertions, 0 deletions
diff --git a/Source/cmRegularExpression.cxx b/Source/cmRegularExpression.cxx
new file mode 100644
index 0000000..0867e97
--- /dev/null
+++ b/Source/cmRegularExpression.cxx
@@ -0,0 +1,1279 @@
+//
+// Copyright (C) 1991 Texas Instruments Incorporated.
+//
+// Permission is granted to any individual or institution to use, copy, modify,
+// and distribute this software, provided that this complete copyright and
+// permission notice is maintained, intact, in all copies and supporting
+// documentation.
+//
+// Texas Instruments Incorporated provides this software "as is" without
+// express or implied warranty.
+//
+//
+// Created: MNF 06/13/89  Initial Design and Implementation
+// Updated: LGO 08/09/89  Inherit from Generic
+// Updated: MBN 09/07/89  Added conditional exception handling
+// Updated: MBN 12/15/89  Sprinkled "const" qualifiers all over the place!
+// Updated: DLS 03/22/91  New lite version
+//
+// This  is the header file  for the regular  expression class.   An object of
+// this class contains a regular expression,  in  a special "compiled" format.
+// This  compiled format consists  of  several slots   all kept as the objects
+// private data.  The cmRegularExpression class  provides a convenient  way  to  represent
+// regular  expressions.  It makes it easy   to search  for  the  same regular
+// expression in many different strings without having to  compile a string to
+// regular expression format more than necessary.
+//
+// A regular  expression allows a programmer to  specify complex patterns that
+// can be searched for  and  matched against the  character string of a String
+// object.  In  its  simplest case, a   regular expression  is a  sequence  of
+// characters with which you can search for exact character matches.  However,
+// many times you may not know the exact sequence you want to find, or you may
+// only want to find a match at the beginning or end of  a String.  The cmRegularExpression
+// object  allows specification of  such patterns by  utilizing the  following
+// regular  expression  meta-characters   (note   that  more  one  of    these
+// meta-characters  can  be used in a single  regular  expression in  order to
+// create complex search patterns):
+//
+//         ^    Match at beginning of line
+//         $    Match at end of line
+//         .    Match any single character
+//         [ ]  Match any one character inside the brackets
+//         [^ ] Match any character NOT inside the brackets
+//         -    Match any character in range on either side of dash
+//         *    Match preceding pattern zero or more times
+//         +    Match preceding pattern one or more times
+//         ?    Match preceding pattern zero or once only
+//         ()   Save a matched expression and use it in a further match.
+//
+// There are three constructors for cmRegularExpression.  One  just creates an empty cmRegularExpression
+// object.  Another creates a cmRegularExpression object  and initializes it with a regular
+// expression  that is given  in  the form of a   char*.   The  third  takes a
+// reference  to  a cmRegularExpression  object    as an  argument    and creates an object
+// initialized with the information from the given cmRegularExpression object.
+//
+// The  find  member function  finds   the  first  occurence   of  the regualr
+// expression of that object in the string given to find as an argument.  Find
+// returns a boolean, and  if true,  mutates  the private  data appropriately.
+// Find sets pointers to the beginning and end of  the thing last  found, they
+// are pointers into the actual string  that was searched.   The start and end
+// member functions return indicies  into the searched string that  correspond
+// to the beginning   and  end pointers  respectively.   The    compile member
+// function takes a char* and puts the  compiled version of the char* argument
+// into the object's private data fields.  The == and  != operators only check
+// the  to see  if   the compiled  regular  expression   is the same, and  the
+// deep_equal functions also checks  to see if the  start and end pointers are
+// the same.  The is_valid  function returns false if  program is set to NULL,
+// (i.e. there is no valid compiled exression).  The set_invalid function sets
+// the  program to NULL  (Warning: this deletes the compiled  expression). The
+// following examples may help clarify regular expression usage:
+//
+//   *  The regular expression  "^hello" matches  a "hello"  only at  the
+//      beginning of a  line.  It would match "hello  there" but not "hi,
+//      hello there".
+//
+//   *  The regular expression "long$" matches a  "long"  only at the end
+//      of a line. It would match "so long\0", but not "long ago".
+//
+//   *  The regular expression "t..t..g"  will match anything that  has a
+//      "t" then any two characters, another "t", any  two characters and
+//      then a "g".   It will match  "testing", or "test again" but would
+//      not match "toasting"
+//
+//   *  The regular  expression "[1-9ab]" matches any  number one through
+//      nine, and the characters  "a" and  "b".  It would match "hello 1"
+//      or "begin", but would not match "no-match".
+//
+//   *  The  regular expression "[^1-9ab]"  matches any character that is
+//      not a number one  through nine, or  an "a" or "b".   It would NOT
+//      match "hello 1" or "begin", but would match "no-match".
+//
+//   *  The regular expression "br* " matches  something that begins with
+//      a "b", is followed by zero or more "r"s, and ends in a space.  It
+//      would match "brrrrr ", and "b ", but would not match "brrh ".
+//
+//   *  The regular expression "br+ " matches something  that begins with
+//      a "b", is followed by one or more "r"s, and ends in  a space.  It
+//      would match "brrrrr ",  and  "br ", but would not  match "b  " or
+//      "brrh ".
+//
+//   *  The regular expression "br? " matches  something that begins with
+//      a "b", is followed by zero or one "r"s, and ends in  a space.  It
+//      would  match  "br ", and "b  ", but would not match  "brrrr "  or
+//      "brrh ".
+//
+//   *  The regular expression "(..p)b" matches  something ending with pb
+//      and beginning with whatever the two characters before the first p
+//      encounterd in the line were.  It would find  "repb" in "rep drepa
+//      qrepb".  The regular expression "(..p)a"  would find "repa qrepb"
+//      in "rep drepa qrepb"
+//
+//   *  The regular expression "d(..p)" matches something ending  with p,
+//      beginning with d, and having  two characters  in between that are
+//      the same as the two characters before  the first p  encounterd in
+//      the line.  It would match "drepa qrepb" in "rep drepa qrepb".
+//
+
+#include "cmRegularExpression.h"	// Include class specification 
+#include <stdio.h>
+#include <string>
+
+// cmRegularExpression -- Copies the given regular expression.
+
+cmRegularExpression::cmRegularExpression (const cmRegularExpression& rxp) {
+  int ind; 
+  this->progsize = rxp.progsize;		// Copy regular expression size
+  this->program = new char[this->progsize];	// Allocate storage
+  for(ind=this->progsize; ind-- != 0;)		// Copy regular expresion
+    this->program[ind] = rxp.program[ind];
+  this->startp[0] = rxp.startp[0];		// Copy pointers into last
+  this->endp[0] = rxp.endp[0];			// Successful "find" operation
+  this->regmust = rxp.regmust;			// Copy field
+  if (rxp.regmust != NULL) {
+    char* dum = rxp.program;
+    ind = 0;
+    while (dum != rxp.regmust) {
+      ++dum;
+      ++ind;
+    }
+    this->regmust = this->program + ind;
+  }
+  this->regstart = rxp.regstart;		// Copy starting index
+  this->reganch = rxp.reganch;			// Copy remaining private data
+  this->regmlen = rxp.regmlen;			// Copy remaining private data
+}
+
+
+// operator== -- Returns true if two regular expressions have the same
+// compiled program for pattern matching.
+
+bool cmRegularExpression::operator== (const cmRegularExpression& rxp) const {
+  if (this != &rxp) {				// Same address?
+    int ind = this->progsize;			// Get regular expression size
+    if (ind != rxp.progsize)			// If different size regexp
+      return false;				// Return failure
+    while(ind-- != 0)				// Else while still characters
+      if(this->program[ind] != rxp.program[ind]) // If regexp are different    
+	return false;				 // Return failure             
+  }
+  return true;					// Else same, return success  
+}
+
+
+// deep_equal -- Returns true if have the same compiled regular expressions
+// and the same start and end pointers.
+
+bool cmRegularExpression::deep_equal (const cmRegularExpression& rxp) const {
+  int ind = this->progsize;			// Get regular expression size
+  if (ind != rxp.progsize)			// If different size regexp
+    return false;				// Return failure
+  while(ind-- != 0)				// Else while still characters
+    if(this->program[ind] != rxp.program[ind])	// If regexp are different    
+      return false;				// Return failure             
+  return (this->startp[0] == rxp.startp[0] && 	// Else if same start/end ptrs,
+	  this->endp[0] == rxp.endp[0]);	// Return true
+}   
+
+// The remaining code in this file is derived from the  regular expression code
+// whose  copyright statement appears  below.  It has been  changed to work
+// with the class concepts of C++ and COOL.
+
+/*
+ * compile and find 
+ *
+ *	Copyright (c) 1986 by University of Toronto.
+ *	Written by Henry Spencer.  Not derived from licensed software.
+ *
+ *	Permission is granted to anyone to use this software for any
+ *	purpose on any computer system, and to redistribute it freely,
+ *	subject to the following restrictions:
+ *
+ *	1. The author is not responsible for the consequences of use of
+ *		this software, no matter how awful, even if they arise
+ *		from defects in it.
+ *
+ *	2. The origin of this software must not be misrepresented, either
+ *		by explicit claim or by omission.
+ *
+ *	3. Altered versions must be plainly marked as such, and must not
+ *		be misrepresented as being the original software.
+ *
+ * Beware that some of this code is subtly aware of the way operator
+ * precedence is structured in regular expressions.  Serious changes in
+ * regular-expression syntax might require a total rethink.
+ */
+
+/*
+ * The "internal use only" fields in regexp.h are present to pass info from
+ * compile to execute that permits the execute phase to run lots faster on
+ * simple cases.  They are:
+ *
+ * regstart	char that must begin a match; '\0' if none obvious
+ * reganch	is the match anchored (at beginning-of-line only)?
+ * regmust	string (pointer into program) that match must include, or NULL
+ * regmlen	length of regmust string
+ *
+ * Regstart and reganch permit very fast decisions on suitable starting points
+ * for a match, cutting down the work a lot.  Regmust permits fast rejection
+ * of lines that cannot possibly match.  The regmust tests are costly enough
+ * that compile() supplies a regmust only if the r.e. contains something
+ * potentially expensive (at present, the only such thing detected is * or +
+ * at the start of the r.e., which can involve a lot of backup).  Regmlen is
+ * supplied because the test in find() needs it and compile() is computing
+ * it anyway.
+ */
+
+/*
+ * Structure for regexp "program".  This is essentially a linear encoding
+ * of a nondeterministic finite-state machine (aka syntax charts or
+ * "railroad normal form" in parsing technology).  Each node is an opcode
+ * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
+ * all nodes except BRANCH implement concatenation; a "next" pointer with
+ * a BRANCH on both ends of it is connecting two alternatives.  (Here we
+ * have one of the subtle syntax dependencies:  an individual BRANCH (as
+ * opposed to a collection of them) is never concatenated with anything
+ * because of operator precedence.)  The operand of some types of node is
+ * a literal string; for others, it is a node leading into a sub-FSM.  In
+ * particular, the operand of a BRANCH node is the first node of the branch.
+ * (NB this is *not* a tree structure:  the tail of the branch connects
+ * to the thing following the set of BRANCHes.)  The opcodes are:
+ */
+
+// definition	number	opnd?	meaning
+#define	END	0		// no	End of program.
+#define	BOL	1		// no	Match "" at beginning of line.
+#define	EOL	2		// no	Match "" at end of line.
+#define	ANY	3		// no	Match any one character.
+#define	ANYOF	4		// str	Match any character in this string.
+#define	ANYBUT	5		// str	Match any character not in this
+				// string.
+#define	BRANCH	6		// node	Match this alternative, or the
+				// next...
+#define	BACK	7		// no	Match "", "next" ptr points backward.
+#define	EXACTLY	8		// str	Match this string.
+#define	NOTHING	9		// no	Match empty string.
+#define	STAR	10		// node	Match this (simple) thing 0 or more
+				// times.
+#define	PLUS	11		// node	Match this (simple) thing 1 or more
+				// times.
+#define	OPEN	20		// no	Mark this point in input as start of
+				// #n.
+// OPEN+1 is number 1, etc.
+#define	CLOSE	30		// no	Analogous to OPEN.
+
+/*
+ * Opcode notes:
+ *
+ * BRANCH	The set of branches constituting a single choice are hooked
+ *		together with their "next" pointers, since precedence prevents
+ *		anything being concatenated to any individual branch.  The
+ *		"next" pointer of the last BRANCH in a choice points to the
+ *		thing following the whole choice.  This is also where the
+ *		final "next" pointer of each individual branch points; each
+ *		branch starts with the operand node of a BRANCH node.
+ *
+ * BACK		Normal "next" pointers all implicitly point forward; BACK
+ *		exists to make loop structures possible.
+ *
+ * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
+ *		BRANCH structures using BACK.  Simple cases (one character
+ *		per match) are implemented with STAR and PLUS for speed
+ *		and to minimize recursive plunges.
+ *
+ * OPEN,CLOSE	...are numbered at compile time.
+ */
+
+/*
+ * A node is one char of opcode followed by two chars of "next" pointer.
+ * "Next" pointers are stored as two 8-bit pieces, high order first.  The
+ * value is a positive offset from the opcode of the node containing it.
+ * An operand, if any, simply follows the node.  (Note that much of the
+ * code generation knows about this implicit relationship.)
+ *
+ * Using two bytes for the "next" pointer is vast overkill for most things,
+ * but allows patterns to get big without disasters.
+ */
+
+#define	OP(p)		(*(p))
+#define	NEXT(p)		(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
+#define	OPERAND(p)	((p) + 3)
+
+const unsigned char MAGIC = 0234;
+/*
+ * Utility definitions.
+ */
+
+#define	UCHARAT(p)	((const unsigned char*)(p))[0]
+
+
+#define	FAIL(m)	{ regerror(m); return(NULL); }
+#define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
+#define	META	"^$.[()|?+*\\"
+
+
+/*
+ * Flags to be passed up and down.
+ */
+#define	HASWIDTH	01	// Known never to match null string.
+#define	SIMPLE		02	// Simple enough to be STAR/PLUS operand.
+#define	SPSTART		04	// Starts with * or +.
+#define	WORST		0	// Worst case.
+
+
+
+/////////////////////////////////////////////////////////////////////////
+//
+//  COMPILE AND ASSOCIATED FUNCTIONS
+//
+/////////////////////////////////////////////////////////////////////////
+
+
+/*
+ * Global work variables for compile().
+ */
+static const char* regparse;	// Input-scan pointer.
+static       int   regnpar;	// () count.
+static       char  regdummy;
+static       char* regcode;	// Code-emit pointer; &regdummy = don't.
+static       long  regsize;	// Code size.
+
+/*
+ * Forward declarations for compile()'s friends.
+ */
+// #ifndef static
+// #define	static	static
+// #endif
+static       char* reg (int, int*);
+static       char* regbranch (int*);
+static       char* regpiece (int*);
+static       char* regatom (int*);
+static       char* regnode (char);
+static const char* regnext (register const char*);
+static       char* regnext (register char*);
+static void        regc (unsigned char);
+static void        reginsert (char, char*);
+static void        regtail (char*, const char*);
+static void        regoptail (char*, const char*);
+
+#ifdef STRCSPN
+static int strcspn ();
+#endif
+
+
+
+/*
+ * We can't allocate space until we know how big the compiled form will be,
+ * but we can't compile it (and thus know how big it is) until we've got a
+ * place to put the code.  So we cheat:  we compile it twice, once with code
+ * generation turned off and size counting turned on, and once "for real".
+ * This also means that we don't allocate space until we are sure that the
+ * thing really will compile successfully, and we never have to move the
+ * code and thus invalidate pointers into it.  (Note that it has to be in
+ * one piece because free() must be able to free it all.)
+ *
+ * Beware that the optimization-preparation code in here knows about some
+ * of the structure of the compiled regexp.
+ */
+
+
+// compile -- compile a regular expression into internal code
+// for later pattern matching.
+
+void cmRegularExpression::compile (const char* exp) {
+    register const char* scan;
+    register const char* longest;
+    register unsigned long len;
+             int         flags;
+
+    if (exp == NULL) {
+      //RAISE Error, SYM(cmRegularExpression), SYM(No_Expr),
+      printf ("cmRegularExpression::compile(): No expression supplied.\n");
+      return;
+    }
+
+    // First pass: determine size, legality.
+    regparse = exp;
+    regnpar = 1;
+    regsize = 0L;
+    regcode = &regdummy;
+    regc(MAGIC);
+    if(!reg(0, &flags))
+      {
+	printf ("cmRegularExpression::compile(): Error in compile.\n");
+	return;
+      }
+    this->startp[0] = this->endp[0] = this->searchstring = NULL;
+
+    // Small enough for pointer-storage convention? 
+    if (regsize >= 32767L) {	// Probably could be 65535L. 
+      //RAISE Error, SYM(cmRegularExpression), SYM(Expr_Too_Big),
+      printf ("cmRegularExpression::compile(): Expression too big.\n");
+      return;
+    }
+
+    // Allocate space. 
+//#ifndef WIN32
+    if (this->program != NULL) delete [] this->program;  
+//#endif
+    this->program = new char[regsize];
+    this->progsize = (int) regsize;
+
+    if (this->program == NULL) {
+      //RAISE Error, SYM(cmRegularExpression), SYM(Out_Of_Memory),
+      printf ("cmRegularExpression::compile(): Out of memory.\n"); 
+      return;
+    }
+
+    // Second pass: emit code.
+    regparse = exp;
+    regnpar = 1;
+    regcode = this->program;
+    regc(MAGIC);
+    reg(0, &flags);
+
+    // Dig out information for optimizations.
+    this->regstart = '\0';		// Worst-case defaults.
+    this->reganch = 0;
+    this->regmust = NULL;
+    this->regmlen = 0;
+    scan = this->program + 1;	// First BRANCH.
+    if (OP(regnext(scan)) == END) {	// Only one top-level choice.
+	scan = OPERAND(scan);
+
+	// Starting-point info.
+	if (OP(scan) == EXACTLY)
+	    this->regstart = *OPERAND(scan);
+	else if (OP(scan) == BOL)
+	    this->reganch++;
+
+	 //
+	 // If there's something expensive in the r.e., find the longest
+	 // literal string that must appear and make it the regmust.  Resolve
+	 // ties in favor of later strings, since the regstart check works
+	 // with the beginning of the r.e. and avoiding duplication
+	 // strengthens checking.  Not a strong reason, but sufficient in the
+	 // absence of others. 
+	 //
+	if (flags & SPSTART) {
+	    longest = NULL;
+	    len = 0;
+	    for (; scan != NULL; scan = regnext(scan))
+		if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
+		    longest = OPERAND(scan);
+		    len = strlen(OPERAND(scan));
+		}
+	    this->regmust = longest;
+	    this->regmlen = len;
+	}
+    }
+}
+
+
+/*
+ - reg - regular expression, i.e. main body or parenthesized thing
+ *
+ * Caller must absorb opening parenthesis.
+ *
+ * Combining parenthesis handling with the base level of regular expression
+ * is a trifle forced, but the need to tie the tails of the branches to what
+ * follows makes it hard to avoid.
+ */
+static char* reg (int paren, int *flagp) {
+    register char* ret;
+    register char* br;
+    register char* ender;
+    register int   parno =0;
+             int   flags;
+
+    *flagp = HASWIDTH;		// Tentatively.
+
+    // Make an OPEN node, if parenthesized.
+    if (paren) {
+	if (regnpar >= NSUBEXP) {
+	  //RAISE Error, SYM(cmRegularExpression), SYM(Too_Many_Parens),
+	  printf ("cmRegularExpression::compile(): Too many parentheses.\n");
+	  return 0;
+        }
+	parno = regnpar;
+	regnpar++;
+	ret = regnode(OPEN + parno);
+    }
+    else
+	ret = NULL;
+
+    // Pick up the branches, linking them together.
+    br = regbranch(&flags);
+    if (br == NULL)
+	return (NULL);
+    if (ret != NULL)
+	regtail(ret, br);	// OPEN -> first.
+    else
+	ret = br;
+    if (!(flags & HASWIDTH))
+	*flagp &= ~HASWIDTH;
+    *flagp |= flags & SPSTART;
+    while (*regparse == '|') {
+	regparse++;
+	br = regbranch(&flags);
+	if (br == NULL)
+	    return (NULL);
+	regtail(ret, br);	// BRANCH -> BRANCH.
+	if (!(flags & HASWIDTH))
+	    *flagp &= ~HASWIDTH;
+	*flagp |= flags & SPSTART;
+      }
+
+    // Make a closing node, and hook it on the end.
+    ender = regnode((paren) ? CLOSE + parno : END);
+    regtail(ret, ender);
+
+    // Hook the tails of the branches to the closing node.
+    for (br = ret; br != NULL; br = regnext(br))
+	regoptail(br, ender);
+
+    // Check for proper termination.
+    if (paren && *regparse++ != ')') {
+        //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
+        printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
+	return 0;
+    }
+    else if (!paren && *regparse != '\0') {
+        if (*regparse == ')') {
+            //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
+            printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
+	    return 0;
+	}
+	else {
+	    //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
+	    printf ("cmRegularExpression::compile(): Internal error.\n");
+	    return 0;
+        }
+	// NOTREACHED
+    }
+    return (ret);
+}
+
+
+/*
+ - regbranch - one alternative of an | operator
+ *
+ * Implements the concatenation operator.
+ */
+static char* regbranch (int *flagp) {
+    register char* ret;
+    register char* chain;
+    register char* latest;
+    int                  flags;
+
+    *flagp = WORST;		// Tentatively.
+
+    ret = regnode(BRANCH);
+    chain = NULL;
+    while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
+	latest = regpiece(&flags);
+	if (latest == NULL)
+	    return (NULL);
+	*flagp |= flags & HASWIDTH;
+	if (chain == NULL)	// First piece.
+	    *flagp |= flags & SPSTART;
+	else
+	    regtail(chain, latest);
+	chain = latest;
+    }
+    if (chain == NULL)		// Loop ran zero times.
+	regnode(NOTHING);
+
+    return (ret);
+}
+
+
+/*
+ - regpiece - something followed by possible [*+?]
+ *
+ * Note that the branching code sequences used for ? and the general cases
+ * of * and + are somewhat optimized:  they use the same NOTHING node as
+ * both the endmarker for their branch list and the body of the last branch.
+ * It might seem that this node could be dispensed with entirely, but the
+ * endmarker role is not redundant.
+ */
+static char* regpiece (int *flagp) {
+    register char* ret;
+    register char  op;
+    register char* next;
+    int            flags;
+
+    ret = regatom(&flags);
+    if (ret == NULL)
+	return (NULL);
+
+    op = *regparse;
+    if (!ISMULT(op)) {
+	*flagp = flags;
+	return (ret);
+    }
+
+    if (!(flags & HASWIDTH) && op != '?') {
+        //RAISE Error, SYM(cmRegularExpression), SYM(Empty_Operand),
+        printf ("cmRegularExpression::compile() : *+ operand could be empty.\n");
+	return 0;
+    }
+    *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
+
+    if (op == '*' && (flags & SIMPLE))
+	reginsert(STAR, ret);
+    else if (op == '*') {
+	// Emit x* as (x&|), where & means "self".
+	reginsert(BRANCH, ret);	// Either x
+	regoptail(ret, regnode(BACK));	// and loop
+	regoptail(ret, ret);	// back
+	regtail(ret, regnode(BRANCH));	// or
+	regtail(ret, regnode(NOTHING));	// null.
+    }
+    else if (op == '+' && (flags & SIMPLE))
+	reginsert(PLUS, ret);
+    else if (op == '+') {
+	// Emit x+ as x(&|), where & means "self".
+	next = regnode(BRANCH);	// Either
+	regtail(ret, next);
+	regtail(regnode(BACK), ret);	// loop back
+	regtail(next, regnode(BRANCH));	// or
+	regtail(ret, regnode(NOTHING));	// null.
+    }
+    else if (op == '?') {
+	// Emit x? as (x|)
+	reginsert(BRANCH, ret);	// Either x
+	regtail(ret, regnode(BRANCH));	// or
+	next = regnode(NOTHING);// null.
+	regtail(ret, next);
+	regoptail(ret, next);
+    }
+    regparse++;
+    if (ISMULT(*regparse)) {
+        //RAISE Error, SYM(cmRegularExpression), SYM(Nested_Operand),
+        printf ("cmRegularExpression::compile(): Nested *?+.\n");
+	return 0;
+    }
+    return (ret);
+}
+
+
+/*
+ - regatom - the lowest level
+ *
+ * Optimization:  gobbles an entire sequence of ordinary characters so that
+ * it can turn them into a single node, which is smaller to store and
+ * faster to run.  Backslashed characters are exceptions, each becoming a
+ * separate node; the code is simpler that way and it's not worth fixing.
+ */
+static char* regatom (int *flagp) {
+    register char* ret;
+             int   flags;
+
+    *flagp = WORST;		// Tentatively.
+
+    switch (*regparse++) {
+	case '^':
+	    ret = regnode(BOL);
+	    break;
+	case '$':
+	    ret = regnode(EOL);
+	    break;
+	case '.':
+	    ret = regnode(ANY);
+	    *flagp |= HASWIDTH | SIMPLE;
+	    break;
+	case '[':{
+		register int    rxpclass;
+		register int    rxpclassend;
+
+		if (*regparse == '^') {	// Complement of range.
+		    ret = regnode(ANYBUT);
+		    regparse++;
+		}
+		else
+		    ret = regnode(ANYOF);
+		if (*regparse == ']' || *regparse == '-')
+		    regc(*regparse++);
+		while (*regparse != '\0' && *regparse != ']') {
+		    if (*regparse == '-') {
+			regparse++;
+			if (*regparse == ']' || *regparse == '\0')
+			    regc('-');
+			else {
+			    rxpclass = UCHARAT(regparse - 2) + 1;
+			    rxpclassend = UCHARAT(regparse);
+			    if (rxpclass > rxpclassend + 1) {
+			       //RAISE Error, SYM(cmRegularExpression), SYM(Invalid_Range),
+			       printf ("cmRegularExpression::compile(): Invalid range in [].\n");
+			       return 0;
+                            }
+			    for (; rxpclass <= rxpclassend; rxpclass++)
+				regc(rxpclass);
+			    regparse++;
+			}
+		    }
+		    else
+			regc(*regparse++);
+		}
+		regc('\0');
+		if (*regparse != ']') {
+                    //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Bracket),
+                    printf ("cmRegularExpression::compile(): Unmatched [].\n");
+		    return 0;
+	        }
+		regparse++;
+		*flagp |= HASWIDTH | SIMPLE;
+	    }
+	    break;
+	case '(':
+	    ret = reg(1, &flags);
+	    if (ret == NULL)
+		return (NULL);
+	    *flagp |= flags & (HASWIDTH | SPSTART);
+	    break;
+	case '\0':
+	case '|':
+	case ')':
+	    //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
+            printf ("cmRegularExpression::compile(): Internal error.\n"); // Never here
+	    return 0;
+	case '?':
+	case '+':
+	case '*':
+	    //RAISE Error, SYM(cmRegularExpression), SYM(No_Operand),
+            printf ("cmRegularExpression::compile(): ?+* follows nothing.\n");
+	    return 0;
+	case '\\':
+	    if (*regparse == '\0') {
+	        //RAISE Error, SYM(cmRegularExpression), SYM(Trailing_Backslash),
+                printf ("cmRegularExpression::compile(): Trailing backslash.\n");
+		return 0;
+            }
+	    ret = regnode(EXACTLY);
+	    regc(*regparse++);
+	    regc('\0');
+	    *flagp |= HASWIDTH | SIMPLE;
+	    break;
+	default:{
+		register int    len;
+		register char   ender;
+
+		regparse--;
+		len = strcspn(regparse, META);
+		if (len <= 0) {
+		    //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
+                    printf ("cmRegularExpression::compile(): Internal error.\n");
+		    return 0;
+                }
+		ender = *(regparse + len);
+		if (len > 1 && ISMULT(ender))
+		    len--;	// Back off clear of ?+* operand.
+		*flagp |= HASWIDTH;
+		if (len == 1)
+		    *flagp |= SIMPLE;
+		ret = regnode(EXACTLY);
+		while (len > 0) {
+		    regc(*regparse++);
+		    len--;
+		}
+		regc('\0');
+	    }
+	    break;
+    }
+    return (ret);
+}
+
+
+/*
+ - regnode - emit a node
+   Location.
+ */
+static char* regnode (char op) {
+    register char* ret;
+    register char* ptr;
+
+    ret = regcode;
+    if (ret == &regdummy) {
+	regsize += 3;
+	return (ret);
+    }
+
+    ptr = ret;
+    *ptr++ = op;
+    *ptr++ = '\0';		// Null "next" pointer.
+    *ptr++ = '\0';
+    regcode = ptr;
+
+    return (ret);
+}
+
+
+/*
+ - regc - emit (if appropriate) a byte of code
+ */
+static void regc (unsigned char b) {
+    if (regcode != &regdummy)
+	*regcode++ = b;
+    else
+	regsize++;
+}
+
+
+/*
+ - reginsert - insert an operator in front of already-emitted operand
+ *
+ * Means relocating the operand.
+ */
+static void reginsert (char op, char* opnd) {
+    register char* src;
+    register char* dst;
+    register char* place;
+
+    if (regcode == &regdummy) {
+	regsize += 3;
+	return;
+    }
+
+    src = regcode;
+    regcode += 3;
+    dst = regcode;
+    while (src > opnd)
+	*--dst = *--src;
+
+    place = opnd;		// Op node, where operand used to be.
+    *place++ = op;
+    *place++ = '\0';
+    *place++ = '\0';
+}
+
+
+/*
+ - regtail - set the next-pointer at the end of a node chain
+ */
+static void regtail (char* p, const char* val) {
+    register char* scan;
+    register char* temp;
+    register int   offset;
+
+    if (p == &regdummy)
+	return;
+
+    // Find last node.
+    scan = p;
+    for (;;) {
+	temp = regnext(scan);
+	if (temp == NULL)
+	    break;
+	scan = temp;
+    }
+
+    if (OP(scan) == BACK)
+	offset = (const char*)scan - val;
+    else
+	offset = val - scan;
+    *(scan + 1) = (offset >> 8) & 0377;
+    *(scan + 2) = offset & 0377;
+}
+
+
+/*
+ - regoptail - regtail on operand of first argument; nop if operandless
+ */
+static void regoptail (char* p, const char* val) {
+    // "Operandless" and "op != BRANCH" are synonymous in practice.
+    if (p == NULL || p == &regdummy || OP(p) != BRANCH)
+	return;
+    regtail(OPERAND(p), val);
+}
+
+
+
+////////////////////////////////////////////////////////////////////////
+// 
+//  find and friends
+// 
+////////////////////////////////////////////////////////////////////////
+
+
+/*
+ * Global work variables for find().
+ */
+static const char*  reginput;	// String-input pointer.
+static const char*  regbol;	// Beginning of input, for ^ check.
+static const char* *regstartp;	// Pointer to startp array.
+static const char* *regendp;	// Ditto for endp.
+
+/*
+ * Forwards.
+ */
+static int regtry (const char*, const char* *,
+		   const char* *, const char*);
+static int regmatch (const char*);
+static int regrepeat (const char*);
+
+#ifdef DEBUG
+int          regnarrate = 0;
+void         regdump ();
+static char* regprop ();
+#endif
+
+bool cmRegularExpression::find (std::string const& s) {
+return find(s.c_str());
+}
+
+
+
+// find -- Matches the regular expression to the given string.
+// Returns true if found, and sets start and end indexes accordingly.
+
+bool cmRegularExpression::find (const char* string) {
+    register const char* s;
+
+    this->searchstring = string;
+
+     // Check validity of program.
+    if (!this->program || UCHARAT(this->program) != MAGIC) {
+        //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
+        printf ("cmRegularExpression::find(): Compiled regular expression corrupted.\n");
+        return 0;
+    }
+    
+    // If there is a "must appear" string, look for it.
+    if (this->regmust != NULL) {
+	s = string;
+	while ((s = strchr(s, this->regmust[0])) != NULL) {
+	    if (strncmp(s, this->regmust, this->regmlen) == 0)
+		break;		// Found it.
+	    s++;
+	}
+	if (s == NULL)		// Not present.
+	    return (0);
+    }
+     
+    // Mark beginning of line for ^ .
+    regbol = string;
+
+    // Simplest case:  anchored match need be tried only once.
+    if (this->reganch)
+	return (regtry(string, this->startp, this->endp, this->program) != 0);
+    
+    // Messy cases:  unanchored match.
+    s = string;
+    if (this->regstart != '\0')
+	// We know what char it must start with.
+	while ((s = strchr(s, this->regstart)) != NULL) {
+	    if (regtry(s, this->startp, this->endp, this->program))
+		return (1);
+	    s++;
+	  
+	}
+    else
+	// We don't -- general case.
+	do {
+	    if (regtry(s, this->startp, this->endp, this->program))
+		return (1);
+	} while (*s++ != '\0');
+    
+    // Failure.
+    return (0);
+}
+
+
+/*
+ - regtry - try match at specific point
+   0 failure, 1 success
+ */
+static int regtry (const char* string, const char* *start,
+		   const char* *end, const char* prog) {
+    register       int    i;
+    register const char* *sp1;
+    register const char* *ep;
+
+    reginput = string;
+    regstartp = start;
+    regendp = end;
+
+    sp1 = start;
+    ep = end;
+    for (i = NSUBEXP; i > 0; i--) {
+	*sp1++ = NULL;
+	*ep++ = NULL;
+    }
+    if (regmatch(prog + 1)) {
+	start[0] = string;
+	end[0] = reginput;
+	return (1);
+    }
+    else
+	return (0);
+}
+
+
+/*
+ - regmatch - main matching routine
+ *
+ * Conceptually the strategy is simple:  check to see whether the current
+ * node matches, call self recursively to see whether the rest matches,
+ * and then act accordingly.  In practice we make some effort to avoid
+ * recursion, in particular by going through "ordinary" nodes (that don't
+ * need to know whether the rest of the match failed) by a loop instead of
+ * by recursion.
+ * 0 failure, 1 success
+ */
+static int regmatch (const char* prog) {
+    register const char* scan;	// Current node.
+             const char* next;	// Next node.
+
+    scan = prog;
+
+    while (scan != NULL) {
+
+	next = regnext(scan);
+
+	switch (OP(scan)) {
+	    case BOL:
+		if (reginput != regbol)
+		    return (0);
+		break;
+	    case EOL:
+		if (*reginput != '\0')
+		    return (0);
+		break;
+	    case ANY:
+		if (*reginput == '\0')
+		    return (0);
+		reginput++;
+		break;
+	    case EXACTLY:{
+		    register int         len;
+		    register const char* opnd;
+
+		    opnd = OPERAND(scan);
+		    // Inline the first character, for speed.
+		    if (*opnd != *reginput)
+			return (0);
+		    len = strlen(opnd);
+		    if (len > 1 && strncmp(opnd, reginput, len) != 0)
+			return (0);
+		    reginput += len;
+		}
+		break;
+	    case ANYOF:
+		if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
+		    return (0);
+		reginput++;
+		break;
+	    case ANYBUT:
+		if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
+		    return (0);
+		reginput++;
+		break;
+	    case NOTHING:
+		break;
+	    case BACK:
+		break;
+	    case OPEN + 1:
+	    case OPEN + 2:
+	    case OPEN + 3:
+	    case OPEN + 4:
+	    case OPEN + 5:
+	    case OPEN + 6:
+	    case OPEN + 7:
+	    case OPEN + 8:
+	    case OPEN + 9:{
+		    register       int    no;
+		    register const char* save;
+
+		    no = OP(scan) - OPEN;
+		    save = reginput;
+
+		    if (regmatch(next)) {
+
+			//
+			// Don't set startp if some later invocation of the
+			// same parentheses already has. 
+			//
+			if (regstartp[no] == NULL)
+			    regstartp[no] = save;
+			return (1);
+		    }
+		    else
+			return (0);
+		}
+//		break;
+	    case CLOSE + 1:
+	    case CLOSE + 2:
+	    case CLOSE + 3:
+	    case CLOSE + 4:
+	    case CLOSE + 5:
+	    case CLOSE + 6:
+	    case CLOSE + 7:
+	    case CLOSE + 8:
+	    case CLOSE + 9:{
+		    register       int    no;
+		    register const char* save;
+
+		    no = OP(scan) - CLOSE;
+		    save = reginput;
+
+		    if (regmatch(next)) {
+
+			//
+			// Don't set endp if some later invocation of the
+			// same parentheses already has. 
+			//
+			if (regendp[no] == NULL)
+			    regendp[no] = save;
+			return (1);
+		    }
+		    else
+			return (0);
+		}
+//		break;
+	    case BRANCH:{
+	      
+	      register const char* save;
+
+		    if (OP(next) != BRANCH)	// No choice.
+			next = OPERAND(scan);	// Avoid recursion.
+		    else {
+			do {
+			    save = reginput;
+			    if (regmatch(OPERAND(scan)))
+				return (1);
+			    reginput = save;
+			    scan = regnext(scan);
+			} while (scan != NULL && OP(scan) == BRANCH);
+			return (0);
+			// NOTREACHED
+		    }
+		}
+		break;
+	    case STAR:
+	    case PLUS:{
+	      register char   nextch;
+		    register int        no;
+		    register const char* save;
+		    register int        min_no;
+
+		    //
+		    // Lookahead to avoid useless match attempts when we know
+		    // what character comes next. 
+		    //
+		    nextch = '\0';
+		    if (OP(next) == EXACTLY)
+			nextch = *OPERAND(next);
+		    min_no = (OP(scan) == STAR) ? 0 : 1;
+		    save = reginput;
+		    no = regrepeat(OPERAND(scan));
+		    while (no >= min_no) {
+			// If it could work, try it.
+			if (nextch == '\0' || *reginput == nextch)
+			    if (regmatch(next))
+				return (1);
+			// Couldn't or didn't -- back up.
+			no--;
+			reginput = save + no;
+		    }
+		    return (0);
+		}
+//		break;
+	    case END:
+ 		return (1);	// Success!
+
+	    default:
+	        //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
+                printf ("cmRegularExpression::find(): Internal error -- memory corrupted.\n");
+		return 0;
+	}
+	scan = next;
+    }
+
+    // 
+    //  We get here only if there's trouble -- normally "case END" is the
+    //  terminating point. 
+    // 
+    //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
+    printf ("cmRegularExpression::find(): Internal error -- corrupted pointers.\n");
+    return (0);
+}
+
+
+/*
+ - regrepeat - repeatedly match something simple, report how many
+ */
+static int regrepeat (const char* p) {
+    register       int   count = 0;
+    register const char* scan;
+    register const char* opnd;
+
+    scan = reginput;
+    opnd = OPERAND(p);
+    switch (OP(p)) {
+	case ANY:
+	    count = strlen(scan);
+	    scan += count;
+	    break;
+	case EXACTLY:
+	    while (*opnd == *scan) {
+		count++;
+		scan++;
+	    }
+	    break;
+	case ANYOF:
+	    while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
+		count++;
+		scan++;
+	    }
+	    break;
+	case ANYBUT:
+	    while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
+		count++;
+		scan++;
+	    }
+	    break;
+	default:		// Oh dear.  Called inappropriately.
+	    //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
+	    printf ("cm RegularExpression::find(): Internal error.\n");
+	    return 0;
+    }
+    reginput = scan;
+    return (count);
+}
+
+
+/*
+ - regnext - dig the "next" pointer out of a node
+ */
+static const char* regnext (register const char* p) {
+    register int offset;
+
+    if (p == &regdummy)
+	return (NULL);
+
+    offset = NEXT(p);
+    if (offset == 0)
+	return (NULL);
+
+    if (OP(p) == BACK)
+	return (p - offset);
+    else
+	return (p + offset);
+}
+
+
+static char* regnext (register char* p) {
+    register int offset;
+
+    if (p == &regdummy)
+	return (NULL);
+
+    offset = NEXT(p);
+    if (offset == 0)
+	return (NULL);
+
+    if (OP(p) == BACK)
+	return (p - offset);
+    else
+	return (p + offset);
+}
author	Bill Hoffman <bill.hoffman@kitware.com>	2000-08-29 19:26:29 (GMT)
committer	Bill Hoffman <bill.hoffman@kitware.com>	2000-08-29 19:26:29 (GMT)
commit	1f42f521cea8b953f4ad7ef5ca0d4c6d49c42a88 (patch)
tree	a7b88a2e4397167c0d6c72c9cf0fcf8f91dcd024 /Source/cmRegularExpression.cxx
parent	d6bdba1096f36268e414f32829e22159e983031a (diff)
download	CMake-1f42f521cea8b953f4ad7ef5ca0d4c6d49c42a88.zip CMake-1f42f521cea8b953f4ad7ef5ca0d4c6d49c42a88.tar.gz CMake-1f42f521cea8b953f4ad7ef5ca0d4c6d49c42a88.tar.bz2