From 52a5c4a877d066a283e76bd5b6e63a9f3eec31ea Mon Sep 17 00:00:00 2001 From: KWSys Upstream Date: Tue, 5 Dec 2017 11:30:36 -0500 Subject: KWSys 2017-12-05 (9376537e) Code extracted from: https://gitlab.kitware.com/utils/kwsys.git at commit 9376537ec0e4770a28f4b1705cfacf79650f71b6 (master). Upstream Shortlog ----------------- Brad King (1): e9557f37 RegularExpression: Fix regression in 'find' method Sebastian Holtermann (4): 4d1e8738 RegularExpression: Make compile() reentrant (thread safe) 64f80068 RegularExpression: Make find() reentrant (thread safe) bbc94ba8 RegularExpression: Remove unused code cff58f07 RegularExpression: New RegularExpressionMatch class --- RegularExpression.cxx | 189 ++++++++++++++++++++------------------- RegularExpression.hxx.in | 227 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 272 insertions(+), 144 deletions(-) diff --git a/RegularExpression.cxx b/RegularExpression.cxx index 26e84e0..fa3551c 100644 --- a/RegularExpression.cxx +++ b/RegularExpression.cxx @@ -45,9 +45,9 @@ RegularExpression::RegularExpression(const RegularExpression& rxp) this->program = new char[this->progsize]; // Allocate storage for (ind = this->progsize; ind-- != 0;) // Copy regular expresion this->program[ind] = rxp.program[ind]; - this->startp[0] = rxp.startp[0]; // Copy pointers into last - this->endp[0] = rxp.endp[0]; // Successful "find" operation - this->regmust = rxp.regmust; // Copy field + // Copy pointers into last successful "find" operation + this->regmatch = rxp.regmatch; + this->regmust = rxp.regmust; // Copy field if (rxp.regmust != 0) { char* dum = rxp.program; ind = 0; @@ -78,9 +78,9 @@ RegularExpression& RegularExpression::operator=(const RegularExpression& rxp) this->program = new char[this->progsize]; // Allocate storage for (ind = this->progsize; ind-- != 0;) // Copy regular expresion this->program[ind] = rxp.program[ind]; - this->startp[0] = rxp.startp[0]; // Copy pointers into last - this->endp[0] = rxp.endp[0]; // Successful "find" operation - this->regmust = rxp.regmust; // Copy field + // Copy pointers into last successful "find" operation + this->regmatch = rxp.regmatch; + this->regmust = rxp.regmust; // Copy field if (rxp.regmust != 0) { char* dum = rxp.program; ind = 0; @@ -123,8 +123,9 @@ bool RegularExpression::deep_equal(const RegularExpression& rxp) const while (ind-- != 0) // Else while still characters if (this->program[ind] != rxp.program[ind]) // If regexp are different return false; // Return failure - return (this->startp[0] == rxp.startp[0] && // Else if same start/end ptrs, - this->endp[0] == rxp.endp[0]); // Return true + // Else if same start/end ptrs, return true + return (this->regmatch.start() == rxp.regmatch.start() && + this->regmatch.end() == rxp.regmatch.end()); } // The remaining code in this file is derived from the regular expression code @@ -276,31 +277,35 @@ const unsigned char MAGIC = 0234; ///////////////////////////////////////////////////////////////////////// /* - * Global work variables for compile(). + * Read only utility variables. */ -static const char* regparse; // Input-scan pointer. -static int regnpar; // () count. static char regdummy; -static char* regcode; // Code-emit pointer; ®dummy = don't. -static long regsize; // Code size. +static char* const regdummyptr = ®dummy; /* - * Forward declarations for compile()'s friends. + * Utility class for RegularExpression::compile(). */ -// #ifndef static -// #define static static -// #endif -static char* reg(int, int*); -static char* regbranch(int*); -static char* regpiece(int*); -static char* regatom(int*); -static char* regnode(char); +class RegExpCompile +{ +public: + const char* regparse; // Input-scan pointer. + int regnpar; // () count. + char* regcode; // Code-emit pointer; regdummyptr = don't. + long regsize; // Code size. + + char* reg(int, int*); + char* regbranch(int*); + char* regpiece(int*); + char* regatom(int*); + char* regnode(char); + void regc(char); + void reginsert(char, char*); + static void regtail(char*, const char*); + static void regoptail(char*, const char*); +}; + static const char* regnext(const char*); static char* regnext(char*); -static void regc(char); -static void reginsert(char, char*); -static void regtail(char*, const char*); -static void regoptail(char*, const char*); #ifdef STRCSPN static int strcspn(); @@ -337,19 +342,20 @@ bool RegularExpression::compile(const char* exp) } // First pass: determine size, legality. - regparse = exp; - regnpar = 1; - regsize = 0L; - regcode = ®dummy; - regc(static_cast(MAGIC)); - if (!reg(0, &flags)) { + RegExpCompile comp; + comp.regparse = exp; + comp.regnpar = 1; + comp.regsize = 0L; + comp.regcode = regdummyptr; + comp.regc(static_cast(MAGIC)); + if (!comp.reg(0, &flags)) { printf("RegularExpression::compile(): Error in compile.\n"); return false; } - this->startp[0] = this->endp[0] = this->searchstring = 0; + this->regmatch.clear(); // Small enough for pointer-storage convention? - if (regsize >= 32767L) { // Probably could be 65535L. + if (comp.regsize >= 32767L) { // Probably could be 65535L. // RAISE Error, SYM(RegularExpression), SYM(Expr_Too_Big), printf("RegularExpression::compile(): Expression too big.\n"); return false; @@ -360,8 +366,8 @@ bool RegularExpression::compile(const char* exp) if (this->program != 0) delete[] this->program; //#endif - this->program = new char[regsize]; - this->progsize = static_cast(regsize); + this->program = new char[comp.regsize]; + this->progsize = static_cast(comp.regsize); if (this->program == 0) { // RAISE Error, SYM(RegularExpression), SYM(Out_Of_Memory), @@ -370,11 +376,11 @@ bool RegularExpression::compile(const char* exp) } // Second pass: emit code. - regparse = exp; - regnpar = 1; - regcode = this->program; - regc(static_cast(MAGIC)); - reg(0, &flags); + comp.regparse = exp; + comp.regnpar = 1; + comp.regcode = this->program; + comp.regc(static_cast(MAGIC)); + comp.reg(0, &flags); // Dig out information for optimizations. this->regstart = '\0'; // Worst-case defaults. @@ -423,7 +429,7 @@ bool RegularExpression::compile(const char* exp) * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ -static char* reg(int paren, int* flagp) +char* RegExpCompile::reg(int paren, int* flagp) { char* ret; char* br; @@ -435,7 +441,7 @@ static char* reg(int paren, int* flagp) // Make an OPEN node, if parenthesized. if (paren) { - if (regnpar >= RegularExpression::NSUBEXP) { + if (regnpar >= RegularExpressionMatch::NSUBEXP) { // RAISE Error, SYM(RegularExpression), SYM(Too_Many_Parens), printf("RegularExpression::compile(): Too many parentheses.\n"); return 0; @@ -501,7 +507,7 @@ static char* reg(int paren, int* flagp) * * Implements the concatenation operator. */ -static char* regbranch(int* flagp) +char* RegExpCompile::regbranch(int* flagp) { char* ret; char* chain; @@ -538,7 +544,7 @@ static char* regbranch(int* flagp) * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ -static char* regpiece(int* flagp) +char* RegExpCompile::regpiece(int* flagp) { char* ret; char op; @@ -605,7 +611,7 @@ static char* regpiece(int* flagp) * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ -static char* regatom(int* flagp) +char* RegExpCompile::regatom(int* flagp) { char* ret; int flags; @@ -724,13 +730,13 @@ static char* regatom(int* flagp) - regnode - emit a node Location. */ -static char* regnode(char op) +char* RegExpCompile::regnode(char op) { char* ret; char* ptr; ret = regcode; - if (ret == ®dummy) { + if (ret == regdummyptr) { regsize += 3; return (ret); } @@ -747,9 +753,9 @@ static char* regnode(char op) /* - regc - emit (if appropriate) a byte of code */ -static void regc(char b) +void RegExpCompile::regc(char b) { - if (regcode != ®dummy) + if (regcode != regdummyptr) *regcode++ = b; else regsize++; @@ -760,13 +766,13 @@ static void regc(char b) * * Means relocating the operand. */ -static void reginsert(char op, char* opnd) +void RegExpCompile::reginsert(char op, char* opnd) { char* src; char* dst; char* place; - if (regcode == ®dummy) { + if (regcode == regdummyptr) { regsize += 3; return; } @@ -786,13 +792,13 @@ static void reginsert(char op, char* opnd) /* - regtail - set the next-pointer at the end of a node chain */ -static void regtail(char* p, const char* val) +void RegExpCompile::regtail(char* p, const char* val) { char* scan; char* temp; int offset; - if (p == ®dummy) + if (p == regdummyptr) return; // Find last node. @@ -815,10 +821,10 @@ static void regtail(char* p, const char* val) /* - regoptail - regtail on operand of first argument; nop if operandless */ -static void regoptail(char* p, const char* val) +void RegExpCompile::regoptail(char* p, const char* val) { // "Operandless" and "op != BRANCH" are synonymous in practice. - if (p == 0 || p == ®dummy || OP(p) != BRANCH) + if (p == 0 || p == regdummyptr || OP(p) != BRANCH) return; regtail(OPERAND(p), val); } @@ -830,34 +836,30 @@ static void regoptail(char* p, const char* val) //////////////////////////////////////////////////////////////////////// /* - * Global work variables for find(). + * Utility class for RegularExpression::find(). */ -static const char* reginput; // String-input pointer. -static const char* regbol; // Beginning of input, for ^ check. -static const char** regstartp; // Pointer to startp array. -static const char** regendp; // Ditto for endp. +class RegExpFind +{ +public: + const char* reginput; // String-input pointer. + const char* regbol; // Beginning of input, for ^ check. + const char** regstartp; // Pointer to startp array. + const char** regendp; // Ditto for endp. -/* - * Forwards. - */ -static int regtry(const char*, const char**, const char**, const char*); -static int regmatch(const char*); -static int regrepeat(const char*); - -#ifdef DEBUG -int regnarrate = 0; -void regdump(); -static char* regprop(); -#endif + int regtry(const char*, const char**, const char**, const char*); + int regmatch(const char*); + int regrepeat(const char*); +}; // find -- Matches the regular expression to the given string. // Returns true if found, and sets start and end indexes accordingly. - -bool RegularExpression::find(const char* string) +bool RegularExpression::find(char const* string, + RegularExpressionMatch& rmatch) const { const char* s; - this->searchstring = string; + rmatch.clear(); + rmatch.searchstring = string; if (!this->program) { return false; @@ -868,7 +870,7 @@ bool RegularExpression::find(const char* string) // RAISE Error, SYM(RegularExpression), SYM(Internal_Error), printf( "RegularExpression::find(): Compiled regular expression corrupted.\n"); - return 0; + return false; } // If there is a "must appear" string, look for it. @@ -880,42 +882,45 @@ bool RegularExpression::find(const char* string) s++; } if (s == 0) // Not present. - return (0); + return false; } + RegExpFind regFind; + // Mark beginning of line for ^ . - regbol = string; + regFind.regbol = string; // Simplest case: anchored match need be tried only once. if (this->reganch) - return (regtry(string, this->startp, this->endp, this->program) != 0); + return ( + regFind.regtry(string, rmatch.startp, rmatch.endp, this->program) != 0); // Messy cases: unanchored match. s = string; if (this->regstart != '\0') // We know what char it must start with. while ((s = strchr(s, this->regstart)) != 0) { - if (regtry(s, this->startp, this->endp, this->program)) - return (1); + if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program)) + return true; s++; } else // We don't -- general case. do { - if (regtry(s, this->startp, this->endp, this->program)) - return (1); + if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program)) + return true; } while (*s++ != '\0'); // Failure. - return (0); + return false; } /* - regtry - try match at specific point 0 failure, 1 success */ -static int regtry(const char* string, const char** start, const char** end, - const char* prog) +int RegExpFind::regtry(const char* string, const char** start, + const char** end, const char* prog) { int i; const char** sp1; @@ -927,7 +932,7 @@ static int regtry(const char* string, const char** start, const char** end, sp1 = start; ep = end; - for (i = RegularExpression::NSUBEXP; i > 0; i--) { + for (i = RegularExpressionMatch::NSUBEXP; i > 0; i--) { *sp1++ = 0; *ep++ = 0; } @@ -950,7 +955,7 @@ static int regtry(const char* string, const char** start, const char** end, * by recursion. * 0 failure, 1 success */ -static int regmatch(const char* prog) +int RegExpFind::regmatch(const char* prog) { const char* scan; // Current node. const char* next; // Next node. @@ -1129,7 +1134,7 @@ static int regmatch(const char* prog) /* - regrepeat - repeatedly match something simple, report how many */ -static int regrepeat(const char* p) +int RegExpFind::regrepeat(const char* p) { int count = 0; const char* scan; @@ -1176,7 +1181,7 @@ static const char* regnext(const char* p) { int offset; - if (p == ®dummy) + if (p == regdummyptr) return (0); offset = NEXT(p); @@ -1193,7 +1198,7 @@ static char* regnext(char* p) { int offset; - if (p == ®dummy) + if (p == regdummyptr) return (0); offset = NEXT(p); diff --git a/RegularExpression.hxx.in b/RegularExpression.hxx.in index 763fdab..a3fe72d 100644 --- a/RegularExpression.hxx.in +++ b/RegularExpression.hxx.in @@ -34,6 +34,115 @@ namespace @KWSYS_NAMESPACE@ { +// Forward declaration +class RegularExpression; + +/** \class RegularExpressionMatch + * \brief Stores the pattern matches of a RegularExpression + */ +class @KWSYS_NAMESPACE@_EXPORT RegularExpressionMatch +{ +public: + RegularExpressionMatch(); + + bool isValid() const; + void clear(); + + std::string::size_type start() const; + std::string::size_type end() const; + std::string::size_type start(int n) const; + std::string::size_type end(int n) const; + std::string match(int n) const; + + enum + { + NSUBEXP = 10 + }; + +private: + friend class RegularExpression; + const char* startp[NSUBEXP]; + const char* endp[NSUBEXP]; + const char* searchstring; +}; + +/** + * \brief Creates an invalid match object + */ +inline RegularExpressionMatch::RegularExpressionMatch() +{ + startp[0] = 0; + endp[0] = 0; + searchstring = 0; +} + +/** + * \brief Returns true if the match pointers are valid + */ +inline bool RegularExpressionMatch::isValid() const +{ + return (this->startp[0] != 0); +} + +/** + * \brief Resets to the (invalid) construction state. + */ +inline void RegularExpressionMatch::clear() +{ + startp[0] = 0; + endp[0] = 0; + searchstring = 0; +} + +/** + * \brief Returns the start index of the full match. + */ +inline std::string::size_type RegularExpressionMatch::start() const +{ + return static_cast(this->startp[0] - searchstring); +} + +/** + * \brief Returns the end index of the full match. + */ +inline std::string::size_type RegularExpressionMatch::end() const +{ + return static_cast(this->endp[0] - searchstring); +} + +/** + * \brief Returns the start index of nth submatch. + * start(0) is the start of the full match. + */ +inline std::string::size_type RegularExpressionMatch::start(int n) const +{ + return static_cast(this->startp[n] - + this->searchstring); +} + +/** + * \brief Returns the end index of nth submatch. + * end(0) is the end of the full match. + */ +inline std::string::size_type RegularExpressionMatch::end(int n) const +{ + return static_cast(this->endp[n] - + this->searchstring); +} + +/** + * \brief Returns the nth submatch as a string. + */ +inline std::string RegularExpressionMatch::match(int n) const +{ + if (this->startp[n] == 0) { + return std::string(); + } else { + return std::string(this->startp[n], static_cast( + this->endp[n] - this->startp[n])); + } +} + /** \class RegularExpression * \brief Implements pattern matching with regular expressions. * @@ -170,6 +279,9 @@ namespace @KWSYS_NAMESPACE@ { * the same as the two characters before the first p encounterd in * the line. It would match "drepa qrepb" in "rep drepa qrepb". * + * All methods of RegularExpression can be called simultaneously from + * different threads but only if each invocation uses an own instance of + * RegularExpression. */ class @KWSYS_NAMESPACE@_EXPORT RegularExpression { @@ -213,9 +325,19 @@ public: /** * Matches the regular expression to the given string. + * Returns true if found, and sets start and end indexes + * in the RegularExpressionMatch instance accordingly. + * + * This method is thread safe when called with different + * RegularExpressionMatch instances. + */ + bool find(char const*, RegularExpressionMatch&) const; + + /** + * Matches the regular expression to the given string. * Returns true if found, and sets start and end indexes accordingly. */ - bool find(char const*); + inline bool find(char const*); /** * Matches the regular expression to the given std string. @@ -224,14 +346,18 @@ public: inline bool find(std::string const&); /** - * Index to start of first find. + * Match indices */ + inline RegularExpressionMatch const& regMatch() const; inline std::string::size_type start() const; + inline std::string::size_type end() const; + inline std::string::size_type start(int n) const; + inline std::string::size_type end(int n) const; /** - * Index to end of first find. + * Match strings */ - inline std::string::size_type end() const; + inline std::string match(int n) const; /** * Copy the given regular expression. @@ -266,29 +392,14 @@ public: */ inline void set_invalid(); - /** - * Destructor. - */ - // awf added - std::string::size_type start(int n) const; - std::string::size_type end(int n) const; - std::string match(int n) const; - - enum - { - NSUBEXP = 10 - }; - private: - const char* startp[NSUBEXP]; - const char* endp[NSUBEXP]; + RegularExpressionMatch regmatch; char regstart; // Internal use only char reganch; // Internal use only const char* regmust; // Internal use only std::string::size_type regmlen; // Internal use only char* program; int progsize; - const char* searchstring; }; /** @@ -344,51 +455,42 @@ inline bool RegularExpression::compile(std::string const& s) * Matches the regular expression to the given std string. * Returns true if found, and sets start and end indexes accordingly. */ -inline bool RegularExpression::find(std::string const& s) +inline bool RegularExpression::find(const char* s) { - return this->find(s.c_str()); + return this->find(s, this->regmatch); } /** - * Set the start position for the regular expression. + * Matches the regular expression to the given std string. + * Returns true if found, and sets start and end indexes accordingly. */ -inline std::string::size_type RegularExpression::start() const +inline bool RegularExpression::find(std::string const& s) { - return static_cast(this->startp[0] - searchstring); + return this->find(s.c_str()); } /** - * Returns the start/end index of the last item found. + * Returns the internal match object */ -inline std::string::size_type RegularExpression::end() const +inline RegularExpressionMatch const& RegularExpression::regMatch() const { - return static_cast(this->endp[0] - searchstring); + return this->regmatch; } /** - * Returns true if two regular expressions have different - * compiled program for pattern matching. + * Returns the start index of the full match. */ -inline bool RegularExpression::operator!=(const RegularExpression& r) const +inline std::string::size_type RegularExpression::start() const { - return (!(*this == r)); + return regmatch.start(); } /** - * Returns true if a valid regular expression is compiled - * and ready for pattern matching. + * Returns the end index of the full match. */ -inline bool RegularExpression::is_valid() const -{ - return (this->program != 0); -} - -inline void RegularExpression::set_invalid() +inline std::string::size_type RegularExpression::end() const { - //#ifndef _WIN32 - delete[] this->program; - //#endif - this->program = 0; + return regmatch.end(); } /** @@ -396,7 +498,7 @@ inline void RegularExpression::set_invalid() */ inline std::string::size_type RegularExpression::start(int n) const { - return static_cast(this->startp[n] - searchstring); + return regmatch.start(n); } /** @@ -404,7 +506,7 @@ inline std::string::size_type RegularExpression::start(int n) const */ inline std::string::size_type RegularExpression::end(int n) const { - return static_cast(this->endp[n] - searchstring); + return regmatch.end(n); } /** @@ -412,12 +514,33 @@ inline std::string::size_type RegularExpression::end(int n) const */ inline std::string RegularExpression::match(int n) const { - if (this->startp[n] == 0) { - return std::string(""); - } else { - return std::string(this->startp[n], static_cast( - this->endp[n] - this->startp[n])); - } + return regmatch.match(n); +} + +/** + * Returns true if two regular expressions have different + * compiled program for pattern matching. + */ +inline bool RegularExpression::operator!=(const RegularExpression& r) const +{ + return (!(*this == r)); +} + +/** + * Returns true if a valid regular expression is compiled + * and ready for pattern matching. + */ +inline bool RegularExpression::is_valid() const +{ + return (this->program != 0); +} + +inline void RegularExpression::set_invalid() +{ + //#ifndef _WIN32 + delete[] this->program; + //#endif + this->program = 0; } } // namespace @KWSYS_NAMESPACE@ -- cgit v0.12