/****************************************************************************** * * Copyright (C) 1997-2021 by Dimitri van Heesch. * * Permission to use, copy, modify, and distribute this software and its * documentation under the terms of the GNU General Public License is hereby * granted. No representations are made about the suitability of this software * for any purpose. It is provided "as is" without express or implied warranty. * See the GNU General Public License for more details. * * Documents produced by Doxygen are derivative works derived from the * input used in their production; they are not affected by this license. * */ #include "regex.h" #include #include #include #include #include #define ENABLE_DEBUG 0 #if ENABLE_DEBUG #define DBG(fmt,...) do { fprintf(stderr,fmt,__VA_ARGS__); } while(0) #else #define DBG(fmt,...) do {} while(0) #endif namespace reg { static inline bool isspace(char c) { return c==' ' || c=='\t' || c=='\n' || c=='\r'; } static inline bool isalpha(char c) { return static_cast(c)>=128 || (c>='a' && c<='z') || (c>='A' && c<='Z'); } static inline bool isdigit(char c) { return c>='0' && c<='9'; } static inline bool isalnum(char c) { return isalpha(c) || isdigit(c); } /** Class representing a token in the compiled regular expression token stream. * A token has a kind and an optional value whose meaning depends on the kind. * It is also possible to store a (from,to) character range in a token. */ class PToken { public: /** The kind of token. * * Ranges per bit mask: * - `0x00FF` from part of a range, except for `0x0000` which is the End marker * - `0x1FFF` built-in ranges * - `0x2FFF` user defined ranges * - `0x4FFF` special operations * - `0x8000` literal character */ enum class Kind : uint16_t { End = 0x0000, WhiteSpace = 0x1001, // \s range [ \t\r\n] Digit = 0x1002, // \d range [0-9] Alpha = 0x1003, // \a range [a-z_A-Z\x80-\xFF] AlphaNum = 0x1004, // \w range [a-Z_A-Z0-9\x80-\xFF] CharClass = 0x2001, // [] NegCharClass = 0x2002, // [^] BeginOfLine = 0x4001, // ^ EndOfLine = 0x4002, // $ BeginOfWord = 0x4003, // \< EndOfWord = 0x4004, // \> BeginCapture = 0x4005, // ( EndCapture = 0x4006, // ) Any = 0x4007, // . Star = 0x4008, // * Optional = 0x4009, // ? Character = 0x8000 // c }; /** returns a string representation of the tokens kind (useful for debugging). */ const char *kindStr() const { if ((m_rep>>16)>=0x1000 || m_rep==0) { switch(static_cast((m_rep>>16))) { case Kind::End: return "End"; case Kind::Alpha: return "Alpha"; case Kind::AlphaNum: return "AlphaNum"; case Kind::WhiteSpace: return "WhiteSpace"; case Kind::Digit: return "Digit"; case Kind::CharClass: return "CharClass"; case Kind::NegCharClass: return "NegCharClass"; case Kind::Character: return "Character"; case Kind::BeginOfLine: return "BeginOfLine"; case Kind::EndOfLine: return "EndOfLine"; case Kind::BeginOfWord: return "BeginOfWord"; case Kind::EndOfWord: return "EndOfWord"; case Kind::BeginCapture: return "BeginCapture"; case Kind::EndCapture: return "EndCapture"; case Kind::Any: return "Any"; case Kind::Star: return "Star"; case Kind::Optional: return "Optional"; } } else { return "Range"; } } /** Creates a token of kind 'End' */ PToken() : m_rep(0) {} /** Creates a token of the given kind \a k */ explicit PToken(Kind k) : m_rep(static_cast(k)<<16) {} /** Create a token for an ASCII character */ PToken(char c) : m_rep((static_cast(Kind::Character)<<16) | static_cast(c)) {} /** Create a token for a byte of an UTF-8 character */ PToken(uint16_t v) : m_rep((static_cast(Kind::Character)<<16) | static_cast(v)) {} /** Create a token representing a range from one character \a from to another character \a to */ PToken(uint16_t from,uint16_t to) : m_rep(static_cast(from)<<16 | to) {} /** Sets the value for a token */ void setValue(uint16_t value) { m_rep = (m_rep & 0xFFFF0000) | value; } /** Returns the kind of the token */ Kind kind() const { return static_cast(m_rep>>16); } /** Returns the 'from' part of the character range. Only valid if this token represents a range */ uint16_t from() const { return m_rep>>16; } /** Returns the 'to' part of the character range. Only valid if this token represents a range */ uint16_t to() const { return m_rep & 0xFFFF; } /** Returns the value for this token */ uint16_t value() const { return m_rep & 0xFFFF; } /** Returns the value for this token as a ASCII character */ char asciiValue() const { return static_cast(m_rep); } /** Returns true iff this token represents a range of characters */ bool isRange() const { return m_rep!=0 && from()<=to(); } /** Returns true iff this token is a positive or negative character class */ bool isCharClass() const { return kind()==Kind::CharClass || kind()==Kind::NegCharClass; } private: uint32_t m_rep; }; /** Private members of a regular expression */ class Ex::Private { public: /** Creates the private part */ Private(const std::string &pat) : pattern(pat) { data.reserve(100); } void compile(); #if ENABLE_DEBUG void dump(); #endif bool matchAt(size_t tokenPos,const std::string &str,Match &match,size_t pos,int level) const; /** Flag indicating the expression was successfully compiled */ bool error = false; /** The token stream representing the compiled regular expression. */ std::vector data; // compiled pattern /** The pattern string as passed by the user */ std::string pattern; }; /** Compiles a regular expression passed as a string into a stream of tokens that can be used for * efficient searching. */ void Ex::Private::compile() { error = false; data.clear(); if (pattern.empty()) return; const char *start = pattern.c_str(); const char *ps = start; char c; int prevTokenPos=-1; int tokenPos=0; auto addToken = [&](PToken tok) { tokenPos++; data.emplace_back(tok); }; auto getNextCharacter = [&]() -> PToken { char cs=*ps; PToken result = PToken(cs); if (cs=='\\') // escaped character { ps++; cs=*ps; switch (cs) { case 'n': result = PToken('\n'); break; case 'r': result = PToken('\r'); break; case 't': result = PToken('\t'); break; case 's': result = PToken(PToken::Kind::WhiteSpace); break; case 'a': result = PToken(PToken::Kind::Alpha); break; case 'w': result = PToken(PToken::Kind::AlphaNum); break; case 'd': result = PToken(PToken::Kind::Digit); break; case '<': result = PToken(PToken::Kind::BeginOfWord); break; case '>': result = PToken(PToken::Kind::EndOfWord); break; case 'x': case 'X': { uint16_t v=0; for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits { int d = (cs>='a' && cs<='f') ? cs-'a'+10 : (cs>='A' && cs<='F') ? cs-'A'+10 : (cs>='0' && cs<='9') ? cs-'0' : -1; if (d>=0) { v<<=4; v|=d; ps++; } else break; } result = PToken(v); } break; case '\0': ps--; break; // backslash at the end of the pattern default: result = PToken(cs); break; } } return result; }; while ((c=*ps)) { switch (c) { case '^': // beginning of line (if first character of the pattern) prevTokenPos = tokenPos; addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) : PToken(c)); break; case '$': // end of the line (if last character of the pattern) prevTokenPos = tokenPos; addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) : PToken(c)); break; case '.': // any character prevTokenPos = tokenPos; addToken(PToken(PToken::Kind::Any)); break; case '(': // begin of capture group prevTokenPos = tokenPos; addToken(PToken(PToken::Kind::BeginCapture)); break; case ')': // end of capture group prevTokenPos = tokenPos; addToken(PToken(PToken::Kind::EndCapture)); break; case '[': // character class { prevTokenPos = tokenPos; ps++; if (*ps==0) { error=true; return; } bool esc = *ps=='\\'; PToken tok = getNextCharacter(); ps++; if (!esc && tok.kind()==PToken::Kind::Character && tok.asciiValue()=='^') // negated character class { addToken(PToken(PToken::Kind::NegCharClass)); if (*ps==0) { error=true; return; } tok = getNextCharacter(); ps++; } else { addToken(PToken(PToken::Kind::CharClass)); } uint16_t numTokens=0; while ((c=*ps)) { if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range { getNextCharacter(); ps++; PToken endTok = getNextCharacter(); ps++; if (tok.value()>endTok.value()) { addToken(PToken(endTok.value(),tok.value())); // swap start and end } else { addToken(PToken(tok.value(),endTok.value())); } numTokens++; } else // single char, from==to { if (tok.kind()==PToken::Kind::Character) { addToken(PToken(tok.value(),tok.value())); } else // special token, add as-is since from>to { addToken(tok); } numTokens++; } if (*ps==0) { error=true; return; } // expected at least a ] esc = *ps=='\\'; tok = getNextCharacter(); if (!esc && tok.kind()==PToken::Kind::Character && tok.value()==static_cast(']')) { break; // end of character class } if (*ps==0) { error=true; return; } // no ] found ps++; } // set the value of either NegCharClass or CharClass data[prevTokenPos].setValue(numTokens); } break; case '*': // 0 or more case '+': // 1 or more case '?': // optional: 0 or 1 { if (prevTokenPos==-1) { error=true; return; } switch (data[prevTokenPos].kind()) { case PToken::Kind::BeginOfLine: // $* or $+ or $? case PToken::Kind::BeginOfWord: // \<* or \<+ or \* or \>+ or \>? case PToken::Kind::Star: // ** or *+ or *? case PToken::Kind::Optional: // ?* or ?+ or ?? error=true; return; default: // ok break; } int ddiff = static_cast(tokenPos-prevTokenPos); if (*ps=='+') // convert + -> * { // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*] // ddiff=n ^prevTokenPos data.resize(data.size()+ddiff); std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos); prevTokenPos+=ddiff; tokenPos+=ddiff; } data.insert(data.begin()+prevTokenPos, c=='?' ? PToken(PToken::Kind::Optional) : PToken(PToken::Kind::Star)); tokenPos++; addToken(PToken(PToken::Kind::End)); // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND] // ^prevTokenPos // same for 'T?'. } break; default: prevTokenPos = tokenPos; addToken(getNextCharacter()); break; } ps++; } //addToken(PToken(PToken::Kind::End)); } #if ENABLE_DEBUG /** Dump the compiled token stream for this regular expression. For debugging purposes. */ void Ex::Private::dump() { size_t l = data.size(); size_t i =0; DBG("==== compiled token stream for pattern '%s' ===\n",pattern.c_str()); while (i0 && i bool { PToken tok = data[tp]; bool negate = tok.kind()==PToken::Kind::NegCharClass; uint16_t numFields = tok.value(); bool found = false; for (uint16_t i=0;i(c); if (tok.from()<=v && v<=tok.to()) { found=true; break; } } } DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found); return negate ? !found : found; }; size_t index = pos; enum SequenceType { Star, Optional }; auto processSequence = [this,&tokenPos,&index,&str,&matchCharClass, &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool { size_t startIndex = index; PToken tok = data[++tokenPos]; if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's { char c_tok = tok.asciiValue(); while (index<=str.length() && str[index]==c_tok) { index++; if (type==Optional) break; } tokenPos++; } else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters { while (index<=str.length() && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; } tokenPos+=tok.value()+1; // skip over character ranges + end token } else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters { while (index<=str.length() && isStartIdChar(str[index])) { index++; if (type==Optional) break; } tokenPos++; } else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters { while (index<=str.length() && isIdChar(str[index])) { index++; if (type==Optional) break; } tokenPos++; } else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces { while (index<=str.length() && isspace(str[index])) { index++; if (type==Optional) break; } tokenPos++; } else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits { while (index<=str.length() && isdigit(str[index])) { index++; if (type==Optional) break; } tokenPos++; } else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all { if (type==Optional) index++; else index = str.length(); tokenPos++; } tokenPos++; // skip over end marker while ((int)index>=(int)startIndex) { // pattern 'x*xy' should match 'xy' and 'xxxxy' bool found = matchAt(tokenPos,str,match,index,level+1); if (found) { match.setMatch(pos,index-pos+match.length()); return true; } index--; } return false; }; while (tokenPos=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char index++,tokenPos++; } else if (tok.isCharClass()) { if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false; index++,tokenPos+=tok.value()+1; // skip over character ranges + end token } else { switch (tok.kind()) { case PToken::Kind::Alpha: if (index>=str.length() || !isStartIdChar(str[index])) return false; index++; break; case PToken::Kind::AlphaNum: if (index>=str.length() || !isIdChar(str[index])) return false; index++; break; case PToken::Kind::WhiteSpace: if (index>=str.length() || !isspace(str[index])) return false; index++; break; case PToken::Kind::Digit: if (index>=str.length() || !isdigit(str[index])) return false; index++; break; case PToken::Kind::BeginOfLine: if (index!=pos) return false; break; case PToken::Kind::EndOfLine: if (index0?str[index]-1:0, index>0?isIdChar(str[index-1]):-1); if (index>=str.length() || !isIdChar(str[index]) || (index>0 && isIdChar(str[index-1]))) return false; break; case PToken::Kind::EndOfWord: DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n", index,pos,str[index],isIdChar(str[index]), index==0 ? 0 : str[index-1], index==0 ? -1 : isIdChar(str[index-1])); if (index=str.length()) return false; index++; break; case PToken::Kind::Star: return processSequence(Star); case PToken::Kind::Optional: return processSequence(Optional); default: return false; } tokenPos++; } } match.setMatch(pos,index-pos); return true; } static std::string wildcard2regex(const std::string &pattern) { std::string result="^"; // match start of input char c; const char *p = pattern.c_str(); while ((c=*p++)) { switch(c) { case '*': result+=".*"; break; // '*' => '.*' case '?': result+='.'; break; // '?' => '.' case '.': case '+': case '\\': case '$': case '^': case '(': case ')': result+='\\'; result+=c; // escape break; case '[': if (*p=='^') // don't escape ^ after [ { result+="[^"; p++; } else { result+=c; } break; default: // just copy result+=c; break; } } result+='$'; // match end of input return result; } Ex::Ex(const std::string &pattern, Mode mode) : p(std::make_unique(mode==Mode::RegEx ? pattern : wildcard2regex(pattern))) { p->compile(); #if ENABLE_DEBUG p->dump(); assert(!p->error); #endif } Ex::~Ex() { } bool Ex::match(const std::string &str,Match &match,size_t pos) const { bool found=false; if (p->data.size()==0 || p->error) return found; match.init(&str); PToken tok = p->data[0]; if (tok.kind()==PToken::Kind::BeginOfLine) // only test match at the given position { found = p->matchAt(0,str,match,pos,0); } else { if (tok.kind()==PToken::Kind::Character) // search for the start character { size_t index = str.find(tok.asciiValue(),pos); if (index==std::string::npos) { DBG("Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n",str.c_str(),pos,tok.asciiValue()); return false; } DBG("pos=%zu str='%s' char='%c' index=%zu\n",index,str.c_str(),tok.asciiValue(),index); pos=index; } while (posmatchAt(0,str,match,pos,0); if (found) break; pos++; } } DBG("Ex::match(str='%s',pos=%zu)=%d\n",str.c_str(),pos,found); return found; } bool Ex::isValid() const { return !p->pattern.empty() && !p->error; } //---------------------------------------------------------------------------------------- bool search(const std::string &str,Match &match,const Ex &re,size_t pos) { return re.match(str,match,pos); } bool search(const std::string &str,const Ex &re,size_t pos) { Match match; return re.match(str,match,pos); } bool match(const std::string &str,Match &match,const Ex &re) { return re.match(str,match,0) && match.position()==0 && match.length()==str.length(); } bool match(const std::string &str,const Ex &re) { Match match; return re.match(str,match,0) && match.position()==0 && match.length()==str.length(); } std::string replace(const std::string &str,const Ex &re,const std::string &replacement) { std::string result; Match match; size_t p=0; while (re.match(str,match,p)) { size_t i=match.position(); size_t l=match.length(); if (i>p) result+=str.substr(p,i-p); result+=replacement; p=i+l; } if (p