From 3d4f0313d20cc8f71ade094faa006a2171ff29c2 Mon Sep 17 00:00:00 2001 From: Dimitri van Heesch Date: Tue, 2 Mar 2021 21:40:36 +0100 Subject: Refactoring: replaced std::regex with own much faster implementation --- src/CMakeLists.txt | 1 + src/classdef.cpp | 1 - src/configimpl.l | 58 ++-- src/context.cpp | 2 +- src/definition.cpp | 10 +- src/docparser.cpp | 12 +- src/docsets.cpp | 2 +- src/doctokenizer.l | 26 +- src/doxygen.cpp | 44 +-- src/groupdef.cpp | 8 +- src/htmlhelp.cpp | 6 +- src/markdown.cpp | 10 +- src/memberdef.cpp | 87 ++---- src/pre.l | 20 +- src/regex.cpp | 736 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/regex.h | 336 +++++++++++++++++++++++ src/rtfstyle.cpp | 40 +-- src/scanner.l | 40 +-- src/template.cpp | 10 +- src/util.cpp | 213 +++++++-------- src/util.h | 6 +- src/vhdlcode.l | 6 +- src/vhdldocgen.cpp | 27 +- src/vhdljjparser.cpp | 20 +- 24 files changed, 1374 insertions(+), 347 deletions(-) create mode 100644 src/regex.cpp create mode 100644 src/regex.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c6af813..4488067 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -281,6 +281,7 @@ add_library(doxymain STATIC qhp.cpp qhpxmlwriter.cpp reflist.cpp + regex.cpp resourcemgr.cpp rtfdocvisitor.cpp rtfgen.cpp diff --git a/src/classdef.cpp b/src/classdef.cpp index 3e602b0..198a1c9 100644 --- a/src/classdef.cpp +++ b/src/classdef.cpp @@ -17,7 +17,6 @@ #include #include -#include #include #include diff --git a/src/configimpl.l b/src/configimpl.l index b99ddf8..1f9be8d 100644 --- a/src/configimpl.l +++ b/src/configimpl.l @@ -32,8 +32,8 @@ #include #include -#include +#include "regex.h" #include "configimpl.h" #include "version.h" #include "portable.h" @@ -1135,27 +1135,32 @@ void ConfigImpl::emptyValueToDefault() static void substEnvVarsInString(QCString &str) { if (str.isEmpty()) return; - // match e.g. $(HOME) but also $(PROGRAMFILES(X86)) - static const std::regex re("\\$\\(([[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF.-]*(\\([[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF.-]*\\))?)\\)", std::regex::optimize); - std::string s = str.str(); - std::sregex_iterator it(s.begin(),s.end(),re); - std::sregex_iterator end; - std::string result; - size_t p = 0; - for (; it!=end ; ++it) - { - const auto &match = *it; - size_t i = match.position(); - size_t l = match.length(); - result+=s.substr(p,i-p); - std::string matchContents = match[1].str(); - QCString env=Portable::getenv(matchContents.c_str()); // get content of $(..) match - substEnvVarsInString(env); // recursively expand variables if needed. - result+=env.str(); - p=i+l; - } - result+=s.substr(p); - str = QCString(result).stripWhiteSpace(); + auto replace = [](const std::string &s, const reg::Ex &re) -> std::string + { + reg::Iterator it(s,re); + reg::Iterator end; + std::string result; + size_t p = 0; + for (; it!=end ; ++it) + { + const auto &match = *it; + size_t i = match.position(); + size_t l = match.length(); + result+=s.substr(p,i-p); + std::string matchContents = match[1].str(); + QCString env=Portable::getenv(matchContents.c_str()); // get content of $(..) match + substEnvVarsInString(env); // recursively expand variables if needed. + result+=env.str(); + p=i+l; + } + result+=s.substr(p); + return result; + }; + + // match e.g. re1=$(HOME) but also re2=$(PROGRAMFILES(X86)) + static const reg::Ex re1(R"(\$\((\a[\w.-]*)\))"); + static const reg::Ex re2(R"(\$\((\a[\w.-]*\(\a[\w.-]*\))\))"); + str = QCString(replace(replace(str.str(),re1),re2)).stripWhiteSpace(); } static void substEnvVarsInStrList(StringVector &sl) @@ -1648,11 +1653,10 @@ void Config::checkAndCorrect() const StringVector &aliasList = Config_getList(ALIASES); for (const auto &alias : aliasList) { - // match aliases of the form 'name=' and 'name{2} =' - static const std::regex re("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_]*(\\{[[:digit:]]+\\})?[[:space:]]*=", std::regex::optimize); - std::sregex_iterator it(alias.begin(),alias.end(),re); - std::sregex_iterator end; - if (it==end) + // match aliases of the form re1='name=' and re2='name{2} =' + static const reg::Ex re1(R"(\a\w*\s*=)"); + static const reg::Ex re2(R"(\a\w*{\d+}\s*=)"); + if (!reg::search(alias,re1) && !reg::search(alias,re2)) { err("Illegal ALIASES format '%s'. Use \"name=value\" or \"name{n}=value\", where n is the number of arguments\n", alias.c_str()); diff --git a/src/context.cpp b/src/context.cpp index d2cfa9e..8bdc23c 100644 --- a/src/context.cpp +++ b/src/context.cpp @@ -136,7 +136,7 @@ class GenericConstIterator : public TemplateListIntf::ConstIterator } void toLast() { - m_index=m_list.size()-1; + m_index=(int)m_list.size()-1; } void toNext() { diff --git a/src/definition.cpp b/src/definition.cpp index 1140a7f..70c1839 100644 --- a/src/definition.cpp +++ b/src/definition.cpp @@ -19,13 +19,13 @@ #include #include #include -#include #include #include "md5.h" #include #include #include +#include "regex.h" #include "config.h" #include "definitionimpl.h" #include "doxygen.h" @@ -176,12 +176,10 @@ static bool matchExcludedSymbols(const char *name) pattern=pattern.left(pattern.length()-1),forceEnd=TRUE; if (pattern.find('*')!=-1) // wildcard mode { - const std::regex re(substitute(pattern,"*",".*").str()); - std::sregex_iterator it(symName.begin(),symName.end(),re); - std::sregex_iterator end; - if (it!=end) // wildcard match + const reg::Ex re(substitute(pattern,"*",".*").str()); + reg::Match match; + if (reg::search(symName,match,re)) // wildcard match { - const auto &match = *it; size_t ui = match.position(); size_t pl = match.length(); size_t sl = symName.length(); diff --git a/src/docparser.cpp b/src/docparser.cpp index e801133..6325cd8 100644 --- a/src/docparser.cpp +++ b/src/docparser.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -24,6 +23,7 @@ #include #include +#include "regex.h" #include "doxygen.h" #include "debug.h" #include "util.h" @@ -436,9 +436,9 @@ static void checkArgumentName(const std::string &name) //printf("isDocsForDefinition()=%d\n",g_memberDef->isDocsForDefinition()); if (al.empty()) return; // no argument list - static const std::regex re("(\\$[[:alnum:]\\x80-\\xFF_]|[[:alpha:]\\x80-\\xFF_])[[:alnum:]\\x80-\\xFF_]*\\.*", std::regex::optimize); - std::sregex_iterator it(name.begin(),name.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"(\$?\w+\.*)"); + reg::Iterator it(name,re); + reg::Iterator end; for (; it!=end ; ++it) { const auto &match = *it; @@ -873,9 +873,9 @@ static int handleStyleArgument(DocNode *parent,DocNodeList &children, tok!=TK_ENDLIST ) { - static const std::regex specialChar("[.,|()\\[\\]:;\\?]", std::regex::optimize); + static const reg::Ex specialChar(R"([.,|()\[\]:;?])"); if (tok==TK_WORD && g_token->name.length()==1 && - std::regex_search(g_token->name.str(),specialChar)) + reg::match(g_token->name.str(),specialChar)) { // special character that ends the markup command return tok; diff --git a/src/docsets.cpp b/src/docsets.cpp index 4f04623..2911025 100644 --- a/src/docsets.cpp +++ b/src/docsets.cpp @@ -201,7 +201,7 @@ void DocSets::finalize() QCString DocSets::Private::indent() { QCString result; - result.fill(' ',(indentStack.size()+2)*2); + result.fill(' ',((int)indentStack.size()+2)*2); return result; } diff --git a/src/doctokenizer.l b/src/doctokenizer.l index f30dba7..9bafcab 100644 --- a/src/doctokenizer.l +++ b/src/doctokenizer.l @@ -26,7 +26,6 @@ #include #include -#include #include #include @@ -42,6 +41,7 @@ #include "doxygen.h" #include "portable.h" #include "cite.h" +#include "regex.h" #define YY_NO_INPUT 1 #define YY_NO_UNISTD_H 1 @@ -515,9 +515,9 @@ RCSID "$"("Author"|"Date"|"Header"|"Id"|"Locker"|"Log"|"Name"|"RCSfile"|"Revisio { lineCount(yytext,yyleng); std::string text=yytext; - static const std::regex re("[*+][^*+]*$", std::regex::optimize); // find last + or * - std::smatch match; - std::regex_search(text,match,re); + static const reg::Ex re(R"([*+][^*+]*$)"); // find last + or * + reg::Match match; + reg::search(text,match,re); size_t listPos = match.position(); g_token->isEnumList = FALSE; g_token->id = -1; @@ -533,9 +533,9 @@ RCSID "$"("Author"|"Date"|"Header"|"Id"|"Locker"|"Log"|"Name"|"RCSfile"|"Revisio else { std::string text=yytext; - static const std::regex re("[1-9]+", std::regex::optimize); - std::smatch match; - std::regex_search(text,match,re); + static const reg::Ex re(R"(\d+)"); + reg::Match match; + reg::search(text,match,re); g_token->isEnumList = true; g_token->id = std::stoul(match.str()); g_token->indent = computeIndent(yytext,match.position()); @@ -560,9 +560,9 @@ RCSID "$"("Author"|"Date"|"Header"|"Id"|"Locker"|"Log"|"Name"|"RCSfile"|"Revisio { lineCount(yytext,yyleng); std::string text=extractPartAfterNewLine(yytext).str(); - static const std::regex re("[*+][^*+]*$", std::regex::optimize); // find last + or * - std::smatch match; - std::regex_search(text,match,re); + static const reg::Ex re(R"([*+][^*+]*$)"); // find last + or * + reg::Match match; + reg::search(text,match,re); size_t markPos = match.position(); g_token->isEnumList = FALSE; g_token->id = -1; @@ -579,9 +579,9 @@ RCSID "$"("Author"|"Date"|"Header"|"Id"|"Locker"|"Log"|"Name"|"RCSfile"|"Revisio { lineCount(yytext,yyleng); std::string text=extractPartAfterNewLine(yytext).str(); - static const std::regex re("[1-9]+", std::regex::optimize); - std::smatch match; - std::regex_search(text,match,re); + static const reg::Ex re(R"(\d+)"); + reg::Match match; + reg::search(text,match,re); g_token->isEnumList = true; g_token->id = std::stoul(match.str()); g_token->indent = computeIndent(text.c_str(),match.position()); diff --git a/src/doxygen.cpp b/src/doxygen.cpp index d4127a7..ce89540 100644 --- a/src/doxygen.cpp +++ b/src/doxygen.cpp @@ -31,7 +31,6 @@ #include #include #include -#include #include "version.h" #include "doxygen.h" @@ -106,6 +105,7 @@ #include "threadpool.h" #include "clangparser.h" #include "symbolresolver.h" +#include "regex.h" #if USE_SQLITE3 #include @@ -2220,10 +2220,10 @@ static MemberDef *addVariableToFile( { ttype.stripPrefix("struct "); ttype.stripPrefix("union "); - static const std::regex re("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_]*", std::regex::optimize); - std::smatch match; + static const reg::Ex re(R"(\a\w*)"); + reg::Match match; std::string typ = ttype.str(); - if (std::regex_search(typ,match,re)) + if (reg::search(typ,match,re)) { QCString typeValue = match.str(); ClassDefMutable *cd = getClassMutable(typeValue); @@ -2439,11 +2439,11 @@ static int findFunctionPtr(const std::string &type,SrcLangExt lang, int *pLength return -1; // Fortran and VHDL do not have function pointers } - static const std::regex re("\\([^)]*[*\\^][^)]*\\)", std::regex::optimize); - std::smatch match; + static const reg::Ex re(R"(\([^)]*[*^][^)]*\))"); + reg::Match match; size_t i=std::string::npos; size_t l=0; - if (std::regex_search(type,match,re)) // contains (...*...) + if (reg::search(type,match,re)) // contains (...*...) { i = match.position(); l = match.length(); @@ -2532,12 +2532,12 @@ static bool isVarWithConstructor(const Entry *root) } for (const Argument &a : root->argList) { - static const std::regex initChars("[0-9\"'&*!^]+", std::regex::optimize); - std::smatch match; + static const reg::Ex initChars(R"([\d"'&*!^]+)"); + reg::Match match; if (!a.name.isEmpty() || !a.defval.isEmpty()) { std::string name = a.name.str(); - if (std::regex_search(name,match,initChars) && match.position()==0) + if (reg::search(name,match,initChars) && match.position()==0) { result=TRUE; } @@ -2567,15 +2567,15 @@ static bool isVarWithConstructor(const Entry *root) goto done; } std::string atype = a.type.str(); - if (std::regex_search(atype,match,initChars) && match.position()==0) + if (reg::search(atype,match,initChars) && match.position()==0) { result=TRUE; // argument type starts with typical initializer char goto done; } std::string resType=resolveTypeDef(ctx,a.type).str(); if (resType.empty()) resType=atype; - static const std::regex idChars("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_]*", std::regex::optimize); - if (std::regex_search(resType,match,idChars) && match.position()==0) // resType starts with identifier + static const reg::Ex idChars(R"(\a\w*)"); + if (reg::search(resType,match,idChars) && match.position()==0) // resType starts with identifier { resType=match.str(); //printf("resType=%s\n",resType.data()); @@ -2625,9 +2625,9 @@ static void addVariable(const Entry *root,int isFuncPtr=-1) type=name; std::string sargs = args.str(); - static const std::regex reName("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_]*", std::regex::optimize); - std::smatch match; - if (std::regex_search(sargs,match,reName)) + static const reg::Ex reName(R"(\a\w*)"); + reg::Match match; + if (reg::search(sargs,match,reName)) { name = match.str(); // e.g. 'var' in '(var[10])' sargs = match.suffix().str(); // e.g. '[10]) in '(var[10])' @@ -3816,9 +3816,9 @@ static TemplateNameMap getTemplateArgumentsInName(const ArgumentList &templateAr int count=0; for (const Argument &arg : templateArguments) { - static const std::regex re("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_:]*", std::regex::optimize); - std::sregex_iterator it(name.begin(),name.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"(\a[\w:]*)"); + reg::Iterator it(name,re); + reg::Iterator end; for (; it!=end ; ++it) { const auto &match = *it; @@ -5325,9 +5325,9 @@ static QCString substituteTemplatesInString( ) { std::string dst; - static const std::regex re("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_]*", std::regex::optimize); - std::sregex_iterator it(src.begin(),src.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"(\a\w*)"); + reg::Iterator it(src,re); + reg::Iterator end; //printf("type=%s\n",sa->type.data()); size_t p=0; for (; it!=end ; ++it) // for each word in srcType diff --git a/src/groupdef.cpp b/src/groupdef.cpp index 5a89a4a..095069e 100644 --- a/src/groupdef.cpp +++ b/src/groupdef.cpp @@ -17,7 +17,6 @@ #include #include -#include #include @@ -46,6 +45,7 @@ #include "dirdef.h" #include "config.h" #include "definitionimpl.h" +#include "regex.h" //--------------------------------------------------------------------------- @@ -1090,10 +1090,10 @@ void GroupDefImpl::writeDocumentation(OutputList &ol) if (Doxygen::searchIndex) { Doxygen::searchIndex->setCurrentDoc(this,anchor(),FALSE); - static const std::regex we("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_\\-]*", std::regex::optimize); std::string title = m_title.str(); - std::sregex_iterator it(title.begin(),title.end(),we); - std::sregex_iterator end; + static const reg::Ex re(R"(\a[\w-]*)"); + reg::Iterator it(title,re); + reg::Iterator end; for (; it!=end ; ++it) { const auto &match = *it; diff --git a/src/htmlhelp.cpp b/src/htmlhelp.cpp index e3a007a..3d88e4f 100644 --- a/src/htmlhelp.cpp +++ b/src/htmlhelp.cpp @@ -16,7 +16,6 @@ */ #include -#include #include #include @@ -34,6 +33,7 @@ #include "filedef.h" #include "util.h" #include "linkedmap.h" +#include "regex.h" //---------------------------------------------------------------------------- @@ -154,10 +154,10 @@ void HtmlHelpIndex::addItem(const char *level1,const char *level2, const char *url,const char *anchor,bool hasLink, bool reversed) { - static const std::regex re("@[[:digit:]]+", std::regex::optimize); + static const reg::Ex re(R"(@\d+)"); std::string key = level1; if (level2) key+= std::string("?") + level2; - if (std::regex_search(key,re)) // skip anonymous stuff + if (reg::search(key,re)) // skip anonymous stuff { return; } diff --git a/src/markdown.cpp b/src/markdown.cpp index 8ac2e5f..185a43e 100644 --- a/src/markdown.cpp +++ b/src/markdown.cpp @@ -37,7 +37,6 @@ #include #include -#include #include "markdown.h" #include "growbuf.h" @@ -51,6 +50,7 @@ #include "section.h" #include "message.h" #include "portable.h" +#include "regex.h" #if !defined(NDEBUG) #define ENABLE_TRACING @@ -1472,13 +1472,13 @@ static QCString extractTitleId(QCString &title, int level) { TRACE(title.data()); // match e.g. '{#id-b11} ' and capture 'id-b11' - static const std::regex r2("\\{#([a-z_A-Z][a-z_A-Z0-9\\-]*)\\}[[:space:]]*$", std::regex::optimize); - std::smatch match; + static const reg::Ex r2(R"({#(\a[\w-]*)}\s*$)"); + reg::Match match; std::string ti = title.str(); - if (std::regex_search(ti,match,r2)) + if (reg::search(ti,match,r2)) { std::string id = match[1].str(); - title = title.left(match.position()); + title = title.left((int)match.position()); //printf("found match id='%s' title=%s\n",id.c_str(),title.data()); return id; } diff --git a/src/memberdef.cpp b/src/memberdef.cpp index 5d2a0da..c357c43 100644 --- a/src/memberdef.cpp +++ b/src/memberdef.cpp @@ -13,11 +13,11 @@ * */ -#include #include #include #include + #include "md5.h" #include "memberdef.h" #include "membername.h" @@ -37,7 +37,6 @@ #include "dotcallgraph.h" #include "searchindex.h" #include "parserintf.h" - #include "vhdldocgen.h" #include "arguments.h" #include "memberlist.h" @@ -45,6 +44,7 @@ #include "filedef.h" #include "config.h" #include "definitionimpl.h" +#include "regex.h" //----------------------------------------------------------------------------- @@ -1852,10 +1852,10 @@ ClassDef *MemberDefImpl::getClassDefOfAnonymousType() const // match expression if it contains at least one @1 marker, e.g. // 'struct A::@1::@2::B' matches 'A::@1::@2::B' but 'struct A::B' does not match. - static const std::regex r("[[:alnum:]\\x80-\\xFF_@:]*@[[:digit:]]+[[:alnum:]\\x80-\\xFF_@:]*", std::regex::optimize); std::string stype = ltype.str(); - std::smatch match; - if (std::regex_search(stype,match,r)) // found anonymous scope in type + static const reg::Ex r(R"([\w@:]*@\d+[\w@:]*)"); + reg::Match match; + if (reg::search(stype,match,r)) // found anonymous scope in type { QCString annName = match.str(); @@ -2103,11 +2103,11 @@ void MemberDefImpl::writeDeclaration(OutputList &ol, } // strip 'friend' keyword from ltype ltype.stripPrefix("friend "); - static const std::regex r("@[[:digit:]]+", std::regex::optimize); - std::smatch match; + static const reg::Ex r(R"(@\d+)"); + reg::Match match; std::string stype = ltype.str(); bool endAnonScopeNeeded=FALSE; - if (std::regex_search(stype,match,r)) // member has an anonymous type + if (reg::search(stype,match,r)) // member has an anonymous type { int i = (int)match.position(); int l = (int)match.length(); @@ -2905,9 +2905,6 @@ void MemberDefImpl::_writeTypeConstraints(OutputList &ol) const } } -// match from the start of the scope until the last marker -static const std::regex reAnonymous("[[:alnum:]\\x80-\\xFF_:]*@[[:digit:]]+([^@]*@[[:digit:]]+)?", std::regex::optimize); - void MemberDefImpl::_writeEnumValues(OutputList &ol,const Definition *container, const QCString &cfname,const QCString &ciname, const QCString &cname) const @@ -2978,6 +2975,9 @@ void MemberDefImpl::_writeEnumValues(OutputList &ol,const Definition *container, } } +// match from the start of the scope until the last marker +static const reg::Ex reAnonymous(R"([\w:@]*@\d+)"); + QCString MemberDefImpl::displayDefinition() const { QCString ldef = definition(); @@ -3010,8 +3010,8 @@ QCString MemberDefImpl::displayDefinition() const } std::string sdef = ldef.str(); - std::smatch match; - if (std::regex_search(sdef,match,reAnonymous)) + reg::Match match; + if (reg::search(sdef,match,reAnonymous)) { ldef = match.prefix().str() + " { ... } " + match.suffix().str(); } @@ -3203,10 +3203,10 @@ void MemberDefImpl::writeDocumentation(const MemberList *ml, QStrList sl; getLabels(sl,scopedContainer); - static const std::regex r("@[0-9]+", std::regex::optimize); - std::smatch match; + static const reg::Ex r(R"(@\d+)"); + reg::Match match; std::string sdef = ldef.str(); - if ((isVariable() || isTypedef()) && std::regex_search(sdef,match,r)) + if ((isVariable() || isTypedef()) && reg::search(sdef,match,r)) { // find enum type and insert it in the definition bool found=false; @@ -3234,7 +3234,7 @@ void MemberDefImpl::writeDocumentation(const MemberList *ml, // search for the last anonymous compound name in the definition ol.startMemberDocName(isObjCMethod()); - if (std::regex_search(sdef,match,reAnonymous)) + if (reg::search(sdef,match,reAnonymous)) { std::string prefix = match.prefix().str(); std::string suffix = match.suffix().str(); @@ -3612,61 +3612,18 @@ static QCString simplifyTypeForTable(const QCString &s) { QCString ts=removeAnonymousScopes(s); if (ts.right(2)=="::") ts = ts.left(ts.length()-2); - static const std::regex re("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_]*(<[^>]*>)?::([[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_]*)", std::regex::optimize); - std::smatch match; + static const reg::Ex re1(R"(\a\w*::(\a\w*))"); // non-template version + static const reg::Ex re2(R"(\a\w*<[^>]*>::(\a\w*))"); // template version + reg::Match match; std::string t = ts.str(); - if (std::regex_search(t,match,re)) + if (reg::search(t,match,re1) || reg::search(t,match,re2)) { - ts = match[2].str(); // take the identifier after the last :: (second capture group) + ts = match[1].str(); // take the identifier after the last :: } //printf("simplifyTypeForTable(%s)->%s\n",s.data(),ts.data()); return ts; } -#if 0 -/** Returns the type definition corresponding to a member's return type. - * @param[in] scope The scope in which to search for the class definition. - * @param[in] type The string representing the member's return type. - * @param[in] lang The programming language in which the class is defined. - * @param[out] start The string position where the class definition name was found. - * @param[out] length The length of the class definition's name. - */ -static Definition *getClassFromType(Definition *scope,const QCString &type,SrcLangExt lang,int &start,int &length) -{ - int pos=0; - int i; - QCString name; - QCString templSpec; - while ((i=extractClassNameFromType(type,pos,name,templSpec,lang))!=-1) - { - ClassDef *cd=0; - MemberDef *md=0; - int l = name.length()+templSpec.length(); - if (!templSpec.isEmpty()) - { - cd = getResolvedClass(scope,0,name+templSpec,&md); - } - cd = getResolvedClass(scope,0,name); - if (cd) - { - start=i; - length=l; - printf("getClassFromType: type=%s name=%s start=%d length=%d\n",type.data(),name.data(),start,length); - return cd; - } - else if (md) - { - start=i; - length=l; - printf("getClassFromType: type=%s name=%s start=%d length=%d\n",type.data(),name.data(),start,length); - return md; - } - pos=i+l; - } - return 0; -} -#endif - QCString MemberDefImpl::fieldType() const { QCString type = m_impl->accessorType; diff --git a/src/pre.l b/src/pre.l index 296295f..0b12ea9 100644 --- a/src/pre.l +++ b/src/pre.l @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -60,6 +59,7 @@ #include "condparser.h" #include "config.h" #include "filedef.h" +#include "regex.h" #define YY_NO_UNISTD_H 1 @@ -1962,17 +1962,17 @@ static void processConcatOperators(QCString &expr) if (expr.isEmpty()) return; //printf("processConcatOperators: in='%s'\n",expr.data()); std::string e = expr.str(); - static const std::regex r("[[:space:]]*##[[:space:]]*", std::regex::optimize); - std::sregex_iterator end; + static const reg::Ex r(R"(\s*##\s*)"); + reg::Iterator end; size_t i=0; for (;;) { - std::sregex_iterator it(e.begin()+i,e.end(),r); + reg::Iterator it(e,r,i); if (it!=end) { const auto &match = *it; - size_t n = i+match.position(); + size_t n = match.position(); size_t l = match.length(); //printf("Match: '%s'\n",expr.data()+i); if (n+l+1 argMap; @@ -3221,12 +3221,12 @@ static void initPredefined(yyscan_t yyscanner,const char *fileName) { size_t i=i_obrace+1; //printf("predefined function macro '%s'\n",ds.c_str()); - std::sregex_iterator it(ds.begin()+i,ds.end(),reId); + reg::Iterator it(ds,reId,i); // gather the formal arguments in a dictionary while (i0) // see bug375037 { @@ -3244,7 +3244,7 @@ static void initPredefined(yyscan_t yyscanner,const char *fileName) // strip definition part std::string definition; std::string in=ds.substr(i_equals+1); - std::sregex_iterator re_it(in.begin(),in.end(),reId); + reg::Iterator re_it(in,reId); size_t i=0; // substitute all occurrences of formal arguments by their // corresponding markers diff --git a/src/regex.cpp b/src/regex.cpp new file mode 100644 index 0000000..e3aa057 --- /dev/null +++ b/src/regex.cpp @@ -0,0 +1,736 @@ +/****************************************************************************** + * + * Copyright (C) 1997-2021 by Dimitri van Heesch. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation under the terms of the GNU General Public License is hereby + * granted. No representations are made about the suitability of this software + * for any purpose. It is provided "as is" without express or implied warranty. + * See the GNU General Public License for more details. + * + * Documents produced by Doxygen are derivative works derived from the + * input used in their production; they are not affected by this license. + * + */ + +#include "regex.h" +#include +#include +#include +#include +#include + +#define ENABLE_DEBUG 0 +#if ENABLE_DEBUG +#define DBG(fmt,...) do { fprintf(stderr,fmt,__VA_ARGS__); } while(0) +#else +#define DBG(fmt,...) do {} while(0) +#endif + +namespace reg +{ + +/** Class representing a token in the compiled regular expression token stream. + * A token has a kind and an optional value whose meaning depends on the kind. + * It is also possible to store a (from,to) character range in a token. + */ +class PToken +{ + public: + /** The kind of token. + * + * Ranges per bit mask: + * - `0x00FF` from part of a range, except for `0x0000` which is the End marker + * - `0x1FFF` built-in ranges + * - `0x2FFF` user defined ranges + * - `0x4FFF` special operations + * - `0x8000` literal character + */ + enum class Kind : uint16_t + { + End = 0x0000, + WhiteSpace = 0x1001, // \s range [ \t\r\n] + Digit = 0x1002, // \d range [0-9] + Alpha = 0x1003, // \a range [a-z_A-Z\x80-\xFF] + AlphaNum = 0x1004, // \w range [a-Z_A-Z0-9\x80-\xFF] + CharClass = 0x2001, // [] + NegCharClass = 0x2002, // [^] + BeginOfLine = 0x4001, // ^ + EndOfLine = 0x4002, // $ + BeginOfWord = 0x4003, // \< + EndOfWord = 0x4004, // \> + BeginCapture = 0x4005, // ( + EndCapture = 0x4006, // ) + Any = 0x4007, // . + Star = 0x4008, // * + Optional = 0x4009, // ? + Character = 0x8000 // c + }; + + /** returns a string representation of the tokens kind (useful for debugging). */ + const char *kindStr() const + { + if ((m_rep>>16)>=0x1000 || m_rep==0) + { + switch(static_cast((m_rep>>16))) + { + case Kind::End: return "End"; + case Kind::Alpha: return "Alpha"; + case Kind::AlphaNum: return "AlphaNum"; + case Kind::WhiteSpace: return "WhiteSpace"; + case Kind::Digit: return "Digit"; + case Kind::CharClass: return "CharClass"; + case Kind::NegCharClass: return "NegCharClass"; + case Kind::Character: return "Character"; + case Kind::BeginOfLine: return "BeginOfLine"; + case Kind::EndOfLine: return "EndOfLine"; + case Kind::BeginOfWord: return "BeginOfWord"; + case Kind::EndOfWord: return "EndOfWord"; + case Kind::BeginCapture: return "BeginCapture"; + case Kind::EndCapture: return "EndCapture"; + case Kind::Any: return "Any"; + case Kind::Star: return "Star"; + case Kind::Optional: return "Optional"; + } + } + else + { + return "Range"; + } + } + + /** Creates a token of kind 'End' */ + PToken() : m_rep(0) {} + + /** Creates a token of the given kind \a k */ + explicit PToken(Kind k) : m_rep(static_cast(k)<<16) {} + + /** Create a token for an ASCII character */ + PToken(char c) : m_rep((static_cast(Kind::Character)<<16) | + static_cast(c)) {} + + /** Create a token for a byte of an UTF-8 character */ + PToken(uint16_t v) : m_rep((static_cast(Kind::Character)<<16) | + static_cast(v)) {} + + /** Create a token representing a range from one character \a from to another character \a to */ + PToken(uint16_t from,uint16_t to) : m_rep(static_cast(from)<<16 | to) {} + + /** Sets the value for a token */ + void setValue(uint16_t value) { m_rep = (m_rep & 0xFFFF0000) | value; } + + /** Returns the kind of the token */ + Kind kind() const { return static_cast(m_rep>>16); } + + /** Returns the 'from' part of the character range. Only valid if this token represents a range */ + uint16_t from() const { return m_rep>>16; } + + /** Returns the 'to' part of the character range. Only valid if this token represents a range */ + uint16_t to() const { return m_rep & 0xFFFF; } + + /** Returns the value for this token */ + uint16_t value() const { return m_rep & 0xFFFF; } + + /** Returns the value for this token as a ASCII character */ + char asciiValue() const { return static_cast(m_rep); } + + /** Returns true iff this token represents a range of characters */ + bool isRange() const { return m_rep!=0 && from()<=to(); } + + /** Returns true iff this token is a positive or negative character class */ + bool isCharClass() const { return kind()==Kind::CharClass || kind()==Kind::NegCharClass; } + + private: + uint32_t m_rep; +}; + +/** Private members of a regular expression */ +class Ex::Private +{ + public: + /** Creates the private part */ + Private(const std::string &pat) : pattern(pat) + { + data.reserve(100); + } + void compile(); +#if ENABLE_DEBUG + void dump(); +#endif + bool matchAt(size_t tokenPos,const std::string &str,Match &match,size_t pos,int level) const; + + /** Flag indicating the expression was succesfully compiled */ + bool error = false; + + /** The token stream representing the compiled regular expression. */ + std::vector data; // compiled pattern + + /** The pattern string as passed by the user */ + std::string pattern; +}; + +/** Compiles a regular expression passed as a string into a stream of tokens that can be used for + * efficient searching. + */ +void Ex::Private::compile() +{ + error = false; + data.clear(); + if (pattern.empty()) return; + const char *start = pattern.c_str(); + const char *ps = start; + char c; + + int prevTokenPos=-1; + int tokenPos=0; + + auto addToken = [&](PToken tok) + { + tokenPos++; + data.emplace_back(tok); + }; + + auto getNextCharacter = [&]() -> PToken + { + char cs=*ps; + PToken result = PToken(cs); + if (cs=='\\') // escaped character + { + ps++; + cs=*ps; + switch (cs) + { + case 'n': result = PToken('\n'); break; + case 'r': result = PToken('\r'); break; + case 't': result = PToken('\t'); break; + case 's': result = PToken(PToken::Kind::WhiteSpace); break; + case 'a': result = PToken(PToken::Kind::Alpha); break; + case 'w': result = PToken(PToken::Kind::AlphaNum); break; + case 'd': result = PToken(PToken::Kind::Digit); break; + case '<': result = PToken(PToken::Kind::BeginOfWord); break; + case '>': result = PToken(PToken::Kind::EndOfWord); break; + case 'x': + case 'X': + { + uint16_t v=0; + for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits + { + int d = (cs>='a' && cs<='f') ? cs-'a'+10 : + (cs>='A' && cs<='F') ? cs-'A'+10 : + (cs>='0' && cs<='9') ? cs-'0' : + -1; + if (d>=0) { v<<=4; v|=d; ps++; } else break; + } + result = PToken(v); + } + break; + case '\0': ps--; break; // backslash at the end of the pattern + default: + result = PToken(cs); + break; + } + } + return result; + }; + + while ((c=*ps)) + { + switch (c) + { + case '^': // beginning of line (if first character of the pattern) + prevTokenPos = tokenPos; + addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) : + PToken(c)); + break; + case '$': // end of the line (if last character of the pattern) + prevTokenPos = tokenPos; + addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) : + PToken(c)); + break; + case '.': // any character + prevTokenPos = tokenPos; + addToken(PToken(PToken::Kind::Any)); + break; + case '(': // begin of capture group + prevTokenPos = tokenPos; + addToken(PToken(PToken::Kind::BeginCapture)); + break; + case ')': // end of capture group + prevTokenPos = tokenPos; + addToken(PToken(PToken::Kind::EndCapture)); + break; + case '[': // character class + { + prevTokenPos = tokenPos; + ps++; + if (*ps==0) { error=true; return; } + bool esc = *ps=='\\'; + PToken tok = getNextCharacter(); + ps++; + if (!esc && tok.kind()==PToken::Kind::Character && + tok.asciiValue()=='^') // negated character class + { + addToken(PToken(PToken::Kind::NegCharClass)); + if (*ps==0) { error=true; return; } + tok = getNextCharacter(); + ps++; + } + else + { + addToken(PToken(PToken::Kind::CharClass)); + } + uint16_t numTokens=0; + while ((c=*ps)) + { + if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range + { + getNextCharacter(); + ps++; + PToken endTok = getNextCharacter(); + ps++; + if (tok.value()>endTok.value()) + { + addToken(PToken(endTok.value(),tok.value())); // swap start and end + } + else + { + addToken(PToken(tok.value(),endTok.value())); + } + numTokens++; + } + else // single char, from==to + { + if (tok.kind()==PToken::Kind::Character) + { + addToken(PToken(tok.value(),tok.value())); + } + else // special token, add as-is since from>to + { + addToken(tok); + } + numTokens++; + } + if (*ps==0) { error=true; return; } // expected at least a ] + esc = *ps=='\\'; + tok = getNextCharacter(); + if (!esc && tok.kind()==PToken::Kind::Character && + tok.value()==static_cast(']')) + { + break; // end of character class + } + if (*ps==0) { error=true; return; } // no ] found + ps++; + } + // set the value of either NegCharClass or CharClass + data[prevTokenPos].setValue(numTokens); + } + break; + case '*': // 0 or more + case '+': // 1 or more + case '?': // optional: 0 or 1 + { + if (prevTokenPos==-1) + { + error=true; + return; + } + switch (data[prevTokenPos].kind()) + { + case PToken::Kind::BeginOfLine: // $* or $+ or $? + case PToken::Kind::BeginOfWord: // \<* or \<+ or \* or \>+ or \>? + case PToken::Kind::Star: // ** or *+ or *? + case PToken::Kind::Optional: // ?* or ?+ or ?? + error=true; + return; + default: // ok + break; + } + int ddiff = static_cast(tokenPos-prevTokenPos); + if (*ps=='+') // convert + -> * + { + // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*] + // ddiff=n ^prevTokenPos + data.resize(data.size()+ddiff); + std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos); + prevTokenPos+=ddiff; + tokenPos+=ddiff; + } + data.insert(data.begin()+prevTokenPos, + c=='?' ? PToken(PToken::Kind::Optional) : PToken(PToken::Kind::Star)); + tokenPos++; + addToken(PToken(PToken::Kind::End)); + // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND] + // ^prevTokenPos + // same for 'T?'. + } + break; + default: + prevTokenPos = tokenPos; + addToken(getNextCharacter()); + break; + } + ps++; + } + //addToken(PToken(PToken::Kind::End)); +} + +#if ENABLE_DEBUG +/** Dump the compiled token stream for this regular expression. For debugging purposes. */ +void Ex::Private::dump() +{ + size_t l = data.size(); + size_t i =0; + DBG("==== compiled token stream for pattern '%s' ===\n",pattern.c_str()); + while (i0 && i bool + { + PToken tok = data[tp]; + bool negate = tok.kind()==PToken::Kind::NegCharClass; + uint16_t numFields = tok.value(); + bool found = false; + for (uint16_t i=0;i(c); + if (tok.from()<=v && v<=tok.to()) + { + found=true; + break; + } + } + } + DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found); + return negate ? !found : found; + }; + size_t index = pos; + enum SequenceType { Star, Optional }; + auto processSequence = [this,&tokenPos,&index,&str,&matchCharClass, + &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool + { + size_t startIndex = index; + PToken tok = data[++tokenPos]; + if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's + { + char c_tok = tok.asciiValue(); + while (index<=str.length() && str[index]==c_tok) { index++; if (type==Optional) break; } + tokenPos++; + } + else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters + { + while (index<=str.length() && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; } + tokenPos+=tok.value()+1; // skip over character ranges + end token + } + else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters + { + while (index<=str.length() && isStartIdChar(str[index])) { index++; if (type==Optional) break; } + tokenPos++; + } + else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters + { + while (index<=str.length() && isIdChar(str[index])) { index++; if (type==Optional) break; } + tokenPos++; + } + else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces + { + while (index<=str.length() && std::isspace(str[index])) { index++; if (type==Optional) break; } + tokenPos++; + } + else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits + { + while (index<=str.length() && std::isdigit(str[index])) { index++; if (type==Optional) break; } + tokenPos++; + } + else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all + { + if (type==Optional) index++; else index = str.length(); + tokenPos++; + } + tokenPos++; // skip over end marker + while ((int)index>=(int)startIndex) + { + // pattern 'x*xy' should match 'xy' and 'xxxxy' + bool found = matchAt(tokenPos,str,match,index,level+1); + if (found) + { + match.setMatch(pos,index-pos+match.length()); + return true; + } + index--; + } + return false; + }; + + while (tokenPos=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char + index++,tokenPos++; + } + else if (tok.isCharClass()) + { + if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false; + index++,tokenPos+=tok.value()+1; // skip over character ranges + end token + } + else + { + switch (tok.kind()) + { + case PToken::Kind::Alpha: + if (index>=str.length() || !isStartIdChar(str[index])) return false; + index++; + break; + case PToken::Kind::AlphaNum: + if (index>=str.length() || !isIdChar(str[index])) return false; + index++; + break; + case PToken::Kind::WhiteSpace: + if (index>=str.length() || !std::isspace(str[index])) return false; + index++; + break; + case PToken::Kind::Digit: + if (index>=str.length() || !std::isdigit(str[index])) return false; + index++; + break; + case PToken::Kind::BeginOfLine: + if (index!=pos) return false; + break; + case PToken::Kind::EndOfLine: + if (index0?str[index]-1:0, + index>0?isIdChar(str[index-1]):-1); + if (index>=str.length() || + !isIdChar(str[index]) || + (index>0 && isIdChar(str[index-1]))) return false; + break; + case PToken::Kind::EndOfWord: + DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n", + index,pos,str[index],isIdChar(str[index]), + index==0 ? 0 : str[index-1], + index==0 ? -1 : isIdChar(str[index-1])); + if (index=str.length()) return false; + index++; + break; + case PToken::Kind::Star: + return processSequence(Star); + case PToken::Kind::Optional: + return processSequence(Optional); + default: + return false; + } + tokenPos++; + } + } + match.setMatch(pos,index-pos); + return true; +} + +static std::string wildcard2regex(const std::string &pattern) +{ + std::string result="^"; // match start of input + char c; + const char *p = pattern.c_str(); + while ((c=*p++)) + { + switch(c) + { + case '*': + result+=".*"; + break; // '*' => '.*' + case '?': + result+='.'; + break; // '?' => '.' + case '.': + case '+': + case '\\': + case '$': + case '^': + case '(': + case ')': + result+='\\'; result+=c; // escape + break; + case '[': + if (*p=='^') // don't escape ^ after [ + { + result+="[^"; + p++; + } + else + { + result+=c; + } + break; + default: // just copy + result+=c; + break; + } + } + result+='$'; // match end of input + return result; +} + + +Ex::Ex(const std::string &pattern, Mode mode) + : p(std::make_unique(mode==Mode::RegEx ? pattern : wildcard2regex(pattern))) +{ + p->compile(); +#if ENABLE_DEBUG + p->dump(); + assert(!p->error); +#endif +} + +Ex::~Ex() +{ +} + +bool Ex::match(const std::string &str,Match &match,size_t pos) const +{ + bool found=false; + if (p->data.size()==0 || p->error) return found; + match.init(&str); + + PToken tok = p->data[0]; + if (tok.kind()==PToken::Kind::BeginOfLine) // only test match at the given position + { + found = p->matchAt(0,str,match,pos,0); + } + else + { + if (tok.kind()==PToken::Kind::Character) // search for the start character + { + size_t index = str.find(tok.asciiValue(),pos); + if (index==std::string::npos) + { + DBG("Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n",str.c_str(),pos,tok.asciiValue()); + return false; + } + DBG("pos=%zu str='%s' char='%c' index=%zu\n",index,str.c_str(),tok.asciiValue(),index); + pos=index; + } + while (posmatchAt(0,str,match,pos,0); + if (found) break; + pos++; + } + } + DBG("Ex::match(str='%s',pos=%zu)=%d\n",str.c_str(),pos,found); + return found; +} + +bool Ex::isValid() const +{ + return !p->pattern.empty() && !p->error; +} + +//---------------------------------------------------------------------------------------- + +bool search(const std::string &str,Match &match,const Ex &re,size_t pos) +{ + return re.match(str,match,pos); +} + +bool search(const std::string &str,const Ex &re,size_t pos) +{ + Match match; + return re.match(str,match,pos); +} + +bool match(const std::string &str,Match &match,const Ex &re) +{ + return re.match(str,match,0) && match.position()==0 && match.length()==str.length(); +} + +bool match(const std::string &str,const Ex &re) +{ + Match match; + return re.match(str,match,0) && match.position()==0 && match.length()==str.length(); +} + +std::string replace(const std::string &str,const Ex &re,const std::string &replacement) +{ + std::string result; + Match match; + size_t p=0; + while (re.match(str,match,p)) + { + size_t i=match.position(); + size_t l=match.length(); + if (i>p) result+=str.substr(p,i-p); + result+=replacement; + p=i+l; + } + if (p +#include +#include +#include + +/** Namespace for the regular expression functions */ +namespace reg +{ + +class Match; + +/** Class representing a regular expression. + * + * It has a similar API as `std::regex`, + * but is much faster (and also somewhat more limited). + */ +class Ex +{ + public: + /** Matching algorithm */ + enum class Mode + { + RegEx, /**< full regular expression. */ + Wildcard /**< simple globbing pattern. */ + }; + /** Creates a regular expression object given the pattern as a string. + * Two modes of matching are supported: RegEx and Wildcard + * + * The following special characters are supported in Mode::RegEx mode. + * - `c` matches character `c` + * - `.` matches any character + * - `^` matches the start of the input + * - `$` matches the end of the input + * - `\<` matches the start of a word + * - `\>` matches the end of a word + * - `[]` matches a set of characters + * - `x*` matches a sequence of zero or more `x`'s + * - `x+` matches a sequence of one or more `x`'s + * - `x?` matches an optional `x` + * - `(` matches the start of a capture range + * - `)` matches the ends a capture range + * - `\c` to escape a special character, such as `+`, `[`, `*`, `(`, etc. + * - `\t` matches a tab character + * - `\n` matches a newline character + * - `\r` matches a return character + * - `\s` matches any whitespace as defined by `std::isspace()` + * - `\d` matches any digit as defined by `std::digit()` + * - `\a` matches any alphabetical characters, same as `[a-z_A-Z\x80-\xFF]` + * - `\w` matches any alpha numercial character, same as `[a-z_A-Z0-9\x80-\xFF]` + * - `\xHH` matches a hexadecimal character, e.g. `\xA0` matches character code 160. + * + * A character range can be used to match a character that falls inside a range + * (or set of ranges). + * Within the opening `[` and closing `]` brackets of a character ranges the following + * is supported: + * - `^` if at the start of the range, a character matches if it is \e not in the range, + * e.g. `[^\d]` matches any character not a digit + * - `-` when placed between 2 characters it defines a range from the first character to the second. + * any character that falls in the range will match, e.g. [0-9] matches the digit from 0 to 9. + * - `\s`, `\d`, `\a`, and `\w` as explained above. + * + * @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special + * meaning in a character range. `^` only has a special meaning as the first character. + * + * @note that capture ranges cannot be nested, and `*`, `+`, and `?` do not work on + * capture ranges. e.g. `(abd)?` is not valid. If multiple capture ranges are + * specified then some character has to be inbetween them, + * e.g. this does not work `(.*)(a.*)`, but this does `(.*)a(.*)`. + * + * In Wildcard mode `*` is used to match any sequence of zero or more characters. + * The character `?` can be used to match an optional character. Character ranges are + * also supported, but other characters like `$` and `+` are just treated as + * literal characters. + * + */ + Ex(const std::string &pattern, Mode mode=Mode::RegEx); + + /** Destroys the regular expression object. Frees resources. */ + ~Ex(); + + /** Check if a given string matches this regular expression. + * @param str The input string to match against. + * @param match The match object to hold the matching results. + * @param pos The position in the string at which to start the match. + * @returns true iff a match is found. Details are stored in the match object. + */ + bool match(const std::string &str,Match &match,size_t pos=0) const; + bool isValid() const; + private: + Ex(const Ex &) = delete; + Ex &operator=(const Ex &e) = delete; + + class Private; + std::unique_ptr p; +}; + +/** Object representing the match results of a capture range. */ +class SubMatch +{ + public: + /** Creates a match for a single capture range given a non-owning pointer to the string. */ + SubMatch(const std::string *str) : m_str(str) {} + + /** Returns the position in the string at which the match starts. */ + size_t position() const { return m_pos; } + + /** Returns the length of the matching part. */ + size_t length() const { return m_len; } + + /** Returns the matching part as a string */ + std::string str() const { return m_str ? m_str->substr(m_pos,m_len) : std::string(); } + + private: + friend class Match; + void setStart(size_t pos) { m_pos=pos; } + void setEnd(size_t pos) { m_len=pos-m_pos; } + void setMatch(size_t pos,size_t len) { m_pos=pos; m_len=len; } + size_t m_pos = std::string::npos; + size_t m_len = std::string::npos; + const std::string *m_str = nullptr; +}; + +/** Object representing the matching results. It consists of an array of + * SubMatch objects. The first entry of the array represents the whole match, any + * next elements represent each of the capture ranges. + * + * For example string `@42` and expression `@(\\d+)` will have two + * Submatches, match[0] will point to the input string as a whole, and + * match[1] will point to the number 42 only. + * + */ +class Match +{ + public: + /** Creates an empty match object */ + Match() {} + + /** Returns the position of the match or std::string::npos if no position is set. */ + size_t position() const { return m_subMatches[0].position(); } + + /** Returns the position of the match or std::string::npos if no length is set. */ + size_t length() const { return m_subMatches[0].length(); } + + /** Return a string representing the matching part. */ + std::string str() const { return m_subMatches[0].str(); } + + /** Return the part of the string after the match */ + SubMatch suffix() const { SubMatch m(m_str); m.setMatch(0,position()); return m; } + + /** Return the part of the string before the match */ + SubMatch prefix() const + { + SubMatch m(m_str); + if (m_str) + { + size_t e = position()+length(); + m.setMatch(e,m_str->length()-e); + } + return m; + } + + /** Returns the number of sub matches available in this match. */ + size_t size() const { return m_subMatches.size(); } + + /** Returns the n-th SubMatch object. Note that there is always 1 SubMatch object + * representing the whole match. + */ + const SubMatch &operator[](size_t index) const { return m_subMatches[index]; } + + private: + friend class Ex; + void init(const std::string *str) + { + m_subMatches.clear(); + m_subMatches.emplace_back(str); + m_str = str; + } + void startCapture(size_t index) + { + if (!m_insideCapture) // when backtracking we can re-entry the capture multiple times + // only update the index, example `\s*(x)` + { + m_captureIndex = m_subMatches.size(); + m_subMatches.emplace_back(m_str); + m_insideCapture = true; + } + m_subMatches.back().setStart(index); + } + void endCapture(size_t index) + { + if (index>m_subMatches.back().position()) + { + m_captureIndex=0; + m_subMatches.back().setEnd(index); + m_insideCapture = false; + } + } + void setMatch(size_t pos,size_t len) + { + m_subMatches[m_captureIndex].setMatch(pos,len); + } + + std::vector m_subMatches; + size_t m_captureIndex=0; + const std::string *m_str = nullptr; + bool m_insideCapture=false; +}; + +/** Iterator class to iterator through matches. + */ +class Iterator +{ + public: + using value_type = Match; + using difference_type = std::ptrdiff_t; + using pointer = value_type*; + using reference = value_type&; + using iterator_category = std::forward_iterator_tag; + + /** Creates an end-of-sequence iterator */ + Iterator() {} + + /** Creates an iterator for input string \a str, using regular expression \a re to search. + * @note the string and regular expression objects should remain valid while iterating. + */ + Iterator(const std::string &str, const Ex &re, size_t pos=0) + : m_str(&str), m_re(&re), m_pos(pos) { findNext(); } + + // Iterator holds pointers, so prevent temporaries to be passed as string or + // regular expression + Iterator(std::string &&str, const Ex &re) = delete; + Iterator(const std::string &str, Ex &&re) = delete; + Iterator(std::string &&str, Ex &&re) = delete; + + /** Returns true if the iterators point to the same match (or both are end-of-sequence iterators) */ + bool operator==(const Iterator &rhs) const { return rhs.m_pos==m_pos; } + + /** Returns true if the iterators are not pointing to the same match */ + bool operator!=(const Iterator &rhs) const { return rhs.m_pos!=m_pos; } + + /** Returns a reference to the current match */ + const value_type &operator*() const { return m_match; } + + /** Returns a pointer to the current match */ + const value_type *operator->() const { return &m_match; } + + /** Advances the iterator to the next match. */ + Iterator &operator++() { findNext(); return *this; } + + private: + void findNext() + { + if (!m_re || !m_str) { m_pos=std::string::npos; return; } // end marker + if (m_re->match(*m_str,m_match,m_pos)) + { + m_pos=m_match.position()+m_match.length(); // update m_pos to point beyond last match + } + else // no more matches, make the iterator point to the 'end-of-sequence' + { + m_pos=std::string::npos; + } + } + const std::string *m_str = nullptr; + const Ex *m_re = nullptr; + size_t m_pos = std::string::npos; + Match m_match; +}; + +/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re. + * Returns true iff a match was found. + * Details of what part of the string has matched is returned via the \a match object. + * + * An example to show how to match all identifiers in a string. + * @code + * static reg::Ex re(R"(\a\w*)"); + * std::string = u8"void(Func是(42));"; + * while (reg::search(str,match,re,pos)) + * { + * std::cout << match.str() << std::endl; + * pos=match.position()+match.length(); + * } + * @endcode + * produces: + * @code + * void + * Func是 + * B_C + * Códe42 + * @endcode + * + * @see Ex::Ex() for details on the regular expression patterns. + */ +bool search(const std::string &str,Match &match,const Ex &re,size_t pos=0); + +/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re. + * Returns true iff a match was found. + */ +bool search(const std::string &str,const Ex &re,size_t pos=0); + +/** Matches a given string \a str for a match against regular expression \a re. + * Returns true iff a match was found for the whole string. + * Any capture groups are returned via the \a match object. + */ +bool match(const std::string &str,Match &match,const Ex &re); + +/** Matches a given string \a str for a match against regular expression \a re. + * Returns true iff a match was found for the whole string. + */ +bool match(const std::string &str,const Ex &re); + +/** Searching in a given input string \a for parts that match regular expression \a re and + * replaces those parts by string \a replacement. + */ +std::string replace(const std::string &str,const Ex &re,const std::string &replacement); + +} // namespace + +#endif diff --git a/src/rtfstyle.cpp b/src/rtfstyle.cpp index cbe2468..d279eab 100644 --- a/src/rtfstyle.cpp +++ b/src/rtfstyle.cpp @@ -13,12 +13,12 @@ * */ -#include #include #include #include "rtfstyle.h" #include "message.h" +#include "regex.h" RTFListItemInfo rtf_listItemInfo[rtf_maxIndentLevels]; @@ -234,12 +234,12 @@ Rtf_Style_Default rtf_Style_Default[] = } }; -static const std::regex s_clause("\\\\s([[:digit:]]+)[[:space:]]*", std::regex::optimize); +static const reg::Ex s_clause(R"(\\s(\d+)\s*)"); // match, e.g. '\s30' and capture '30' StyleData::StyleData(const std::string &reference, const std::string &definition) { - std::smatch match; - if (std::regex_search(reference,match,s_clause)) + reg::Match match; + if (reg::search(reference,match,s_clause)) { m_index = static_cast(std::stoul(match[1].str())); } @@ -253,19 +253,19 @@ StyleData::StyleData(const std::string &reference, const std::string &definition bool StyleData::setStyle(const std::string &command, const std::string &styleName) { - std::smatch match; - if (!std::regex_search(command,match,s_clause)) + reg::Match match; + if (!reg::search(command,match,s_clause)) { err("Style sheet '%s' contains no '\\s' clause.\n{%s}", styleName.c_str(), command.c_str()); return false; } m_index = static_cast(std::stoul(match[1].str())); - static const std::regex definition_splitter("^(.*)(\\\\sbasedon[[:digit:]]+.*)$", std::regex::optimize); - if (std::regex_match(command,match,definition_splitter)) + size_t index = command.find("\\sbasedon"); + if (index!=std::string::npos) { - m_reference = match[1].str(); - m_definition = match[2].str(); + m_reference = command.substr(0,index); + m_definition = command.substr(index); } return true; @@ -287,12 +287,12 @@ void loadStylesheet(const char *name, StyleDataMap& map) for (std::string line ; getline(file,line) ; ) // for each line { if (line.empty() || line[0]=='#') continue; // skip blanks & comments - static const std::regex assignment_splitter("[[:space:]]*=[[:space:]]*", std::regex::optimize); - std::smatch match; - if (std::regex_search(line,match,assignment_splitter)) + static const reg::Ex assignment_splitter(R"(\s*=\s*)"); + reg::Match match; + if (reg::search(line,match,assignment_splitter)) { - std::string key = match.prefix(); - std::string value = match.suffix(); + std::string key = match.prefix().str(); + std::string value = match.suffix().str(); auto it = map.find(key); if (it!=map.end()) { @@ -329,12 +329,12 @@ void loadExtensions(const char *name) for (std::string line ; getline(file,line) ; ) // for each line { if (line.empty() || line[0]=='#') continue; // skip blanks & comments - std::smatch match; - static const std::regex assignment_splitter("[[:space:]]*=[[:space:]]*", std::regex::optimize); - if (std::regex_search(line,match,assignment_splitter)) + static const reg::Ex assignment_splitter(R"(\s*=\s*)"); + reg::Match match; + if (reg::search(line,match,assignment_splitter)) { - std::string key = match.prefix(); - std::string value = match.suffix(); + std::string key = match.prefix().str(); + std::string value = match.suffix().str(); auto it = g_styleMap.find(key); if (it!=g_styleMap.end()) { diff --git a/src/scanner.l b/src/scanner.l index 4a1e720..0193689 100644 --- a/src/scanner.l +++ b/src/scanner.l @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -51,6 +50,7 @@ #include "clangparser.h" #include "markdown.h" +#include "regex.h" #define YY_NO_INPUT 1 #define YY_NO_UNISTD_H 1 @@ -3739,9 +3739,9 @@ OPERATOR "operator"{B}*({ARITHOP}|{ASSIGNOP}|{LOGICOP}|{BITOP}) } else { - static const std::regex re("@[0-9]+$", std::regex::optimize); + static const reg::Ex re(R"(@\d+$)"); if (!yyextra->isTypedef && yyextra->memspecEntry && - !std::regex_search(yyextra->memspecEntry->name.str(),re)) // not typedef or anonymous type (see bug691071) + !reg::search(yyextra->memspecEntry->name.str(),re)) // not typedef or anonymous type (see bug691071) { // enabled the next two lines for bug 623424 yyextra->current->doc.resize(0); @@ -4831,11 +4831,11 @@ OPERATOR "operator"{B}*({ARITHOP}|{ASSIGNOP}|{LOGICOP}|{BITOP}) yyextra->current->fileName = yyextra->yyFileName; yyextra->current->startLine = yyextra->yyBegLineNr; yyextra->current->startColumn = yyextra->yyBegColNr; - static const std::regex re("\\([^)]*[*&][^]*\\)", std::regex::optimize); - std::smatch match; + static const reg::Ex re(R"(\([^)]*[*&][^)]*\))"); + reg::Match match; std::string type = yyextra->current->type.str(); int ti=-1; - if (std::regex_search(type,match,re)) + if (reg::search(type,match,re)) { ti = (int)match.position(); } @@ -6895,16 +6895,26 @@ static void splitKnRArg(yyscan_t yyscanner,QCString &oldStyleArgPtr,QCString &ol int si = yyextra->current->args.length(); if (yyextra->oldStyleArgType.isEmpty()) // new argument { - static const std::regex re("(\\([^)]*\\))[[:space:]]*(\\([^)]*\\))?", std::regex::optimize); std::string args = yyextra->current->args.str(); - std::smatch matches; - std::regex_search(args,matches,re); + static const reg::Ex re(R"(\([^)]*\).*)"); // find first (...) + int bi1=-1; + int bi2=-1; + reg::Match match; + if (reg::search(args,match,re)) + { + bi1=(int)match.position(); + size_t secondMatchStart = match.position()+match.length(); // search again after first match + if (reg::search(args,match,re,secondMatchStart)) + { + bi2=(int)match.position(); + } + } char c; - if (matches.length()==3 && !matches[2].str().empty()) // found something like "int (*func)(int arg)" + if (bi1!=-1 && bi2!=-1) // found something like "int (*func)(int arg)" { - size_t s = matches.position()+1; // keep opening ( + int s=bi2+1; // keep opening ( yyextra->oldStyleArgType = yyextra->current->args.left(s); - int i=(int)s; + int i=s; while (icurrent->args.at(i))=='*' || isspace((uchar)c))) i++; yyextra->oldStyleArgType += yyextra->current->args.mid(s,i-s); s=i; @@ -6912,12 +6922,12 @@ static void splitKnRArg(yyscan_t yyscanner,QCString &oldStyleArgPtr,QCString &ol oldStyleArgName = yyextra->current->args.mid(s,i-s); yyextra->oldStyleArgType+=yyextra->current->args.mid(i); } - else if (matches.length()==3) // redundant braces like in "int (*var)" + else if (bi1!=-1) // redundant braces like in "int (*var)" { - size_t s = matches.position(); // strip opening ( + int s=bi1; // strip opening ( yyextra->oldStyleArgType = yyextra->current->args.left(s); s++; - int i=(int)s+1; + int i=s+1; while (icurrent->args.at(i))=='*' || isspace((uchar)c))) i++; yyextra->oldStyleArgType += yyextra->current->args.mid(s,i-s); s=i; diff --git a/src/template.cpp b/src/template.cpp index 4c296aa..9f1eb92 100644 --- a/src/template.cpp +++ b/src/template.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -30,6 +29,7 @@ #include "util.h" #include "resourcemgr.h" #include "portable.h" +#include "regex.h" #define ENABLE_TRACING 0 @@ -4186,9 +4186,9 @@ class TemplateNodeMarkers : public TemplateNodeCreator c->push(); std::string str = patternStr.toString().str(); - static const std::regex marker("@([[:digit:]]+)", std::regex::optimize); - std::sregex_iterator re_it(str.begin(),str.end(),marker); - std::sregex_iterator end; + static const reg::Ex marker(R"(@\d+)"); + reg::Iterator re_it(str,marker); + reg::Iterator end; size_t index=0; for ( ; re_it!=end ; ++re_it) { @@ -4204,7 +4204,7 @@ class TemplateNodeMarkers : public TemplateNodeCreator { ts << part; // write text before marker } - unsigned long entryIndex = std::stoul(match[1].str()); + unsigned long entryIndex = std::stoul(match.str().substr(1)); TemplateVariant var; size_t i=0; // search for list element at position id diff --git a/src/util.cpp b/src/util.cpp index 9b76714..1677b26 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -35,6 +34,7 @@ #include #include +#include "regex.h" #include "util.h" #include "message.h" #include "classdef.h" @@ -188,10 +188,10 @@ QCString removeAnonymousScopes(const char *str) return false; }; - static const std::regex re("[ :]*@[[:digit:]]+[: ]*", std::regex::optimize); + static const reg::Ex re(R"([\s:]*@\d+[\s:]*)"); std::string s = str; - std::sregex_iterator iter( s.begin(), s.end(), re); - std::sregex_iterator end; + reg::Iterator iter(s,re); + reg::Iterator end; size_t p=0; size_t sl=s.length(); bool needsSeparator=false; @@ -221,8 +221,8 @@ QCString removeAnonymousScopes(const char *str) QCString replaceAnonymousScopes(const char *s,const char *replacement) { if (s==0) return QCString(); - static const std::regex marker("@[[:digit:]]+", std::regex::optimize); - std::string result = std::regex_replace(s,marker,replacement?replacement:"__anonymous__"); + static const reg::Ex marker(R"(@\d+)"); + std::string result = reg::replace(s,marker,replacement?replacement:"__anonymous__"); //printf("replaceAnonymousScopes('%s')='%s'\n",s.data(),result.data()); return result; } @@ -895,9 +895,9 @@ void linkifyText(const TextGeneratorIntf &out, const Definition *scope, size_t strLen = txtStr.length(); if (strLen==0) return; - static const std::regex regExp("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_~!\\\\.:$]*", std::regex::optimize); - std::sregex_iterator it( txtStr.begin(), txtStr.end(), regExp); - std::sregex_iterator end; + static const reg::Ex regExp(R"(\a[\w~!\\.:$]*)"); + reg::Iterator it(txtStr,regExp); + reg::Iterator end; //printf("linkifyText scope=%s fileScope=%s strtxt=%s strlen=%d external=%d\n", // scope?scope->name().data():"", @@ -1096,9 +1096,9 @@ void linkifyText(const TextGeneratorIntf &out, const Definition *scope, void writeMarkerList(OutputList &ol,const std::string &markerText,size_t numMarkers, std::function replaceFunc) { - static const std::regex marker("@([[:digit:]]+)", std::regex::optimize); - std::sregex_iterator it(markerText.begin(),markerText.end(),marker); - std::sregex_iterator end; + static const reg::Ex marker(R"(@(\d+))"); + reg::Iterator it(markerText,marker); + reg::Iterator end; size_t index=0; // now replace all markers in inheritLine with links to the classes for ( ; it!=end ; ++it) @@ -1107,7 +1107,7 @@ void writeMarkerList(OutputList &ol,const std::string &markerText,size_t numMark size_t newIndex = match.position(); size_t matchLen = match.length(); ol.parseText(markerText.substr(index,newIndex-index)); - unsigned long entryIndex = std::stoul(match[1]); + unsigned long entryIndex = std::stoul(match[1].str()); if (entryIndex<(unsigned long)numMarkers) { replaceFunc(entryIndex); @@ -1279,37 +1279,6 @@ int filterCRLF(char *buf,int len) return dest; // length of the valid part of the buf } -static std::string wildcard2regex(const std::string &pattern) -{ - std::string result="^"; // match start of input - char c; - const char *p = pattern.c_str(); - while ((c=*p++)) - { - switch(c) - { - case '*': result+=".*"; break; // '*' => '.*' - case '?': result+='.'; break; // '?' => '.' - case '.': case '+': case '\\': case '$': case '^': result+='\\'; result+=c; break; // escape - case '[': if (*p=='^') { result+="[^"; p++; } else result+=c; break; // don't escape ^ after [ - default: result+=c; break; // just copy - } - } - result+='$'; // match end of input - return result; -} - -static bool isMatchingWildcard(const std::string &input,const std::string &pattern, - bool caseSensitive=false) -{ - - std::regex::flag_type flags = std::regex::ECMAScript; - if (!caseSensitive) flags |= std::regex::icase; - std::string re_str = wildcard2regex(pattern); - std::regex rePattern(re_str,flags); - return std::regex_match(input,rePattern); -} - static QCString getFilterFromList(const char *name,const StringVector &filterList,bool &found) { found=FALSE; @@ -1321,7 +1290,14 @@ static QCString getFilterFromList(const char *name,const StringVector &filterLis if (i_equals!=-1) { QCString filterPattern = fs.left(i_equals); - if (isMatchingWildcard(name,filterPattern.str(),Portable::fileSystemIsCaseSensitive())) + QCString input = name; + if (!Portable::fileSystemIsCaseSensitive()) + { + filterPattern = filterPattern.lower(); + input = input.lower(); + } + reg::Ex re(filterPattern.str(),reg::Ex::Mode::Wildcard); + if (re.isValid() && reg::match(input.str(),re)) { // found a match! QCString filterName = fs.mid(i_equals+1); @@ -1906,10 +1882,10 @@ static QCString extractCanonicalType(const Definition *d,const FileDef *fs,QCStr // (i.e. type is not a template specialization) // then resolve any identifiers inside. { - static const std::regex re("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_]*", std::regex::optimize); std::string ts = templSpec.str(); - std::sregex_iterator it(ts.begin(),ts.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"(\a\w*)"); + reg::Iterator it(ts,re); + reg::Iterator end; size_t tp=0; // for each identifier template specifier @@ -4282,14 +4258,13 @@ QCString convertCharEntitiesToUTF8(const char *str) { if (str==0) return QCString(); - static const std::regex re("&[[:alpha:]\\x80-\\xFF][[:alnum:]\\x80-\\xFF]*;", std::regex::optimize); std::string s = str; - std::sregex_iterator it(s.begin(),s.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"(&\a\w*;)"); + reg::Iterator it(s,re); + reg::Iterator end; GrowBuf growBuf; size_t p,i=0,l; - //while ((p=entityPat.match(s,i,&l))!=-1) for (; it!=end ; ++it) { const auto &match = *it; @@ -4447,9 +4422,9 @@ void addMembersToMemberGroup(MemberList *ml, */ int extractClassNameFromType(const char *type,int &pos,QCString &name,QCString &templSpec,SrcLangExt lang) { - static std::regex re_norm("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_:]*"); - static std::regex re_fortran("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_:()=]*"); - static std::regex &re = re_norm; + static reg::Ex re_norm(R"(\a[\w:]*)"); + static reg::Ex re_fortran(R"(\a[\w:()=]*)"); + static const reg::Ex *re = &re_norm; name.resize(0); templSpec.resize(0); @@ -4462,17 +4437,17 @@ int extractClassNameFromType(const char *type,int &pos,QCString &name,QCString & if (type[pos]==',') return -1; if (QCString(type).left(4).lower()!="type") { - re = re_fortran; + re = &re_fortran; } } std::string s = type; - std::sregex_iterator it(s.begin()+pos,s.end(),re); - std::sregex_iterator end; + reg::Iterator it(s,*re,(int)pos); + reg::Iterator end; if (it!=end) { const auto &match = *it; - int i = pos+(int)match.position(); + int i = (int)match.position(); int l = (int)match.length(); int ts = i+l; int te = ts; @@ -4530,10 +4505,10 @@ QCString normalizeNonTemplateArgumentsInString( p++; QCString result = name.left(p); - static const std::regex re("[[:alpha:]\\x80-\\xFF_:][[:alnum:]\\x80-\\xFF_:]*", std::regex::optimize); std::string s = result.mid(p).str(); - std::sregex_iterator it(s.begin(),s.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"([\a:][\w:]*)"); + reg::Iterator it(s,re); + reg::Iterator end; size_t pi=0; // for each identifier in the template part (e.g. B -> T) for (; it!=end ; ++it) @@ -4594,9 +4569,9 @@ QCString substituteTemplateArgumentsInString( if (formalArgs.empty()) return name; std::string result; - static const std::regex re("[[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_:]*", std::regex::optimize); - std::sregex_iterator it(name.begin(),name.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"(\a[\w:]*)"); + reg::Iterator it(name,re); + reg::Iterator end; size_t p=0; for (; it!=end ; ++it) @@ -5414,9 +5389,9 @@ QCString stripPath(const char *s) bool containsWord(const char *str,const char *word) { if (str==0 || word==0) return false; - static const std::regex re("[[:alpha:]\\x80-\\xFF_]+", std::regex::optimize); + static const reg::Ex re(R"(\a+)"); std::string s = str; - for (std::sregex_iterator it(s.begin(),s.end(),re) ; it!=std::sregex_iterator() ; ++it) + for (reg::Iterator it(s,re) ; it!=reg::Iterator() ; ++it) { if (it->str()==word) return true; } @@ -5429,36 +5404,36 @@ bool containsWord(const char *str,const char *word) */ bool findAndRemoveWord(QCString &sentence,const char *word) { - static const std::regex re("[^[:alpha:]\\x80-\\xFF_]+", std::regex::optimize); + static reg::Ex re(R"(\s*(\<\a+\>)\s*)"); std::string s = sentence.str(); - std::sregex_token_iterator it(s.begin(),s.end(),re,{-1,0}); - std::sregex_token_iterator end; - - bool found=false; + reg::Iterator it(s,re); + reg::Iterator end; std::string result; - bool keepSpaces=false; // skip leading whitespace - for (;it!=end;it++) + bool found=false; + size_t p=0; + for ( ; it!=end ; ++it) { - std::string part = it->str(); - bool whiteSpaceOnly = std::all_of(part.begin(),part.end(), - [](const auto ch) { return std::isspace(ch); }); - bool matchingWord = part==word; - if (!matchingWord && (keepSpaces || !whiteSpaceOnly)) + const auto match = *it; + std::string part = match[1].str(); + if (part!=word) { - result+=part; - keepSpaces=!whiteSpaceOnly; // skip sequences of spaces + size_t i = match.position(); + size_t l = match.length(); + result+=s.substr(p,i-p); + result+=match.str(); + p=i+l; } - else if (matchingWord) + else { found=true; + size_t i = match[1].position(); + size_t l = match[1].length(); + result+=s.substr(p,i-p); + p=i+l; } } - - // trim trailing whitespace - result.erase(std::find_if(result.rbegin(), result.rend(), - [](const auto ch) { return !std::isspace(ch); }).base(), result.end()); - - sentence = result; + result+=s.substr(p); + sentence = QCString(result).simplifyWhiteSpace(); return found; } @@ -6029,11 +6004,11 @@ static QCString escapeCommas(const QCString &s) static QCString expandAliasRec(StringUnorderedSet &aliasesProcessed,const std::string &s,bool allowRecursion) { - //QCString result; std::string result; - std::regex re("[\\\\@]([[:alpha:]\\x80-\\xFF_][[:alnum:]\\x80-\\xFF_]*)"); - std::sregex_iterator re_it(s.begin(),s.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"([\\@](\a\w*))"); + reg::Iterator re_it(s,re); + reg::Iterator end; + int p = 0; for ( ; re_it!=end ; ++re_it) { @@ -6354,9 +6329,9 @@ bool readInputFile(const char *fileName,BufStr &inBuf,bool filter,bool isSourceC QCString filterTitle(const std::string &title) { std::string tf; - std::regex re("%([A-Z_a-z]+)"); - std::sregex_iterator it(title.begin(),title.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"(%[a-z_A-Z]+)"); + reg::Iterator it(title,re); + reg::Iterator end; size_t p = 0; for (; it!=end ; ++it) { @@ -6364,7 +6339,7 @@ QCString filterTitle(const std::string &title) size_t i = match.position(); size_t l = match.length(); if (i>p) tf+=title.substr(p,i-p); - tf+=match[1].str(); // skip % + tf+=match.str().substr(1); // skip % p=i+l; } tf+=title.substr(p); @@ -6399,9 +6374,15 @@ bool patternMatch(const QFileInfo &fi,const StringVector &patList) size_t i=pattern.find('='); if (i!=std::string::npos) pattern=pattern.substr(0,i); // strip of the extension specific filter name - found = isMatchingWildcard(fn,pattern,caseSenseNames) || - isMatchingWildcard(fp,pattern,caseSenseNames) || - isMatchingWildcard(afp,pattern,caseSenseNames); + if (!caseSenseNames) + { + pattern = QCString(pattern).lower().str(); + fn = QCString(fn).lower().str(); + fp = QCString(fn).lower().str(); + afp = QCString(fn).lower().str(); + } + reg::Ex re(pattern,reg::Ex::Mode::Wildcard); + found = re.isValid() && (reg::match(fn,re) || reg::match(fp,re) || reg::match(afp,re)); if (found) break; //printf("Matching '%s' against pattern '%s' found=%d\n", // fi->fileName().data(),pattern.data(),found); @@ -6486,9 +6467,9 @@ QCString replaceColorMarkers(const char *str) if (str==0) return QCString(); std::string result; std::string s=str; - static const std::regex re("##([0-9A-Fa-f][0-9A-Fa-f])", std::regex::optimize); - std::sregex_iterator it(s.begin(),s.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"(##[0-9A-Fa-f][0-9A-Fa-f])"); + reg::Iterator it(s,re); + reg::Iterator end; static int hue = Config_getInt(HTML_COLORSTYLE_HUE); static int sat = Config_getInt(HTML_COLORSTYLE_SAT); static int gamma = Config_getInt(HTML_COLORSTYLE_GAMMA); @@ -6500,7 +6481,7 @@ QCString replaceColorMarkers(const char *str) size_t i = match.position(); size_t l = match.length(); if (i>p) result+=s.substr(p,i-p); - std::string lumStr = match[1].str(); + std::string lumStr = match.str().substr(2); #define HEXTONUM(x) (((x)>='0' && (x)<='9') ? ((x)-'0') : \ ((x)>='a' && (x)<='f') ? ((x)-'a'+10) : \ ((x)>='A' && (x)<='F') ? ((x)-'A'+10) : 0) @@ -6974,10 +6955,10 @@ bool classVisibleInIndex(const ClassDef *cd) */ QCString extractDirection(QCString &docs) { - std::regex re("\\[([ inout,]+)\\]"); std::string s = docs.str(); - std::sregex_iterator it(s.begin(),s.end(),re); - std::sregex_iterator end; + static const reg::Ex re(R"(\[([ inout,]+)\])"); + reg::Iterator it(s,re); + reg::Iterator end; if (it!=end) { const auto &match = *it; @@ -7467,15 +7448,21 @@ StringVector split(const std::string &s,const std::string &delimiter) /// split input string \a s by regular expression delimiter \a delimiter. /// returns a vector of non-empty strings that are between the delimiters -StringVector split(const std::string &s,const std::regex &delimiter) +StringVector split(const std::string &s,const reg::Ex &delimiter) { StringVector result; - std::sregex_token_iterator iter(s.begin(), s.end(), delimiter, -1); - std::sregex_token_iterator end; + reg::Iterator iter(s, delimiter); + reg::Iterator end; + size_t p=0; for ( ; iter != end; ++iter) { - result.push_back(*iter); + const auto &match = *iter; + size_t i=match.position(); + size_t l=match.length(); + if (i>p) result.push_back(s.substr(p,i-p)); + p=i+l; } + if (p #include #include -#include #include #include "types.h" @@ -36,6 +35,7 @@ #include "containers.h" #include "namespacedef.h" #include "outputgen.h" +#include "regex.h" //-------------------------------------------------------------------- @@ -460,9 +460,9 @@ void writeExtraLatexPackages(FTextStream &t); void writeLatexSpecialFormulaChars(FTextStream &t); StringVector split(const std::string &s,const std::string &delimiter); -StringVector split(const std::string &s,const std::regex &delimiter); +StringVector split(const std::string &s,const reg::Ex &delimiter); int findIndex(const StringVector &sv,const std::string &s); -int findIndex(const std::string &s,const std::regex &re); +int findIndex(const std::string &s,const reg::Ex &re); bool recognizeFixedForm(const char* contents, FortranFormat format); FortranFormat convertFileNameFortranParserCode(QCString fn); diff --git a/src/vhdlcode.l b/src/vhdlcode.l index 3e754c7..c363a23 100644 --- a/src/vhdlcode.l +++ b/src/vhdlcode.l @@ -30,7 +30,6 @@ #include #include -#include /* * includes @@ -55,6 +54,7 @@ #include "classdef.h" #include "filedef.h" #include "tooltip.h" +#include "regex.h" #define YY_NO_INPUT 1 #define YY_NO_UNISTD_H 1 @@ -359,7 +359,7 @@ XILINX "INST"|"NET"|"PIN"|"BLKNM"|"BUFG"|"COLLAPSE"|"CPLD"|"COMPGRP"|"CONFI tt=tt.lower(); VhdlDocGen::deleteAllChars(tt,';'); tt.stripWhiteSpace(); - static const std::regex regg("[[:space:]]+", std::regex::optimize); + static const reg::Ex regg(R"(\s+)"); // any number of whitespace auto ql = split(tt.str(),regg); int index=findIndex(ql,"if")+1; index+=findIndex(ql,"case")+1; @@ -959,7 +959,7 @@ static bool checkVhdlString(yyscan_t yyscanner,QCString &name) if (name.at(0)=='"' && name.at(len-1)=='"' && len > 2) { std::string inside = name.str().substr(1,len-2); - static const std::regex regg("[[:space:]]+", std::regex::optimize); + static const reg::Ex regg(R"(\s+)"); // any number of whitespace auto qrl=split(inside,regg); if (VhdlDocGen::isNumber(qrl[0])) { diff --git a/src/vhdldocgen.cpp b/src/vhdldocgen.cpp index 3bd510f..86110a8 100644 --- a/src/vhdldocgen.cpp +++ b/src/vhdldocgen.cpp @@ -26,7 +26,6 @@ #include #include #include -#include #include #include @@ -62,7 +61,7 @@ #include "plantuml.h" #include "vhdljjparser.h" #include "VhdlParser.h" -//#include "vhdlcode.h" +#include "regex.h" #include "plantuml.h" //#define DEBUGFLOW #define theTranslator_vhdlType theTranslator->trVhdlType @@ -1034,7 +1033,7 @@ void VhdlDocGen::parseFuncProto(const char* text,QCString& name,QCString& ret,bo QCString VhdlDocGen::getIndexWord(const char* c,int index) { - static const std::regex reg("[[:space:]:|]",std::regex::optimize); + static const reg::Ex reg(R"([\s|])"); auto ql=split(c,reg); if ((size_t)index < ql.size()) @@ -1122,7 +1121,7 @@ QCString VhdlDocGen::getProcessNumber() void VhdlDocGen::writeFormatString(const QCString& s,OutputList&ol,const MemberDef* mdef) { - static const std::regex reg("[\\[\\]\\.\\/\\<\\>\\:\\s\\,\\;\\'\\+\\-\\*\\|\\&\\=\\(\\)\"]",std::regex::optimize); + static const reg::Ex reg(R"([\[\]./<>:\s,;'+*|&=()\"-])"); QCString qcs = s; qcs+=QCString(" ");// parsing the last sign QCString find=qcs; @@ -1191,8 +1190,8 @@ void VhdlDocGen::writeFormatString(const QCString& s,OutputList&ol,const MemberD */ bool VhdlDocGen::isNumber(const std::string& s) { - static const std::regex regg("[0-9][0-9eEfFbBcCdDaA_.#-+?xXzZ]*",std::regex::optimize); - return std::regex_match(s,regg); + static const reg::Ex regg(R"([0-9][0-9eEfFbBcCdDaA_.#+?xXzZ-]*)"); + return reg::match(s,regg); }// isNumber @@ -2298,7 +2297,7 @@ void VhdlDocGen::parseUCF(const char* input, Entry* entity,QCString fileName,b } else { - static const std::regex ee("[[:space:]=]",std::regex::optimize); + static const reg::Ex ee(R"([\s=])"); int in=findIndex(temp.str(),ee); QCString ff=temp.left(in); temp.stripPrefix(ff.data()); @@ -2324,7 +2323,7 @@ static void initUCF(Entry* root,const char* type,QCString & qcs,int line,QCStr VhdlDocGen::deleteAllChars(qcs,';'); qcs=qcs.stripWhiteSpace(); - static const std::regex reg("[[:space:]=]",std::regex::optimize); + static const reg::Ex reg(R"([\s=])"); int i = findIndex(qcs.str(),reg); if (i<0) return; if (i==0) @@ -2406,7 +2405,7 @@ QCString VhdlDocGen::parseForConfig(QCString & entity,QCString & arch) QCString label; if (!entity.contains(":")) return ""; - static const std::regex exp("[:()[[:space:]]",std::regex::optimize); + static const reg::Ex exp(R"([:()\s])"); auto ql=split(entity.str(),exp); if (ql.size()<2) { @@ -2436,7 +2435,7 @@ QCString VhdlDocGen::parseForConfig(QCString & entity,QCString & arch) QCString VhdlDocGen::parseForBinding(QCString & entity,QCString & arch) { - static const std::regex exp("[()[[:space:]]",std::regex::optimize); + static const reg::Ex exp(R"([()\s])"); auto ql = split(entity.str(),exp); @@ -2704,14 +2703,14 @@ void VhdlDocGen::addBaseClass(ClassDef* cd,ClassDef *ent) bcd.usedName.append("(2)"); return; } - static const std::regex reg("[[:digit:]]+",std::regex::optimize); + static const reg::Ex reg(R"(\d+)"); QCString s=n.left(i); QCString r=n.right(n.length()-i); std::string t=r.str(); VhdlDocGen::deleteAllChars(r,')'); VhdlDocGen::deleteAllChars(r,'('); r.setNum(r.toInt()+1); - std::regex_replace(t, reg, r.str()); + reg::replace(t, reg, r.str()); s.append(t.c_str()); bcd.usedName=s; bcd.templSpecifiers=t; @@ -2952,8 +2951,8 @@ void FlowChart::printNode(const FlowChart& flo) { t=flo.text.str(); } - static const std::regex ep("[[:space:]]",std::regex::optimize); - t = std::regex_replace(t,ep,std::string("")); + static const reg::Ex ep(R"(\s)"); + t = reg::replace(t,ep,std::string()); if (t.empty()) { t=" "; diff --git a/src/vhdljjparser.cpp b/src/vhdljjparser.cpp index 3852045..9eaf167 100644 --- a/src/vhdljjparser.cpp +++ b/src/vhdljjparser.cpp @@ -10,7 +10,6 @@ * */ -#include #include #include @@ -34,6 +33,7 @@ #include "markdown.h" #include "VhdlParserTokenManager.h" #include "VhdlParserErrorHandler.hpp" +#include "regex.h" using namespace vhdl::parser; @@ -272,28 +272,28 @@ void VHDLOutlineParser::handleFlowComment(const char* doc) int VHDLOutlineParser::checkInlineCode(QCString &doc) { - static const std::regex csRe("[\\\\@]code", std::regex::optimize); - static const std::regex cendRe("[[:space:]]*[\\\\@]endcode", std::regex::optimize); - static const std::regex cbriefRe("[\\\\@]brief", std::regex::optimize); + static const reg::Ex csRe(R"([\\@]code)"); + static const reg::Ex cendRe(R"(\s*[\\@]endcode)"); + static const reg::Ex cbriefRe(R"([\\@]brief)"); // helper to simulate behavior of QString.find(const QRegExp &re,int pos) - auto findRe = [](const QCString &str,const std::regex &re,int pos=0) -> int + auto findRe = [](const QCString &str,const reg::Ex &re,int pos=0) -> int { if ((int)str.length() QCString + auto replaceRe = [](const QCString &str,const reg::Ex &re,const QCString &replacement) -> QCString { - return std::regex_replace(str.str(), re, replacement.str()); + return reg::replace(str.str(), re, replacement.str()); }; int index = findRe(doc,csRe); -- cgit v0.12