/****************************************************************************** * * Copyright (C) 1997-2021 by Dimitri van Heesch. * * Permission to use, copy, modify, and distribute this software and its * documentation under the terms of the GNU General Public License is hereby * granted. No representations are made about the suitability of this software * for any purpose. It is provided "as is" without express or implied warranty. * See the GNU General Public License for more details. * * Documents produced by Doxygen are derivative works derived from the * input used in their production; they are not affected by this license. * */ #ifndef FREGEX_H #define FREGEX_H #include #include #include #include /** Namespace for the regular expression functions */ namespace reg { class Match; /** Class representing a regular expression. * * It has a similar API as `std::regex`, * but is much faster (and also somewhat more limited). */ class Ex { public: /** Matching algorithm */ enum class Mode { RegEx, /**< full regular expression. */ Wildcard /**< simple globbing pattern. */ }; /** Creates a regular expression object given the pattern as a string. * Two modes of matching are supported: RegEx and Wildcard * * The following special characters are supported in Mode::RegEx mode. * - `c` matches character `c` * - `.` matches any character * - `^` matches the start of the input * - `$` matches the end of the input * - `\<` matches the start of a word * - `\>` matches the end of a word * - `[]` matches a set of characters * - `x*` matches a sequence of zero or more `x`'s * - `x+` matches a sequence of one or more `x`'s * - `x?` matches an optional `x` * - `(` matches the start of a capture range * - `)` matches the ends a capture range * - `\c` to escape a special character, such as `+`, `[`, `*`, `(`, etc. * - `\t` matches a tab character * - `\n` matches a newline character * - `\r` matches a return character * - `\s` matches any whitespace as defined by `std::isspace()` * - `\d` matches any digit as defined by `std::digit()` * - `\a` matches any alphabetical characters, same as `[a-z_A-Z\x80-\xFF]` * - `\w` matches any alpha numercial character, same as `[a-z_A-Z0-9\x80-\xFF]` * - `\xHH` matches a hexadecimal character, e.g. `\xA0` matches character code 160. * * A character range can be used to match a character that falls inside a range * (or set of ranges). * Within the opening `[` and closing `]` brackets of a character ranges the following * is supported: * - `^` if at the start of the range, a character matches if it is \e not in the range, * e.g. `[^\d]` matches any character not a digit * - `-` when placed between 2 characters it defines a range from the first character to the second. * any character that falls in the range will match, e.g. [0-9] matches the digit from 0 to 9. * - `\s`, `\d`, `\a`, and `\w` as explained above. * * @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special * meaning in a character range. `^` only has a special meaning as the first character. * * @note that capture ranges cannot be nested, and `*`, `+`, and `?` do not work on * capture ranges. e.g. `(abd)?` is not valid. If multiple capture ranges are * specified then some character has to be in between them, * e.g. this does not work `(.*)(a.*)`, but this does `(.*)a(.*)`. * * In Wildcard mode `*` is used to match any sequence of zero or more characters. * The character `?` can be used to match an optional character. Character ranges are * also supported, but other characters like `$` and `+` are just treated as * literal characters. * */ Ex(const std::string &pattern, Mode mode=Mode::RegEx); /** Destroys the regular expression object. Frees resources. */ ~Ex(); /** Check if a given string matches this regular expression. * @param str The input string to match against. * @param match The match object to hold the matching results. * @param pos The position in the string at which to start the match. * @returns true iff a match is found. Details are stored in the match object. */ bool match(const std::string &str,Match &match,size_t pos=0) const; bool isValid() const; private: Ex(const Ex &) = delete; Ex &operator=(const Ex &e) = delete; class Private; std::unique_ptr p; }; /** Object representing the match results of a capture range. */ class SubMatch { public: /** Creates a match for a single capture range given a non-owning pointer to the string. */ SubMatch(const std::string *str) : m_str(str) {} /** Returns the position in the string at which the match starts. */ size_t position() const { return m_pos; } /** Returns the length of the matching part. */ size_t length() const { return m_len; } /** Returns the matching part as a string */ std::string str() const { return m_str ? m_str->substr(m_pos,m_len) : std::string(); } private: friend class Match; void setStart(size_t pos) { m_pos=pos; } void setEnd(size_t pos) { m_len=pos-m_pos; } void setMatch(size_t pos,size_t len) { m_pos=pos; m_len=len; } size_t m_pos = std::string::npos; size_t m_len = std::string::npos; const std::string *m_str = nullptr; }; /** Object representing the matching results. It consists of an array of * SubMatch objects. The first entry of the array represents the whole match, any * next elements represent each of the capture ranges. * * For example string `@42` and expression `@(\\d+)` will have two * Submatches, match[0] will point to the input string as a whole, and * match[1] will point to the number 42 only. * */ class Match { public: /** Creates an empty match object */ Match() {} /** Returns the position of the match or std::string::npos if no position is set. */ size_t position() const { return m_subMatches[0].position(); } /** Returns the position of the match or std::string::npos if no length is set. */ size_t length() const { return m_subMatches[0].length(); } /** Return a string representing the matching part. */ std::string str() const { return m_subMatches[0].str(); } /** Return the part of the string before the match */ SubMatch prefix() const { SubMatch m(m_str); m.setMatch(0,position()); return m; } /** Return the part of the string after the match */ SubMatch suffix() const { SubMatch m(m_str); if (m_str) { size_t e = position()+length(); m.setMatch(e,m_str->length()-e); } return m; } /** Returns the number of sub matches available in this match. */ size_t size() const { return m_subMatches.size(); } /** Returns the n-th SubMatch object. Note that there is always 1 SubMatch object * representing the whole match. */ const SubMatch &operator[](size_t index) const { return m_subMatches[index]; } private: friend class Ex; void init(const std::string *str) { m_subMatches.clear(); m_subMatches.emplace_back(str); m_str = str; } void startCapture(size_t index) { if (!m_insideCapture) // when backtracking we can re-entry the capture multiple times // only update the index, example `\s*(x)` { m_captureIndex = m_subMatches.size(); m_subMatches.emplace_back(m_str); m_insideCapture = true; } m_subMatches.back().setStart(index); } void endCapture(size_t index) { if (index>m_subMatches.back().position()) { m_captureIndex=0; m_subMatches.back().setEnd(index); m_insideCapture = false; } } void setMatch(size_t pos,size_t len) { m_subMatches[m_captureIndex].setMatch(pos,len); } std::vector m_subMatches; size_t m_captureIndex=0; const std::string *m_str = nullptr; bool m_insideCapture=false; }; /** Iterator class to iterator through matches. */ class Iterator { public: using value_type = Match; using difference_type = std::ptrdiff_t; using pointer = value_type*; using reference = value_type&; using iterator_category = std::forward_iterator_tag; /** Creates an end-of-sequence iterator */ Iterator() {} /** Creates an iterator for input string \a str, using regular expression \a re to search. * @note the string and regular expression objects should remain valid while iterating. */ Iterator(const std::string &str, const Ex &re, size_t pos=0) : m_str(&str), m_re(&re), m_pos(pos) { findNext(); } // Iterator holds pointers, so prevent temporaries to be passed as string or // regular expression Iterator(std::string &&str, const Ex &re) = delete; Iterator(const std::string &str, Ex &&re) = delete; Iterator(std::string &&str, Ex &&re) = delete; /** Returns true if the iterators point to the same match (or both are end-of-sequence iterators) */ bool operator==(const Iterator &rhs) const { return rhs.m_pos==m_pos; } /** Returns true if the iterators are not pointing to the same match */ bool operator!=(const Iterator &rhs) const { return rhs.m_pos!=m_pos; } /** Returns a reference to the current match */ const value_type &operator*() const { return m_match; } /** Returns a pointer to the current match */ const value_type *operator->() const { return &m_match; } /** Advances the iterator to the next match. */ Iterator &operator++() { findNext(); return *this; } private: void findNext() { if (!m_re || !m_str) { m_pos=std::string::npos; return; } // end marker if (m_re->match(*m_str,m_match,m_pos)) { m_pos=m_match.position()+m_match.length(); // update m_pos to point beyond last match } else // no more matches, make the iterator point to the 'end-of-sequence' { m_pos=std::string::npos; } } const std::string *m_str = nullptr; const Ex *m_re = nullptr; size_t m_pos = std::string::npos; Match m_match; }; /** Search in a given string \a str starting at position \a pos for a match against regular expression \a re. * Returns true iff a match was found. * Details of what part of the string has matched is returned via the \a match object. * * An example to show how to match all identifiers in a string. * @code * static reg::Ex re(R"(\a\w*)"); * std::string = u8"void(Func是(42));"; * while (reg::search(str,match,re,pos)) * { * std::cout << match.str() << std::endl; * pos=match.position()+match.length(); * } * @endcode * produces: * @code * void * Func是 * B_C * Códe42 * @endcode * * @see Ex::Ex() for details on the regular expression patterns. */ bool search(const std::string &str,Match &match,const Ex &re,size_t pos=0); /** Search in a given string \a str starting at position \a pos for a match against regular expression \a re. * Returns true iff a match was found. */ bool search(const std::string &str,const Ex &re,size_t pos=0); /** Matches a given string \a str for a match against regular expression \a re. * Returns true iff a match was found for the whole string. * Any capture groups are returned via the \a match object. */ bool match(const std::string &str,Match &match,const Ex &re); /** Matches a given string \a str for a match against regular expression \a re. * Returns true iff a match was found for the whole string. */ bool match(const std::string &str,const Ex &re); /** Searching in a given input string \a for parts that match regular expression \a re and * replaces those parts by string \a replacement. */ std::string replace(const std::string &str,const Ex &re,const std::string &replacement); } // namespace #endif