diff options
Diffstat (limited to 'src/regex.h')
-rw-r--r-- | src/regex.h | 336 |
1 files changed, 336 insertions, 0 deletions
diff --git a/src/regex.h b/src/regex.h new file mode 100644 index 0000000..bedc052 --- /dev/null +++ b/src/regex.h @@ -0,0 +1,336 @@ +/****************************************************************************** + * + * Copyright (C) 1997-2021 by Dimitri van Heesch. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation under the terms of the GNU General Public License is hereby + * granted. No representations are made about the suitability of this software + * for any purpose. It is provided "as is" without express or implied warranty. + * See the GNU General Public License for more details. + * + * Documents produced by Doxygen are derivative works derived from the + * input used in their production; they are not affected by this license. + * + */ + +#ifndef FREGEX_H +#define FREGEX_H + +#include <memory> +#include <string> +#include <vector> +#include <iterator> + +/** Namespace for the regular expression functions */ +namespace reg +{ + +class Match; + +/** Class representing a regular expression. + * + * It has a similar API as `std::regex`, + * but is much faster (and also somewhat more limited). + */ +class Ex +{ + public: + /** Matching algorithm */ + enum class Mode + { + RegEx, /**< full regular expression. */ + Wildcard /**< simple globbing pattern. */ + }; + /** Creates a regular expression object given the pattern as a string. + * Two modes of matching are supported: RegEx and Wildcard + * + * The following special characters are supported in Mode::RegEx mode. + * - `c` matches character `c` + * - `.` matches any character + * - `^` matches the start of the input + * - `$` matches the end of the input + * - `\<` matches the start of a word + * - `\>` matches the end of a word + * - `[]` matches a set of characters + * - `x*` matches a sequence of zero or more `x`'s + * - `x+` matches a sequence of one or more `x`'s + * - `x?` matches an optional `x` + * - `(` matches the start of a capture range + * - `)` matches the ends a capture range + * - `\c` to escape a special character, such as `+`, `[`, `*`, `(`, etc. + * - `\t` matches a tab character + * - `\n` matches a newline character + * - `\r` matches a return character + * - `\s` matches any whitespace as defined by `std::isspace()` + * - `\d` matches any digit as defined by `std::digit()` + * - `\a` matches any alphabetical characters, same as `[a-z_A-Z\x80-\xFF]` + * - `\w` matches any alpha numercial character, same as `[a-z_A-Z0-9\x80-\xFF]` + * - `\xHH` matches a hexadecimal character, e.g. `\xA0` matches character code 160. + * + * A character range can be used to match a character that falls inside a range + * (or set of ranges). + * Within the opening `[` and closing `]` brackets of a character ranges the following + * is supported: + * - `^` if at the start of the range, a character matches if it is \e not in the range, + * e.g. `[^\d]` matches any character not a digit + * - `-` when placed between 2 characters it defines a range from the first character to the second. + * any character that falls in the range will match, e.g. [0-9] matches the digit from 0 to 9. + * - `\s`, `\d`, `\a`, and `\w` as explained above. + * + * @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special + * meaning in a character range. `^` only has a special meaning as the first character. + * + * @note that capture ranges cannot be nested, and `*`, `+`, and `?` do not work on + * capture ranges. e.g. `(abd)?` is not valid. If multiple capture ranges are + * specified then some character has to be in between them, + * e.g. this does not work `(.*)(a.*)`, but this does `(.*)a(.*)`. + * + * In Wildcard mode `*` is used to match any sequence of zero or more characters. + * The character `?` can be used to match an optional character. Character ranges are + * also supported, but other characters like `$` and `+` are just treated as + * literal characters. + * + */ + Ex(const std::string &pattern, Mode mode=Mode::RegEx); + + /** Destroys the regular expression object. Frees resources. */ + ~Ex(); + + /** Check if a given string matches this regular expression. + * @param str The input string to match against. + * @param match The match object to hold the matching results. + * @param pos The position in the string at which to start the match. + * @returns true iff a match is found. Details are stored in the match object. + */ + bool match(const std::string &str,Match &match,size_t pos=0) const; + bool isValid() const; + private: + Ex(const Ex &) = delete; + Ex &operator=(const Ex &e) = delete; + + class Private; + std::unique_ptr<Private> p; +}; + +/** Object representing the match results of a capture range. */ +class SubMatch +{ + public: + /** Creates a match for a single capture range given a non-owning pointer to the string. */ + SubMatch(const std::string *str) : m_str(str) {} + + /** Returns the position in the string at which the match starts. */ + size_t position() const { return m_pos; } + + /** Returns the length of the matching part. */ + size_t length() const { return m_len; } + + /** Returns the matching part as a string */ + std::string str() const { return m_str ? m_str->substr(m_pos,m_len) : std::string(); } + + private: + friend class Match; + void setStart(size_t pos) { m_pos=pos; } + void setEnd(size_t pos) { m_len=pos-m_pos; } + void setMatch(size_t pos,size_t len) { m_pos=pos; m_len=len; } + size_t m_pos = std::string::npos; + size_t m_len = std::string::npos; + const std::string *m_str = nullptr; +}; + +/** Object representing the matching results. It consists of an array of + * SubMatch objects. The first entry of the array represents the whole match, any + * next elements represent each of the capture ranges. + * + * For example string `@42` and expression `@(\\d+)` will have two + * Submatches, match[0] will point to the input string as a whole, and + * match[1] will point to the number 42 only. + * + */ +class Match +{ + public: + /** Creates an empty match object */ + Match() {} + + /** Returns the position of the match or std::string::npos if no position is set. */ + size_t position() const { return m_subMatches[0].position(); } + + /** Returns the position of the match or std::string::npos if no length is set. */ + size_t length() const { return m_subMatches[0].length(); } + + /** Return a string representing the matching part. */ + std::string str() const { return m_subMatches[0].str(); } + + /** Return the part of the string after the match */ + SubMatch suffix() const { SubMatch m(m_str); m.setMatch(0,position()); return m; } + + /** Return the part of the string before the match */ + SubMatch prefix() const + { + SubMatch m(m_str); + if (m_str) + { + size_t e = position()+length(); + m.setMatch(e,m_str->length()-e); + } + return m; + } + + /** Returns the number of sub matches available in this match. */ + size_t size() const { return m_subMatches.size(); } + + /** Returns the n-th SubMatch object. Note that there is always 1 SubMatch object + * representing the whole match. + */ + const SubMatch &operator[](size_t index) const { return m_subMatches[index]; } + + private: + friend class Ex; + void init(const std::string *str) + { + m_subMatches.clear(); + m_subMatches.emplace_back(str); + m_str = str; + } + void startCapture(size_t index) + { + if (!m_insideCapture) // when backtracking we can re-entry the capture multiple times + // only update the index, example `\s*(x)` + { + m_captureIndex = m_subMatches.size(); + m_subMatches.emplace_back(m_str); + m_insideCapture = true; + } + m_subMatches.back().setStart(index); + } + void endCapture(size_t index) + { + if (index>m_subMatches.back().position()) + { + m_captureIndex=0; + m_subMatches.back().setEnd(index); + m_insideCapture = false; + } + } + void setMatch(size_t pos,size_t len) + { + m_subMatches[m_captureIndex].setMatch(pos,len); + } + + std::vector<SubMatch> m_subMatches; + size_t m_captureIndex=0; + const std::string *m_str = nullptr; + bool m_insideCapture=false; +}; + +/** Iterator class to iterator through matches. + */ +class Iterator +{ + public: + using value_type = Match; + using difference_type = std::ptrdiff_t; + using pointer = value_type*; + using reference = value_type&; + using iterator_category = std::forward_iterator_tag; + + /** Creates an end-of-sequence iterator */ + Iterator() {} + + /** Creates an iterator for input string \a str, using regular expression \a re to search. + * @note the string and regular expression objects should remain valid while iterating. + */ + Iterator(const std::string &str, const Ex &re, size_t pos=0) + : m_str(&str), m_re(&re), m_pos(pos) { findNext(); } + + // Iterator holds pointers, so prevent temporaries to be passed as string or + // regular expression + Iterator(std::string &&str, const Ex &re) = delete; + Iterator(const std::string &str, Ex &&re) = delete; + Iterator(std::string &&str, Ex &&re) = delete; + + /** Returns true if the iterators point to the same match (or both are end-of-sequence iterators) */ + bool operator==(const Iterator &rhs) const { return rhs.m_pos==m_pos; } + + /** Returns true if the iterators are not pointing to the same match */ + bool operator!=(const Iterator &rhs) const { return rhs.m_pos!=m_pos; } + + /** Returns a reference to the current match */ + const value_type &operator*() const { return m_match; } + + /** Returns a pointer to the current match */ + const value_type *operator->() const { return &m_match; } + + /** Advances the iterator to the next match. */ + Iterator &operator++() { findNext(); return *this; } + + private: + void findNext() + { + if (!m_re || !m_str) { m_pos=std::string::npos; return; } // end marker + if (m_re->match(*m_str,m_match,m_pos)) + { + m_pos=m_match.position()+m_match.length(); // update m_pos to point beyond last match + } + else // no more matches, make the iterator point to the 'end-of-sequence' + { + m_pos=std::string::npos; + } + } + const std::string *m_str = nullptr; + const Ex *m_re = nullptr; + size_t m_pos = std::string::npos; + Match m_match; +}; + +/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re. + * Returns true iff a match was found. + * Details of what part of the string has matched is returned via the \a match object. + * + * An example to show how to match all identifiers in a string. + * @code + * static reg::Ex re(R"(\a\w*)"); + * std::string = u8"void(Func是<B_C::Códe42>(42));"; + * while (reg::search(str,match,re,pos)) + * { + * std::cout << match.str() << std::endl; + * pos=match.position()+match.length(); + * } + * @endcode + * produces: + * @code + * void + * Func是 + * B_C + * Códe42 + * @endcode + * + * @see Ex::Ex() for details on the regular expression patterns. + */ +bool search(const std::string &str,Match &match,const Ex &re,size_t pos=0); + +/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re. + * Returns true iff a match was found. + */ +bool search(const std::string &str,const Ex &re,size_t pos=0); + +/** Matches a given string \a str for a match against regular expression \a re. + * Returns true iff a match was found for the whole string. + * Any capture groups are returned via the \a match object. + */ +bool match(const std::string &str,Match &match,const Ex &re); + +/** Matches a given string \a str for a match against regular expression \a re. + * Returns true iff a match was found for the whole string. + */ +bool match(const std::string &str,const Ex &re); + +/** Searching in a given input string \a for parts that match regular expression \a re and + * replaces those parts by string \a replacement. + */ +std::string replace(const std::string &str,const Ex &re,const std::string &replacement); + +} // namespace + +#endif |