summaryrefslogtreecommitdiffstats
path: root/src/regex.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/regex.h')
-rw-r--r--src/regex.h336
1 files changed, 336 insertions, 0 deletions
diff --git a/src/regex.h b/src/regex.h
new file mode 100644
index 0000000..bedc052
--- /dev/null
+++ b/src/regex.h
@@ -0,0 +1,336 @@
+/******************************************************************************
+ *
+ * Copyright (C) 1997-2021 by Dimitri van Heesch.
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation under the terms of the GNU General Public License is hereby
+ * granted. No representations are made about the suitability of this software
+ * for any purpose. It is provided "as is" without express or implied warranty.
+ * See the GNU General Public License for more details.
+ *
+ * Documents produced by Doxygen are derivative works derived from the
+ * input used in their production; they are not affected by this license.
+ *
+ */
+
+#ifndef FREGEX_H
+#define FREGEX_H
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <iterator>
+
+/** Namespace for the regular expression functions */
+namespace reg
+{
+
+class Match;
+
+/** Class representing a regular expression.
+ *
+ * It has a similar API as `std::regex`,
+ * but is much faster (and also somewhat more limited).
+ */
+class Ex
+{
+ public:
+ /** Matching algorithm */
+ enum class Mode
+ {
+ RegEx, /**< full regular expression. */
+ Wildcard /**< simple globbing pattern. */
+ };
+ /** Creates a regular expression object given the pattern as a string.
+ * Two modes of matching are supported: RegEx and Wildcard
+ *
+ * The following special characters are supported in Mode::RegEx mode.
+ * - `c` matches character `c`
+ * - `.` matches any character
+ * - `^` matches the start of the input
+ * - `$` matches the end of the input
+ * - `\<` matches the start of a word
+ * - `\>` matches the end of a word
+ * - `[]` matches a set of characters
+ * - `x*` matches a sequence of zero or more `x`'s
+ * - `x+` matches a sequence of one or more `x`'s
+ * - `x?` matches an optional `x`
+ * - `(` matches the start of a capture range
+ * - `)` matches the ends a capture range
+ * - `\c` to escape a special character, such as `+`, `[`, `*`, `(`, etc.
+ * - `\t` matches a tab character
+ * - `\n` matches a newline character
+ * - `\r` matches a return character
+ * - `\s` matches any whitespace as defined by `std::isspace()`
+ * - `\d` matches any digit as defined by `std::digit()`
+ * - `\a` matches any alphabetical characters, same as `[a-z_A-Z\x80-\xFF]`
+ * - `\w` matches any alpha numercial character, same as `[a-z_A-Z0-9\x80-\xFF]`
+ * - `\xHH` matches a hexadecimal character, e.g. `\xA0` matches character code 160.
+ *
+ * A character range can be used to match a character that falls inside a range
+ * (or set of ranges).
+ * Within the opening `[` and closing `]` brackets of a character ranges the following
+ * is supported:
+ * - `^` if at the start of the range, a character matches if it is \e not in the range,
+ * e.g. `[^\d]` matches any character not a digit
+ * - `-` when placed between 2 characters it defines a range from the first character to the second.
+ * any character that falls in the range will match, e.g. [0-9] matches the digit from 0 to 9.
+ * - `\s`, `\d`, `\a`, and `\w` as explained above.
+ *
+ * @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special
+ * meaning in a character range. `^` only has a special meaning as the first character.
+ *
+ * @note that capture ranges cannot be nested, and `*`, `+`, and `?` do not work on
+ * capture ranges. e.g. `(abd)?` is not valid. If multiple capture ranges are
+ * specified then some character has to be in between them,
+ * e.g. this does not work `(.*)(a.*)`, but this does `(.*)a(.*)`.
+ *
+ * In Wildcard mode `*` is used to match any sequence of zero or more characters.
+ * The character `?` can be used to match an optional character. Character ranges are
+ * also supported, but other characters like `$` and `+` are just treated as
+ * literal characters.
+ *
+ */
+ Ex(const std::string &pattern, Mode mode=Mode::RegEx);
+
+ /** Destroys the regular expression object. Frees resources. */
+ ~Ex();
+
+ /** Check if a given string matches this regular expression.
+ * @param str The input string to match against.
+ * @param match The match object to hold the matching results.
+ * @param pos The position in the string at which to start the match.
+ * @returns true iff a match is found. Details are stored in the match object.
+ */
+ bool match(const std::string &str,Match &match,size_t pos=0) const;
+ bool isValid() const;
+ private:
+ Ex(const Ex &) = delete;
+ Ex &operator=(const Ex &e) = delete;
+
+ class Private;
+ std::unique_ptr<Private> p;
+};
+
+/** Object representing the match results of a capture range. */
+class SubMatch
+{
+ public:
+ /** Creates a match for a single capture range given a non-owning pointer to the string. */
+ SubMatch(const std::string *str) : m_str(str) {}
+
+ /** Returns the position in the string at which the match starts. */
+ size_t position() const { return m_pos; }
+
+ /** Returns the length of the matching part. */
+ size_t length() const { return m_len; }
+
+ /** Returns the matching part as a string */
+ std::string str() const { return m_str ? m_str->substr(m_pos,m_len) : std::string(); }
+
+ private:
+ friend class Match;
+ void setStart(size_t pos) { m_pos=pos; }
+ void setEnd(size_t pos) { m_len=pos-m_pos; }
+ void setMatch(size_t pos,size_t len) { m_pos=pos; m_len=len; }
+ size_t m_pos = std::string::npos;
+ size_t m_len = std::string::npos;
+ const std::string *m_str = nullptr;
+};
+
+/** Object representing the matching results. It consists of an array of
+ * SubMatch objects. The first entry of the array represents the whole match, any
+ * next elements represent each of the capture ranges.
+ *
+ * For example string `@42` and expression `@(\\d+)` will have two
+ * Submatches, match[0] will point to the input string as a whole, and
+ * match[1] will point to the number 42 only.
+ *
+ */
+class Match
+{
+ public:
+ /** Creates an empty match object */
+ Match() {}
+
+ /** Returns the position of the match or std::string::npos if no position is set. */
+ size_t position() const { return m_subMatches[0].position(); }
+
+ /** Returns the position of the match or std::string::npos if no length is set. */
+ size_t length() const { return m_subMatches[0].length(); }
+
+ /** Return a string representing the matching part. */
+ std::string str() const { return m_subMatches[0].str(); }
+
+ /** Return the part of the string after the match */
+ SubMatch suffix() const { SubMatch m(m_str); m.setMatch(0,position()); return m; }
+
+ /** Return the part of the string before the match */
+ SubMatch prefix() const
+ {
+ SubMatch m(m_str);
+ if (m_str)
+ {
+ size_t e = position()+length();
+ m.setMatch(e,m_str->length()-e);
+ }
+ return m;
+ }
+
+ /** Returns the number of sub matches available in this match. */
+ size_t size() const { return m_subMatches.size(); }
+
+ /** Returns the n-th SubMatch object. Note that there is always 1 SubMatch object
+ * representing the whole match.
+ */
+ const SubMatch &operator[](size_t index) const { return m_subMatches[index]; }
+
+ private:
+ friend class Ex;
+ void init(const std::string *str)
+ {
+ m_subMatches.clear();
+ m_subMatches.emplace_back(str);
+ m_str = str;
+ }
+ void startCapture(size_t index)
+ {
+ if (!m_insideCapture) // when backtracking we can re-entry the capture multiple times
+ // only update the index, example `\s*(x)`
+ {
+ m_captureIndex = m_subMatches.size();
+ m_subMatches.emplace_back(m_str);
+ m_insideCapture = true;
+ }
+ m_subMatches.back().setStart(index);
+ }
+ void endCapture(size_t index)
+ {
+ if (index>m_subMatches.back().position())
+ {
+ m_captureIndex=0;
+ m_subMatches.back().setEnd(index);
+ m_insideCapture = false;
+ }
+ }
+ void setMatch(size_t pos,size_t len)
+ {
+ m_subMatches[m_captureIndex].setMatch(pos,len);
+ }
+
+ std::vector<SubMatch> m_subMatches;
+ size_t m_captureIndex=0;
+ const std::string *m_str = nullptr;
+ bool m_insideCapture=false;
+};
+
+/** Iterator class to iterator through matches.
+ */
+class Iterator
+{
+ public:
+ using value_type = Match;
+ using difference_type = std::ptrdiff_t;
+ using pointer = value_type*;
+ using reference = value_type&;
+ using iterator_category = std::forward_iterator_tag;
+
+ /** Creates an end-of-sequence iterator */
+ Iterator() {}
+
+ /** Creates an iterator for input string \a str, using regular expression \a re to search.
+ * @note the string and regular expression objects should remain valid while iterating.
+ */
+ Iterator(const std::string &str, const Ex &re, size_t pos=0)
+ : m_str(&str), m_re(&re), m_pos(pos) { findNext(); }
+
+ // Iterator holds pointers, so prevent temporaries to be passed as string or
+ // regular expression
+ Iterator(std::string &&str, const Ex &re) = delete;
+ Iterator(const std::string &str, Ex &&re) = delete;
+ Iterator(std::string &&str, Ex &&re) = delete;
+
+ /** Returns true if the iterators point to the same match (or both are end-of-sequence iterators) */
+ bool operator==(const Iterator &rhs) const { return rhs.m_pos==m_pos; }
+
+ /** Returns true if the iterators are not pointing to the same match */
+ bool operator!=(const Iterator &rhs) const { return rhs.m_pos!=m_pos; }
+
+ /** Returns a reference to the current match */
+ const value_type &operator*() const { return m_match; }
+
+ /** Returns a pointer to the current match */
+ const value_type *operator->() const { return &m_match; }
+
+ /** Advances the iterator to the next match. */
+ Iterator &operator++() { findNext(); return *this; }
+
+ private:
+ void findNext()
+ {
+ if (!m_re || !m_str) { m_pos=std::string::npos; return; } // end marker
+ if (m_re->match(*m_str,m_match,m_pos))
+ {
+ m_pos=m_match.position()+m_match.length(); // update m_pos to point beyond last match
+ }
+ else // no more matches, make the iterator point to the 'end-of-sequence'
+ {
+ m_pos=std::string::npos;
+ }
+ }
+ const std::string *m_str = nullptr;
+ const Ex *m_re = nullptr;
+ size_t m_pos = std::string::npos;
+ Match m_match;
+};
+
+/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re.
+ * Returns true iff a match was found.
+ * Details of what part of the string has matched is returned via the \a match object.
+ *
+ * An example to show how to match all identifiers in a string.
+ * @code
+ * static reg::Ex re(R"(\a\w*)");
+ * std::string = u8"void(Func是<B_C::Códe42>(42));";
+ * while (reg::search(str,match,re,pos))
+ * {
+ * std::cout << match.str() << std::endl;
+ * pos=match.position()+match.length();
+ * }
+ * @endcode
+ * produces:
+ * @code
+ * void
+ * Func是
+ * B_C
+ * Códe42
+ * @endcode
+ *
+ * @see Ex::Ex() for details on the regular expression patterns.
+ */
+bool search(const std::string &str,Match &match,const Ex &re,size_t pos=0);
+
+/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re.
+ * Returns true iff a match was found.
+ */
+bool search(const std::string &str,const Ex &re,size_t pos=0);
+
+/** Matches a given string \a str for a match against regular expression \a re.
+ * Returns true iff a match was found for the whole string.
+ * Any capture groups are returned via the \a match object.
+ */
+bool match(const std::string &str,Match &match,const Ex &re);
+
+/** Matches a given string \a str for a match against regular expression \a re.
+ * Returns true iff a match was found for the whole string.
+ */
+bool match(const std::string &str,const Ex &re);
+
+/** Searching in a given input string \a for parts that match regular expression \a re and
+ * replaces those parts by string \a replacement.
+ */
+std::string replace(const std::string &str,const Ex &re,const std::string &replacement);
+
+} // namespace
+
+#endif