src/regex.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336

/******************************************************************************
 *
 * Copyright (C) 1997-2021 by Dimitri van Heesch.
 *
 * Permission to use, copy, modify, and distribute this software and its
 * documentation under the terms of the GNU General Public License is hereby
 * granted. No representations are made about the suitability of this software
 * for any purpose. It is provided "as is" without express or implied warranty.
 * See the GNU General Public License for more details.
 *
 * Documents produced by Doxygen are derivative works derived from the
 * input used in their production; they are not affected by this license.
 *
 */

#ifndef FREGEX_H
#define FREGEX_H

#include <memory>
#include <string>
#include <vector>
#include <iterator>

/** Namespace for the regular expression functions */
namespace reg
{

class Match;

/** Class representing a regular expression.
 *
 *  It has a similar API as `std::regex`,
 *  but is much faster (and also somewhat more limited).
 */
class Ex
{
  public:
    /** Matching algorithm */
    enum class Mode
    {
      RegEx,    /**< full regular expression. */
      Wildcard  /**< simple globbing pattern. */
    };
    /** Creates a regular expression object given the pattern as a string.
     *  Two modes of matching are supported: RegEx and Wildcard
     *
     *  The following special characters are supported in Mode::RegEx mode.
     *  - `c` matches character `c`
     *  - `.` matches any character
     *  - `^` matches the start of the input
     *  - `$` matches the end of the input
     *  - `\<` matches the start of a word
     *  - `\>` matches the end of a word
     *  - `[]` matches a set of characters
     *  - `x*` matches a sequence of zero or more `x`'s
     *  - `x+` matches a sequence of one or more `x`'s
     *  - `x?` matches an optional `x`
     *  - `(`  matches the start of a capture range
     *  - `)`  matches the ends a capture range
     *  - `\c` to escape a special character, such as `+`, `[`, `*`, `(`, etc.
     *  - `\t` matches a tab character
     *  - `\n` matches a newline character
     *  - `\r` matches a return character
     *  - `\s` matches any whitespace as defined by `std::isspace()`
     *  - `\d` matches any digit as defined by `std::digit()`
     *  - `\a` matches any alphabetical characters, same as `[a-z_A-Z\x80-\xFF]`
     *  - `\w` matches any alpha numercial character, same as `[a-z_A-Z0-9\x80-\xFF]`
     *  - `\xHH` matches a hexadecimal character, e.g. `\xA0` matches character code 160.
     *
     *  A character range can be used to match a character that falls inside a range
     *  (or set of ranges).
     *  Within the opening `[` and closing `]` brackets of a character ranges the following
     *  is supported:
     *  - `^` if at the start of the range, a character matches if it is \e not in the range,
     *        e.g. `[^\d]` matches any character not a digit
     *  - `-` when placed between 2 characters it defines a range from the first character to the second.
     *        any character that falls in the range will match, e.g. [0-9] matches the digit from 0 to 9.
     *  - `\s`, `\d`, `\a`, and `\w` as explained above.
     *
     *  @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special
     *  meaning in a character range. `^` only has a special meaning as the first character.
     *
     *  @note that capture ranges cannot be nested, and `*`, `+`, and `?` do not work on
     *  capture ranges. e.g. `(abd)?` is not valid. If multiple capture ranges are
     *  specified then some character has to be in between them,
     *  e.g. this does not work `(.*)(a.*)`, but this does `(.*)a(.*)`.
     *
     *  In Wildcard mode `*` is used to match any sequence of zero or more characters.
     *  The character `?` can be used to match an optional character. Character ranges are
     *  also supported, but other characters like `$` and `+` are just treated as
     *  literal characters.
     *
     */
    Ex(const std::string &pattern, Mode mode=Mode::RegEx);

    /** Destroys the regular expression object. Frees resources. */
   ~Ex();

    /** Check if a given string matches this regular expression.
     *  @param str   The input string to match against.
     *  @param match The match object to hold the matching results.
     *  @param pos   The position in the string at which to start the match.
     *  @returns true iff a match is found. Details are stored in the match object.
     */
    bool match(const std::string &str,Match &match,size_t pos=0) const;
    bool isValid() const;
  private:
    Ex(const Ex &) = delete;
    Ex &operator=(const Ex &e) = delete;

    class Private;
    std::unique_ptr<Private> p;
};

/** Object representing the match results of a capture range. */
class SubMatch
{
  public:
    /** Creates a match for a single capture range given a non-owning pointer to the string. */
    SubMatch(const std::string *str) : m_str(str) {}

    /** Returns the position in the string at which the match starts. */
    size_t position()    const { return m_pos; }

    /** Returns the length of the matching part. */
    size_t length()      const { return m_len; }

    /** Returns the matching part as a string */
    std::string str()    const { return m_str ?  m_str->substr(m_pos,m_len) : std::string(); }

  private:
    friend class Match;
    void setStart(size_t pos) { m_pos=pos; }
    void setEnd(size_t pos)   { m_len=pos-m_pos; }
    void setMatch(size_t pos,size_t len) { m_pos=pos; m_len=len; }
    size_t m_pos = std::string::npos;
    size_t m_len = std::string::npos;
    const std::string *m_str = nullptr;
};

/** Object representing the matching results. It consists of an array of
 *  SubMatch objects. The first entry of the array represents the whole match, any
 *  next elements represent each of the capture ranges.
 *
 *  For example string `@42` and expression `@(\\d+)` will have two
 *  Submatches, match[0] will point to the input string as a whole, and
 *  match[1] will point to the number 42 only.
 *
 */
class Match
{
  public:
    /** Creates an empty match object */
    Match() {}

    /** Returns the position of the match or std::string::npos if no position is set. */
    size_t position() const { return m_subMatches[0].position(); }

    /** Returns the position of the match or std::string::npos if no length is set. */
    size_t length()   const { return m_subMatches[0].length();   }

    /** Return a string representing the matching part. */
    std::string str() const { return m_subMatches[0].str(); }

    /** Return the part of the string before the match */
    SubMatch prefix() const { SubMatch m(m_str); m.setMatch(0,position()); return m; }

    /** Return the part of the string after the match */
    SubMatch suffix() const
    {
      SubMatch m(m_str);
      if (m_str)
      {
        size_t e = position()+length();
        m.setMatch(e,m_str->length()-e);
      }
      return m;
    }

    /** Returns the number of sub matches available in this match. */
    size_t size()     const { return m_subMatches.size(); }

    /** Returns the n-th SubMatch object. Note that there is always 1 SubMatch object
     *  representing the whole match.
     */
    const SubMatch &operator[](size_t index) const { return m_subMatches[index]; }

  private:
    friend class Ex;
    void init(const std::string *str)
    {
      m_subMatches.clear();
      m_subMatches.emplace_back(str);
      m_str = str;
    }
    void startCapture(size_t index)
    {
      if (!m_insideCapture) // when backtracking we can re-entry the capture multiple times
                            // only update the index, example `\s*(x)`
      {
        m_captureIndex = m_subMatches.size();
        m_subMatches.emplace_back(m_str);
        m_insideCapture = true;
      }
      m_subMatches.back().setStart(index);
    }
    void endCapture(size_t index)
    {
      if (index>m_subMatches.back().position())
      {
        m_captureIndex=0;
        m_subMatches.back().setEnd(index);
        m_insideCapture = false;
      }
    }
    void setMatch(size_t pos,size_t len)
    {
      m_subMatches[m_captureIndex].setMatch(pos,len);
    }

    std::vector<SubMatch> m_subMatches;
    size_t m_captureIndex=0;
    const std::string *m_str = nullptr;
    bool m_insideCapture=false;
};

/** Iterator class to iterator through matches.
 */
class Iterator
{
  public:
    using value_type = Match;
    using difference_type = std::ptrdiff_t;
    using pointer = value_type*;
    using reference = value_type&;
    using iterator_category = std::forward_iterator_tag;

    /** Creates an end-of-sequence iterator */
    Iterator() {}

    /** Creates an iterator for input string \a str, using regular expression \a re to search.
     *  @note the string and regular expression objects should remain valid while iterating.
     */
    Iterator(const std::string &str, const Ex &re, size_t pos=0)
      : m_str(&str), m_re(&re), m_pos(pos) { findNext(); }

    // Iterator holds pointers, so prevent temporaries to be passed as string or
    // regular expression
    Iterator(std::string &&str, const Ex &re) = delete;
    Iterator(const std::string &str, Ex &&re) = delete;
    Iterator(std::string &&str, Ex &&re) = delete;

    /** Returns true if the iterators point to the same match (or both are end-of-sequence iterators) */
    bool operator==(const Iterator &rhs) const { return rhs.m_pos==m_pos; }

    /** Returns true if the iterators are not pointing to the same match */
    bool operator!=(const Iterator &rhs) const { return rhs.m_pos!=m_pos; }

    /** Returns a reference to the current match */
    const value_type &operator*()        const { return m_match; }

    /** Returns a pointer to the current match */
    const value_type *operator->()       const { return &m_match;  }

    /** Advances the iterator to the next match. */
    Iterator &operator++() { findNext(); return *this; }

  private:
    void findNext()
    {
      if (!m_re || !m_str) { m_pos=std::string::npos; return; } // end marker
      if (m_re->match(*m_str,m_match,m_pos))
      {
        m_pos=m_match.position()+m_match.length(); // update m_pos to point beyond last match
      }
      else // no more matches, make the iterator point to the 'end-of-sequence'
      {
        m_pos=std::string::npos;
      }
    }
    const std::string *m_str = nullptr;
    const Ex *m_re = nullptr;
    size_t m_pos = std::string::npos;
    Match m_match;
};

/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re.
 *  Returns true iff a match was found.
 *  Details of what part of the string has matched is returned via the \a match object.
 *
 *  An example to show how to match all identifiers in a string.
 *  @code
 *  static reg::Ex re(R"(\a\w*)");
 *  std::string = u8"void(Func是<B_C::Códe42>(42));";
 *  while (reg::search(str,match,re,pos))
 *  {
 *    std::cout << match.str() << std::endl;
 *    pos=match.position()+match.length();
 *  }
 *  @endcode
 *  produces:
 *  @code
 *  void
 *  Func是
 *  B_C
 *  Códe42
 *  @endcode
 *
 *  @see Ex::Ex() for details on the regular expression patterns.
 */
bool search(const std::string &str,Match &match,const Ex &re,size_t pos=0);

/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re.
 *  Returns true iff a match was found.
 */
bool search(const std::string &str,const Ex &re,size_t pos=0);

/** Matches a given string \a str for a match against regular expression \a re.
 *  Returns true iff a match was found for the whole string.
 *  Any capture groups are returned via the \a match object.
 */
bool match(const std::string &str,Match &match,const Ex &re);

/** Matches a given string \a str for a match against regular expression \a re.
 *  Returns true iff a match was found for the whole string.
 */
bool match(const std::string &str,const Ex &re);

/** Searching in a given input string \a for parts that match regular expression \a re and
 *  replaces those parts by string \a replacement.
 */
std::string replace(const std::string &str,const Ex &re,const std::string &replacement);

} // namespace

#endif