From be0b62cab431089cb8693ca859316a86dbae2f94 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 29 Jun 1998 20:29:08 +0000 Subject: Added findall() to RegexObject -- return a list of all matches in a string. Added groupdict() to MatchObject -- return the named groups as a dict. Added default argument to groups() to specify what to return for unmatching groups; groupdict() also has this. --- Lib/re.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 16 deletions(-) diff --git a/Lib/re.py b/Lib/re.py index 75905c5..c5b71b8 100644 --- a/Lib/re.py +++ b/Lib/re.py @@ -57,6 +57,11 @@ def split(pattern, string, maxsplit=0): pattern = _cachecompile(pattern) return pattern.split(string, maxsplit) +def findall(pattern, string): + if type(pattern) == type(''): + pattern = _cachecompile(pattern) + return pattern.findall(string) + def escape(pattern): "Escape all non-alphanumeric characters in pattern." result = [] @@ -80,6 +85,7 @@ def compile(pattern, flags=0): # class RegexObject: + def __init__(self, pattern, flags, code, groupindex): self.code = code self.flags = flags @@ -171,7 +177,7 @@ class RegexObject: return (string.join(results, ''), n) def split(self, source, maxsplit=0): - """Split the \var{source} string by the occurrences of the pattern, + """Split the source string by the occurrences of the pattern, returning a list containing the resulting substrings.""" if maxsplit < 0: @@ -198,13 +204,38 @@ class RegexObject: results.append(source[lastmatch:i]) g = m.groups() if g: - if type(g)==type( "" ): g = [g] results[len(results):] = list(g) pos = lastmatch = j n = n + 1 results.append(source[lastmatch:]) return results + def findall(self, string): + """Return a list of all non-overlapping matches in the string. + + If one or more groups are present in the pattern, return a + list of groups; this will be a list of tuples if the pattern + has more than one group. + + Empty matches are included in the result. + + """ + pos = 0 + n = len(string) + result = [] + while pos <= n: + m = self.search(string, pos) + if not m: + break + gr = m.groups() + if not gr: + gr = m.group() + elif len(gr) == 1: + gr = gr[0] + result.append(gr) + pos = max(m.end(), pos+1) + return result + # The following 3 functions were contributed by Mike Fletcher, and # allow pickling and unpickling of RegexObject instances. def __getinitargs__(self): @@ -221,6 +252,7 @@ class RegexObject: self.code = apply(pcre_compile, statetuple) class MatchObject: + def __init__(self, re, string, pos, endpos, regs): self.re = re self.string = string @@ -234,7 +266,7 @@ class MatchObject: try: g = self.re.groupindex[g] except (KeyError, TypeError): - raise IndexError, ('group "' + g + '" is undefined') + raise IndexError, 'group %s is undefined' % `g` return self.regs[g][0] def end(self, g = 0): @@ -243,31 +275,31 @@ class MatchObject: try: g = self.re.groupindex[g] except (KeyError, TypeError): - raise IndexError, ('group "' + g + '" is undefined') + raise IndexError, 'group %s is undefined' % `g` return self.regs[g][1] def span(self, g = 0): - """Return a tuple containing the start,end of the substring - matched by group g""" + "Return (start, end) of the substring matched by group g" if type(g) == type(''): try: g = self.re.groupindex[g] except (KeyError, TypeError): - raise IndexError, ('group "' + g + '" is undefined') + raise IndexError, 'group %s is undefined' % `g` return self.regs[g] - def groups(self): + def groups(self, default=None): "Return a tuple containing all subgroups of the match object" result = [] for g in range(1, self.re._num_regs): - if (self.regs[g][0] == -1) or (self.regs[g][1] == -1): - result.append(None) + a, b = self.regs[g] + if a == -1 or b == -1: + result.append(default) else: - result.append(self.string[self.regs[g][0]:self.regs[g][1]]) + result.append(self.string[a:b]) return tuple(result) def group(self, *groups): - "Return one or more groups of the match." + "Return one or more groups of the match" if len(groups) == 0: groups = (0,) result = [] @@ -276,15 +308,28 @@ class MatchObject: try: g = self.re.groupindex[g] except (KeyError, TypeError): - raise IndexError, ('group "' + g + '" is undefined') - if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined') - elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1): + raise IndexError, 'group %s is undefined' % `g` + if g >= len(self.regs): + raise IndexError, 'group %s is undefined' % `g` + a, b = self.regs[g] + if a == -1 or b == -1: result.append(None) else: - result.append(self.string[self.regs[g][0]:self.regs[g][1]]) + result.append(self.string[a:b]) if len(result) > 1: return tuple(result) elif len(result) == 1: return result[0] else: return () + + def groupdict(self, default=None): + "Return a dictionary containing all named subgroups of the match" + dict = {} + for name, index in self.re.groupindex.items(): + a, b = self.regs[index] + if a == -1 or b == -1: + dict[name] = default + else: + dict[name] = self.string[a:b] + return dict -- cgit v0.12