diff options
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/sre.py | 33 | ||||
-rw-r--r-- | Lib/sre_compile.py | 14 | ||||
-rw-r--r-- | Lib/sre_constants.py | 2 | ||||
-rw-r--r-- | Lib/sre_parse.py | 155 | ||||
-rw-r--r-- | Lib/test/output/test_sre | 2 |
5 files changed, 118 insertions, 88 deletions
@@ -10,9 +10,13 @@ # other compatibility work. # +# FIXME: change all FIXME's to XXX ;-) + import sre_compile import sre_parse +import string + # flags I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE L = LOCALE = sre_compile.SRE_FLAG_LOCALE @@ -53,6 +57,9 @@ def findall(pattern, string, maxsplit=0): def compile(pattern, flags=0): return _compile(pattern, flags) +def purge(): + _cache.clear() + def template(pattern, flags=0): return _compile(pattern, flags|T) @@ -65,7 +72,7 @@ def escape(pattern): s[i] = "\\000" else: s[i] = "\\" + c - return pattern[:0].join(s) + return _join(s, pattern) # -------------------------------------------------------------------- # internals @@ -73,10 +80,14 @@ def escape(pattern): _cache = {} _MAXCACHE = 100 +def _join(seq, sep): + # internal: join into string having the same type as sep + return string.join(seq, sep[:0]) + def _compile(pattern, flags=0): # internal: compile pattern tp = type(pattern) - if tp not in (type(""), type(u"")): + if tp not in sre_compile.STRING_TYPES: return pattern key = (tp, pattern, flags) try: @@ -89,10 +100,6 @@ def _compile(pattern, flags=0): _cache[key] = p return p -def purge(): - # clear pattern cache - _cache.clear() - def _sub(pattern, template, string, count=0): # internal: pattern.sub implementation hook return _subn(pattern, template, string, count)[0] @@ -120,7 +127,7 @@ def _subn(pattern, template, string, count=0): i = e n = n + 1 append(string[i:]) - return string[:0].join(s), n + return _join(s, string[:0]), n def _split(pattern, string, maxsplit=0): # internal: pattern.split implementation hook @@ -161,11 +168,19 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile) class Scanner: def __init__(self, lexicon): + from sre_constants import BRANCH, SUBPATTERN, INDEX self.lexicon = lexicon + # combine phrases into a compound pattern p = [] + s = sre_parse.Pattern() for phrase, action in lexicon: - p.append("(?:%s)(?P#%d)" % (phrase, len(p))) - self.scanner = _compile("|".join(p)) + p.append(sre_parse.SubPattern(s, [ + (SUBPATTERN, (None, sre_parse.parse(phrase))), + (INDEX, len(p)) + ])) + p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) + s.groups = len(p) + self.scanner = sre_compile.compile(p) def scan(self, string): result = [] append = result.append diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index d8d01ea..ef26e1c 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -197,10 +197,11 @@ def _compile(code, pattern, flags): else: emit(ATCODES[av]) elif op is BRANCH: - emit(OPCODES[op]) tail = [] for av in av[1]: + emit(OPCODES[op]) skip = len(code); emit(0) + emit(MAXCODE) # save mark _compile(code, av, flags) emit(OPCODES[JUMP]) tail.append(len(code)); emit(0) @@ -286,11 +287,18 @@ def _compile_info(code, pattern, flags): emit(OPCODES[FAILURE]) code[skip] = len(code) - skip +STRING_TYPES = [type("")] + +try: + STRING_TYPES.append(type(unicode(""))) +except NameError: + pass + def compile(p, flags=0): # internal: convert pattern list to internal format # compile, as necessary - if type(p) in (type(""), type(u"")): + if type(p) in STRING_TYPES: import sre_parse pattern = p p = sre_parse.parse(p, flags) @@ -308,6 +316,8 @@ def compile(p, flags=0): code.append(OPCODES[SUCCESS]) + # print code + # FIXME: <fl> get rid of this limitation! assert p.pattern.groups <= 100,\ "sorry, but this version only supports 100 named groups" diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index c2cecdf..ef32c32 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -172,7 +172,7 @@ CH_UNICODE = { # flags SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) SRE_FLAG_IGNORECASE = 2 # case insensitive -SRE_FLAG_LOCALE = 4 # honor system locale +SRE_FLAG_LOCALE = 4 # honour system locale SRE_FLAG_MULTILINE = 8 # treat target as multiline string SRE_FLAG_DOTALL = 16 # treat target as a single string SRE_FLAG_UNICODE = 32 # use unicode locale diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 053335a..1b56352 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -25,12 +25,12 @@ CHARMASK = 0xff SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" -DIGITS = tuple(string.digits) +DIGITS = tuple("012345689") OCTDIGITS = tuple("01234567") HEXDIGITS = tuple("0123456789abcdefABCDEF") -WHITESPACE = tuple(string.whitespace) +WHITESPACE = tuple(" \t\n\r\v\f") ESCAPES = { r"\a": (LITERAL, 7), @@ -68,7 +68,8 @@ FLAGS = { "u": SRE_FLAG_UNICODE, } -class State: +class Pattern: + # master pattern object. keeps track of global attributes def __init__(self): self.flags = 0 self.groups = 1 @@ -88,6 +89,33 @@ class SubPattern: data = [] self.data = data self.width = None + def dump(self, level=0): + nl = 1 + for op, av in self.data: + print level*" " + op,; nl = 0 + if op == "in": + # member sublanguage + print; nl = 1 + for op, a in av: + print (level+1)*" " + op, a + elif op == "branch": + print; nl = 1 + i = 0 + for a in av[1]: + if i > 0: + print level*" " + "or" + a.dump(level+1); nl = 1 + i = i + 1 + elif type(av) in (type(()), type([])): + for a in av: + if isinstance(a, SubPattern): + if not nl: print + a.dump(level+1); nl = 1 + else: + print a, ; nl = 0 + else: + print av, ; nl = 0 + if not nl: print def __repr__(self): return repr(self.data) def __len__(self): @@ -255,10 +283,25 @@ def _escape(source, escape, state): pass raise error, "bogus escape: %s" % repr(escape) -def _branch(pattern, items): - # form a branch operator from a set of items +def _parse_sub(source, state, nested=1): + # parse an alternation: a|b|c - subpattern = SubPattern(pattern) + items = [] + while 1: + items.append(_parse(source, state)) + if source.match("|"): + continue + if not nested: + break + if not source.next or source.match(")"): + break + else: + raise error, "pattern not properly closed" + + if len(items) == 1: + return items[0] + + subpattern = SubPattern(state) # check if all items share a common prefix while 1: @@ -285,7 +328,7 @@ def _branch(pattern, items): break else: # we can store this as a character set instead of a - # branch (FIXME: use a range if possible) + # branch (the compiler may optimize this even more) set = [] for item in items: set.append(item[0]) @@ -296,8 +339,7 @@ def _branch(pattern, items): return subpattern def _parse(source, state): - - # parse regular expression pattern into an operator list. + # parse a simple pattern subpattern = SubPattern(state) @@ -451,22 +493,6 @@ def _parse(source, state): if gid is None: raise error, "unknown group name" subpattern.append((GROUPREF, gid)) - elif source.match("#"): - index = "" - while 1: - char = source.get() - if char is None: - raise error, "unterminated index" - if char == ")": - break - index = index + char - try: - index = int(index) - if index < 0 or index > MAXREPEAT: - raise ValueError - except ValueError: - raise error, "illegal index" - subpattern.append((INDEX, index)) continue else: char = source.get() @@ -491,48 +517,27 @@ def _parse(source, state): raise error, "syntax error" dir = -1 # lookbehind char = source.get() - b = [] - while 1: - p = _parse(source, state) - if source.next == ")": - if b: - b.append(p) - p = _branch(state, b) - if char == "=": - subpattern.append((ASSERT, (dir, p))) - else: - subpattern.append((ASSERT_NOT, (dir, p))) - break - elif source.match("|"): - b.append(p) - else: - raise error, "pattern not properly closed" + p = _parse_sub(source, state) + if char == "=": + subpattern.append((ASSERT, (dir, p))) + else: + subpattern.append((ASSERT_NOT, (dir, p))) + continue else: # flags while FLAGS.has_key(source.next): state.flags = state.flags | FLAGS[source.get()] if group: # parse group contents - b = [] if group == 2: # anonymous group group = None else: group = state.getgroup(name) - while 1: - p = _parse(source, state) - if group is not None: - p.append((INDEX, group)) - if source.match(")"): - if b: - b.append(p) - p = _branch(state, b) - subpattern.append((SUBPATTERN, (group, p))) - break - elif source.match("|"): - b.append(p) - else: - raise error, "group not properly closed" + p = _parse_sub(source, state) + subpattern.append((SUBPATTERN, (group, p))) + if group is not None: + p.append((INDEX, group)) else: while 1: char = source.get() @@ -555,26 +560,24 @@ def _parse(source, state): return subpattern -def parse(pattern, flags=0): +def parse(str, flags=0): # parse 're' pattern into list of (opcode, argument) tuples - source = Tokenizer(pattern) - state = State() - state.flags = flags - b = [] - while 1: - p = _parse(source, state) - tail = source.get() - if tail == "|": - b.append(p) - elif tail == ")": - raise error, "unbalanced parenthesis" - elif tail is None: - if b: - b.append(p) - p = _branch(state, b) - break - else: - raise error, "bogus characters at end of regular expression" + + source = Tokenizer(str) + + pattern = Pattern() + pattern.flags = flags + + p = _parse_sub(source, pattern, 0) + + tail = source.get() + if tail == ")": + raise error, "unbalanced parenthesis" + elif tail: + raise error, "bogus characters at end of regular expression" + + # p.dump() + return p def parse_template(source, pattern): @@ -656,4 +659,4 @@ def expand_template(template, match): if s is None: raise error, "empty group" a(s) - return sep.join(p) + return string.join(p, sep) diff --git a/Lib/test/output/test_sre b/Lib/test/output/test_sre index d949f25..05bcead 100644 --- a/Lib/test/output/test_sre +++ b/Lib/test/output/test_sre @@ -1,4 +1,6 @@ test_sre === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A') === Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a') +=== grouping error ('(a)(b)c|ab', 'ab', 0, 'found+"-"+g1+"-"+g2', 'ab-None-None') 'ab-None-b' should be 'ab-None-None' +=== grouping error ('(a)+b|aac', 'aac', 0, 'found+"-"+g1', 'aac-None') 'aac-a' should be 'aac-None' === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A') |