diff options
author | Guido van Rossum <guido@python.org> | 2000-03-31 14:58:54 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2000-03-31 14:58:54 (GMT) |
commit | 7627c0de6968471996ce05aab200115d56efa1d5 (patch) | |
tree | 9c4191aa38d9b7428d11c1b6267b95e5add41afa /Lib/sre_parse.py | |
parent | 7a5b796322e2bf58fa8afe78bbaccfcc9492d178 (diff) | |
download | cpython-7627c0de6968471996ce05aab200115d56efa1d5.zip cpython-7627c0de6968471996ce05aab200115d56efa1d5.tar.gz cpython-7627c0de6968471996ce05aab200115d56efa1d5.tar.bz2 |
Added Fredrik Lundh's sre module and its supporting cast.
NOTE: THIS IS VERY ROUGH ALPHA CODE!
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r-- | Lib/sre_parse.py | 492 |
1 files changed, 492 insertions, 0 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py new file mode 100644 index 0000000..a563de3 --- /dev/null +++ b/Lib/sre_parse.py @@ -0,0 +1,492 @@ +# +# Secret Labs' Regular Expression Engine +# $Id$ +# +# convert re-style regular expression to SRE template. the current +# implementation is somewhat incomplete, and not very fast. should +# definitely be rewritten before Python 1.6 goes beta. +# +# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. +# +# This code can only be used for 1.6 alpha testing. All other use +# require explicit permission from Secret Labs AB. +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# + +# FIXME: comments marked with the FIXME tag are open issues. all such +# issues should be closed before the final beta. + +import string, sys + +from sre_constants import * + +SPECIAL_CHARS = ".\\[{()*+?^$|" +REPEAT_CHARS = "*+?{" + +OCTDIGITS = "01234567" +HEXDIGITS = "0123456789abcdefABCDEF" + +ESCAPES = { + "\\a": (LITERAL, chr(7)), + "\\b": (LITERAL, chr(8)), + "\\f": (LITERAL, chr(12)), + "\\n": (LITERAL, chr(10)), + "\\r": (LITERAL, chr(13)), + "\\t": (LITERAL, chr(9)), + "\\v": (LITERAL, chr(11)) +} + +CATEGORIES = { + "\\A": (AT, AT_BEGINNING), # start of string + "\\b": (AT, AT_BOUNDARY), + "\\B": (AT, AT_NON_BOUNDARY), + "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), + "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), + "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), + "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), + "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]), + "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), + "\\Z": (AT, AT_END), # end of string +} + +class Pattern: + # FIXME: <fl> rename class, and store flags in here too! + def __init__(self): + self.flags = [] + self.groups = 1 + self.groupdict = {} + def getgroup(self, name=None): + gid = self.groups + self.groups = gid + 1 + if name: + self.groupdict[name] = gid + return gid + def setflag(self, flag): + if flag not in self.flags: + self.flags.append(flag) + +class SubPattern: + # a subpattern, in intermediate form + def __init__(self, pattern, data=None): + self.pattern = pattern + if not data: + data = [] + self.data = data + self.flags = [] + self.width = None + def __repr__(self): + return repr(self.data) + def __len__(self): + return len(self.data) + def __delitem__(self, index): + del self.data[index] + def __getitem__(self, index): + return self.data[index] + def __setitem__(self, index, code): + self.data[index] = code + def __getslice__(self, start, stop): + return SubPattern(self.pattern, self.data[start:stop]) + def insert(self, index, code): + self.data.insert(index, code) + def append(self, code): + self.data.append(code) + def getwidth(self): + # determine the width (min, max) for this subpattern + if self.width: + return self.width + lo = hi = 0L + for op, av in self.data: + if op is BRANCH: + l = sys.maxint + h = 0 + for av in av[1]: + i, j = av.getwidth() + l = min(l, i) + h = min(h, j) + lo = lo + i + hi = hi + j + elif op is CALL: + i, j = av.getwidth() + lo = lo + i + hi = hi + j + elif op is SUBPATTERN: + i, j = av[1].getwidth() + lo = lo + i + hi = hi + j + elif op in (MIN_REPEAT, MAX_REPEAT): + i, j = av[2].getwidth() + lo = lo + i * av[0] + hi = hi + j * av[1] + elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): + lo = lo + 1 + hi = hi + 1 + elif op == SUCCESS: + break + self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) + return self.width + def set(self, flag): + if not flag in self.flags: + self.flags.append(flag) + def reset(self, flag): + if flag in self.flags: + self.flags.remove(flag) + +class Tokenizer: + def __init__(self, string): + self.string = list(string) + self.next = self.__next() + def __next(self): + if not self.string: + return None + char = self.string[0] + if char[0] == "\\": + try: + c = self.string[1] + except IndexError: + raise SyntaxError, "bogus escape" + char = char + c + try: + if c == "x": + # hexadecimal constant + for i in xrange(2, sys.maxint): + c = self.string[i] + if c not in HEXDIGITS: + break + char = char + c + elif c in string.digits: + # decimal (or octal) number + for i in xrange(2, sys.maxint): + c = self.string[i] + # FIXME: if larger than current number of + # groups, interpret as an octal number + if c not in string.digits: + break + char = char + c + except IndexError: + pass # use what we've got this far + del self.string[0:len(char)] + return char + def match(self, char): + if char == self.next: + self.next = self.__next() + return 1 + return 0 + def match_set(self, set): + if self.next in set: + self.next = self.__next() + return 1 + return 0 + def get(self): + this = self.next + self.next = self.__next() + return this + +def _fixescape(escape, character_class=0): + # convert escape to (type, value) + if character_class: + # inside a character class, we'll look in the character + # escapes dictionary first + code = ESCAPES.get(escape) + if code: + return code + code = CATEGORIES.get(escape) + else: + code = CATEGORIES.get(escape) + if code: + return code + code = ESCAPES.get(escape) + if code: + return code + if not character_class: + try: + group = int(escape[1:]) + # FIXME: only valid if group <= current number of groups + return GROUP, group + except ValueError: + pass + try: + if escape[1:2] == "x": + escape = escape[2:] + return LITERAL, chr(string.atoi(escape[-2:], 16) & 0xff) + elif escape[1:2] in string.digits: + return LITERAL, chr(string.atoi(escape[1:], 8) & 0xff) + elif len(escape) == 2: + return LITERAL, escape[1] + except ValueError: + pass + raise SyntaxError, "bogus escape: %s" % repr(escape) + +def _branch(subpattern, items): + + # form a branch operator from a set of items (FIXME: move this + # optimization to the compiler module!) + + # check if all items share a common prefix + while 1: + prefix = None + for item in items: + if not item: + break + if prefix is None: + prefix = item[0] + elif item[0] != prefix: + break + else: + # all subitems start with a common "prefix". + # move it out of the branch + for item in items: + del item[0] + subpattern.append(prefix) + continue # check next one + break + + # check if the branch can be replaced by a character set + for item in items: + if len(item) != 1 or item[0][0] != LITERAL: + break + else: + # we can store this as a character set instead of a + # branch (FIXME: use a range if possible) + set = [] + for item in items: + set.append(item[0]) + subpattern.append((IN, set)) + return + + subpattern.append((BRANCH, (None, items))) + +def _parse(source, pattern, flags=()): + + # parse regular expression pattern into an operator list. + + subpattern = SubPattern(pattern) + + this = None + + while 1: + + if source.next in ("|", ")"): + break # end of subpattern + this = source.get() + if this is None: + break # end of pattern + + if this and this[0] not in SPECIAL_CHARS: + subpattern.append((LITERAL, this)) + + elif this == "[": + # character set + set = [] +## if source.match(":"): +## pass # handle character classes + if source.match("^"): + set.append((NEGATE, None)) + # check remaining characters + start = set[:] + while 1: + this = source.get() + if this == "]" and set != start: + break + elif this and this[0] == "\\": + code1 = _fixescape(this, 1) + elif this: + code1 = LITERAL, this + else: + raise SyntaxError, "unexpected end of regular expression" + if source.match("-"): + # potential range + this = source.get() + if this == "]": + set.append(code1) + set.append((LITERAL, "-")) + break + else: + if this[0] == "\\": + code2 = _fixescape(this, 1) + else: + code2 = LITERAL, this + if code1[0] != LITERAL or code2[0] != LITERAL: + raise SyntaxError, "illegal range" + if len(code1[1]) != 1 or len(code2[1]) != 1: + raise SyntaxError, "illegal range" + set.append((RANGE, (code1[1], code2[1]))) + else: + if code1[0] is IN: + code1 = code1[1][0] + set.append(code1) + + # FIXME: <fl> move set optimization to support function + if len(set)==1 and set[0][0] is LITERAL: + subpattern.append(set[0]) # optimization + elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: + subpattern.append((NOT_LITERAL, set[1][1])) # optimization + else: + # FIXME: <fl> add charmap optimization + subpattern.append((IN, set)) + + elif this and this[0] in REPEAT_CHARS: + # repeat previous item + if this == "?": + min, max = 0, 1 + elif this == "*": + min, max = 0, sys.maxint + elif this == "+": + min, max = 1, sys.maxint + elif this == "{": + min, max = 0, sys.maxint + lo = hi = "" + while source.next in string.digits: + lo = lo + source.get() + if source.match(","): + while source.next in string.digits: + hi = hi + source.get() + else: + hi = lo + if not source.match("}"): + raise SyntaxError, "bogus range" + if lo: + min = int(lo) + if hi: + max = int(hi) + # FIXME: <fl> check that hi >= lo! + else: + raise SyntaxError, "not supported" + # figure out which item to repeat + # FIXME: should back up to the right mark, right? + if subpattern: + index = len(subpattern)-1 + while subpattern[index][0] is MARK: + index = index - 1 + item = subpattern[index:index+1] + else: + raise SyntaxError, "nothing to repeat" + if source.match("?"): + subpattern[index] = (MIN_REPEAT, (min, max, item)) + else: + subpattern[index] = (MAX_REPEAT, (min, max, item)) + elif this == ".": + subpattern.append((ANY, None)) + elif this == "(": + group = 1 + name = None + if source.match("?"): + group = 0 + # options + if source.match("P"): + # named group: skip forward to end of name + if source.match("<"): + name = "" + while 1: + char = source.get() + if char in (">", None): + break + name = name + char + group = 1 + elif source.match(":"): + # non-capturing group + group = 2 + elif source.match_set("iI"): + pattern.setflag("i") + elif source.match_set("lL"): + pattern.setflag("l") + elif source.match_set("mM"): + pattern.setflag("m") + elif source.match_set("sS"): + pattern.setflag("s") + elif source.match_set("xX"): + pattern.setflag("x") + if group: + # parse group contents + b = [] + if group == 2: + # anonymous group + group = None + else: + group = pattern.getgroup(name) + if group: + subpattern.append((MARK, (group-1)*2)) + while 1: + p = _parse(source, pattern, flags) + if source.match(")"): + if b: + b.append(p) + _branch(subpattern, b) + else: + subpattern.append((SUBPATTERN, (group, p))) + break + elif source.match("|"): + b.append(p) + else: + raise SyntaxError, "group not properly closed" + if group: + subpattern.append((MARK, (group-1)*2+1)) + else: + # FIXME: should this really be a while loop? + while source.get() not in (")", None): + pass + + elif this == "^": + subpattern.append((AT, AT_BEGINNING)) + + elif this == "$": + subpattern.append((AT, AT_END)) + + elif this and this[0] == "\\": + code =_fixescape(this) + subpattern.append(code) + + else: + raise SyntaxError, "parser error" + + return subpattern + +def parse(source, flags=()): + s = Tokenizer(source) + g = Pattern() + b = [] + while 1: + p = _parse(s, g, flags) + tail = s.get() + if tail == "|": + b.append(p) + elif tail == ")": + raise SyntaxError, "unbalanced parenthesis" + elif tail is None: + if b: + b.append(p) + p = SubPattern(g) + _branch(p, b) + break + else: + raise SyntaxError, "bogus characters at end of regular expression" + return p + +if __name__ == "__main__": + from pprint import pprint + from testpatterns import PATTERNS + a = b = c = 0 + for pattern, flags in PATTERNS: + if flags: + continue + print "-"*68 + try: + p = parse(pattern) + print repr(pattern), "->" + pprint(p.data) + import sre_compile + try: + code = sre_compile.compile(p) + c = c + 1 + except: + pass + a = a + 1 + except SyntaxError, v: + print "**", repr(pattern), v + b = b + 1 + print "-"*68 + print a, "of", b, "patterns successfully parsed" + print c, "of", b, "patterns successfully compiled" + |