From 7627c0de6968471996ce05aab200115d56efa1d5 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 31 Mar 2000 14:58:54 +0000 Subject: Added Fredrik Lundh's sre module and its supporting cast. NOTE: THIS IS VERY ROUGH ALPHA CODE! --- Lib/sre.py | 46 +++++ Lib/sre_compile.py | 187 ++++++++++++++++++++ Lib/sre_constants.py | 131 ++++++++++++++ Lib/sre_parse.py | 492 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 856 insertions(+) create mode 100644 Lib/sre.py create mode 100644 Lib/sre_compile.py create mode 100644 Lib/sre_constants.py create mode 100644 Lib/sre_parse.py diff --git a/Lib/sre.py b/Lib/sre.py new file mode 100644 index 0000000..0b41057 --- /dev/null +++ b/Lib/sre.py @@ -0,0 +1,46 @@ +# -*- Mode: Python; tab-width: 4 -*- +# +# Secret Labs' Regular Expression Engine +# $Id$ +# +# re-compatible interface for the sre matching engine +# +# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. +# +# This code can only be used for 1.6 alpha testing. All other use +# require explicit permission from Secret Labs AB. +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# + +""" +this is a long string +""" + +import sre_compile + +# -------------------------------------------------------------------- +# public interface + +def compile(pattern, flags=0): + return sre_compile.compile(pattern, _fixflags(flags)) + +def match(pattern, string, flags=0): + return compile(pattern, _fixflags(flags)).match(string) + +def search(pattern, string, flags=0): + assert flags == 0 + return compile(pattern, _fixflags(flags)).search(string) + +# FIXME: etc + +# -------------------------------------------------------------------- +# helpers + +def _fixflags(flags): + # convert flag bitmask to sequence + assert flags == 0 + return () + diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py new file mode 100644 index 0000000..3e9700b --- /dev/null +++ b/Lib/sre_compile.py @@ -0,0 +1,187 @@ +# +# Secret Labs' Regular Expression Engine +# $Id$ +# +# convert template to internal format +# +# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. +# +# This code can only be used for 1.6 alpha testing. All other use +# require explicit permission from Secret Labs AB. +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# + +# FIXME: formalize (objectify?) and document the compiler code +# format, so that other frontends can use this compiler + +import array, string, sys + +import _sre + +from sre_constants import * + +# find an array type code that matches the engine's code size +for WORDSIZE in "BHil": + if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize(): + break +else: + raise RuntimeError, "cannot find a useable array type" + +# FIXME: should move some optimizations from the parser to here! + +class Code: + def __init__(self): + self.data = [] + def __len__(self): + return len(self.data) + def __getitem__(self, index): + return self.data[index] + def __setitem__(self, index, code): + self.data[index] = code + def append(self, code): + self.data.append(code) + def todata(self): + # print self.data + return array.array(WORDSIZE, self.data).tostring() + +def _lower(literal): + # return _sre._lower(literal) # FIXME + return string.lower(literal) + +def _compile(code, pattern, flags): + append = code.append + for op, av in pattern: + if op is ANY: + if "s" in flags: + append(CODES[op]) # any character at all! + else: + append(CODES[NOT_LITERAL]) + append(10) + elif op in (SUCCESS, FAILURE): + append(CODES[op]) + elif op is AT: + append(CODES[op]) + append(POSITIONS[av]) + elif op is BRANCH: + append(CODES[op]) + tail = [] + for av in av[1]: + skip = len(code); append(0) + _compile(code, av, flags) + append(CODES[JUMP]) + tail.append(len(code)); append(0) + code[skip] = len(code) - skip + append(0) # end of branch + for tail in tail: + code[tail] = len(code) - tail + elif op is CALL: + append(CODES[op]) + skip = len(code); append(0) + _compile(code, av, flags) + append(CODES[SUCCESS]) + code[skip] = len(code) - skip + elif op is CATEGORY: # not used by current parser + append(CODES[op]) + append(CATEGORIES[av]) + elif op is GROUP: + if "i" in flags: + append(CODES[MAP_IGNORE[op]]) + else: + append(CODES[op]) + append(av) + elif op is IN: + if "i" in flags: + append(CODES[MAP_IGNORE[op]]) + def fixup(literal): + return ord(_lower(literal)) + else: + append(CODES[op]) + fixup = ord + skip = len(code); append(0) + for op, av in av: + append(CODES[op]) + if op is NEGATE: + pass + elif op is LITERAL: + append(fixup(av)) + elif op is RANGE: + append(fixup(av[0])) + append(fixup(av[1])) + elif op is CATEGORY: + append(CATEGORIES[av]) + else: + raise ValueError, "unsupported set operator" + append(CODES[FAILURE]) + code[skip] = len(code) - skip + elif op in (LITERAL, NOT_LITERAL): + if "i" in flags: + append(CODES[MAP_IGNORE[op]]) + append(ord(_lower(av))) + else: + append(CODES[op]) + append(ord(av)) + elif op is MARK: + append(CODES[op]) + append(av) + elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): + lo, hi = av[2].getwidth() + if lo == 0: + raise SyntaxError, "cannot repeat zero-width items" + if lo == hi == 1 and op is MAX_REPEAT: + append(CODES[MAX_REPEAT_ONE]) + skip = len(code); append(0) + append(av[0]) + append(av[1]) + _compile(code, av[2], flags) + append(CODES[SUCCESS]) + code[skip] = len(code) - skip + else: + append(CODES[op]) + skip = len(code); append(0) + append(av[0]) + append(av[1]) + _compile(code, av[2], flags) + if op is MIN_REPEAT: + append(CODES[MIN_UNTIL]) + else: + # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?) + append(CODES[MAX_UNTIL]) + code[skip] = len(code) - skip + elif op is SUBPATTERN: +## group = av[0] +## if group: +## append(CODES[MARK]) +## append((group-1)*2) + _compile(code, av[1], flags) +## if group: +## append(CODES[MARK]) +## append((group-1)*2+1) + else: + raise ValueError, ("unsupported operand type", op) + +def compile(p, flags=()): + # convert pattern list to internal format + if type(p) is type(""): + import sre_parse + pattern = p + p = sre_parse.parse(p) + else: + pattern = None + # print p.getwidth() + # print p + code = Code() + _compile(code, p.data, p.pattern.flags) + code.append(CODES[SUCCESS]) + # print list(code.data) + data = code.todata() + if 0: # debugging + print + print "-" * 68 + import sre_disasm + sre_disasm.disasm(data) + print "-" * 68 + # print len(data), p.pattern.groups, len(p.pattern.groupdict) + return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py new file mode 100644 index 0000000..af88309 --- /dev/null +++ b/Lib/sre_constants.py @@ -0,0 +1,131 @@ +# +# Secret Labs' Regular Expression Engine +# $Id$ +# +# various symbols used by the regular expression engine. +# run this script to update the _sre include files! +# +# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. +# +# This code can only be used for 1.6 alpha testing. All other use +# require explicit permission from Secret Labs AB. +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# + +# operators + +FAILURE = "failure" +SUCCESS = "success" + +ANY = "any" +ASSERT = "assert" +AT = "at" +BRANCH = "branch" +CALL = "call" +CATEGORY = "category" +GROUP = "group" +GROUP_IGNORE = "group_ignore" +IN = "in" +IN_IGNORE = "in_ignore" +JUMP = "jump" +LITERAL = "literal" +LITERAL_IGNORE = "literal_ignore" +MARK = "mark" +MAX_REPEAT = "max_repeat" +MAX_REPEAT_ONE = "max_repeat_one" +MAX_UNTIL = "max_until" +MIN_REPEAT = "min_repeat" +MIN_UNTIL = "min_until" +NEGATE = "negate" +NOT_LITERAL = "not_literal" +NOT_LITERAL_IGNORE = "not_literal_ignore" +RANGE = "range" +REPEAT = "repeat" +SUBPATTERN = "subpattern" + +# positions +AT_BEGINNING = "at_beginning" +AT_BOUNDARY = "at_boundary" +AT_NON_BOUNDARY = "at_non_boundary" +AT_END = "at_end" + +# categories + +CATEGORY_DIGIT = "category_digit" +CATEGORY_NOT_DIGIT = "category_not_digit" +CATEGORY_SPACE = "category_space" +CATEGORY_NOT_SPACE = "category_not_space" +CATEGORY_WORD = "category_word" +CATEGORY_NOT_WORD = "category_not_word" + +CODES = [ + + # failure=0 success=1 (just because it looks better that way :-) + FAILURE, SUCCESS, + + ANY, + ASSERT, + AT, + BRANCH, + CALL, + CATEGORY, + GROUP, GROUP_IGNORE, + IN, IN_IGNORE, + JUMP, + LITERAL, LITERAL_IGNORE, + MARK, + MAX_REPEAT, MAX_UNTIL, + MAX_REPEAT_ONE, + MIN_REPEAT, MIN_UNTIL, + NOT_LITERAL, NOT_LITERAL_IGNORE, + NEGATE, + RANGE, + REPEAT + +] + +# convert to dictionary +c = {} +i = 0 +for code in CODES: + c[code] = i + i = i + 1 +CODES = c + +# replacement operations for "ignore case" mode +MAP_IGNORE = { + GROUP: GROUP_IGNORE, + IN: IN_IGNORE, + LITERAL: LITERAL_IGNORE, + NOT_LITERAL: NOT_LITERAL_IGNORE +} + +POSITIONS = { + AT_BEGINNING: ord("a"), + AT_BOUNDARY: ord("b"), + AT_NON_BOUNDARY: ord("B"), + AT_END: ord("z"), +} + +CATEGORIES = { + CATEGORY_DIGIT: ord("d"), + CATEGORY_NOT_DIGIT: ord("D"), + CATEGORY_SPACE: ord("s"), + CATEGORY_NOT_SPACE: ord("S"), + CATEGORY_WORD: ord("w"), + CATEGORY_NOT_WORD: ord("W"), +} + +if __name__ == "__main__": + import string + items = CODES.items() + items.sort(lambda a, b: cmp(a[1], b[1])) + f = open("sre_constants.h", "w") + f.write("/* generated by sre_constants.py */\n") + for k, v in items: + f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n") + f.close() + print "done" diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py new file mode 100644 index 0000000..a563de3 --- /dev/null +++ b/Lib/sre_parse.py @@ -0,0 +1,492 @@ +# +# Secret Labs' Regular Expression Engine +# $Id$ +# +# convert re-style regular expression to SRE template. the current +# implementation is somewhat incomplete, and not very fast. should +# definitely be rewritten before Python 1.6 goes beta. +# +# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. +# +# This code can only be used for 1.6 alpha testing. All other use +# require explicit permission from Secret Labs AB. +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# + +# FIXME: comments marked with the FIXME tag are open issues. all such +# issues should be closed before the final beta. + +import string, sys + +from sre_constants import * + +SPECIAL_CHARS = ".\\[{()*+?^$|" +REPEAT_CHARS = "*+?{" + +OCTDIGITS = "01234567" +HEXDIGITS = "0123456789abcdefABCDEF" + +ESCAPES = { + "\\a": (LITERAL, chr(7)), + "\\b": (LITERAL, chr(8)), + "\\f": (LITERAL, chr(12)), + "\\n": (LITERAL, chr(10)), + "\\r": (LITERAL, chr(13)), + "\\t": (LITERAL, chr(9)), + "\\v": (LITERAL, chr(11)) +} + +CATEGORIES = { + "\\A": (AT, AT_BEGINNING), # start of string + "\\b": (AT, AT_BOUNDARY), + "\\B": (AT, AT_NON_BOUNDARY), + "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), + "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), + "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), + "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), + "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]), + "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), + "\\Z": (AT, AT_END), # end of string +} + +class Pattern: + # FIXME: rename class, and store flags in here too! + def __init__(self): + self.flags = [] + self.groups = 1 + self.groupdict = {} + def getgroup(self, name=None): + gid = self.groups + self.groups = gid + 1 + if name: + self.groupdict[name] = gid + return gid + def setflag(self, flag): + if flag not in self.flags: + self.flags.append(flag) + +class SubPattern: + # a subpattern, in intermediate form + def __init__(self, pattern, data=None): + self.pattern = pattern + if not data: + data = [] + self.data = data + self.flags = [] + self.width = None + def __repr__(self): + return repr(self.data) + def __len__(self): + return len(self.data) + def __delitem__(self, index): + del self.data[index] + def __getitem__(self, index): + return self.data[index] + def __setitem__(self, index, code): + self.data[index] = code + def __getslice__(self, start, stop): + return SubPattern(self.pattern, self.data[start:stop]) + def insert(self, index, code): + self.data.insert(index, code) + def append(self, code): + self.data.append(code) + def getwidth(self): + # determine the width (min, max) for this subpattern + if self.width: + return self.width + lo = hi = 0L + for op, av in self.data: + if op is BRANCH: + l = sys.maxint + h = 0 + for av in av[1]: + i, j = av.getwidth() + l = min(l, i) + h = min(h, j) + lo = lo + i + hi = hi + j + elif op is CALL: + i, j = av.getwidth() + lo = lo + i + hi = hi + j + elif op is SUBPATTERN: + i, j = av[1].getwidth() + lo = lo + i + hi = hi + j + elif op in (MIN_REPEAT, MAX_REPEAT): + i, j = av[2].getwidth() + lo = lo + i * av[0] + hi = hi + j * av[1] + elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): + lo = lo + 1 + hi = hi + 1 + elif op == SUCCESS: + break + self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) + return self.width + def set(self, flag): + if not flag in self.flags: + self.flags.append(flag) + def reset(self, flag): + if flag in self.flags: + self.flags.remove(flag) + +class Tokenizer: + def __init__(self, string): + self.string = list(string) + self.next = self.__next() + def __next(self): + if not self.string: + return None + char = self.string[0] + if char[0] == "\\": + try: + c = self.string[1] + except IndexError: + raise SyntaxError, "bogus escape" + char = char + c + try: + if c == "x": + # hexadecimal constant + for i in xrange(2, sys.maxint): + c = self.string[i] + if c not in HEXDIGITS: + break + char = char + c + elif c in string.digits: + # decimal (or octal) number + for i in xrange(2, sys.maxint): + c = self.string[i] + # FIXME: if larger than current number of + # groups, interpret as an octal number + if c not in string.digits: + break + char = char + c + except IndexError: + pass # use what we've got this far + del self.string[0:len(char)] + return char + def match(self, char): + if char == self.next: + self.next = self.__next() + return 1 + return 0 + def match_set(self, set): + if self.next in set: + self.next = self.__next() + return 1 + return 0 + def get(self): + this = self.next + self.next = self.__next() + return this + +def _fixescape(escape, character_class=0): + # convert escape to (type, value) + if character_class: + # inside a character class, we'll look in the character + # escapes dictionary first + code = ESCAPES.get(escape) + if code: + return code + code = CATEGORIES.get(escape) + else: + code = CATEGORIES.get(escape) + if code: + return code + code = ESCAPES.get(escape) + if code: + return code + if not character_class: + try: + group = int(escape[1:]) + # FIXME: only valid if group <= current number of groups + return GROUP, group + except ValueError: + pass + try: + if escape[1:2] == "x": + escape = escape[2:] + return LITERAL, chr(string.atoi(escape[-2:], 16) & 0xff) + elif escape[1:2] in string.digits: + return LITERAL, chr(string.atoi(escape[1:], 8) & 0xff) + elif len(escape) == 2: + return LITERAL, escape[1] + except ValueError: + pass + raise SyntaxError, "bogus escape: %s" % repr(escape) + +def _branch(subpattern, items): + + # form a branch operator from a set of items (FIXME: move this + # optimization to the compiler module!) + + # check if all items share a common prefix + while 1: + prefix = None + for item in items: + if not item: + break + if prefix is None: + prefix = item[0] + elif item[0] != prefix: + break + else: + # all subitems start with a common "prefix". + # move it out of the branch + for item in items: + del item[0] + subpattern.append(prefix) + continue # check next one + break + + # check if the branch can be replaced by a character set + for item in items: + if len(item) != 1 or item[0][0] != LITERAL: + break + else: + # we can store this as a character set instead of a + # branch (FIXME: use a range if possible) + set = [] + for item in items: + set.append(item[0]) + subpattern.append((IN, set)) + return + + subpattern.append((BRANCH, (None, items))) + +def _parse(source, pattern, flags=()): + + # parse regular expression pattern into an operator list. + + subpattern = SubPattern(pattern) + + this = None + + while 1: + + if source.next in ("|", ")"): + break # end of subpattern + this = source.get() + if this is None: + break # end of pattern + + if this and this[0] not in SPECIAL_CHARS: + subpattern.append((LITERAL, this)) + + elif this == "[": + # character set + set = [] +## if source.match(":"): +## pass # handle character classes + if source.match("^"): + set.append((NEGATE, None)) + # check remaining characters + start = set[:] + while 1: + this = source.get() + if this == "]" and set != start: + break + elif this and this[0] == "\\": + code1 = _fixescape(this, 1) + elif this: + code1 = LITERAL, this + else: + raise SyntaxError, "unexpected end of regular expression" + if source.match("-"): + # potential range + this = source.get() + if this == "]": + set.append(code1) + set.append((LITERAL, "-")) + break + else: + if this[0] == "\\": + code2 = _fixescape(this, 1) + else: + code2 = LITERAL, this + if code1[0] != LITERAL or code2[0] != LITERAL: + raise SyntaxError, "illegal range" + if len(code1[1]) != 1 or len(code2[1]) != 1: + raise SyntaxError, "illegal range" + set.append((RANGE, (code1[1], code2[1]))) + else: + if code1[0] is IN: + code1 = code1[1][0] + set.append(code1) + + # FIXME: move set optimization to support function + if len(set)==1 and set[0][0] is LITERAL: + subpattern.append(set[0]) # optimization + elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: + subpattern.append((NOT_LITERAL, set[1][1])) # optimization + else: + # FIXME: add charmap optimization + subpattern.append((IN, set)) + + elif this and this[0] in REPEAT_CHARS: + # repeat previous item + if this == "?": + min, max = 0, 1 + elif this == "*": + min, max = 0, sys.maxint + elif this == "+": + min, max = 1, sys.maxint + elif this == "{": + min, max = 0, sys.maxint + lo = hi = "" + while source.next in string.digits: + lo = lo + source.get() + if source.match(","): + while source.next in string.digits: + hi = hi + source.get() + else: + hi = lo + if not source.match("}"): + raise SyntaxError, "bogus range" + if lo: + min = int(lo) + if hi: + max = int(hi) + # FIXME: check that hi >= lo! + else: + raise SyntaxError, "not supported" + # figure out which item to repeat + # FIXME: should back up to the right mark, right? + if subpattern: + index = len(subpattern)-1 + while subpattern[index][0] is MARK: + index = index - 1 + item = subpattern[index:index+1] + else: + raise SyntaxError, "nothing to repeat" + if source.match("?"): + subpattern[index] = (MIN_REPEAT, (min, max, item)) + else: + subpattern[index] = (MAX_REPEAT, (min, max, item)) + elif this == ".": + subpattern.append((ANY, None)) + elif this == "(": + group = 1 + name = None + if source.match("?"): + group = 0 + # options + if source.match("P"): + # named group: skip forward to end of name + if source.match("<"): + name = "" + while 1: + char = source.get() + if char in (">", None): + break + name = name + char + group = 1 + elif source.match(":"): + # non-capturing group + group = 2 + elif source.match_set("iI"): + pattern.setflag("i") + elif source.match_set("lL"): + pattern.setflag("l") + elif source.match_set("mM"): + pattern.setflag("m") + elif source.match_set("sS"): + pattern.setflag("s") + elif source.match_set("xX"): + pattern.setflag("x") + if group: + # parse group contents + b = [] + if group == 2: + # anonymous group + group = None + else: + group = pattern.getgroup(name) + if group: + subpattern.append((MARK, (group-1)*2)) + while 1: + p = _parse(source, pattern, flags) + if source.match(")"): + if b: + b.append(p) + _branch(subpattern, b) + else: + subpattern.append((SUBPATTERN, (group, p))) + break + elif source.match("|"): + b.append(p) + else: + raise SyntaxError, "group not properly closed" + if group: + subpattern.append((MARK, (group-1)*2+1)) + else: + # FIXME: should this really be a while loop? + while source.get() not in (")", None): + pass + + elif this == "^": + subpattern.append((AT, AT_BEGINNING)) + + elif this == "$": + subpattern.append((AT, AT_END)) + + elif this and this[0] == "\\": + code =_fixescape(this) + subpattern.append(code) + + else: + raise SyntaxError, "parser error" + + return subpattern + +def parse(source, flags=()): + s = Tokenizer(source) + g = Pattern() + b = [] + while 1: + p = _parse(s, g, flags) + tail = s.get() + if tail == "|": + b.append(p) + elif tail == ")": + raise SyntaxError, "unbalanced parenthesis" + elif tail is None: + if b: + b.append(p) + p = SubPattern(g) + _branch(p, b) + break + else: + raise SyntaxError, "bogus characters at end of regular expression" + return p + +if __name__ == "__main__": + from pprint import pprint + from testpatterns import PATTERNS + a = b = c = 0 + for pattern, flags in PATTERNS: + if flags: + continue + print "-"*68 + try: + p = parse(pattern) + print repr(pattern), "->" + pprint(p.data) + import sre_compile + try: + code = sre_compile.compile(p) + c = c + 1 + except: + pass + a = a + 1 + except SyntaxError, v: + print "**", repr(pattern), v + b = b + 1 + print "-"*68 + print a, "of", b, "patterns successfully parsed" + print c, "of", b, "patterns successfully compiled" + -- cgit v0.12