diff options
author | Guido van Rossum <guido@python.org> | 2000-03-31 14:58:54 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2000-03-31 14:58:54 (GMT) |
commit | 7627c0de6968471996ce05aab200115d56efa1d5 (patch) | |
tree | 9c4191aa38d9b7428d11c1b6267b95e5add41afa /Lib/sre_compile.py | |
parent | 7a5b796322e2bf58fa8afe78bbaccfcc9492d178 (diff) | |
download | cpython-7627c0de6968471996ce05aab200115d56efa1d5.zip cpython-7627c0de6968471996ce05aab200115d56efa1d5.tar.gz cpython-7627c0de6968471996ce05aab200115d56efa1d5.tar.bz2 |
Added Fredrik Lundh's sre module and its supporting cast.
NOTE: THIS IS VERY ROUGH ALPHA CODE!
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r-- | Lib/sre_compile.py | 187 |
1 files changed, 187 insertions, 0 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py new file mode 100644 index 0000000..3e9700b --- /dev/null +++ b/Lib/sre_compile.py @@ -0,0 +1,187 @@ +# +# Secret Labs' Regular Expression Engine +# $Id$ +# +# convert template to internal format +# +# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. +# +# This code can only be used for 1.6 alpha testing. All other use +# require explicit permission from Secret Labs AB. +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# + +# FIXME: <fl> formalize (objectify?) and document the compiler code +# format, so that other frontends can use this compiler + +import array, string, sys + +import _sre + +from sre_constants import * + +# find an array type code that matches the engine's code size +for WORDSIZE in "BHil": + if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize(): + break +else: + raise RuntimeError, "cannot find a useable array type" + +# FIXME: <fl> should move some optimizations from the parser to here! + +class Code: + def __init__(self): + self.data = [] + def __len__(self): + return len(self.data) + def __getitem__(self, index): + return self.data[index] + def __setitem__(self, index, code): + self.data[index] = code + def append(self, code): + self.data.append(code) + def todata(self): + # print self.data + return array.array(WORDSIZE, self.data).tostring() + +def _lower(literal): + # return _sre._lower(literal) # FIXME + return string.lower(literal) + +def _compile(code, pattern, flags): + append = code.append + for op, av in pattern: + if op is ANY: + if "s" in flags: + append(CODES[op]) # any character at all! + else: + append(CODES[NOT_LITERAL]) + append(10) + elif op in (SUCCESS, FAILURE): + append(CODES[op]) + elif op is AT: + append(CODES[op]) + append(POSITIONS[av]) + elif op is BRANCH: + append(CODES[op]) + tail = [] + for av in av[1]: + skip = len(code); append(0) + _compile(code, av, flags) + append(CODES[JUMP]) + tail.append(len(code)); append(0) + code[skip] = len(code) - skip + append(0) # end of branch + for tail in tail: + code[tail] = len(code) - tail + elif op is CALL: + append(CODES[op]) + skip = len(code); append(0) + _compile(code, av, flags) + append(CODES[SUCCESS]) + code[skip] = len(code) - skip + elif op is CATEGORY: # not used by current parser + append(CODES[op]) + append(CATEGORIES[av]) + elif op is GROUP: + if "i" in flags: + append(CODES[MAP_IGNORE[op]]) + else: + append(CODES[op]) + append(av) + elif op is IN: + if "i" in flags: + append(CODES[MAP_IGNORE[op]]) + def fixup(literal): + return ord(_lower(literal)) + else: + append(CODES[op]) + fixup = ord + skip = len(code); append(0) + for op, av in av: + append(CODES[op]) + if op is NEGATE: + pass + elif op is LITERAL: + append(fixup(av)) + elif op is RANGE: + append(fixup(av[0])) + append(fixup(av[1])) + elif op is CATEGORY: + append(CATEGORIES[av]) + else: + raise ValueError, "unsupported set operator" + append(CODES[FAILURE]) + code[skip] = len(code) - skip + elif op in (LITERAL, NOT_LITERAL): + if "i" in flags: + append(CODES[MAP_IGNORE[op]]) + append(ord(_lower(av))) + else: + append(CODES[op]) + append(ord(av)) + elif op is MARK: + append(CODES[op]) + append(av) + elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): + lo, hi = av[2].getwidth() + if lo == 0: + raise SyntaxError, "cannot repeat zero-width items" + if lo == hi == 1 and op is MAX_REPEAT: + append(CODES[MAX_REPEAT_ONE]) + skip = len(code); append(0) + append(av[0]) + append(av[1]) + _compile(code, av[2], flags) + append(CODES[SUCCESS]) + code[skip] = len(code) - skip + else: + append(CODES[op]) + skip = len(code); append(0) + append(av[0]) + append(av[1]) + _compile(code, av[2], flags) + if op is MIN_REPEAT: + append(CODES[MIN_UNTIL]) + else: + # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?) + append(CODES[MAX_UNTIL]) + code[skip] = len(code) - skip + elif op is SUBPATTERN: +## group = av[0] +## if group: +## append(CODES[MARK]) +## append((group-1)*2) + _compile(code, av[1], flags) +## if group: +## append(CODES[MARK]) +## append((group-1)*2+1) + else: + raise ValueError, ("unsupported operand type", op) + +def compile(p, flags=()): + # convert pattern list to internal format + if type(p) is type(""): + import sre_parse + pattern = p + p = sre_parse.parse(p) + else: + pattern = None + # print p.getwidth() + # print p + code = Code() + _compile(code, p.data, p.pattern.flags) + code.append(CODES[SUCCESS]) + # print list(code.data) + data = code.todata() + if 0: # debugging + print + print "-" * 68 + import sre_disasm + sre_disasm.disasm(data) + print "-" * 68 + # print len(data), p.pattern.groups, len(p.pattern.groupdict) + return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict) |