diff options
Diffstat (limited to 'Lib/dos-8x3/sre_comp.py')
-rw-r--r-- | Lib/dos-8x3/sre_comp.py | 221 |
1 files changed, 104 insertions, 117 deletions
diff --git a/Lib/dos-8x3/sre_comp.py b/Lib/dos-8x3/sre_comp.py index 8738061..c042375 100644 --- a/Lib/dos-8x3/sre_comp.py +++ b/Lib/dos-8x3/sre_comp.py @@ -1,24 +1,16 @@ # # Secret Labs' Regular Expression Engine -# $Id$ # # convert template to internal format # # Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. # -# This code can only be used for 1.6 alpha testing. All other use -# require explicit permission from Secret Labs AB. -# # Portions of this engine have been developed in cooperation with # CNRI. Hewlett-Packard provided funding for 1.6 integration and # other compatibility work. # -# FIXME: <fl> formalize (objectify?) and document the compiler code -# format, so that other frontends can use this compiler - -import array, string, sys - +import array import _sre from sre_constants import * @@ -30,158 +22,153 @@ for WORDSIZE in "BHil": else: raise RuntimeError, "cannot find a useable array type" -# FIXME: <fl> should move some optimizations from the parser to here! - -class Code: - def __init__(self): - self.data = [] - def __len__(self): - return len(self.data) - def __getitem__(self, index): - return self.data[index] - def __setitem__(self, index, code): - self.data[index] = code - def append(self, code): - self.data.append(code) - def todata(self): - # print self.data - return array.array(WORDSIZE, self.data).tostring() - -def _lower(literal): - # return _sre._lower(literal) # FIXME - return string.lower(literal) - def _compile(code, pattern, flags): - append = code.append + emit = code.append for op, av in pattern: if op is ANY: - if "s" in flags: - append(CODES[op]) # any character at all! + if flags & SRE_FLAG_DOTALL: + emit(OPCODES[op]) else: - append(CODES[NOT_LITERAL]) - append(10) + emit(OPCODES[CATEGORY]) + emit(CHCODES[CATEGORY_NOT_LINEBREAK]) elif op in (SUCCESS, FAILURE): - append(CODES[op]) + emit(OPCODES[op]) elif op is AT: - append(CODES[op]) - append(POSITIONS[av]) + emit(OPCODES[op]) + if flags & SRE_FLAG_MULTILINE: + emit(ATCODES[AT_MULTILINE[av]]) + else: + emit(ATCODES[av]) elif op is BRANCH: - append(CODES[op]) + emit(OPCODES[op]) tail = [] for av in av[1]: - skip = len(code); append(0) + skip = len(code); emit(0) _compile(code, av, flags) - append(CODES[JUMP]) - tail.append(len(code)); append(0) + emit(OPCODES[JUMP]) + tail.append(len(code)); emit(0) code[skip] = len(code) - skip - append(0) # end of branch - for tail in tail: + emit(0) # end of branch + for tail in tail: code[tail] = len(code) - tail elif op is CALL: - append(CODES[op]) - skip = len(code); append(0) + emit(OPCODES[op]) + skip = len(code); emit(0) _compile(code, av, flags) - append(CODES[SUCCESS]) + emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip - elif op is CATEGORY: # not used by current parser - append(CODES[op]) - append(CATEGORIES[av]) + elif op is CATEGORY: + emit(OPCODES[op]) + if flags & SRE_FLAG_LOCALE: + emit(CH_LOCALE[CHCODES[av]]) + elif flags & SRE_FLAG_UNICODE: + emit(CH_UNICODE[CHCODES[av]]) + else: + emit(CHCODES[av]) elif op is GROUP: - if "i" in flags: - append(CODES[MAP_IGNORE[op]]) + if flags & SRE_FLAG_IGNORECASE: + emit(OPCODES[OP_IGNORE[op]]) else: - append(CODES[op]) - append(av) + emit(OPCODES[op]) + emit(av-1) elif op is IN: - if "i" in flags: - append(CODES[MAP_IGNORE[op]]) - def fixup(literal): - return ord(_lower(literal)) + if flags & SRE_FLAG_IGNORECASE: + emit(OPCODES[OP_IGNORE[op]]) + def fixup(literal, flags=flags): + return _sre.getlower(ord(literal), flags) else: - append(CODES[op]) + emit(OPCODES[op]) fixup = ord - skip = len(code); append(0) + skip = len(code); emit(0) for op, av in av: - append(CODES[op]) + emit(OPCODES[op]) if op is NEGATE: pass elif op is LITERAL: - append(fixup(av)) + emit(fixup(av)) elif op is RANGE: - append(fixup(av[0])) - append(fixup(av[1])) + emit(fixup(av[0])) + emit(fixup(av[1])) elif op is CATEGORY: - append(CATEGORIES[av]) + if flags & SRE_FLAG_LOCALE: + emit(CH_LOCALE[CHCODES[av]]) + elif flags & SRE_FLAG_UNICODE: + emit(CH_UNICODE[CHCODES[av]]) + else: + emit(CHCODES[av]) else: - raise ValueError, "unsupported set operator" - append(CODES[FAILURE]) + raise error, "internal: unsupported set operator" + emit(OPCODES[FAILURE]) code[skip] = len(code) - skip elif op in (LITERAL, NOT_LITERAL): - if "i" in flags: - append(CODES[MAP_IGNORE[op]]) - append(ord(_lower(av))) + if flags & SRE_FLAG_IGNORECASE: + emit(OPCODES[OP_IGNORE[op]]) else: - append(CODES[op]) - append(ord(av)) + emit(OPCODES[op]) + emit(ord(av)) elif op is MARK: - append(CODES[op]) - append(av) + emit(OPCODES[op]) + emit(av) elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): - lo, hi = av[2].getwidth() - if lo == 0: - raise SyntaxError, "cannot repeat zero-width items" - if lo == hi == 1 and op is MAX_REPEAT: - append(CODES[MAX_REPEAT_ONE]) - skip = len(code); append(0) - append(av[0]) - append(av[1]) + if flags & SRE_FLAG_TEMPLATE: + emit(OPCODES[REPEAT]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) _compile(code, av[2], flags) - append(CODES[SUCCESS]) + emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip else: - append(CODES[op]) - skip = len(code); append(0) - append(av[0]) - append(av[1]) - _compile(code, av[2], flags) - if op is MIN_REPEAT: - append(CODES[MIN_UNTIL]) + lo, hi = av[2].getwidth() + if lo == 0: + raise error, "nothing to repeat" + if 0 and lo == hi == 1 and op is MAX_REPEAT: + # FIXME: <fl> need a better way to figure out when + # it's safe to use this one (in the parser, probably) + emit(OPCODES[MAX_REPEAT_ONE]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(OPCODES[SUCCESS]) + code[skip] = len(code) - skip else: - # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?) - append(CODES[MAX_UNTIL]) - code[skip] = len(code) - skip + emit(OPCODES[op]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(OPCODES[SUCCESS]) + code[skip] = len(code) - skip elif op is SUBPATTERN: -## group = av[0] -## if group: -## append(CODES[MARK]) -## append((group-1)*2) + group = av[0] + if group: + emit(OPCODES[MARK]) + emit((group-1)*2) _compile(code, av[1], flags) -## if group: -## append(CODES[MARK]) -## append((group-1)*2+1) + if group: + emit(OPCODES[MARK]) + emit((group-1)*2+1) else: raise ValueError, ("unsupported operand type", op) -def compile(p, flags=()): - # convert pattern list to internal format +def compile(p, flags=0): + # internal: convert pattern list to internal format if type(p) in (type(""), type(u"")): import sre_parse pattern = p p = sre_parse.parse(p) else: pattern = None - # print p.getwidth() - # print p - code = Code() - _compile(code, p.data, p.pattern.flags) - code.append(CODES[SUCCESS]) - # print list(code.data) - data = code.todata() - if 0: # debugging - print - print "-" * 68 - import sre_disasm - sre_disasm.disasm(data) - print "-" * 68 - # print len(data), p.pattern.groups, len(p.pattern.groupdict) - return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict) + flags = p.pattern.flags | flags + code = [] + _compile(code, p.data, flags) + code.append(OPCODES[SUCCESS]) + # FIXME: <fl> get rid of this limitation + assert p.pattern.groups <= 100,\ + "sorry, but this version only supports 100 named groups" + return _sre.compile( + pattern, flags, + array.array(WORDSIZE, code).tostring(), + p.pattern.groups-1, p.pattern.groupdict + ) |