#!/usr/bin/env python # -*- mode: python -*- # $Id$ import string import reop # reop.error and re.error should be the same, since exceptions can be # raised from either module. error = reop.error # 're error' from reop import NORMAL, CHARCLASS, REPLACEMENT from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER # compilation flags IGNORECASE = I = 0x01 MULTILINE = M = 0x02 DOTALL = S = 0x04 VERBOSE = X = 0x08 repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?', '{n,}', '{n,}?', '{n,m}', '{n,m}?'] # # # def valid_identifier(id): if len(id) == 0: return 0 if (not reop.syntax_table[id[0]] & reop.word) or \ (reop.syntax_table[id[0]] & reop.digit): return 0 for char in id[1:]: if not reop.syntax_table[char] & reop.word: return 0 return 1 # # # _cache = {} _MAXCACHE = 20 def _cachecompile(pattern, flags=0): key = (pattern, flags) try: return _cache[key] except KeyError: pass value = compile(pattern, flags) if len(_cache) >= _MAXCACHE: _cache.clear() _cache[key] = value return value def match(pattern, string, flags=0): return _cachecompile(pattern, flags).match(string) def search(pattern, string, flags=0): return _cachecompile(pattern, flags).search(string) def sub(pattern, repl, string, count=0): if type(pattern) == type(''): pattern = _cachecompile(pattern) return pattern.sub(repl, string, count) def subn(pattern, repl, string, count=0): if type(pattern) == type(''): pattern = _cachecompile(pattern) return pattern.subn(repl, string, count) def split(pattern, string, maxsplit=0): if type(pattern) == type(''): pattern = _cachecompile(pattern) return pattern.split(string, maxsplit) # # # def _expand(m, repl): results = [] index = 0 size = len(repl) while index < size: found = string.find(repl, '\\', index) if found < 0: results.append(repl[index:]) break if found > index: results.append(repl[index:found]) escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT) if escape_type == CHAR: results.append(value) elif escape_type == MEMORY_REFERENCE: r = m.group(value) if r is None: raise error, ('group "' + str(value) + '" did not contribute ' 'to the match') results.append(m.group(value)) else: raise error, "bad escape in replacement" return string.join(results, '') class RegexObject: def __init__(self, pattern, flags, code, num_regs, groupindex): self.code = code self.num_regs = num_regs self.flags = flags self.pattern = pattern self.groupindex = groupindex self.fastmap = build_fastmap(code) if code[0].name == 'bol': self.anchor = 1 elif code[0].name == 'begbuf': self.anchor = 2 else: self.anchor = 0 self.buffer = assemble(code) def search(self, string, pos=0): regs = reop.search(self.buffer, self.num_regs, self.flags, self.fastmap.can_be_null, self.fastmap.fastmap(), self.anchor, string, pos) if regs is None: return None return MatchObject(self, string, pos, regs) def match(self, string, pos=0): regs = reop.match(self.buffer, self.num_regs, self.flags, self.fastmap.can_be_null, self.fastmap.fastmap(), self.anchor, string, pos) if regs is None: return None return MatchObject(self, string, pos, regs) def sub(self, repl, string, count=0): return self.subn(repl, string, count)[0] def subn(self, repl, source, count=0): if count < 0: raise ValueError, "negative substibution count" if count == 0: import sys count = sys.maxint if type(repl) == type(''): if '\\' in repl: repl = lambda m, r=repl: _expand(m, r) else: repl = lambda m, r=repl: r n = 0 # Number of matches pos = 0 # Where to start searching lastmatch = -1 # End of last match results = [] # Substrings making up the result end = len(source) while n < count and pos <= end: m = self.search(source, pos) if not m: break i, j = m.span(0) if i == j == lastmatch: # Empty match adjacent to previous match pos = pos + 1 results.append(source[lastmatch:pos]) continue if pos < i: results.append(source[pos:i]) results.append(repl(m)) pos = lastmatch = j if i == j: # Last match was empty; don't try here again pos = pos + 1 results.append(source[lastmatch:pos]) n = n + 1 results.append(source[pos:]) return (string.join(results, ''), n) def split(self, source, maxsplit=0): if maxsplit < 0: raise error, "negative split count" if maxsplit == 0: import sys maxsplit = sys.maxint n = 0 pos = 0 lastmatch = 0 results = [] end = len(source) while n < maxsplit: m = self.search(source, pos) if not m: break i, j = m.span(0) if i == j: # Empty match if pos >= end: break pos = pos+1 continue results.append(source[lastmatch:i]) g = m.group() if g: results[len(results):] = list(g) pos = lastmatch = j results.append(source[lastmatch:]) return results class MatchObject: def __init__(self, re, string, pos, regs): self.re = re self.string = string self.pos = pos self.regs = regs def start(self, g): if type(g) == type(''): try: g = self.re.groupindex[g] except (KeyError, TypeError): raise IndexError, ('group "' + g + '" is undefined') return self.regs[g][0] def end(self, g): if type(g) == type(''): try: g = self.re.groupindex[g] except (KeyError, TypeError): raise IndexError, ('group "' + g + '" is undefined') return self.regs[g][1] def span(self, g): if type(g) == type(''): try: g = self.re.groupindex[g] except (KeyError, TypeError): raise IndexError, ('group "' + g + '" is undefined') return self.regs[g] def group(self, *groups): if len(groups) == 0: groups = range(1, self.re.num_regs) use_all = 1 else: use_all = 0 result = [] for g in groups: if type(g) == type(''): try: g = self.re.groupindex[g] except (KeyError, TypeError): raise IndexError, ('group "' + g + '" is undefined') if (self.regs[g][0] == -1) or (self.regs[g][1] == -1): result.append(None) else: result.append(self.string[self.regs[g][0]:self.regs[g][1]]) if use_all or len(result) > 1: return tuple(result) elif len(result) == 1: return result[0] else: return () # # A set of classes to make assembly a bit easier, if a bit verbose. # class Instruction: def __init__(self, opcode, size=1): self.opcode = opcode self.size = size def assemble(self, position, labels): return self.opcode def __repr__(self): return '%-15s' % (self.name) class End(Instruction): name = 'end' def __init__(self): Instruction.__init__(self, chr(0)) class Bol(Instruction): name = 'bol' def __init__(self): self.name = 'bol' Instruction.__init__(self, chr(1)) class Eol(Instruction): name = 'eol' def __init__(self): Instruction.__init__(self, chr(2)) class Set(Instruction): name = 'set' def __init__(self, set, flags=0): self.set = set if flags & IGNORECASE: self.set=map(string.lower, self.set) if len(set)==1: # If only one element, use the "exact" opcode (it'll be faster) Instruction.__init__(self, chr(4), 2) else: # Use the "set" opcode Instruction.__init__(self, chr(3), 33) def assemble(self, position, labels): if len(self.set)==1: # If only one character in set, generate an "exact" opcode return self.opcode + self.set[0] result = self.opcode temp = 0 for i, c in map(lambda x: (x, chr(x)), range(256)): if c in self.set: temp = temp | (1 << (i & 7)) if (i % 8) == 7: result = result + chr(temp) temp = 0 return result def __repr__(self): result = '%-15s' % (self.name) self.set.sort() # XXX this should print more intelligently for char in self.set: result = result + char return result class Exact(Instruction): name = 'exact' def __init__(self, char, flags): self.char = char if flags & IGNORECASE: self.char=string.lower(self.char) Instruction.__init__(self, chr(4), 2) def assemble(self, position, labels): return self.opcode + self.char def __repr__(self): return '%-15s %s' % (self.name, `self.char`) class AnyChar(Instruction): name = 'anychar' def __init__(self): Instruction.__init__(self, chr(5)) def assemble(self, position, labels): return self.opcode class MemoryInstruction(Instruction): def __init__(self, opcode, register): self.register = register Instruction.__init__(self, opcode, 2) def assemble(self, position, labels): return self.opcode + chr(self.register) def __repr__(self): return '%-15s %i' % (self.name, self.register) class StartMemory(MemoryInstruction): name = 'start_memory' def __init__(self, register): MemoryInstruction.__init__(self, chr(6), register) class EndMemory(MemoryInstruction): name = 'end_memory' def __init__(self, register): MemoryInstruction.__init__(self, chr(7), register) class MatchMemory(MemoryInstruction): name = 'match_memory' def __init__(self, register): MemoryInstruction.__init__(self, chr(8), register) class JumpInstruction(Instruction): def __init__(self, opcode, label): self.label = label Instruction.__init__(self, opcode, 3) def compute_offset(self, start, dest): return dest - (start + 3) def pack_offset(self, offset): if offset > 32767: raise error, 'offset out of range (pos)' elif offset < -32768: raise error, 'offset out of range (neg)' elif offset < 0: offset = offset + 65536 return chr(offset & 0xff) + chr((offset >> 8) & 0xff) def assemble(self, position, labels): return self.opcode + \ self.pack_offset(self.compute_offset(position, labels[self.label])) def __repr__(self): return '%-15s %i' % (self.name, self.label) class Jump(JumpInstruction): name = 'jump' def __init__(self, label): JumpInstruction.__init__(self, chr(9), label) class StarJump(JumpInstruction): name = 'star_jump' def __init__(self, label): JumpInstruction.__init__(self, chr(10), label) class FailureJump(JumpInstruction): name = 'failure_jump' def __init__(self, label): JumpInstruction.__init__(self, chr(11), label) class UpdateFailureJump(JumpInstruction): name = 'update_failure_jump' def __init__(self, label): JumpInstruction.__init__(self, chr(12), label) class DummyFailureJump(JumpInstruction): name = 'dummy_failure_jump' def __init__(self, label): JumpInstruction.__init__(self, chr(13), label) class BegBuf(Instruction): name = 'begbuf' def __init__(self): Instruction.__init__(self, chr(14)) class EndBuf(Instruction): name = 'endbuf' def __init__(self): Instruction.__init__(self, chr(15)) class WordBeg(Instruction): name = 'wordbeg' def __init__(self): Instruction.__init__(self, chr(16)) class WordEnd(Instruction): name = 'wordend' def __init__(self): Instruction.__init__(self, chr(17)) class WordBound(Instruction): name = 'wordbound' def __init__(self): Instruction.__init__(self, chr(18)) class NotWordBound(Instruction): name = 'notwordbound' def __init__(self): Instruction.__init__(self, chr(19)) class SyntaxSpec(Instruction): name = 'syntaxspec' def __init__(self, syntax): self.syntax = syntax Instruction.__init__(self, chr(20), 2) def assemble(self, postition, labels): return self.opcode + chr(self.syntax) class NotSyntaxSpec(Instruction): name = 'notsyntaxspec' def __init__(self, syntax): self.syntax = syntax Instruction.__init__(self, chr(21), 2) def assemble(self, postition, labels): return self.opcode + chr(self.syntax) class Label(Instruction): name = 'label' def __init__(self, label): self.label = label Instruction.__init__(self, '', 0) def __repr__(self): return '%-15s %i' % (self.name, self.label) class OpenParen(Instruction): name = '(' def __init__(self, register): self.register = register Instruction.__init__(self, '', 0) def assemble(self, position, labels): raise error, 'unmatched open parenthesis' class Alternation(Instruction): name = '|' def __init__(self): Instruction.__init__(self, '', 0) def assemble(self, position, labels): raise error, 'an alternation was not taken care of' # # # def assemble(instructions): labels = {} position = 0 pass1 = [] for instruction in instructions: if instruction.name == 'label': labels[instruction.label] = position else: pass1.append((position, instruction)) position = position + instruction.size pass2 = '' for position, instruction in pass1: pass2 = pass2 + instruction.assemble(position, labels) return pass2 # # # def escape(pattern): result = [] for char in pattern: if not reop.syntax_table[char] & reop.word: result.append('\\') result.append(char) return string.join(result, '') # # # def registers_used(instructions): result = [] for instruction in instructions: if (instruction.name in ['set_memory', 'end_memory']) and \ (instruction.register not in result): result.append(instruction.register) return result # # # class Fastmap: def __init__(self): self.map = ['\000']*256 self.can_be_null = 0 def add(self, char): self.map[ord(char)] = '\001' def fastmap(self): return string.join(self.map, '') def __getitem__(self, char): return ord(self.map[ord(char)]) def __repr__(self): self.map.sort() return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')' # # # def find_label(code, label): line = 0 for instruction in code: if (instruction.name == 'label') and (instruction.label == label): return line + 1 line = line + 1 def build_fastmap_aux(code, pos, visited, fastmap): if visited[pos]: return while 1: instruction = code[pos] visited[pos] = 1 pos = pos + 1 if instruction.name == 'end': fastmap.can_be_null = 1 return elif instruction.name == 'syntaxspec': for char in map(chr, range(256)): if reop.syntax_table[char] & instruction.syntax: fastmap.add(char) return elif instruction.name == 'notsyntaxspec': for char in map(chr, range(256)): if not reop.syntax_table[char] & instruction.syntax: fastmap.add(char) return elif instruction.name == 'eol': fastmap.add('\n') if fastmap.can_be_null == 0: fastmap.can_be_null = 2 return elif instruction.name == 'set': for char in instruction.set: fastmap.add(char) return elif instruction.name == 'exact': fastmap.add(instruction.char) elif instruction.name == 'anychar': for char in map(chr, range(256)): if char != '\n': fastmap.add(char) return elif instruction.name == 'match_memory': for char in map(chr, range(256)): fastmap.add(char) fastmap.can_be_null = 1 return elif instruction.name in ['jump', 'dummy_failure_jump', \ 'update_failure_jump', 'star_jump']: pos = find_label(code, instruction.label) if visited[pos]: return visited[pos] = 1 elif instruction.name == 'failure_jump': build_fastmap_aux(code, find_label(code, instruction.label), visited, fastmap) def build_fastmap(code, pos=0): visited = [0] * len(code) fastmap = Fastmap() build_fastmap_aux(code, pos, visited, fastmap) return fastmap # # # #[NORMAL, CHARCLASS, REPLACEMENT] = range(3) #[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, # NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9) def expand_escape(pattern, index, context=NORMAL): if index >= len(pattern): raise error, 'escape ends too soon' elif pattern[index] == 't': return CHAR, chr(9), index + 1 elif pattern[index] == 'n': return CHAR, chr(10), index + 1 elif pattern[index] == 'v': return CHAR, chr(11), index + 1 elif pattern[index] == 'r': return CHAR, chr(13), index + 1 elif pattern[index] == 'f': return CHAR, chr(12), index + 1 elif pattern[index] == 'a': return CHAR, chr(7), index + 1 elif pattern[index] == 'x': # CAUTION: this is the Python rule, not the Perl rule! end = index + 1 # Skip over the 'x' character while (end < len(pattern)) and (pattern[end] in string.hexdigits): end = end + 1 if end == index: raise error, "\\x must be followed by hex digit(s)" # let Python evaluate it, so we don't incorrectly 2nd-guess # what it's doing (and Python in turn passes it on to sscanf, # so that *it* doesn't incorrectly 2nd-guess what C does!) char = eval ('"' + pattern[index-1:end] + '"') assert len(char) == 1 return CHAR, char, end elif pattern[index] == 'b': if context != NORMAL: return CHAR, chr(8), index + 1 else: return WORD_BOUNDARY, '', index + 1 elif pattern[index] == 'B': if context != NORMAL: return CHAR, 'B', index + 1 else: return NOT_WORD_BOUNDARY, '', index + 1 elif pattern[index] == 'A': if context != NORMAL: return CHAR, 'A', index + 1 else: return BEGINNING_OF_BUFFER, '', index + 1 elif pattern[index] == 'Z': if context != NORMAL: return CHAR, 'Z', index + 1 else: return END_OF_BUFFER, '', index + 1 elif pattern[index] in 'GluLUQE': raise error, ('\\' + pattern[index] + ' is not allowed') elif pattern[index] == 'w': if context == NORMAL: return SYNTAX, reop.word, index + 1 elif context == CHARCLASS: set = [] for char in reop.syntax_table.keys(): if reop.syntax_table[char] & reop.word: set.append(char) return SET, set, index + 1 else: return CHAR, 'w', index + 1 elif pattern[index] == 'W': if context == NORMAL: return NOT_SYNTAX, reop.word, index + 1 elif context == CHARCLASS: set = [] for char in reop.syntax_table.keys(): if not reop.syntax_table[char] & reop.word: set.append(char) return SET, set, index + 1 else: return CHAR, 'W', index + 1 elif pattern[index] == 's': if context == NORMAL: return SYNTAX, reop.whitespace, index + 1 elif context == CHARCLASS: set = [] for char in reop.syntax_table.keys(): if reop.syntax_table[char] & reop.whitespace: set.append(char) return SET, set, index + 1 else: return CHAR, 's', index + 1 elif pattern[index] == 'S': if context == NORMAL: return NOT_SYNTAX, reop.whitespace, index + 1 elif context == CHARCLASS: set = [] for char in reop.syntax_table.keys(): if not reop.syntax_table[char] & reop.whitespace: set.append(char) return SET, set, index + 1 else: return CHAR, 'S', index + 1 elif pattern[index] == 'd': if context == NORMAL: return SYNTAX, reop.digit, index + 1 elif context == CHARCLASS: set = [] for char in reop.syntax_table.keys(): if reop.syntax_table[char] & reop.digit: set.append(char) return SET, set, index + 1 else: return CHAR, 'd', index + 1 elif pattern[index] == 'D': if context == NORMAL: return NOT_SYNTAX, reop.digit, index + 1 elif context == CHARCLASS: set = [] for char in reop.syntax_table.keys(): if not reop.syntax_table[char] & reop.digit: set.append(char) return SET, set, index + 1 else: return CHAR, 'D', index + 1 elif pattern[index] in '0123456789': if pattern[index] == '0': if (index + 1 < len(pattern)) and \ (pattern[index + 1] in string.octdigits): if (index + 2 < len(pattern)) and \ (pattern[index + 2] in string.octdigits): value = string.atoi(pattern[index:index + 3], 8) index = index + 3 else: value = string.atoi(pattern[index:index + 2], 8) index = index + 2 else: value = 0 index = index + 1 if value > 255: raise error, 'octal value out of range' return CHAR, chr(value), index else: if (index + 1 < len(pattern)) and \ (pattern[index + 1] in string.digits): if (index + 2 < len(pattern)) and \ (pattern[index + 2] in string.octdigits) and \ (pattern[index + 1] in string.octdigits) and \ (pattern[index] in string.octdigits): value = string.atoi(pattern[index:index + 3], 8) if value > 255: raise error, 'octal value out of range' return CHAR, chr(value), index + 3 else: value = string.atoi(pattern[index:index + 2]) if (value < 1) or (value > 99): raise error, 'memory reference out of range' if context == CHARCLASS: raise error, ('cannot reference a register from ' 'inside a character class') return MEMORY_REFERENCE, value, index + 2 else: if context == CHARCLASS: raise error, ('cannot reference a register from ' 'inside a character class') value = string.atoi(pattern[index]) return MEMORY_REFERENCE, value, index + 1 elif pattern[index] == 'g': if context != REPLACEMENT: return CHAR, 'g', index + 1 index = index + 1 if index >= len(pattern): raise error, 'unfinished symbolic reference' if pattern[index] != '<': raise error, 'missing < in symbolic reference' index = index + 1 end = string.find(pattern, '>', index) if end == -1: raise error, 'unfinished symbolic reference' value = pattern[index:end] if not valid_identifier(value): raise error, 'illegal symbolic reference' return MEMORY_REFERENCE, value, end + 1 else: return CHAR, pattern[index], index + 1 def compile(pattern, flags=0): stack = [] label = 0 register = 1 groupindex = {} lastop = '' # look for embedded pattern modifiers at the beginning of the pattern index = 0 if len(pattern) >= 3 and \ (pattern[:2] == '(?') and \ (pattern[2] in 'iImMsSxX'): index = 2 while (index < len(pattern)) and (pattern[index] != ')'): if pattern[index] in 'iI': flags = flags | IGNORECASE elif pattern[index] in 'mM': flags = flags | MULTILINE elif pattern[index] in 'sS': flags = flags | DOTALL elif pattern[index] in 'xX': flags = flags | VERBOSE else: raise error, 'unknown modifier' index = index + 1 index = index + 1 # compile the rest of the pattern while (index < len(pattern)): char = pattern[index] index = index + 1 if char == '\\': escape_type, value, index = expand_escape(pattern, index) if escape_type == CHAR: stack.append([Exact(value, flags)]) lastop = '\\' + value elif escape_type == MEMORY_REFERENCE: if value >= register: raise error, ('cannot reference a register ' 'not yet used') stack.append([MatchMemory(value)]) lastop = '\\1' elif escape_type == BEGINNING_OF_BUFFER: stack.append([BegBuf()]) lastop = '\\A' elif escape_type == END_OF_BUFFER: stack.append([EndBuf()]) lastop = '\\Z' elif escape_type == WORD_BOUNDARY: stack.append([WordBound()]) lastop = '\\b' elif escape_type == NOT_WORD_BOUNDARY: stack.append([NotWordBound()]) lastop = '\\B' elif escape_type == SYNTAX: stack.append([SyntaxSpec(value)]) if value == reop.word: lastop = '\\w' elif value == reop.whitespace: lastop = '\\s' elif value == reop.digit: lastop = '\\d' else: lastop = '\\?' elif escape_type == NOT_SYNTAX: stack.append([NotSyntaxSpec(value)]) if value == reop.word: lastop = '\\W' elif value == reop.whitespace: lastop = '\\S' elif value == reop.digit: lastop = '\\D' else: lastop = '\\?' elif escape_type == SET: raise error, 'cannot use set escape type here' else: raise error, 'unknown escape type' elif char == '|': expr = [] while (len(stack) != 0) and \ (stack[-1][0].name != '(') and \ (stack[-1][0].name != '|'): expr = stack[-1] + expr del stack[-1] stack.append([FailureJump(label)] + \ expr + \ [Jump(-1), Label(label)]) stack.append([Alternation()]) label = label + 1 lastop = '|' elif char == '(': if index >= len(pattern): raise error, 'no matching close paren' elif pattern[index] == '?': # Perl style (?...) extensions index = index + 1 if index >= len(pattern): raise error, 'extension ends prematurely' elif pattern[index] == 'P': # Python extensions index = index + 1 if index >= len(pattern): raise error, 'extension ends prematurely' elif pattern[index] == '<': # Handle Python symbolic group names (?P<...>...) index = index + 1 end = string.find(pattern, '>', index) if end == -1: raise error, 'no end to symbolic group name' name = pattern[index:end] if not valid_identifier(name): raise error, ('symbolic group name must be a ' 'valid identifier') index = end + 1 groupindex[name] = register stack.append([OpenParen(register)]) register = register + 1 lastop = '(' elif pattern[index] == '=': # backreference to symbolic group name if index >= len(pattern): raise error, '(?P= at the end of the pattern' start = index + 1 end = string.find(pattern, ')', start) if end == -1: raise error, 'no ) to end symbolic group name' name = pattern[start:end] if name not in groupindex.keys(): raise error, ('symbolic group name ' + name + \ ' has not been used yet') stack.append([MatchMemory(groupindex[name])]) index = end + 1 lastop = '(?P=)' else: raise error, ('unknown Python extension: ' + \ pattern[index]) elif pattern[index] == ':': # grouping, but no registers index = index + 1 stack.append([OpenParen(-1)]) lastop = '(' elif pattern[index] == '#': # comment index = index + 1 end = string.find(pattern, ')', index) if end == -1: raise error, 'no end to comment' index = end + 1 # do not change lastop elif pattern[index] == '=': raise error, ('zero-width positive lookahead ' 'assertion is unsupported') elif pattern[index] == '!': raise error, ('zero-width negative lookahead ' 'assertion is unsupported') elif pattern[index] in 'iImMsSxX': raise error, ('embedded pattern modifiers are only ' 'allowed at the beginning of the pattern') else: raise error, 'unknown extension' else: stack.append([OpenParen(register)]) register = register + 1 lastop = '(' elif char == ')': # make one expression out of everything on the stack up to # the marker left by the last parenthesis expr = [] while (len(stack) > 0) and (stack[-1][0].name != '('): expr = stack[-1] + expr del stack[-1] if len(stack) == 0: raise error, 'too many close parens' # remove markers left by alternation expr = filter(lambda x: x.name != '|', expr) # clean up jumps inserted by alternation need_label = 0 for i in range(len(expr)): if (expr[i].name == 'jump') and (expr[i].label == -1): expr[i] = Jump(label) need_label = 1 if need_label: expr.append(Label(label)) label = label + 1 if stack[-1][0].register > 0: expr = [StartMemory(stack[-1][0].register)] + \ expr + \ [EndMemory(stack[-1][0].register)] del stack[-1] stack.append(expr) lastop = ')' elif char == '{': if len(stack) == 0: raise error, 'no expression to repeat' end = string.find(pattern, '}', index) if end == -1: raise error, ('no close curly bracket to match' ' open curly bracket') fields = map(string.strip, string.split(pattern[index:end], ',')) index = end + 1 minimal = 0 if (index < len(pattern)) and (pattern[index] == '?'): minimal = 1 index = index + 1 if len(fields) == 1: # {n} or {n}? (there's really no difference) try: count = string.atoi(fields[0]) except ValueError: raise error, ('count must be an integer ' 'inside curly braces') if count > 65535: raise error, 'repeat count out of range' expr = [] while count > 0: expr = expr + stack[-1] count = count - 1 del stack[-1] stack.append(expr) if minimal: lastop = '{n}?' else: lastop = '{n}' elif len(fields) == 2: # {n,} or {n,m} if fields[1] == '': # {n,} try: min = string.atoi(fields[0]) except ValueError: raise error, ('minimum must be an integer ' 'inside curly braces') if min > 65535: raise error, 'minimum repeat count out of range' expr = [] while min > 0: expr = expr + stack[-1] min = min - 1 if minimal: expr = expr + \ ([Jump(label + 1), Label(label)] + \ stack[-1] + \ [Label(label + 1), FailureJump(label)]) lastop = '{n,}?' else: expr = expr + \ ([Label(label), FailureJump(label + 1)] + stack[-1] + [StarJump(label), Label(label + 1)]) lastop = '{n,}' del stack[-1] stack.append(expr) label = label + 2 else: # {n,m} try: min = string.atoi(fields[0]) except ValueError: raise error, ('minimum must be an integer ' 'inside curly braces') try: max = string.atoi(fields[1]) except ValueError: raise error, ('maximum must be an integer ' 'inside curly braces') if min > 65535: raise error, ('minumim repeat count out ' 'of range') if max > 65535: raise error, ('maximum repeat count out ' 'of range') if min > max: raise error, ('minimum repeat count must be ' 'less than the maximum ' 'repeat count') expr = [] while min > 0: expr = expr + stack[-1] min = min - 1 max = max - 1 if minimal: while max > 0: expr = expr + \ [FailureJump(label), Jump(label + 1), Label(label)] + \ stack[-1] + \ [Label(label + 1)] max = max - 1 label = label + 2 del stack[-1] stack.append(expr) lastop = '{n,m}?' else: while max > 0: expr = expr + \ [FailureJump(label)] + \ stack[-1] max = max - 1 del stack[-1] stack.append(expr + [Label(label)]) label = label + 1 lastop = '{n,m}' else: raise error, ('there need to be one or two fields ' 'in a {} expression') elif char == '}': raise error, 'unbalanced close curly brace' elif char == '*': # Kleene closure if len(stack) == 0: raise error, '* needs something to repeat' if lastop in ['(', '|']: raise error, '* needs something to repeat' if lastop in repetition_operators: raise error, 'nested repetition operators' if (index < len(pattern)) and (pattern[index] == '?'): # non-greedy matching expr = [Jump(label + 1), Label(label)] + \ stack[-1] + \ [Label(label + 1), FailureJump(label)] index = index + 1 lastop = '*?' else: # greedy matching expr = [Label(label), FailureJump(label + 1)] + \ stack[-1] + \ [StarJump(label), Label(label + 1)] lastop = '*' del stack[-1] stack.append(expr) label = label + 2 elif char == '+': # positive closure if len(stack) == 0: raise error, '+ needs something to repeat' if lastop in ['(', '|']: raise error, '+ needs something to repeat' if lastop in repetition_operators: raise error, 'nested repetition operators' if (index < len(pattern)) and (pattern[index] == '?'): # non-greedy expr = [Label(label)] + \ stack[-1] + \ [FailureJump(label)] label = label + 1 index = index + 1 lastop = '+?' else: # greedy expr = [DummyFailureJump(label + 1), Label(label), FailureJump(label + 2), Label(label + 1)] + \ stack[-1] + \ [StarJump(label), Label(label + 2)] label = label + 3 lastop = '+' del stack[-1] stack.append(expr) elif char == '?': if len(stack) == 0: raise error, 'need something to be optional' if len(stack) == 0: raise error, '? needs something to repeat' if lastop in ['(', '|']: raise error, '? needs something to repeat' if lastop in repetition_operators: raise error, 'nested repetition operators' if (index < len(pattern)) and (pattern[index] == '?'): # non-greedy matching expr = [FailureJump(label), Jump(label + 1), Label(label)] + \ stack[-1] + \ [Label(label + 1)] label = label + 2 index = index + 1 lastop = '??' else: # greedy matching expr = [FailureJump(label)] + \ stack[-1] + \ [Label(label)] label = label + 1 lastop = '?' del stack[-1] stack.append(expr) elif char == '.': if flags & DOTALL: stack.append([Set(map(chr, range(256)), flags)]) else: stack.append([AnyChar()]) lastop = '.' elif char == '^': if flags & MULTILINE: stack.append([Bol()]) else: stack.append([BegBuf()]) lastop = '^' elif char == '$': if flags & MULTILINE: stack.append([Eol()]) else: stack.append([EndBuf()]) lastop = '$' elif char == '#': if flags & VERBOSE: # comment index = index + 1 end = string.find(pattern, '\n', index) if end == -1: index = len(pattern) else: index = end + 1 # do not change lastop else: stack.append([Exact(char, flags)]) lastop = '#' elif char in string.whitespace: if not (flags & VERBOSE): stack.append([Exact(char, flags)]) lastop = char elif char == '[': # compile character class if index >= len(pattern): raise error, 'unclosed character class' negate = 0 last = '' set = [] if pattern[index] == '^': negate = 1 index = index + 1 if index >= len(pattern): raise error, 'unclosed character class' if pattern[index] == ']': set.append(']') index = index + 1 if index >= len(pattern): raise error, 'unclosed character class' elif pattern[index] == '-': set.append('-') index = index + 1 if index >= len(pattern): raise error, 'unclosed character class' while (index < len(pattern)) and (pattern[index] != ']'): next = pattern[index] index = index + 1 if next == '-': if index >= len(pattern): raise error, 'incomplete range in character class' elif pattern[index] == ']': set.append('-') else: if last == '': raise error, ('improper use of range in ' 'character class') start = last if pattern[index] == '\\': escape_type, value, index = expand_escape(pattern, index + 1, CHARCLASS) if escape_type == CHAR: end = value else: raise error, ('illegal escape in character ' 'class range') else: end = pattern[index] index = index + 1 if start > end: raise error, ('range arguments out of order ' 'in character class') for char in map(chr, range(ord(start), ord(end) + 1)): if char not in set: set.append(char) last = '' elif next == '\\': # expand syntax meta-characters and add to set if index >= len(pattern): raise error, 'incomplete set' escape_type, value, index = expand_escape(pattern, index, CHARCLASS) if escape_type == CHAR: set.append(value) last = value elif escape_type == SET: for char in value: if char not in set: set.append(char) last = '' else: raise error, 'illegal escape type in character class' else: if next not in set: set.append(next) last = next if (index >= len(pattern)) or ( pattern[index] != ']'): raise error, 'incomplete set' index = index + 1 if negate: # If case is being ignored, then both upper- and lowercase # versions of the letters must be excluded. if flags & IGNORECASE: set=set+map(string.upper, set) notset = [] for char in map(chr, range(256)): if char not in set: notset.append(char) if len(notset) == 0: raise error, 'empty negated set' stack.append([Set(notset, flags)]) else: if len(set) == 0: raise error, 'empty set' stack.append([Set(set, flags)]) lastop = '[]' else: stack.append([Exact(char, flags)]) lastop = char code = [] while len(stack) > 0: if stack[-1][0].name == '(': raise error, 'too many open parens' code = stack[-1] + code del stack[-1] if len(code) == 0: raise error, 'no code generated' code = filter(lambda x: x.name != '|', code) need_label = 0 for i in range(len(code)): if (code[i].name == 'jump') and (code[i].label == -1): code[i] = Jump(label) need_label = 1 if need_label: code.append(Label(label)) label = label + 1 code.append(End()) # print code return RegexObject(pattern, flags, code, register, groupindex) # Replace expand_escape and _expand functions with their C equivalents. # If you suspect bugs in the C versions, comment out the next two lines expand_escape = reop.expand_escape _expand = reop._expand