From bf9d353babbd4126fc080a9a84ae65cc529b209d Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 6 Oct 1997 14:45:17 +0000 Subject: New "re" regular expression support. The new re module was written by Andrew Kuchling and uses the pcre code in ../Modules/. The old re module has been renamed to re1, just in case you need it for comparison. --- Lib/re.py | 1212 +++--------------------------------------------- Lib/re1.py | 1508 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1575 insertions(+), 1145 deletions(-) create mode 100644 Lib/re1.py diff --git a/Lib/re.py b/Lib/re.py index 6c24797..6c3ee0b 100644 --- a/Lib/re.py +++ b/Lib/re.py @@ -2,42 +2,27 @@ # -*- mode: python -*- # $Id$ -import string -import reop - -# reop.error and re.error should be the same, since exceptions can be -# raised from either module. -error = reop.error # 're error' -from reop import NORMAL, CHARCLASS, REPLACEMENT -from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET -from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER - -# compilation flags - -IGNORECASE = I = 0x01 - -MULTILINE = M = 0x02 -DOTALL = S = 0x04 -VERBOSE = X = 0x08 +import sys +import string +from pcre import * -repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?', - '{n,}', '{n,}?', '{n,m}', '{n,m}?'] +[ NORMAL, CHARCLASS, REPLACEMENT ] = range(3) +[ CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER ] = range(9) # -# +# First, the public part of the interface: # -def valid_identifier(id): - if len(id) == 0: - return 0 - if (not reop.syntax_table[id[0]] & reop.word) or \ - (reop.syntax_table[id[0]] & reop.digit): - return 0 - for char in id[1:]: - if not reop.syntax_table[char] & reop.word: - return 0 - return 1 +# pcre.error and re.error should be the same, since exceptions can be +# raised from either module. + +# compilation flags + +I = IGNORECASE +M = MULTILINE +S = DOTALL +X = VERBOSE # # @@ -83,60 +68,17 @@ def split(pattern, string, maxsplit=0): # # -def _expand(m, repl): - results = [] - index = 0 - size = len(repl) - while index < size: - found = string.find(repl, '\\', index) - if found < 0: - results.append(repl[index:]) - break - if found > index: - results.append(repl[index:found]) - escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT) - if escape_type == CHAR: - results.append(value) - elif escape_type == MEMORY_REFERENCE: - r = m.group(value) - if r is None: - raise error, ('group "' + str(value) + '" did not contribute ' - 'to the match') - results.append(m.group(value)) - else: - raise error, "bad escape in replacement" - return string.join(results, '') - class RegexObject: - def __init__(self, pattern, flags, code, num_regs, groupindex): - self.code = code - self.num_regs = num_regs + def __init__(self, pattern, flags, code, groupindex): + self.code = code self.flags = flags self.pattern = pattern self.groupindex = groupindex - self.fastmap = build_fastmap(code) - - if code[0].name == 'bol': - self.anchor = 1 - - elif code[0].name == 'begbuf': - self.anchor = 2 - - else: - self.anchor = 0 - - self.buffer = assemble(code) def search(self, string, pos=0): - regs = reop.search(self.buffer, - self.num_regs, - self.flags, - self.fastmap.can_be_null, - self.fastmap.fastmap(), - self.anchor, - string, - pos) + regs = self.code.match(string, pos, 0) if regs is None: return None + self.num_regs=len(regs) return MatchObject(self, string, @@ -144,17 +86,10 @@ class RegexObject: regs) def match(self, string, pos=0): - regs = reop.match(self.buffer, - self.num_regs, - self.flags, - self.fastmap.can_be_null, - self.fastmap.fastmap(), - self.anchor, - string, - pos) + regs = self.code.match(string, pos, ANCHORED) if regs is None: return None - + self.num_regs=len(regs)/2 return MatchObject(self, string, pos, @@ -165,13 +100,13 @@ class RegexObject: def subn(self, repl, source, count=0): if count < 0: - raise ValueError, "negative substibution count" + raise error, "negative substitution count" if count == 0: import sys count = sys.maxint if type(repl) == type(''): if '\\' in repl: - repl = lambda m, r=repl: _expand(m, r) + repl = lambda m, r=repl: pcre_expand(m, r) else: repl = lambda m, r=repl: r n = 0 # Number of matches @@ -275,7 +210,8 @@ class MatchObject: g = self.re.groupindex[g] except (KeyError, TypeError): raise IndexError, ('group "' + g + '" is undefined') - if (self.regs[g][0] == -1) or (self.regs[g][1] == -1): + if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined') + elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1): result.append(None) else: result.append(self.string[self.regs[g][0]:self.regs[g][1]]) @@ -286,364 +222,56 @@ class MatchObject: else: return () -# -# A set of classes to make assembly a bit easier, if a bit verbose. -# - -class Instruction: - def __init__(self, opcode, size=1): - self.opcode = opcode - self.size = size - def assemble(self, position, labels): - return self.opcode - def __repr__(self): - return '%-15s' % (self.name) - -class End(Instruction): - name = 'end' - def __init__(self): - Instruction.__init__(self, chr(0)) - -class Bol(Instruction): - name = 'bol' - def __init__(self): - self.name = 'bol' - Instruction.__init__(self, chr(1)) - -class Eol(Instruction): - name = 'eol' - def __init__(self): - Instruction.__init__(self, chr(2)) - -class Set(Instruction): - name = 'set' - def __init__(self, set, flags=0): - self.set = set - if flags & IGNORECASE: self.set=map(string.lower, self.set) - if len(set)==1: - # If only one element, use the "exact" opcode (it'll be faster) - Instruction.__init__(self, chr(4), 2) - else: - # Use the "set" opcode - Instruction.__init__(self, chr(3), 33) - def assemble(self, position, labels): - if len(self.set)==1: - # If only one character in set, generate an "exact" opcode - return self.opcode + self.set[0] - result = self.opcode - temp = 0 - for i, c in map(lambda x: (x, chr(x)), range(256)): - if c in self.set: - temp = temp | (1 << (i & 7)) - if (i % 8) == 7: - result = result + chr(temp) - temp = 0 - return result - def __repr__(self): - result = '%-15s' % (self.name) - self.set.sort() - # XXX this should print more intelligently - for char in self.set: - result = result + char - return result - -class Exact(Instruction): - name = 'exact' - def __init__(self, char, flags): - self.char = char - if flags & IGNORECASE: self.char=string.lower(self.char) - Instruction.__init__(self, chr(4), 2) - def assemble(self, position, labels): - return self.opcode + self.char - def __repr__(self): - return '%-15s %s' % (self.name, `self.char`) - -class AnyChar(Instruction): - name = 'anychar' - def __init__(self): - Instruction.__init__(self, chr(5)) - def assemble(self, position, labels): - return self.opcode - -class MemoryInstruction(Instruction): - def __init__(self, opcode, register): - self.register = register - Instruction.__init__(self, opcode, 2) - def assemble(self, position, labels): - return self.opcode + chr(self.register) - def __repr__(self): - return '%-15s %i' % (self.name, self.register) - -class StartMemory(MemoryInstruction): - name = 'start_memory' - def __init__(self, register): - MemoryInstruction.__init__(self, chr(6), register) - -class EndMemory(MemoryInstruction): - name = 'end_memory' - def __init__(self, register): - MemoryInstruction.__init__(self, chr(7), register) - -class MatchMemory(MemoryInstruction): - name = 'match_memory' - def __init__(self, register): - MemoryInstruction.__init__(self, chr(8), register) - -class JumpInstruction(Instruction): - def __init__(self, opcode, label): - self.label = label - Instruction.__init__(self, opcode, 3) - def compute_offset(self, start, dest): - return dest - (start + 3) - def pack_offset(self, offset): - if offset > 32767: - raise error, 'offset out of range (pos)' - elif offset < -32768: - raise error, 'offset out of range (neg)' - elif offset < 0: - offset = offset + 65536 - return chr(offset & 0xff) + chr((offset >> 8) & 0xff) - def assemble(self, position, labels): - return self.opcode + \ - self.pack_offset(self.compute_offset(position, - labels[self.label])) - def __repr__(self): - return '%-15s %i' % (self.name, self.label) - -class Jump(JumpInstruction): - name = 'jump' - def __init__(self, label): - JumpInstruction.__init__(self, chr(9), label) - -class StarJump(JumpInstruction): - name = 'star_jump' - def __init__(self, label): - JumpInstruction.__init__(self, chr(10), label) - -class FailureJump(JumpInstruction): - name = 'failure_jump' - def __init__(self, label): - JumpInstruction.__init__(self, chr(11), label) - -class UpdateFailureJump(JumpInstruction): - name = 'update_failure_jump' - def __init__(self, label): - JumpInstruction.__init__(self, chr(12), label) - -class DummyFailureJump(JumpInstruction): - name = 'dummy_failure_jump' - def __init__(self, label): - JumpInstruction.__init__(self, chr(13), label) - -class BegBuf(Instruction): - name = 'begbuf' - def __init__(self): - Instruction.__init__(self, chr(14)) - -class EndBuf(Instruction): - name = 'endbuf' - def __init__(self): - Instruction.__init__(self, chr(15)) - -class WordBeg(Instruction): - name = 'wordbeg' - def __init__(self): - Instruction.__init__(self, chr(16)) - -class WordEnd(Instruction): - name = 'wordend' - def __init__(self): - Instruction.__init__(self, chr(17)) - -class WordBound(Instruction): - name = 'wordbound' - def __init__(self): - Instruction.__init__(self, chr(18)) - -class NotWordBound(Instruction): - name = 'notwordbound' - def __init__(self): - Instruction.__init__(self, chr(19)) - -class SyntaxSpec(Instruction): - name = 'syntaxspec' - def __init__(self, syntax): - self.syntax = syntax - Instruction.__init__(self, chr(20), 2) - def assemble(self, postition, labels): - return self.opcode + chr(self.syntax) - -class NotSyntaxSpec(Instruction): - name = 'notsyntaxspec' - def __init__(self, syntax): - self.syntax = syntax - Instruction.__init__(self, chr(21), 2) - def assemble(self, postition, labels): - return self.opcode + chr(self.syntax) - -class Label(Instruction): - name = 'label' - def __init__(self, label): - self.label = label - Instruction.__init__(self, '', 0) - def __repr__(self): - return '%-15s %i' % (self.name, self.label) - -class OpenParen(Instruction): - name = '(' - def __init__(self, register): - self.register = register - Instruction.__init__(self, '', 0) - def assemble(self, position, labels): - raise error, 'unmatched open parenthesis' - -class Alternation(Instruction): - name = '|' - def __init__(self): - Instruction.__init__(self, '', 0) - def assemble(self, position, labels): - raise error, 'an alternation was not taken care of' - -# -# -# - -def assemble(instructions): - labels = {} - position = 0 - pass1 = [] - for instruction in instructions: - if instruction.name == 'label': - labels[instruction.label] = position - else: - pass1.append((position, instruction)) - position = position + instruction.size - pass2 = '' - for position, instruction in pass1: - pass2 = pass2 + instruction.assemble(position, labels) - return pass2 - -# -# -# - def escape(pattern): result = [] + alphanum=string.letters+'_'+string.digits for char in pattern: - if not reop.syntax_table[char] & reop.word: + if char not in alphanum: result.append('\\') result.append(char) return string.join(result, '') -# -# -# - -def registers_used(instructions): - result = [] - for instruction in instructions: - if (instruction.name in ['set_memory', 'end_memory']) and \ - (instruction.register not in result): - result.append(instruction.register) - return result - -# -# -# - -class Fastmap: - def __init__(self): - self.map = ['\000']*256 - self.can_be_null = 0 - def add(self, char): - self.map[ord(char)] = '\001' - def fastmap(self): - return string.join(self.map, '') - def __getitem__(self, char): - return ord(self.map[ord(char)]) - def __repr__(self): - self.map.sort() - return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')' - -# -# -# - -def find_label(code, label): - line = 0 - for instruction in code: - if (instruction.name == 'label') and (instruction.label == label): - return line + 1 - line = line + 1 - -def build_fastmap_aux(code, pos, visited, fastmap): - if visited[pos]: - return - while 1: - instruction = code[pos] - visited[pos] = 1 - pos = pos + 1 - if instruction.name == 'end': - fastmap.can_be_null = 1 - return - elif instruction.name == 'syntaxspec': - for char in map(chr, range(256)): - if reop.syntax_table[char] & instruction.syntax: - fastmap.add(char) - return - elif instruction.name == 'notsyntaxspec': - for char in map(chr, range(256)): - if not reop.syntax_table[char] & instruction.syntax: - fastmap.add(char) - return - elif instruction.name == 'eol': - fastmap.add('\n') - if fastmap.can_be_null == 0: - fastmap.can_be_null = 2 - return - elif instruction.name == 'set': - for char in instruction.set: - fastmap.add(char) - return - elif instruction.name == 'exact': - fastmap.add(instruction.char) - elif instruction.name == 'anychar': - for char in map(chr, range(256)): - if char != '\n': - fastmap.add(char) - return - elif instruction.name == 'match_memory': - for char in map(chr, range(256)): - fastmap.add(char) - fastmap.can_be_null = 1 - return - elif instruction.name in ['jump', 'dummy_failure_jump', \ - 'update_failure_jump', 'star_jump']: - pos = find_label(code, instruction.label) - if visited[pos]: - return - visited[pos] = 1 - elif instruction.name == 'failure_jump': - build_fastmap_aux(code, - find_label(code, instruction.label), - visited, - fastmap) - -def build_fastmap(code, pos=0): - visited = [0] * len(code) - fastmap = Fastmap() - build_fastmap_aux(code, pos, visited, fastmap) - return fastmap - -# -# -# +def valid_identifier(id): + import string + if len(id) == 0: + return 0 + if id[0] not in string.letters+'_': + return 0 + for char in id[1:]: + if not syntax_table[char] & word: + return 0 + return 1 -#[NORMAL, CHARCLASS, REPLACEMENT] = range(3) -#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, -# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9) +def compile(pattern, flags=0): + groupindex={} + code=pcre_compile(pattern, flags, groupindex) + return RegexObject(pattern, flags, code, groupindex) + +def _expand(m, repl): + results = [] + index = 0 + size = len(repl) + while index < size: + found = string.find(repl, '\\', index) + if found < 0: + results.append(repl[index:]) + break + if found > index: + results.append(repl[index:found]) + escape_type, value, index = _expand_escape(repl, found+1, REPLACEMENT) + if escape_type == CHAR: + results.append(value) + elif escape_type == MEMORY_REFERENCE: + r = m.group(value) + if r is None: + raise error, ('group "' + str(value) + '" did not contribute ' + 'to the match') + results.append(m.group(value)) + else: + raise error, "bad escape in replacement" + return string.join(results, '') -def expand_escape(pattern, index, context=NORMAL): +def _expand_escape(pattern, index, context=NORMAL): if index >= len(pattern): raise error, 'escape ends too soon' @@ -676,7 +304,7 @@ def expand_escape(pattern, index, context=NORMAL): # what it's doing (and Python in turn passes it on to sscanf, # so that *it* doesn't incorrectly 2nd-guess what C does!) char = eval ('"' + pattern[index-1:end] + '"') - assert len(char) == 1 +# assert len(char) == 1 return CHAR, char, end elif pattern[index] == 'b': @@ -707,75 +335,21 @@ def expand_escape(pattern, index, context=NORMAL): raise error, ('\\' + pattern[index] + ' is not allowed') elif pattern[index] == 'w': - if context == NORMAL: - return SYNTAX, reop.word, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if reop.syntax_table[char] & reop.word: - set.append(char) - return SET, set, index + 1 - else: return CHAR, 'w', index + 1 elif pattern[index] == 'W': - if context == NORMAL: - return NOT_SYNTAX, reop.word, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if not reop.syntax_table[char] & reop.word: - set.append(char) - return SET, set, index + 1 - else: return CHAR, 'W', index + 1 elif pattern[index] == 's': - if context == NORMAL: - return SYNTAX, reop.whitespace, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if reop.syntax_table[char] & reop.whitespace: - set.append(char) - return SET, set, index + 1 - else: return CHAR, 's', index + 1 elif pattern[index] == 'S': - if context == NORMAL: - return NOT_SYNTAX, reop.whitespace, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if not reop.syntax_table[char] & reop.whitespace: - set.append(char) - return SET, set, index + 1 - else: return CHAR, 'S', index + 1 elif pattern[index] == 'd': - if context == NORMAL: - return SYNTAX, reop.digit, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if reop.syntax_table[char] & reop.digit: - set.append(char) - return SET, set, index + 1 - else: return CHAR, 'd', index + 1 elif pattern[index] == 'D': - if context == NORMAL: - return NOT_SYNTAX, reop.digit, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if not reop.syntax_table[char] & reop.digit: - set.append(char) - return SET, set, index + 1 - else: return CHAR, 'D', index + 1 elif pattern[index] in '0123456789': @@ -854,655 +428,3 @@ def expand_escape(pattern, index, context=NORMAL): else: return CHAR, pattern[index], index + 1 -def compile(pattern, flags=0): - stack = [] - label = 0 - register = 1 - groupindex = {} - lastop = '' - - # look for embedded pattern modifiers at the beginning of the pattern - - index = 0 - - if len(pattern) >= 3 and \ - (pattern[:2] == '(?') and \ - (pattern[2] in 'iImMsSxX'): - index = 2 - while (index < len(pattern)) and (pattern[index] != ')'): - if pattern[index] in 'iI': - flags = flags | IGNORECASE - elif pattern[index] in 'mM': - flags = flags | MULTILINE - elif pattern[index] in 'sS': - flags = flags | DOTALL - elif pattern[index] in 'xX': - flags = flags | VERBOSE - else: - raise error, 'unknown modifier' - index = index + 1 - index = index + 1 - - # compile the rest of the pattern - - while (index < len(pattern)): - char = pattern[index] - index = index + 1 - if char == '\\': - escape_type, value, index = expand_escape(pattern, index) - - if escape_type == CHAR: - stack.append([Exact(value, flags)]) - lastop = '\\' + value - - elif escape_type == MEMORY_REFERENCE: - if value >= register: - raise error, ('cannot reference a register ' - 'not yet used') - stack.append([MatchMemory(value)]) - lastop = '\\1' - - elif escape_type == BEGINNING_OF_BUFFER: - stack.append([BegBuf()]) - lastop = '\\A' - - elif escape_type == END_OF_BUFFER: - stack.append([EndBuf()]) - lastop = '\\Z' - - elif escape_type == WORD_BOUNDARY: - stack.append([WordBound()]) - lastop = '\\b' - - elif escape_type == NOT_WORD_BOUNDARY: - stack.append([NotWordBound()]) - lastop = '\\B' - - elif escape_type == SYNTAX: - stack.append([SyntaxSpec(value)]) - if value == reop.word: - lastop = '\\w' - elif value == reop.whitespace: - lastop = '\\s' - elif value == reop.digit: - lastop = '\\d' - else: - lastop = '\\?' - - elif escape_type == NOT_SYNTAX: - stack.append([NotSyntaxSpec(value)]) - if value == reop.word: - lastop = '\\W' - elif value == reop.whitespace: - lastop = '\\S' - elif value == reop.digit: - lastop = '\\D' - else: - lastop = '\\?' - - elif escape_type == SET: - raise error, 'cannot use set escape type here' - - else: - raise error, 'unknown escape type' - - elif char == '|': - expr = [] - - while (len(stack) != 0) and \ - (stack[-1][0].name != '(') and \ - (stack[-1][0].name != '|'): - expr = stack[-1] + expr - del stack[-1] - stack.append([FailureJump(label)] + \ - expr + \ - [Jump(-1), - Label(label)]) - stack.append([Alternation()]) - label = label + 1 - lastop = '|' - - elif char == '(': - if index >= len(pattern): - raise error, 'no matching close paren' - - elif pattern[index] == '?': - # Perl style (?...) extensions - index = index + 1 - if index >= len(pattern): - raise error, 'extension ends prematurely' - - elif pattern[index] == 'P': - # Python extensions - index = index + 1 - if index >= len(pattern): - raise error, 'extension ends prematurely' - - elif pattern[index] == '<': - # Handle Python symbolic group names (?P<...>...) - index = index + 1 - end = string.find(pattern, '>', index) - if end == -1: - raise error, 'no end to symbolic group name' - name = pattern[index:end] - if not valid_identifier(name): - raise error, ('symbolic group name must be a ' - 'valid identifier') - index = end + 1 - groupindex[name] = register - stack.append([OpenParen(register)]) - register = register + 1 - lastop = '(' - - elif pattern[index] == '=': - # backreference to symbolic group name - if index >= len(pattern): - raise error, '(?P= at the end of the pattern' - start = index + 1 - end = string.find(pattern, ')', start) - if end == -1: - raise error, 'no ) to end symbolic group name' - name = pattern[start:end] - if name not in groupindex.keys(): - raise error, ('symbolic group name ' + name + \ - ' has not been used yet') - stack.append([MatchMemory(groupindex[name])]) - index = end + 1 - lastop = '(?P=)' - - else: - raise error, ('unknown Python extension: ' + \ - pattern[index]) - - elif pattern[index] == ':': - # grouping, but no registers - index = index + 1 - stack.append([OpenParen(-1)]) - lastop = '(' - - elif pattern[index] == '#': - # comment - index = index + 1 - end = string.find(pattern, ')', index) - if end == -1: - raise error, 'no end to comment' - index = end + 1 - # do not change lastop - - elif pattern[index] == '=': - raise error, ('zero-width positive lookahead ' - 'assertion is unsupported') - - elif pattern[index] == '!': - raise error, ('zero-width negative lookahead ' - 'assertion is unsupported') - - elif pattern[index] in 'iImMsSxX': - raise error, ('embedded pattern modifiers are only ' - 'allowed at the beginning of the pattern') - - else: - raise error, 'unknown extension' - - else: - stack.append([OpenParen(register)]) - register = register + 1 - lastop = '(' - - elif char == ')': - # make one expression out of everything on the stack up to - # the marker left by the last parenthesis - expr = [] - while (len(stack) > 0) and (stack[-1][0].name != '('): - expr = stack[-1] + expr - del stack[-1] - - if len(stack) == 0: - raise error, 'too many close parens' - - # remove markers left by alternation - expr = filter(lambda x: x.name != '|', expr) - - # clean up jumps inserted by alternation - need_label = 0 - for i in range(len(expr)): - if (expr[i].name == 'jump') and (expr[i].label == -1): - expr[i] = Jump(label) - need_label = 1 - if need_label: - expr.append(Label(label)) - label = label + 1 - - if stack[-1][0].register > 0: - expr = [StartMemory(stack[-1][0].register)] + \ - expr + \ - [EndMemory(stack[-1][0].register)] - del stack[-1] - stack.append(expr) - lastop = ')' - - elif char == '{': - if len(stack) == 0: - raise error, 'no expression to repeat' - end = string.find(pattern, '}', index) - if end == -1: - raise error, ('no close curly bracket to match' - ' open curly bracket') - - fields = map(string.strip, - string.split(pattern[index:end], ',')) - index = end + 1 - - minimal = 0 - if (index < len(pattern)) and (pattern[index] == '?'): - minimal = 1 - index = index + 1 - - if len(fields) == 1: - # {n} or {n}? (there's really no difference) - try: - count = string.atoi(fields[0]) - except ValueError: - raise error, ('count must be an integer ' - 'inside curly braces') - if count > 65535: - raise error, 'repeat count out of range' - expr = [] - while count > 0: - expr = expr + stack[-1] - count = count - 1 - del stack[-1] - stack.append(expr) - if minimal: - lastop = '{n}?' - else: - lastop = '{n}' - - elif len(fields) == 2: - # {n,} or {n,m} - if fields[1] == '': - # {n,} - try: - min = string.atoi(fields[0]) - except ValueError: - raise error, ('minimum must be an integer ' - 'inside curly braces') - if min > 65535: - raise error, 'minimum repeat count out of range' - - expr = [] - while min > 0: - expr = expr + stack[-1] - min = min - 1 - if minimal: - expr = expr + \ - ([Jump(label + 1), - Label(label)] + \ - stack[-1] + \ - [Label(label + 1), - FailureJump(label)]) - lastop = '{n,}?' - else: - expr = expr + \ - ([Label(label), - FailureJump(label + 1)] + - stack[-1] + - [StarJump(label), - Label(label + 1)]) - lastop = '{n,}' - - del stack[-1] - stack.append(expr) - label = label + 2 - - else: - # {n,m} - try: - min = string.atoi(fields[0]) - except ValueError: - raise error, ('minimum must be an integer ' - 'inside curly braces') - try: - max = string.atoi(fields[1]) - except ValueError: - raise error, ('maximum must be an integer ' - 'inside curly braces') - if min > 65535: - raise error, ('minumim repeat count out ' - 'of range') - if max > 65535: - raise error, ('maximum repeat count out ' - 'of range') - if min > max: - raise error, ('minimum repeat count must be ' - 'less than the maximum ' - 'repeat count') - expr = [] - while min > 0: - expr = expr + stack[-1] - min = min - 1 - max = max - 1 - if minimal: - while max > 0: - expr = expr + \ - [FailureJump(label), - Jump(label + 1), - Label(label)] + \ - stack[-1] + \ - [Label(label + 1)] - max = max - 1 - label = label + 2 - del stack[-1] - stack.append(expr) - lastop = '{n,m}?' - else: - while max > 0: - expr = expr + \ - [FailureJump(label)] + \ - stack[-1] - max = max - 1 - del stack[-1] - stack.append(expr + [Label(label)]) - label = label + 1 - lastop = '{n,m}' - - else: - raise error, ('there need to be one or two fields ' - 'in a {} expression') - - elif char == '}': - raise error, 'unbalanced close curly brace' - - elif char == '*': - # Kleene closure - if len(stack) == 0: - raise error, '* needs something to repeat' - - if lastop in ['(', '|']: - raise error, '* needs something to repeat' - - if lastop in repetition_operators: - raise error, 'nested repetition operators' - - if (index < len(pattern)) and (pattern[index] == '?'): - # non-greedy matching - expr = [Jump(label + 1), - Label(label)] + \ - stack[-1] + \ - [Label(label + 1), - FailureJump(label)] - index = index + 1 - lastop = '*?' - else: - # greedy matching - expr = [Label(label), - FailureJump(label + 1)] + \ - stack[-1] + \ - [StarJump(label), - Label(label + 1)] - lastop = '*' - del stack[-1] - stack.append(expr) - label = label + 2 - - elif char == '+': - # positive closure - if len(stack) == 0: - raise error, '+ needs something to repeat' - - if lastop in ['(', '|']: - raise error, '+ needs something to repeat' - - if lastop in repetition_operators: - raise error, 'nested repetition operators' - - if (index < len(pattern)) and (pattern[index] == '?'): - # non-greedy - expr = [Label(label)] + \ - stack[-1] + \ - [FailureJump(label)] - label = label + 1 - index = index + 1 - lastop = '+?' - - else: - # greedy - expr = [DummyFailureJump(label + 1), - Label(label), - FailureJump(label + 2), - Label(label + 1)] + \ - stack[-1] + \ - [StarJump(label), - Label(label + 2)] - label = label + 3 - lastop = '+' - - del stack[-1] - stack.append(expr) - - elif char == '?': - if len(stack) == 0: - raise error, 'need something to be optional' - - if len(stack) == 0: - raise error, '? needs something to repeat' - - if lastop in ['(', '|']: - raise error, '? needs something to repeat' - - if lastop in repetition_operators: - raise error, 'nested repetition operators' - - if (index < len(pattern)) and (pattern[index] == '?'): - # non-greedy matching - expr = [FailureJump(label), - Jump(label + 1), - Label(label)] + \ - stack[-1] + \ - [Label(label + 1)] - label = label + 2 - index = index + 1 - lastop = '??' - - else: - # greedy matching - expr = [FailureJump(label)] + \ - stack[-1] + \ - [Label(label)] - label = label + 1 - lastop = '?' - - del stack[-1] - stack.append(expr) - - elif char == '.': - if flags & DOTALL: - stack.append([Set(map(chr, range(256)), flags)]) - else: - stack.append([AnyChar()]) - lastop = '.' - - elif char == '^': - if flags & MULTILINE: - stack.append([Bol()]) - else: - stack.append([BegBuf()]) - lastop = '^' - - elif char == '$': - if flags & MULTILINE: - stack.append([Eol()]) - else: - stack.append([EndBuf()]) - lastop = '$' - - elif char == '#': - if flags & VERBOSE: - # comment - index = index + 1 - end = string.find(pattern, '\n', index) - if end == -1: - index = len(pattern) - else: - index = end + 1 - # do not change lastop - else: - stack.append([Exact(char, flags)]) - lastop = '#' - - elif char in string.whitespace: - if not (flags & VERBOSE): - stack.append([Exact(char, flags)]) - lastop = char - - elif char == '[': - # compile character class - - if index >= len(pattern): - raise error, 'unclosed character class' - - negate = 0 - last = '' - set = [] - - if pattern[index] == '^': - negate = 1 - index = index + 1 - if index >= len(pattern): - raise error, 'unclosed character class' - - if pattern[index] == ']': - set.append(']') - index = index + 1 - if index >= len(pattern): - raise error, 'unclosed character class' - - elif pattern[index] == '-': - set.append('-') - index = index + 1 - if index >= len(pattern): - raise error, 'unclosed character class' - - while (index < len(pattern)) and (pattern[index] != ']'): - next = pattern[index] - index = index + 1 - if next == '-': - if index >= len(pattern): - raise error, 'incomplete range in character class' - - elif pattern[index] == ']': - set.append('-') - - else: - if last == '': - raise error, ('improper use of range in ' - 'character class') - - start = last - - if pattern[index] == '\\': - escape_type, - value, - index = expand_escape(pattern, - index + 1, - CHARCLASS) - - if escape_type == CHAR: - end = value - - else: - raise error, ('illegal escape in character ' - 'class range') - else: - end = pattern[index] - index = index + 1 - - if start > end: - raise error, ('range arguments out of order ' - 'in character class') - - for char in map(chr, range(ord(start), ord(end) + 1)): - if char not in set: - set.append(char) - - last = '' - - elif next == '\\': - # expand syntax meta-characters and add to set - if index >= len(pattern): - raise error, 'incomplete set' - - escape_type, value, index = expand_escape(pattern, - index, - CHARCLASS) - - if escape_type == CHAR: - set.append(value) - last = value - - elif escape_type == SET: - for char in value: - if char not in set: - set.append(char) - last = '' - - else: - raise error, 'illegal escape type in character class' - - else: - if next not in set: - set.append(next) - last = next - - if (index >= len(pattern)) or ( pattern[index] != ']'): - raise error, 'incomplete set' - - index = index + 1 - - if negate: - # If case is being ignored, then both upper- and lowercase - # versions of the letters must be excluded. - if flags & IGNORECASE: set=set+map(string.upper, set) - notset = [] - for char in map(chr, range(256)): - if char not in set: - notset.append(char) - if len(notset) == 0: - raise error, 'empty negated set' - stack.append([Set(notset, flags)]) - else: - if len(set) == 0: - raise error, 'empty set' - stack.append([Set(set, flags)]) - - lastop = '[]' - - else: - stack.append([Exact(char, flags)]) - lastop = char - - code = [] - while len(stack) > 0: - if stack[-1][0].name == '(': - raise error, 'too many open parens' - code = stack[-1] + code - del stack[-1] - if len(code) == 0: - raise error, 'no code generated' - code = filter(lambda x: x.name != '|', code) - need_label = 0 - for i in range(len(code)): - if (code[i].name == 'jump') and (code[i].label == -1): - code[i] = Jump(label) - need_label = 1 - if need_label: - code.append(Label(label)) - label = label + 1 - code.append(End()) -# print code - return RegexObject(pattern, flags, code, register, groupindex) - -# Replace expand_escape and _expand functions with their C equivalents. -# If you suspect bugs in the C versions, comment out the next two lines -expand_escape = reop.expand_escape -_expand = reop._expand diff --git a/Lib/re1.py b/Lib/re1.py new file mode 100644 index 0000000..6c24797 --- /dev/null +++ b/Lib/re1.py @@ -0,0 +1,1508 @@ +#!/usr/bin/env python +# -*- mode: python -*- +# $Id$ + +import string +import reop + +# reop.error and re.error should be the same, since exceptions can be +# raised from either module. +error = reop.error # 're error' + +from reop import NORMAL, CHARCLASS, REPLACEMENT +from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET +from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER + +# compilation flags + +IGNORECASE = I = 0x01 + +MULTILINE = M = 0x02 +DOTALL = S = 0x04 +VERBOSE = X = 0x08 + +repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?', + '{n,}', '{n,}?', '{n,m}', '{n,m}?'] + +# +# +# + +def valid_identifier(id): + if len(id) == 0: + return 0 + if (not reop.syntax_table[id[0]] & reop.word) or \ + (reop.syntax_table[id[0]] & reop.digit): + return 0 + for char in id[1:]: + if not reop.syntax_table[char] & reop.word: + return 0 + return 1 + +# +# +# + +_cache = {} +_MAXCACHE = 20 + +def _cachecompile(pattern, flags=0): + key = (pattern, flags) + try: + return _cache[key] + except KeyError: + pass + value = compile(pattern, flags) + if len(_cache) >= _MAXCACHE: + _cache.clear() + _cache[key] = value + return value + +def match(pattern, string, flags=0): + return _cachecompile(pattern, flags).match(string) + +def search(pattern, string, flags=0): + return _cachecompile(pattern, flags).search(string) + +def sub(pattern, repl, string, count=0): + if type(pattern) == type(''): + pattern = _cachecompile(pattern) + return pattern.sub(repl, string, count) + +def subn(pattern, repl, string, count=0): + if type(pattern) == type(''): + pattern = _cachecompile(pattern) + return pattern.subn(repl, string, count) + +def split(pattern, string, maxsplit=0): + if type(pattern) == type(''): + pattern = _cachecompile(pattern) + return pattern.split(string, maxsplit) + +# +# +# + +def _expand(m, repl): + results = [] + index = 0 + size = len(repl) + while index < size: + found = string.find(repl, '\\', index) + if found < 0: + results.append(repl[index:]) + break + if found > index: + results.append(repl[index:found]) + escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT) + if escape_type == CHAR: + results.append(value) + elif escape_type == MEMORY_REFERENCE: + r = m.group(value) + if r is None: + raise error, ('group "' + str(value) + '" did not contribute ' + 'to the match') + results.append(m.group(value)) + else: + raise error, "bad escape in replacement" + return string.join(results, '') + +class RegexObject: + def __init__(self, pattern, flags, code, num_regs, groupindex): + self.code = code + self.num_regs = num_regs + self.flags = flags + self.pattern = pattern + self.groupindex = groupindex + self.fastmap = build_fastmap(code) + + if code[0].name == 'bol': + self.anchor = 1 + + elif code[0].name == 'begbuf': + self.anchor = 2 + + else: + self.anchor = 0 + + self.buffer = assemble(code) + def search(self, string, pos=0): + regs = reop.search(self.buffer, + self.num_regs, + self.flags, + self.fastmap.can_be_null, + self.fastmap.fastmap(), + self.anchor, + string, + pos) + if regs is None: + return None + + return MatchObject(self, + string, + pos, + regs) + + def match(self, string, pos=0): + regs = reop.match(self.buffer, + self.num_regs, + self.flags, + self.fastmap.can_be_null, + self.fastmap.fastmap(), + self.anchor, + string, + pos) + if regs is None: + return None + + return MatchObject(self, + string, + pos, + regs) + + def sub(self, repl, string, count=0): + return self.subn(repl, string, count)[0] + + def subn(self, repl, source, count=0): + if count < 0: + raise ValueError, "negative substibution count" + if count == 0: + import sys + count = sys.maxint + if type(repl) == type(''): + if '\\' in repl: + repl = lambda m, r=repl: _expand(m, r) + else: + repl = lambda m, r=repl: r + n = 0 # Number of matches + pos = 0 # Where to start searching + lastmatch = -1 # End of last match + results = [] # Substrings making up the result + end = len(source) + while n < count and pos <= end: + m = self.search(source, pos) + if not m: + break + i, j = m.span(0) + if i == j == lastmatch: + # Empty match adjacent to previous match + pos = pos + 1 + results.append(source[lastmatch:pos]) + continue + if pos < i: + results.append(source[pos:i]) + results.append(repl(m)) + pos = lastmatch = j + if i == j: + # Last match was empty; don't try here again + pos = pos + 1 + results.append(source[lastmatch:pos]) + n = n + 1 + results.append(source[pos:]) + return (string.join(results, ''), n) + + def split(self, source, maxsplit=0): + if maxsplit < 0: + raise error, "negative split count" + if maxsplit == 0: + import sys + maxsplit = sys.maxint + n = 0 + pos = 0 + lastmatch = 0 + results = [] + end = len(source) + while n < maxsplit: + m = self.search(source, pos) + if not m: + break + i, j = m.span(0) + if i == j: + # Empty match + if pos >= end: + break + pos = pos+1 + continue + results.append(source[lastmatch:i]) + g = m.group() + if g: + results[len(results):] = list(g) + pos = lastmatch = j + results.append(source[lastmatch:]) + return results + +class MatchObject: + def __init__(self, re, string, pos, regs): + self.re = re + self.string = string + self.pos = pos + self.regs = regs + + def start(self, g): + if type(g) == type(''): + try: + g = self.re.groupindex[g] + except (KeyError, TypeError): + raise IndexError, ('group "' + g + '" is undefined') + return self.regs[g][0] + + def end(self, g): + if type(g) == type(''): + try: + g = self.re.groupindex[g] + except (KeyError, TypeError): + raise IndexError, ('group "' + g + '" is undefined') + return self.regs[g][1] + + def span(self, g): + if type(g) == type(''): + try: + g = self.re.groupindex[g] + except (KeyError, TypeError): + raise IndexError, ('group "' + g + '" is undefined') + return self.regs[g] + + def group(self, *groups): + if len(groups) == 0: + groups = range(1, self.re.num_regs) + use_all = 1 + else: + use_all = 0 + result = [] + for g in groups: + if type(g) == type(''): + try: + g = self.re.groupindex[g] + except (KeyError, TypeError): + raise IndexError, ('group "' + g + '" is undefined') + if (self.regs[g][0] == -1) or (self.regs[g][1] == -1): + result.append(None) + else: + result.append(self.string[self.regs[g][0]:self.regs[g][1]]) + if use_all or len(result) > 1: + return tuple(result) + elif len(result) == 1: + return result[0] + else: + return () + +# +# A set of classes to make assembly a bit easier, if a bit verbose. +# + +class Instruction: + def __init__(self, opcode, size=1): + self.opcode = opcode + self.size = size + def assemble(self, position, labels): + return self.opcode + def __repr__(self): + return '%-15s' % (self.name) + +class End(Instruction): + name = 'end' + def __init__(self): + Instruction.__init__(self, chr(0)) + +class Bol(Instruction): + name = 'bol' + def __init__(self): + self.name = 'bol' + Instruction.__init__(self, chr(1)) + +class Eol(Instruction): + name = 'eol' + def __init__(self): + Instruction.__init__(self, chr(2)) + +class Set(Instruction): + name = 'set' + def __init__(self, set, flags=0): + self.set = set + if flags & IGNORECASE: self.set=map(string.lower, self.set) + if len(set)==1: + # If only one element, use the "exact" opcode (it'll be faster) + Instruction.__init__(self, chr(4), 2) + else: + # Use the "set" opcode + Instruction.__init__(self, chr(3), 33) + def assemble(self, position, labels): + if len(self.set)==1: + # If only one character in set, generate an "exact" opcode + return self.opcode + self.set[0] + result = self.opcode + temp = 0 + for i, c in map(lambda x: (x, chr(x)), range(256)): + if c in self.set: + temp = temp | (1 << (i & 7)) + if (i % 8) == 7: + result = result + chr(temp) + temp = 0 + return result + def __repr__(self): + result = '%-15s' % (self.name) + self.set.sort() + # XXX this should print more intelligently + for char in self.set: + result = result + char + return result + +class Exact(Instruction): + name = 'exact' + def __init__(self, char, flags): + self.char = char + if flags & IGNORECASE: self.char=string.lower(self.char) + Instruction.__init__(self, chr(4), 2) + def assemble(self, position, labels): + return self.opcode + self.char + def __repr__(self): + return '%-15s %s' % (self.name, `self.char`) + +class AnyChar(Instruction): + name = 'anychar' + def __init__(self): + Instruction.__init__(self, chr(5)) + def assemble(self, position, labels): + return self.opcode + +class MemoryInstruction(Instruction): + def __init__(self, opcode, register): + self.register = register + Instruction.__init__(self, opcode, 2) + def assemble(self, position, labels): + return self.opcode + chr(self.register) + def __repr__(self): + return '%-15s %i' % (self.name, self.register) + +class StartMemory(MemoryInstruction): + name = 'start_memory' + def __init__(self, register): + MemoryInstruction.__init__(self, chr(6), register) + +class EndMemory(MemoryInstruction): + name = 'end_memory' + def __init__(self, register): + MemoryInstruction.__init__(self, chr(7), register) + +class MatchMemory(MemoryInstruction): + name = 'match_memory' + def __init__(self, register): + MemoryInstruction.__init__(self, chr(8), register) + +class JumpInstruction(Instruction): + def __init__(self, opcode, label): + self.label = label + Instruction.__init__(self, opcode, 3) + def compute_offset(self, start, dest): + return dest - (start + 3) + def pack_offset(self, offset): + if offset > 32767: + raise error, 'offset out of range (pos)' + elif offset < -32768: + raise error, 'offset out of range (neg)' + elif offset < 0: + offset = offset + 65536 + return chr(offset & 0xff) + chr((offset >> 8) & 0xff) + def assemble(self, position, labels): + return self.opcode + \ + self.pack_offset(self.compute_offset(position, + labels[self.label])) + def __repr__(self): + return '%-15s %i' % (self.name, self.label) + +class Jump(JumpInstruction): + name = 'jump' + def __init__(self, label): + JumpInstruction.__init__(self, chr(9), label) + +class StarJump(JumpInstruction): + name = 'star_jump' + def __init__(self, label): + JumpInstruction.__init__(self, chr(10), label) + +class FailureJump(JumpInstruction): + name = 'failure_jump' + def __init__(self, label): + JumpInstruction.__init__(self, chr(11), label) + +class UpdateFailureJump(JumpInstruction): + name = 'update_failure_jump' + def __init__(self, label): + JumpInstruction.__init__(self, chr(12), label) + +class DummyFailureJump(JumpInstruction): + name = 'dummy_failure_jump' + def __init__(self, label): + JumpInstruction.__init__(self, chr(13), label) + +class BegBuf(Instruction): + name = 'begbuf' + def __init__(self): + Instruction.__init__(self, chr(14)) + +class EndBuf(Instruction): + name = 'endbuf' + def __init__(self): + Instruction.__init__(self, chr(15)) + +class WordBeg(Instruction): + name = 'wordbeg' + def __init__(self): + Instruction.__init__(self, chr(16)) + +class WordEnd(Instruction): + name = 'wordend' + def __init__(self): + Instruction.__init__(self, chr(17)) + +class WordBound(Instruction): + name = 'wordbound' + def __init__(self): + Instruction.__init__(self, chr(18)) + +class NotWordBound(Instruction): + name = 'notwordbound' + def __init__(self): + Instruction.__init__(self, chr(19)) + +class SyntaxSpec(Instruction): + name = 'syntaxspec' + def __init__(self, syntax): + self.syntax = syntax + Instruction.__init__(self, chr(20), 2) + def assemble(self, postition, labels): + return self.opcode + chr(self.syntax) + +class NotSyntaxSpec(Instruction): + name = 'notsyntaxspec' + def __init__(self, syntax): + self.syntax = syntax + Instruction.__init__(self, chr(21), 2) + def assemble(self, postition, labels): + return self.opcode + chr(self.syntax) + +class Label(Instruction): + name = 'label' + def __init__(self, label): + self.label = label + Instruction.__init__(self, '', 0) + def __repr__(self): + return '%-15s %i' % (self.name, self.label) + +class OpenParen(Instruction): + name = '(' + def __init__(self, register): + self.register = register + Instruction.__init__(self, '', 0) + def assemble(self, position, labels): + raise error, 'unmatched open parenthesis' + +class Alternation(Instruction): + name = '|' + def __init__(self): + Instruction.__init__(self, '', 0) + def assemble(self, position, labels): + raise error, 'an alternation was not taken care of' + +# +# +# + +def assemble(instructions): + labels = {} + position = 0 + pass1 = [] + for instruction in instructions: + if instruction.name == 'label': + labels[instruction.label] = position + else: + pass1.append((position, instruction)) + position = position + instruction.size + pass2 = '' + for position, instruction in pass1: + pass2 = pass2 + instruction.assemble(position, labels) + return pass2 + +# +# +# + +def escape(pattern): + result = [] + for char in pattern: + if not reop.syntax_table[char] & reop.word: + result.append('\\') + result.append(char) + return string.join(result, '') + +# +# +# + +def registers_used(instructions): + result = [] + for instruction in instructions: + if (instruction.name in ['set_memory', 'end_memory']) and \ + (instruction.register not in result): + result.append(instruction.register) + return result + +# +# +# + +class Fastmap: + def __init__(self): + self.map = ['\000']*256 + self.can_be_null = 0 + def add(self, char): + self.map[ord(char)] = '\001' + def fastmap(self): + return string.join(self.map, '') + def __getitem__(self, char): + return ord(self.map[ord(char)]) + def __repr__(self): + self.map.sort() + return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')' + +# +# +# + +def find_label(code, label): + line = 0 + for instruction in code: + if (instruction.name == 'label') and (instruction.label == label): + return line + 1 + line = line + 1 + +def build_fastmap_aux(code, pos, visited, fastmap): + if visited[pos]: + return + while 1: + instruction = code[pos] + visited[pos] = 1 + pos = pos + 1 + if instruction.name == 'end': + fastmap.can_be_null = 1 + return + elif instruction.name == 'syntaxspec': + for char in map(chr, range(256)): + if reop.syntax_table[char] & instruction.syntax: + fastmap.add(char) + return + elif instruction.name == 'notsyntaxspec': + for char in map(chr, range(256)): + if not reop.syntax_table[char] & instruction.syntax: + fastmap.add(char) + return + elif instruction.name == 'eol': + fastmap.add('\n') + if fastmap.can_be_null == 0: + fastmap.can_be_null = 2 + return + elif instruction.name == 'set': + for char in instruction.set: + fastmap.add(char) + return + elif instruction.name == 'exact': + fastmap.add(instruction.char) + elif instruction.name == 'anychar': + for char in map(chr, range(256)): + if char != '\n': + fastmap.add(char) + return + elif instruction.name == 'match_memory': + for char in map(chr, range(256)): + fastmap.add(char) + fastmap.can_be_null = 1 + return + elif instruction.name in ['jump', 'dummy_failure_jump', \ + 'update_failure_jump', 'star_jump']: + pos = find_label(code, instruction.label) + if visited[pos]: + return + visited[pos] = 1 + elif instruction.name == 'failure_jump': + build_fastmap_aux(code, + find_label(code, instruction.label), + visited, + fastmap) + +def build_fastmap(code, pos=0): + visited = [0] * len(code) + fastmap = Fastmap() + build_fastmap_aux(code, pos, visited, fastmap) + return fastmap + +# +# +# + +#[NORMAL, CHARCLASS, REPLACEMENT] = range(3) +#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, +# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9) + +def expand_escape(pattern, index, context=NORMAL): + if index >= len(pattern): + raise error, 'escape ends too soon' + + elif pattern[index] == 't': + return CHAR, chr(9), index + 1 + + elif pattern[index] == 'n': + return CHAR, chr(10), index + 1 + + elif pattern[index] == 'v': + return CHAR, chr(11), index + 1 + + elif pattern[index] == 'r': + return CHAR, chr(13), index + 1 + + elif pattern[index] == 'f': + return CHAR, chr(12), index + 1 + + elif pattern[index] == 'a': + return CHAR, chr(7), index + 1 + + elif pattern[index] == 'x': + # CAUTION: this is the Python rule, not the Perl rule! + end = index + 1 # Skip over the 'x' character + while (end < len(pattern)) and (pattern[end] in string.hexdigits): + end = end + 1 + if end == index: + raise error, "\\x must be followed by hex digit(s)" + # let Python evaluate it, so we don't incorrectly 2nd-guess + # what it's doing (and Python in turn passes it on to sscanf, + # so that *it* doesn't incorrectly 2nd-guess what C does!) + char = eval ('"' + pattern[index-1:end] + '"') + assert len(char) == 1 + return CHAR, char, end + + elif pattern[index] == 'b': + if context != NORMAL: + return CHAR, chr(8), index + 1 + else: + return WORD_BOUNDARY, '', index + 1 + + elif pattern[index] == 'B': + if context != NORMAL: + return CHAR, 'B', index + 1 + else: + return NOT_WORD_BOUNDARY, '', index + 1 + + elif pattern[index] == 'A': + if context != NORMAL: + return CHAR, 'A', index + 1 + else: + return BEGINNING_OF_BUFFER, '', index + 1 + + elif pattern[index] == 'Z': + if context != NORMAL: + return CHAR, 'Z', index + 1 + else: + return END_OF_BUFFER, '', index + 1 + + elif pattern[index] in 'GluLUQE': + raise error, ('\\' + pattern[index] + ' is not allowed') + + elif pattern[index] == 'w': + if context == NORMAL: + return SYNTAX, reop.word, index + 1 + elif context == CHARCLASS: + set = [] + for char in reop.syntax_table.keys(): + if reop.syntax_table[char] & reop.word: + set.append(char) + return SET, set, index + 1 + else: + return CHAR, 'w', index + 1 + + elif pattern[index] == 'W': + if context == NORMAL: + return NOT_SYNTAX, reop.word, index + 1 + elif context == CHARCLASS: + set = [] + for char in reop.syntax_table.keys(): + if not reop.syntax_table[char] & reop.word: + set.append(char) + return SET, set, index + 1 + else: + return CHAR, 'W', index + 1 + + elif pattern[index] == 's': + if context == NORMAL: + return SYNTAX, reop.whitespace, index + 1 + elif context == CHARCLASS: + set = [] + for char in reop.syntax_table.keys(): + if reop.syntax_table[char] & reop.whitespace: + set.append(char) + return SET, set, index + 1 + else: + return CHAR, 's', index + 1 + + elif pattern[index] == 'S': + if context == NORMAL: + return NOT_SYNTAX, reop.whitespace, index + 1 + elif context == CHARCLASS: + set = [] + for char in reop.syntax_table.keys(): + if not reop.syntax_table[char] & reop.whitespace: + set.append(char) + return SET, set, index + 1 + else: + return CHAR, 'S', index + 1 + + elif pattern[index] == 'd': + if context == NORMAL: + return SYNTAX, reop.digit, index + 1 + elif context == CHARCLASS: + set = [] + for char in reop.syntax_table.keys(): + if reop.syntax_table[char] & reop.digit: + set.append(char) + return SET, set, index + 1 + else: + return CHAR, 'd', index + 1 + + elif pattern[index] == 'D': + if context == NORMAL: + return NOT_SYNTAX, reop.digit, index + 1 + elif context == CHARCLASS: + set = [] + for char in reop.syntax_table.keys(): + if not reop.syntax_table[char] & reop.digit: + set.append(char) + return SET, set, index + 1 + else: + return CHAR, 'D', index + 1 + + elif pattern[index] in '0123456789': + + if pattern[index] == '0': + if (index + 1 < len(pattern)) and \ + (pattern[index + 1] in string.octdigits): + if (index + 2 < len(pattern)) and \ + (pattern[index + 2] in string.octdigits): + value = string.atoi(pattern[index:index + 3], 8) + index = index + 3 + + else: + value = string.atoi(pattern[index:index + 2], 8) + index = index + 2 + + else: + value = 0 + index = index + 1 + + if value > 255: + raise error, 'octal value out of range' + + return CHAR, chr(value), index + + else: + if (index + 1 < len(pattern)) and \ + (pattern[index + 1] in string.digits): + if (index + 2 < len(pattern)) and \ + (pattern[index + 2] in string.octdigits) and \ + (pattern[index + 1] in string.octdigits) and \ + (pattern[index] in string.octdigits): + value = string.atoi(pattern[index:index + 3], 8) + if value > 255: + raise error, 'octal value out of range' + + return CHAR, chr(value), index + 3 + + else: + value = string.atoi(pattern[index:index + 2]) + if (value < 1) or (value > 99): + raise error, 'memory reference out of range' + + if context == CHARCLASS: + raise error, ('cannot reference a register from ' + 'inside a character class') + return MEMORY_REFERENCE, value, index + 2 + + else: + if context == CHARCLASS: + raise error, ('cannot reference a register from ' + 'inside a character class') + + value = string.atoi(pattern[index]) + return MEMORY_REFERENCE, value, index + 1 + + elif pattern[index] == 'g': + if context != REPLACEMENT: + return CHAR, 'g', index + 1 + + index = index + 1 + if index >= len(pattern): + raise error, 'unfinished symbolic reference' + if pattern[index] != '<': + raise error, 'missing < in symbolic reference' + + index = index + 1 + end = string.find(pattern, '>', index) + if end == -1: + raise error, 'unfinished symbolic reference' + value = pattern[index:end] + if not valid_identifier(value): + raise error, 'illegal symbolic reference' + return MEMORY_REFERENCE, value, end + 1 + + else: + return CHAR, pattern[index], index + 1 + +def compile(pattern, flags=0): + stack = [] + label = 0 + register = 1 + groupindex = {} + lastop = '' + + # look for embedded pattern modifiers at the beginning of the pattern + + index = 0 + + if len(pattern) >= 3 and \ + (pattern[:2] == '(?') and \ + (pattern[2] in 'iImMsSxX'): + index = 2 + while (index < len(pattern)) and (pattern[index] != ')'): + if pattern[index] in 'iI': + flags = flags | IGNORECASE + elif pattern[index] in 'mM': + flags = flags | MULTILINE + elif pattern[index] in 'sS': + flags = flags | DOTALL + elif pattern[index] in 'xX': + flags = flags | VERBOSE + else: + raise error, 'unknown modifier' + index = index + 1 + index = index + 1 + + # compile the rest of the pattern + + while (index < len(pattern)): + char = pattern[index] + index = index + 1 + if char == '\\': + escape_type, value, index = expand_escape(pattern, index) + + if escape_type == CHAR: + stack.append([Exact(value, flags)]) + lastop = '\\' + value + + elif escape_type == MEMORY_REFERENCE: + if value >= register: + raise error, ('cannot reference a register ' + 'not yet used') + stack.append([MatchMemory(value)]) + lastop = '\\1' + + elif escape_type == BEGINNING_OF_BUFFER: + stack.append([BegBuf()]) + lastop = '\\A' + + elif escape_type == END_OF_BUFFER: + stack.append([EndBuf()]) + lastop = '\\Z' + + elif escape_type == WORD_BOUNDARY: + stack.append([WordBound()]) + lastop = '\\b' + + elif escape_type == NOT_WORD_BOUNDARY: + stack.append([NotWordBound()]) + lastop = '\\B' + + elif escape_type == SYNTAX: + stack.append([SyntaxSpec(value)]) + if value == reop.word: + lastop = '\\w' + elif value == reop.whitespace: + lastop = '\\s' + elif value == reop.digit: + lastop = '\\d' + else: + lastop = '\\?' + + elif escape_type == NOT_SYNTAX: + stack.append([NotSyntaxSpec(value)]) + if value == reop.word: + lastop = '\\W' + elif value == reop.whitespace: + lastop = '\\S' + elif value == reop.digit: + lastop = '\\D' + else: + lastop = '\\?' + + elif escape_type == SET: + raise error, 'cannot use set escape type here' + + else: + raise error, 'unknown escape type' + + elif char == '|': + expr = [] + + while (len(stack) != 0) and \ + (stack[-1][0].name != '(') and \ + (stack[-1][0].name != '|'): + expr = stack[-1] + expr + del stack[-1] + stack.append([FailureJump(label)] + \ + expr + \ + [Jump(-1), + Label(label)]) + stack.append([Alternation()]) + label = label + 1 + lastop = '|' + + elif char == '(': + if index >= len(pattern): + raise error, 'no matching close paren' + + elif pattern[index] == '?': + # Perl style (?...) extensions + index = index + 1 + if index >= len(pattern): + raise error, 'extension ends prematurely' + + elif pattern[index] == 'P': + # Python extensions + index = index + 1 + if index >= len(pattern): + raise error, 'extension ends prematurely' + + elif pattern[index] == '<': + # Handle Python symbolic group names (?P<...>...) + index = index + 1 + end = string.find(pattern, '>', index) + if end == -1: + raise error, 'no end to symbolic group name' + name = pattern[index:end] + if not valid_identifier(name): + raise error, ('symbolic group name must be a ' + 'valid identifier') + index = end + 1 + groupindex[name] = register + stack.append([OpenParen(register)]) + register = register + 1 + lastop = '(' + + elif pattern[index] == '=': + # backreference to symbolic group name + if index >= len(pattern): + raise error, '(?P= at the end of the pattern' + start = index + 1 + end = string.find(pattern, ')', start) + if end == -1: + raise error, 'no ) to end symbolic group name' + name = pattern[start:end] + if name not in groupindex.keys(): + raise error, ('symbolic group name ' + name + \ + ' has not been used yet') + stack.append([MatchMemory(groupindex[name])]) + index = end + 1 + lastop = '(?P=)' + + else: + raise error, ('unknown Python extension: ' + \ + pattern[index]) + + elif pattern[index] == ':': + # grouping, but no registers + index = index + 1 + stack.append([OpenParen(-1)]) + lastop = '(' + + elif pattern[index] == '#': + # comment + index = index + 1 + end = string.find(pattern, ')', index) + if end == -1: + raise error, 'no end to comment' + index = end + 1 + # do not change lastop + + elif pattern[index] == '=': + raise error, ('zero-width positive lookahead ' + 'assertion is unsupported') + + elif pattern[index] == '!': + raise error, ('zero-width negative lookahead ' + 'assertion is unsupported') + + elif pattern[index] in 'iImMsSxX': + raise error, ('embedded pattern modifiers are only ' + 'allowed at the beginning of the pattern') + + else: + raise error, 'unknown extension' + + else: + stack.append([OpenParen(register)]) + register = register + 1 + lastop = '(' + + elif char == ')': + # make one expression out of everything on the stack up to + # the marker left by the last parenthesis + expr = [] + while (len(stack) > 0) and (stack[-1][0].name != '('): + expr = stack[-1] + expr + del stack[-1] + + if len(stack) == 0: + raise error, 'too many close parens' + + # remove markers left by alternation + expr = filter(lambda x: x.name != '|', expr) + + # clean up jumps inserted by alternation + need_label = 0 + for i in range(len(expr)): + if (expr[i].name == 'jump') and (expr[i].label == -1): + expr[i] = Jump(label) + need_label = 1 + if need_label: + expr.append(Label(label)) + label = label + 1 + + if stack[-1][0].register > 0: + expr = [StartMemory(stack[-1][0].register)] + \ + expr + \ + [EndMemory(stack[-1][0].register)] + del stack[-1] + stack.append(expr) + lastop = ')' + + elif char == '{': + if len(stack) == 0: + raise error, 'no expression to repeat' + end = string.find(pattern, '}', index) + if end == -1: + raise error, ('no close curly bracket to match' + ' open curly bracket') + + fields = map(string.strip, + string.split(pattern[index:end], ',')) + index = end + 1 + + minimal = 0 + if (index < len(pattern)) and (pattern[index] == '?'): + minimal = 1 + index = index + 1 + + if len(fields) == 1: + # {n} or {n}? (there's really no difference) + try: + count = string.atoi(fields[0]) + except ValueError: + raise error, ('count must be an integer ' + 'inside curly braces') + if count > 65535: + raise error, 'repeat count out of range' + expr = [] + while count > 0: + expr = expr + stack[-1] + count = count - 1 + del stack[-1] + stack.append(expr) + if minimal: + lastop = '{n}?' + else: + lastop = '{n}' + + elif len(fields) == 2: + # {n,} or {n,m} + if fields[1] == '': + # {n,} + try: + min = string.atoi(fields[0]) + except ValueError: + raise error, ('minimum must be an integer ' + 'inside curly braces') + if min > 65535: + raise error, 'minimum repeat count out of range' + + expr = [] + while min > 0: + expr = expr + stack[-1] + min = min - 1 + if minimal: + expr = expr + \ + ([Jump(label + 1), + Label(label)] + \ + stack[-1] + \ + [Label(label + 1), + FailureJump(label)]) + lastop = '{n,}?' + else: + expr = expr + \ + ([Label(label), + FailureJump(label + 1)] + + stack[-1] + + [StarJump(label), + Label(label + 1)]) + lastop = '{n,}' + + del stack[-1] + stack.append(expr) + label = label + 2 + + else: + # {n,m} + try: + min = string.atoi(fields[0]) + except ValueError: + raise error, ('minimum must be an integer ' + 'inside curly braces') + try: + max = string.atoi(fields[1]) + except ValueError: + raise error, ('maximum must be an integer ' + 'inside curly braces') + if min > 65535: + raise error, ('minumim repeat count out ' + 'of range') + if max > 65535: + raise error, ('maximum repeat count out ' + 'of range') + if min > max: + raise error, ('minimum repeat count must be ' + 'less than the maximum ' + 'repeat count') + expr = [] + while min > 0: + expr = expr + stack[-1] + min = min - 1 + max = max - 1 + if minimal: + while max > 0: + expr = expr + \ + [FailureJump(label), + Jump(label + 1), + Label(label)] + \ + stack[-1] + \ + [Label(label + 1)] + max = max - 1 + label = label + 2 + del stack[-1] + stack.append(expr) + lastop = '{n,m}?' + else: + while max > 0: + expr = expr + \ + [FailureJump(label)] + \ + stack[-1] + max = max - 1 + del stack[-1] + stack.append(expr + [Label(label)]) + label = label + 1 + lastop = '{n,m}' + + else: + raise error, ('there need to be one or two fields ' + 'in a {} expression') + + elif char == '}': + raise error, 'unbalanced close curly brace' + + elif char == '*': + # Kleene closure + if len(stack) == 0: + raise error, '* needs something to repeat' + + if lastop in ['(', '|']: + raise error, '* needs something to repeat' + + if lastop in repetition_operators: + raise error, 'nested repetition operators' + + if (index < len(pattern)) and (pattern[index] == '?'): + # non-greedy matching + expr = [Jump(label + 1), + Label(label)] + \ + stack[-1] + \ + [Label(label + 1), + FailureJump(label)] + index = index + 1 + lastop = '*?' + else: + # greedy matching + expr = [Label(label), + FailureJump(label + 1)] + \ + stack[-1] + \ + [StarJump(label), + Label(label + 1)] + lastop = '*' + del stack[-1] + stack.append(expr) + label = label + 2 + + elif char == '+': + # positive closure + if len(stack) == 0: + raise error, '+ needs something to repeat' + + if lastop in ['(', '|']: + raise error, '+ needs something to repeat' + + if lastop in repetition_operators: + raise error, 'nested repetition operators' + + if (index < len(pattern)) and (pattern[index] == '?'): + # non-greedy + expr = [Label(label)] + \ + stack[-1] + \ + [FailureJump(label)] + label = label + 1 + index = index + 1 + lastop = '+?' + + else: + # greedy + expr = [DummyFailureJump(label + 1), + Label(label), + FailureJump(label + 2), + Label(label + 1)] + \ + stack[-1] + \ + [StarJump(label), + Label(label + 2)] + label = label + 3 + lastop = '+' + + del stack[-1] + stack.append(expr) + + elif char == '?': + if len(stack) == 0: + raise error, 'need something to be optional' + + if len(stack) == 0: + raise error, '? needs something to repeat' + + if lastop in ['(', '|']: + raise error, '? needs something to repeat' + + if lastop in repetition_operators: + raise error, 'nested repetition operators' + + if (index < len(pattern)) and (pattern[index] == '?'): + # non-greedy matching + expr = [FailureJump(label), + Jump(label + 1), + Label(label)] + \ + stack[-1] + \ + [Label(label + 1)] + label = label + 2 + index = index + 1 + lastop = '??' + + else: + # greedy matching + expr = [FailureJump(label)] + \ + stack[-1] + \ + [Label(label)] + label = label + 1 + lastop = '?' + + del stack[-1] + stack.append(expr) + + elif char == '.': + if flags & DOTALL: + stack.append([Set(map(chr, range(256)), flags)]) + else: + stack.append([AnyChar()]) + lastop = '.' + + elif char == '^': + if flags & MULTILINE: + stack.append([Bol()]) + else: + stack.append([BegBuf()]) + lastop = '^' + + elif char == '$': + if flags & MULTILINE: + stack.append([Eol()]) + else: + stack.append([EndBuf()]) + lastop = '$' + + elif char == '#': + if flags & VERBOSE: + # comment + index = index + 1 + end = string.find(pattern, '\n', index) + if end == -1: + index = len(pattern) + else: + index = end + 1 + # do not change lastop + else: + stack.append([Exact(char, flags)]) + lastop = '#' + + elif char in string.whitespace: + if not (flags & VERBOSE): + stack.append([Exact(char, flags)]) + lastop = char + + elif char == '[': + # compile character class + + if index >= len(pattern): + raise error, 'unclosed character class' + + negate = 0 + last = '' + set = [] + + if pattern[index] == '^': + negate = 1 + index = index + 1 + if index >= len(pattern): + raise error, 'unclosed character class' + + if pattern[index] == ']': + set.append(']') + index = index + 1 + if index >= len(pattern): + raise error, 'unclosed character class' + + elif pattern[index] == '-': + set.append('-') + index = index + 1 + if index >= len(pattern): + raise error, 'unclosed character class' + + while (index < len(pattern)) and (pattern[index] != ']'): + next = pattern[index] + index = index + 1 + if next == '-': + if index >= len(pattern): + raise error, 'incomplete range in character class' + + elif pattern[index] == ']': + set.append('-') + + else: + if last == '': + raise error, ('improper use of range in ' + 'character class') + + start = last + + if pattern[index] == '\\': + escape_type, + value, + index = expand_escape(pattern, + index + 1, + CHARCLASS) + + if escape_type == CHAR: + end = value + + else: + raise error, ('illegal escape in character ' + 'class range') + else: + end = pattern[index] + index = index + 1 + + if start > end: + raise error, ('range arguments out of order ' + 'in character class') + + for char in map(chr, range(ord(start), ord(end) + 1)): + if char not in set: + set.append(char) + + last = '' + + elif next == '\\': + # expand syntax meta-characters and add to set + if index >= len(pattern): + raise error, 'incomplete set' + + escape_type, value, index = expand_escape(pattern, + index, + CHARCLASS) + + if escape_type == CHAR: + set.append(value) + last = value + + elif escape_type == SET: + for char in value: + if char not in set: + set.append(char) + last = '' + + else: + raise error, 'illegal escape type in character class' + + else: + if next not in set: + set.append(next) + last = next + + if (index >= len(pattern)) or ( pattern[index] != ']'): + raise error, 'incomplete set' + + index = index + 1 + + if negate: + # If case is being ignored, then both upper- and lowercase + # versions of the letters must be excluded. + if flags & IGNORECASE: set=set+map(string.upper, set) + notset = [] + for char in map(chr, range(256)): + if char not in set: + notset.append(char) + if len(notset) == 0: + raise error, 'empty negated set' + stack.append([Set(notset, flags)]) + else: + if len(set) == 0: + raise error, 'empty set' + stack.append([Set(set, flags)]) + + lastop = '[]' + + else: + stack.append([Exact(char, flags)]) + lastop = char + + code = [] + while len(stack) > 0: + if stack[-1][0].name == '(': + raise error, 'too many open parens' + code = stack[-1] + code + del stack[-1] + if len(code) == 0: + raise error, 'no code generated' + code = filter(lambda x: x.name != '|', code) + need_label = 0 + for i in range(len(code)): + if (code[i].name == 'jump') and (code[i].label == -1): + code[i] = Jump(label) + need_label = 1 + if need_label: + code.append(Label(label)) + label = label + 1 + code.append(End()) +# print code + return RegexObject(pattern, flags, code, register, groupindex) + +# Replace expand_escape and _expand functions with their C equivalents. +# If you suspect bugs in the C versions, comment out the next two lines +expand_escape = reop.expand_escape +_expand = reop._expand -- cgit v0.12