diff options
Diffstat (limited to 'Lib/re.py')
| -rw-r--r-- | Lib/re.py | 1212 | 
1 files changed, 67 insertions, 1145 deletions
@@ -2,42 +2,27 @@  # -*- mode: python -*-  # $Id$ -import string -import reop - -# reop.error and re.error should be the same, since exceptions can be -# raised from  either module. -error = reop.error # 're error' -from reop import NORMAL, CHARCLASS, REPLACEMENT -from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET -from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER - -# compilation flags - -IGNORECASE = I = 0x01 - -MULTILINE = M = 0x02 -DOTALL = S = 0x04 -VERBOSE = X = 0x08 +import sys +import string +from pcre import * -repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?', -			'{n,}', '{n,}?', '{n,m}', '{n,m}?'] +[ NORMAL, CHARCLASS, REPLACEMENT ] = range(3) +[ CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER ] = range(9)  # -# +# First, the public part of the interface:  # -def valid_identifier(id): -    if len(id) == 0: -	return 0 -    if (not reop.syntax_table[id[0]] & reop.word) or \ -       (reop.syntax_table[id[0]] & reop.digit): -	return 0 -    for char in id[1:]: -	if not reop.syntax_table[char] & reop.word: -	    return 0 -    return 1 +# pcre.error and re.error should be the same, since exceptions can be +# raised from  either module. + +# compilation flags + +I = IGNORECASE +M = MULTILINE +S = DOTALL  +X = VERBOSE   #  # @@ -83,60 +68,17 @@ def split(pattern, string, maxsplit=0):  #  # -def _expand(m, repl): -    results = [] -    index = 0 -    size = len(repl) -    while index < size: -	found = string.find(repl, '\\', index) -	if found < 0: -	    results.append(repl[index:]) -	    break -	if found > index: -	    results.append(repl[index:found]) -	escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT) -	if escape_type == CHAR: -	    results.append(value) -	elif escape_type == MEMORY_REFERENCE: -	    r = m.group(value) -	    if r is None: -		raise error, ('group "' + str(value) + '" did not contribute ' -			      'to the match') -	    results.append(m.group(value)) -	else: -	    raise error, "bad escape in replacement" -    return string.join(results, '') -  class RegexObject: -    def __init__(self, pattern, flags, code, num_regs, groupindex): -	self.code = code -	self.num_regs = num_regs +    def __init__(self, pattern, flags, code, groupindex): +	self.code = code   	self.flags = flags  	self.pattern = pattern  	self.groupindex = groupindex -	self.fastmap = build_fastmap(code) -	 -	if code[0].name == 'bol': -	    self.anchor = 1 -	     -	elif code[0].name == 'begbuf': -	    self.anchor = 2 -	     -	else: -	    self.anchor = 0 -	     -	self.buffer = assemble(code)      def search(self, string, pos=0): -	regs = reop.search(self.buffer, -			   self.num_regs, -			   self.flags, -			   self.fastmap.can_be_null, -			   self.fastmap.fastmap(), -			   self.anchor, -			   string, -			   pos) +	regs = self.code.match(string, pos, 0)  	if regs is None:  	    return None +	self.num_regs=len(regs)  	return MatchObject(self,  			   string, @@ -144,17 +86,10 @@ class RegexObject:  			   regs)      def match(self, string, pos=0): -	regs = reop.match(self.buffer, -			  self.num_regs, -			  self.flags, -			  self.fastmap.can_be_null, -			  self.fastmap.fastmap(), -			  self.anchor, -			  string, -			  pos) +	regs = self.code.match(string, pos, ANCHORED)  	if regs is None:  	    return None -	 +	self.num_regs=len(regs)/2  	return MatchObject(self,  			   string,  			   pos, @@ -165,13 +100,13 @@ class RegexObject:      def subn(self, repl, source, count=0):  	if count < 0: -	    raise ValueError, "negative substibution count" +	    raise error, "negative substitution count"  	if count == 0:  	    import sys  	    count = sys.maxint  	if type(repl) == type(''):  	    if '\\' in repl: -		repl = lambda m, r=repl: _expand(m, r) +		repl = lambda m, r=repl: pcre_expand(m, r)  	    else:  		repl = lambda m, r=repl: r  	n = 0           # Number of matches @@ -275,7 +210,8 @@ class MatchObject:  		    g = self.re.groupindex[g]  		except (KeyError, TypeError):  		    raise IndexError, ('group "' + g + '" is undefined') -	    if (self.regs[g][0] == -1) or (self.regs[g][1] == -1): +	    if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined') +	    elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1):  		result.append(None)  	    else:  		result.append(self.string[self.regs[g][0]:self.regs[g][1]]) @@ -286,364 +222,56 @@ class MatchObject:  	else:  	    return () -# -# A set of classes to make assembly a bit easier, if a bit verbose. -# - -class Instruction: -    def __init__(self, opcode, size=1): -	self.opcode = opcode -	self.size = size -    def assemble(self, position, labels): -	return self.opcode -    def __repr__(self): -	return '%-15s' % (self.name) - -class End(Instruction): -    name = 'end' -    def __init__(self): -	Instruction.__init__(self, chr(0)) - -class Bol(Instruction): -    name = 'bol' -    def __init__(self): -	self.name = 'bol' -	Instruction.__init__(self, chr(1)) - -class Eol(Instruction): -    name = 'eol' -    def __init__(self): -	Instruction.__init__(self, chr(2)) - -class Set(Instruction): -    name = 'set' -    def __init__(self, set, flags=0): -	self.set = set -	if flags & IGNORECASE: self.set=map(string.lower, self.set) -	if len(set)==1:  -	    # If only one element, use the "exact" opcode (it'll be faster) -	    Instruction.__init__(self, chr(4), 2) -	else: -	    # Use the "set" opcode -	    Instruction.__init__(self, chr(3), 33) -    def assemble(self, position, labels): -	if len(self.set)==1: -	    # If only one character in set, generate an "exact" opcode -	    return self.opcode + self.set[0] -	result = self.opcode -	temp = 0 -	for i, c in map(lambda x: (x, chr(x)), range(256)): -	    if c in self.set: -		temp = temp | (1 << (i & 7)) -	    if (i % 8) == 7: -		result = result + chr(temp) -		temp = 0 -	return result -    def __repr__(self): -	result = '%-15s' % (self.name) -	self.set.sort() -	# XXX this should print more intelligently -	for char in self.set: -	    result = result + char -	return result -     -class Exact(Instruction): -    name = 'exact' -    def __init__(self, char, flags): -	self.char = char -	if flags & IGNORECASE: self.char=string.lower(self.char) -	Instruction.__init__(self, chr(4), 2) -    def assemble(self, position, labels): -	return self.opcode + self.char -    def __repr__(self): -	return '%-15s %s' % (self.name, `self.char`) -     -class AnyChar(Instruction): -    name = 'anychar' -    def __init__(self): -	Instruction.__init__(self, chr(5)) -    def assemble(self, position, labels): -	return self.opcode - -class MemoryInstruction(Instruction): -    def __init__(self, opcode, register): -	self.register = register -	Instruction.__init__(self, opcode, 2) -    def assemble(self, position, labels): -	return self.opcode + chr(self.register) -    def __repr__(self): -	return '%-15s %i' % (self.name, self.register) - -class StartMemory(MemoryInstruction): -    name = 'start_memory' -    def __init__(self, register): -	MemoryInstruction.__init__(self, chr(6), register) - -class EndMemory(MemoryInstruction): -    name = 'end_memory' -    def __init__(self, register): -	MemoryInstruction.__init__(self, chr(7), register) - -class MatchMemory(MemoryInstruction): -    name = 'match_memory' -    def __init__(self, register): -	MemoryInstruction.__init__(self, chr(8), register) - -class JumpInstruction(Instruction): -    def __init__(self, opcode, label): -	self.label = label -	Instruction.__init__(self, opcode, 3) -    def compute_offset(self, start, dest): -	return dest - (start + 3) -    def pack_offset(self, offset): -	if offset > 32767: -	    raise error, 'offset out of range (pos)' -	elif offset < -32768: -	    raise error, 'offset out of range (neg)' -	elif offset < 0: -	    offset = offset + 65536 -	return chr(offset & 0xff) + chr((offset >> 8) & 0xff) -    def assemble(self, position, labels): -	return self.opcode + \ -	       self.pack_offset(self.compute_offset(position, -						    labels[self.label])) -    def __repr__(self): -	return '%-15s %i' % (self.name, self.label) - -class Jump(JumpInstruction): -    name = 'jump' -    def __init__(self, label): -	JumpInstruction.__init__(self, chr(9), label) - -class StarJump(JumpInstruction): -    name = 'star_jump' -    def __init__(self, label): -	JumpInstruction.__init__(self, chr(10), label) - -class FailureJump(JumpInstruction): -    name = 'failure_jump' -    def __init__(self, label): -	JumpInstruction.__init__(self, chr(11), label) - -class UpdateFailureJump(JumpInstruction): -    name = 'update_failure_jump' -    def __init__(self, label): -	JumpInstruction.__init__(self, chr(12), label) - -class DummyFailureJump(JumpInstruction): -    name = 'dummy_failure_jump' -    def __init__(self, label): -	JumpInstruction.__init__(self, chr(13), label) -	 -class BegBuf(Instruction): -    name = 'begbuf' -    def __init__(self): -	Instruction.__init__(self, chr(14)) - -class EndBuf(Instruction): -    name = 'endbuf' -    def __init__(self): -	Instruction.__init__(self, chr(15)) - -class WordBeg(Instruction): -    name = 'wordbeg' -    def __init__(self): -	Instruction.__init__(self, chr(16)) - -class WordEnd(Instruction): -    name = 'wordend' -    def __init__(self): -	Instruction.__init__(self, chr(17)) - -class WordBound(Instruction): -    name = 'wordbound' -    def __init__(self): -	Instruction.__init__(self, chr(18)) - -class NotWordBound(Instruction): -    name = 'notwordbound' -    def __init__(self): -	Instruction.__init__(self, chr(19)) - -class SyntaxSpec(Instruction): -    name = 'syntaxspec' -    def __init__(self, syntax): -	self.syntax = syntax -	Instruction.__init__(self, chr(20), 2) -    def assemble(self, postition, labels): -	return self.opcode + chr(self.syntax) - -class NotSyntaxSpec(Instruction): -    name = 'notsyntaxspec' -    def __init__(self, syntax): -	self.syntax = syntax -	Instruction.__init__(self, chr(21), 2) -    def assemble(self, postition, labels): -	return self.opcode + chr(self.syntax) - -class Label(Instruction): -    name = 'label' -    def __init__(self, label): -	self.label = label -	Instruction.__init__(self, '', 0) -    def __repr__(self): -	return '%-15s %i' % (self.name, self.label) - -class OpenParen(Instruction): -    name = '(' -    def __init__(self, register): -	self.register = register -	Instruction.__init__(self, '', 0) -    def assemble(self, position, labels): -	raise error, 'unmatched open parenthesis' - -class Alternation(Instruction): -    name = '|' -    def __init__(self): -	Instruction.__init__(self, '', 0) -    def assemble(self, position, labels): -	raise error, 'an alternation was not taken care of' - -# -# -# - -def assemble(instructions): -    labels = {} -    position = 0 -    pass1 = [] -    for instruction in instructions: -	if instruction.name == 'label': -	    labels[instruction.label] = position -	else: -	    pass1.append((position, instruction)) -	    position = position + instruction.size -    pass2 = '' -    for position, instruction in pass1: -	pass2 = pass2 + instruction.assemble(position, labels) -    return pass2 - -# -# -# -  def escape(pattern):      result = [] +    alphanum=string.letters+'_'+string.digits      for char in pattern: -	if not reop.syntax_table[char] & reop.word: +	if char not in alphanum:  	    result.append('\\')  	result.append(char)      return string.join(result, '') -# -# -# - -def registers_used(instructions): -    result = [] -    for instruction in instructions: -	if (instruction.name in ['set_memory', 'end_memory']) and \ -	   (instruction.register not in result): -	    result.append(instruction.register) -    return result - -# -# -# - -class Fastmap: -    def __init__(self): -	self.map = ['\000']*256 -	self.can_be_null = 0 -    def add(self, char): -	self.map[ord(char)] = '\001' -    def fastmap(self): -	return string.join(self.map, '') -    def __getitem__(self, char): -	return ord(self.map[ord(char)]) -    def __repr__(self): -	self.map.sort() -	return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')' - -# -# -# - -def find_label(code, label): -    line = 0 -    for instruction in code: -	if (instruction.name == 'label') and (instruction.label == label): -	    return line + 1 -	line = line + 1 -	 -def build_fastmap_aux(code, pos, visited, fastmap): -    if visited[pos]: -	return -    while 1: -	instruction = code[pos] -	visited[pos] = 1 -	pos = pos + 1 -	if instruction.name == 'end': -	    fastmap.can_be_null = 1 -	    return -	elif instruction.name == 'syntaxspec': -	    for char in map(chr, range(256)): -		if reop.syntax_table[char] & instruction.syntax: -		    fastmap.add(char) -	    return -	elif instruction.name == 'notsyntaxspec': -	    for char in map(chr, range(256)): -		if not reop.syntax_table[char] & instruction.syntax: -		    fastmap.add(char) -	    return -	elif instruction.name == 'eol': -	    fastmap.add('\n') -	    if fastmap.can_be_null == 0: -		fastmap.can_be_null = 2 -	    return -	elif instruction.name == 'set': -	    for char in instruction.set: -		fastmap.add(char) -	    return -	elif instruction.name == 'exact': -	    fastmap.add(instruction.char) -	elif instruction.name == 'anychar': -	    for char in map(chr, range(256)): -		if char != '\n': -		    fastmap.add(char) -	    return -	elif instruction.name == 'match_memory': -	    for char in map(chr, range(256)): -		fastmap.add(char) -	    fastmap.can_be_null = 1 -	    return -	elif instruction.name in ['jump', 'dummy_failure_jump', \ -				  'update_failure_jump', 'star_jump']: -	    pos = find_label(code, instruction.label) -	    if visited[pos]: -		return -	    visited[pos] = 1 -	elif instruction.name  == 'failure_jump': -	    build_fastmap_aux(code, -			      find_label(code, instruction.label), -			      visited, -			      fastmap) -	 -def build_fastmap(code, pos=0): -    visited = [0] * len(code) -    fastmap = Fastmap() -    build_fastmap_aux(code, pos, visited, fastmap) -    return fastmap - -# -# -# +def valid_identifier(id): +    import string +    if len(id) == 0: +	return 0 +    if id[0] not in string.letters+'_': +	return 0 +    for char in id[1:]: +	if not syntax_table[char] & word: +	    return 0 +    return 1 -#[NORMAL, CHARCLASS, REPLACEMENT] = range(3) -#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, -# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9) +def compile(pattern, flags=0): +    groupindex={} +    code=pcre_compile(pattern, flags, groupindex) +    return RegexObject(pattern, flags, code, groupindex) +     +def _expand(m, repl): +    results = [] +    index = 0 +    size = len(repl) +    while index < size: +	found = string.find(repl, '\\', index) +	if found < 0: +	    results.append(repl[index:]) +	    break +	if found > index: +	    results.append(repl[index:found]) +	escape_type, value, index = _expand_escape(repl, found+1, REPLACEMENT) +	if escape_type == CHAR: +	    results.append(value) +	elif escape_type == MEMORY_REFERENCE: +	    r = m.group(value) +	    if r is None: +		raise error, ('group "' + str(value) + '" did not contribute ' +			      'to the match') +	    results.append(m.group(value)) +	else: +	    raise error, "bad escape in replacement" +    return string.join(results, '') -def expand_escape(pattern, index, context=NORMAL): +def _expand_escape(pattern, index, context=NORMAL):      if index >= len(pattern):  	raise error, 'escape ends too soon' @@ -676,7 +304,7 @@ def expand_escape(pattern, index, context=NORMAL):  	# what it's doing (and Python in turn passes it on to sscanf,  	# so that *it* doesn't incorrectly 2nd-guess what C does!)  	char = eval ('"' + pattern[index-1:end] + '"') -	assert len(char) == 1 +#	assert len(char) == 1  	return CHAR, char, end      elif pattern[index] == 'b': @@ -707,75 +335,21 @@ def expand_escape(pattern, index, context=NORMAL):  	raise error, ('\\' + pattern[index] + ' is not allowed')      elif pattern[index] == 'w': -	if context == NORMAL: -	    return SYNTAX, reop.word, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if reop.syntax_table[char] & reop.word: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 'w', index + 1      elif pattern[index] == 'W': -	if context == NORMAL: -	    return NOT_SYNTAX, reop.word, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if not reop.syntax_table[char] & reop.word: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 'W', index + 1      elif pattern[index] == 's': -	if context == NORMAL: -	    return SYNTAX, reop.whitespace, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if reop.syntax_table[char] & reop.whitespace: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 's', index + 1      elif pattern[index] == 'S': -	if context == NORMAL: -	    return NOT_SYNTAX, reop.whitespace, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if not reop.syntax_table[char] & reop.whitespace: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 'S', index + 1      elif pattern[index] == 'd': -	if context == NORMAL: -	    return SYNTAX, reop.digit, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if reop.syntax_table[char] & reop.digit: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 'd', index + 1      elif pattern[index] == 'D': -	if context == NORMAL: -	    return NOT_SYNTAX, reop.digit, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if not reop.syntax_table[char] & reop.digit: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 'D', index + 1      elif pattern[index] in '0123456789': @@ -854,655 +428,3 @@ def expand_escape(pattern, index, context=NORMAL):      else:  	return CHAR, pattern[index], index + 1 -def compile(pattern, flags=0): -    stack = [] -    label = 0 -    register = 1 -    groupindex = {} -    lastop = '' - -    # look for embedded pattern modifiers at the beginning of the pattern - -    index = 0 - -    if len(pattern) >= 3 and \ -       (pattern[:2] == '(?') and \ -       (pattern[2] in 'iImMsSxX'): -	index = 2 -	while (index < len(pattern)) and (pattern[index] != ')'): -	    if pattern[index] in 'iI': -		flags = flags | IGNORECASE -	    elif pattern[index] in 'mM': -		flags = flags | MULTILINE -	    elif pattern[index] in 'sS': -		flags = flags | DOTALL -	    elif pattern[index] in 'xX': -		flags = flags | VERBOSE -	    else: -		raise error, 'unknown modifier' -	    index = index + 1 -	index = index + 1 - -    # compile the rest of the pattern -     -    while (index < len(pattern)): -	char = pattern[index] -	index = index + 1 -	if char == '\\': -	    escape_type, value, index = expand_escape(pattern, index) - -	    if escape_type == CHAR: -		stack.append([Exact(value, flags)]) -		lastop = '\\' + value -		 -	    elif escape_type == MEMORY_REFERENCE: -		if value >= register: -		    raise error, ('cannot reference a register ' -				  'not yet used') -		stack.append([MatchMemory(value)]) -		lastop = '\\1' -		 -	    elif escape_type == BEGINNING_OF_BUFFER: -		stack.append([BegBuf()]) -		lastop = '\\A' -		 -	    elif escape_type == END_OF_BUFFER: -		stack.append([EndBuf()]) -		lastop = '\\Z' -		 -	    elif escape_type == WORD_BOUNDARY: -		stack.append([WordBound()]) -		lastop = '\\b' -		 -	    elif escape_type == NOT_WORD_BOUNDARY: -		stack.append([NotWordBound()]) -		lastop = '\\B' -		 -	    elif escape_type == SYNTAX: -		stack.append([SyntaxSpec(value)]) -		if value == reop.word: -		    lastop = '\\w' -		elif value == reop.whitespace: -		    lastop = '\\s' -		elif value == reop.digit: -		    lastop = '\\d' -		else: -		    lastop = '\\?' -		     -	    elif escape_type == NOT_SYNTAX: -		stack.append([NotSyntaxSpec(value)]) -		if value == reop.word: -		    lastop = '\\W' -		elif value == reop.whitespace: -		    lastop = '\\S' -		elif value == reop.digit: -		    lastop = '\\D' -		else: -		    lastop = '\\?' -		 -	    elif escape_type == SET: -		raise error, 'cannot use set escape type here' -	     -	    else: -		raise error, 'unknown escape type' - -	elif char == '|': -	    expr = [] -	     -	    while (len(stack) != 0) and \ -		  (stack[-1][0].name != '(') and \ -		  (stack[-1][0].name != '|'): -		expr = stack[-1] + expr -		del stack[-1] -	    stack.append([FailureJump(label)] + \ -			 expr + \ -			 [Jump(-1), -			  Label(label)]) -	    stack.append([Alternation()]) -	    label = label + 1 -	    lastop = '|' -	     -	elif char == '(': -	    if index >= len(pattern): -		raise error, 'no matching close paren' - -	    elif pattern[index] == '?': -		# Perl style (?...) extensions -		index = index + 1 -		if index >= len(pattern): -		    raise error, 'extension ends prematurely' - -		elif pattern[index] == 'P': -		    # Python extensions -		    index = index + 1 -		    if index >= len(pattern): -			raise error, 'extension ends prematurely' - -		    elif pattern[index] == '<': -			# Handle Python symbolic group names (?P<...>...) -			index = index + 1 -			end = string.find(pattern, '>', index) -			if end == -1: -			    raise error, 'no end to symbolic group name' -			name = pattern[index:end] -			if not valid_identifier(name): -			    raise error, ('symbolic group name must be a ' -					  'valid identifier') -			index = end + 1 -			groupindex[name] = register -			stack.append([OpenParen(register)]) -			register = register + 1 -			lastop = '(' - -		    elif pattern[index] == '=': -			# backreference to symbolic group name -			if index >= len(pattern): -			    raise error, '(?P= at the end of the pattern' -			start = index + 1 -			end = string.find(pattern, ')', start) -			if end == -1: -			    raise error, 'no ) to end symbolic group name' -			name = pattern[start:end] -			if name not in groupindex.keys(): -			    raise error, ('symbolic group name ' + name + \ -					  ' has not been used yet') -			stack.append([MatchMemory(groupindex[name])]) -			index = end + 1 -			lastop = '(?P=)' -			 -		    else: -			raise error, ('unknown Python extension: ' + \ -				      pattern[index]) -		     -		elif pattern[index] == ':': -		    # grouping, but no registers -		    index = index + 1 -		    stack.append([OpenParen(-1)]) -		    lastop = '(' -		     -		elif pattern[index] == '#': -		    # comment -		    index = index + 1 -		    end = string.find(pattern, ')', index) -		    if end == -1: -			raise error, 'no end to comment' -		    index = end + 1 -		    # do not change lastop -		     -		elif pattern[index] == '=': -		    raise error, ('zero-width positive lookahead ' -				  'assertion is unsupported') - -		elif pattern[index] == '!': -		    raise error, ('zero-width negative lookahead ' -				  'assertion is unsupported') - -		elif pattern[index] in 'iImMsSxX': -		    raise error, ('embedded pattern modifiers are only ' -				  'allowed at the beginning of the pattern') - -		else: -		    raise error, 'unknown extension' - -	    else: -		stack.append([OpenParen(register)]) -		register = register + 1 -		lastop = '(' - -	elif char == ')': -	    # make one expression out of everything on the stack up to -	    # the marker left by the last parenthesis -	    expr = [] -	    while (len(stack) > 0) and (stack[-1][0].name != '('): -		expr = stack[-1] + expr -		del stack[-1] - -	    if len(stack) == 0: -		raise error, 'too many close parens' -	     -	    # remove markers left by alternation -	    expr = filter(lambda x: x.name != '|', expr) - -	    # clean up jumps inserted by alternation -	    need_label = 0 -	    for i in range(len(expr)): -		if (expr[i].name == 'jump') and (expr[i].label == -1): -		    expr[i] = Jump(label) -		    need_label = 1 -	    if need_label: -		expr.append(Label(label)) -		label = label + 1 - -	    if stack[-1][0].register > 0: -		expr = [StartMemory(stack[-1][0].register)] + \ -		       expr + \ -		       [EndMemory(stack[-1][0].register)] -	    del stack[-1] -	    stack.append(expr) -	    lastop = ')' - -	elif char == '{': -	    if len(stack) == 0: -		raise error, 'no expression to repeat' -	    end = string.find(pattern, '}', index) -	    if end == -1: -		raise error, ('no close curly bracket to match' -			      ' open curly bracket') - -	    fields = map(string.strip, -			 string.split(pattern[index:end], ',')) -	    index = end + 1 - -	    minimal = 0 -	    if (index < len(pattern)) and (pattern[index] == '?'): -		minimal = 1 -		index = index + 1 - -	    if len(fields) == 1: -		# {n} or {n}? (there's really no difference) -		try: -		    count = string.atoi(fields[0]) -		except ValueError: -		    raise error, ('count must be an integer ' -				  'inside curly braces') -		if count > 65535: -		    raise error, 'repeat count out of range' -		expr = [] -		while count > 0: -		    expr = expr + stack[-1] -		    count = count - 1 -		del stack[-1] -		stack.append(expr) -		if minimal: -		    lastop = '{n}?' -		else: -		    lastop = '{n}' - -	    elif len(fields) == 2: -		# {n,} or {n,m} -		if fields[1] == '': -		    # {n,} -		    try: -			min = string.atoi(fields[0]) -		    except ValueError: -			raise error, ('minimum must be an integer ' -				      'inside curly braces') -		    if min > 65535: -			raise error, 'minimum repeat count out of range' - -		    expr = [] -		    while min > 0: -			expr = expr + stack[-1] -			min = min - 1 -		    if minimal: -			expr = expr + \ -			       ([Jump(label + 1), -				 Label(label)] + \ -				stack[-1] + \ -				[Label(label + 1), -				 FailureJump(label)]) -			lastop = '{n,}?' -		    else: -			expr = expr + \ -			       ([Label(label), -				 FailureJump(label + 1)] + -				stack[-1] + -				[StarJump(label), -				 Label(label + 1)]) -			lastop = '{n,}' - -		    del stack[-1] -		    stack.append(expr) -		    label = label + 2 - -		else: -		    # {n,m} -		    try: -			min = string.atoi(fields[0]) -		    except ValueError: -			raise error, ('minimum must be an integer ' -				      'inside curly braces') -		    try: -			max = string.atoi(fields[1]) -		    except ValueError: -			raise error, ('maximum must be an integer ' -				      'inside curly braces') -		    if min > 65535: -			raise error, ('minumim repeat count out ' -				      'of range') -		    if max > 65535: -			raise error, ('maximum repeat count out ' -				      'of range') -		    if min > max: -			raise error, ('minimum repeat count must be ' -				      'less than the maximum ' -				      'repeat count') -		    expr = [] -		    while min > 0: -			expr = expr + stack[-1] -			min = min - 1 -			max = max - 1 -		    if minimal: -			while max > 0: -			    expr = expr + \ -				   [FailureJump(label), -				    Jump(label + 1), -				    Label(label)] + \ -				   stack[-1] + \ -				   [Label(label + 1)] -			    max = max - 1 -			    label = label + 2 -			del stack[-1] -			stack.append(expr) -			lastop = '{n,m}?' -		    else: -			while max > 0: -			    expr = expr + \ -				   [FailureJump(label)] + \ -				   stack[-1] -			    max = max - 1 -			del stack[-1] -			stack.append(expr + [Label(label)]) -			label = label + 1 -			lastop = '{n,m}' - -	    else: -		raise error, ('there need to be one or two fields ' -			      'in a {} expression') - -	elif char == '}': -	    raise error, 'unbalanced close curly brace' - -	elif char == '*': -	    # Kleene closure -	    if len(stack) == 0: -		raise error, '* needs something to repeat' - -	    if lastop in ['(', '|']: -		raise error, '* needs something to repeat' - -	    if lastop in repetition_operators: -		raise error, 'nested repetition operators' -	     -	    if (index < len(pattern)) and (pattern[index] == '?'): -		# non-greedy matching -		expr = [Jump(label + 1), -			Label(label)] + \ -		       stack[-1] + \ -		       [Label(label + 1), -			FailureJump(label)] -		index = index + 1 -		lastop = '*?' -	    else: -		# greedy matching -		expr = [Label(label), -			FailureJump(label + 1)] + \ -		       stack[-1] + \ -		       [StarJump(label), -			Label(label + 1)] -		lastop = '*' -	    del stack[-1] -	    stack.append(expr) -	    label = label + 2 - -	elif char == '+': -	    # positive closure -	    if len(stack) == 0: -		raise error, '+ needs something to repeat' -	     -	    if lastop in ['(', '|']: -		raise error, '+ needs something to repeat' - -	    if lastop in repetition_operators: -		raise error, 'nested repetition operators' - -	    if (index < len(pattern)) and (pattern[index] == '?'): -		# non-greedy -		expr = [Label(label)] + \ -		       stack[-1] + \ -		       [FailureJump(label)] -		label = label + 1 -		index = index + 1 -		lastop = '+?' -		 -	    else: -		# greedy -		expr = [DummyFailureJump(label + 1), -			Label(label), -			FailureJump(label + 2), -			Label(label + 1)] + \ -		       stack[-1] + \ -		       [StarJump(label), -			Label(label + 2)] -		label = label + 3 -		lastop = '+' -		 -	    del stack[-1] -	    stack.append(expr) - -	elif char == '?': -	    if len(stack) == 0: -		raise error, 'need something to be optional' -	     -	    if len(stack) == 0: -		raise error, '? needs something to repeat' -	     -	    if lastop in ['(', '|']: -		raise error, '? needs something to repeat' - -	    if lastop in repetition_operators: -		raise error, 'nested repetition operators' - -	    if (index < len(pattern)) and (pattern[index] == '?'): -		# non-greedy matching -		expr = [FailureJump(label), -			Jump(label + 1), -			Label(label)] + \ -		       stack[-1] + \ -		       [Label(label + 1)] -		label = label + 2 -		index = index + 1 -		lastop = '??' -		 -	    else: -		# greedy matching -		expr = [FailureJump(label)] + \ -		       stack[-1] + \ -		       [Label(label)] -		label = label + 1 -		lastop = '?' -		 -	    del stack[-1] -	    stack.append(expr) - -	elif char == '.': -	    if flags & DOTALL: -		stack.append([Set(map(chr, range(256)), flags)]) -	    else: -		stack.append([AnyChar()]) -	    lastop = '.' - -	elif char == '^': -	    if flags & MULTILINE: -		stack.append([Bol()]) -	    else: -		stack.append([BegBuf()]) -	    lastop = '^' - -	elif char == '$': -	    if flags & MULTILINE: -		stack.append([Eol()]) -	    else: -		stack.append([EndBuf()]) -	    lastop = '$' - -	elif char == '#': -	    if flags & VERBOSE: -		# comment -		index = index + 1 -		end = string.find(pattern, '\n', index) -		if end == -1: -		    index = len(pattern) -		else: -		    index = end + 1 -		# do not change lastop -	    else: -		stack.append([Exact(char, flags)]) -		lastop = '#' - -	elif char in string.whitespace: -	    if not (flags & VERBOSE): -		stack.append([Exact(char, flags)]) -		lastop = char - -	elif char == '[': -	    # compile character class -	     -	    if index >= len(pattern): -		raise error, 'unclosed character class' -	     -	    negate = 0 -	    last = '' -	    set = [] - -	    if pattern[index] == '^': -		negate = 1 -		index = index + 1 -		if index >= len(pattern): -		    raise error, 'unclosed character class' - -	    if pattern[index] == ']': -		set.append(']') -		index = index + 1 -		if index >= len(pattern): -		    raise error, 'unclosed character class' -		 -	    elif pattern[index] == '-': -		set.append('-') -		index = index + 1 -		if index >= len(pattern): -		    raise error, 'unclosed character class' -		 -	    while (index < len(pattern)) and (pattern[index] != ']'): -		next = pattern[index] -		index = index + 1 -		if next == '-': -		    if index >= len(pattern): -			raise error, 'incomplete range in character class' - -		    elif pattern[index] == ']': -			set.append('-') -			 -		    else: -			if last == '': -			    raise error, ('improper use of range in ' -					  'character class') - -			start = last -			 -			if pattern[index] == '\\': -			    escape_type, -			    value, -			    index = expand_escape(pattern, -						  index + 1, -						  CHARCLASS) - -			    if escape_type == CHAR: -				end = value -				 -			    else: -				raise error, ('illegal escape in character ' -					      'class range') -			else: -			    end = pattern[index] -			    index = index + 1 - -			if start > end: -			    raise error, ('range arguments out of order ' -					  'in character class') -			 -			for char in map(chr, range(ord(start), ord(end) + 1)): -			    if char not in set: -				set.append(char) -			     -			last = '' - -		elif next == '\\': -		    # expand syntax meta-characters and add to set -		    if index >= len(pattern): -			raise error, 'incomplete set' - -		    escape_type, value, index = expand_escape(pattern, -							      index, -							      CHARCLASS) - -		    if escape_type == CHAR: -			set.append(value) -			last = value - -		    elif escape_type == SET: -			for char in value: -			    if char not in set: -				set.append(char) -			last = '' - -		    else: -			raise error, 'illegal escape type in character class' - -		else: -		    if next not in set: -			set.append(next) -		    last = next -		     -	    if (index >= len(pattern)) or ( pattern[index] != ']'): -		raise error, 'incomplete set' - -	    index = index + 1 - -	    if negate: -		# If case is being ignored, then both upper- and lowercase -		# versions of the letters must be excluded. -		if flags & IGNORECASE: set=set+map(string.upper, set) -		notset = [] -		for char in map(chr, range(256)): -		    if char not in set: -			notset.append(char) -		if len(notset) == 0: -		    raise error, 'empty negated set' -		stack.append([Set(notset, flags)]) -	    else: -		if len(set) == 0: -		    raise error, 'empty set' -		stack.append([Set(set, flags)]) - -	    lastop = '[]' - -	else: -	    stack.append([Exact(char, flags)]) -	    lastop = char - -    code = [] -    while len(stack) > 0: -	if stack[-1][0].name == '(': -	    raise error, 'too many open parens' -	code = stack[-1] + code -	del stack[-1] -    if len(code) == 0: -	raise error, 'no code generated' -    code = filter(lambda x: x.name != '|', code) -    need_label = 0 -    for i in range(len(code)): -	if (code[i].name == 'jump') and (code[i].label == -1): -	    code[i] = Jump(label) -	    need_label = 1 -    if need_label: -	code.append(Label(label)) -	label = label + 1 -    code.append(End()) -#    print code -    return RegexObject(pattern, flags, code, register, groupindex) - -# Replace expand_escape and _expand functions with their C equivalents. -# If you suspect bugs in the C versions, comment out the next two lines -expand_escape = reop.expand_escape -_expand  = reop._expand  | 
