diff options
author | Guido van Rossum <guido@python.org> | 2000-06-29 19:35:29 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2000-06-29 19:35:29 (GMT) |
commit | 3e06ab1d447442a56af739a906546d8d1998dfdc (patch) | |
tree | 23145a16354b8b2e59932767c6dc9a5b56bb12cf /Lib/dos-8x3/sre_pars.py | |
parent | 45cd9de2bb2faa96bb18eb11d20261d7d1b8c20e (diff) | |
download | cpython-3e06ab1d447442a56af739a906546d8d1998dfdc.zip cpython-3e06ab1d447442a56af739a906546d8d1998dfdc.tar.gz cpython-3e06ab1d447442a56af739a906546d8d1998dfdc.tar.bz2 |
The usual :)
Diffstat (limited to 'Lib/dos-8x3/sre_pars.py')
-rw-r--r-- | Lib/dos-8x3/sre_pars.py | 421 |
1 files changed, 255 insertions, 166 deletions
diff --git a/Lib/dos-8x3/sre_pars.py b/Lib/dos-8x3/sre_pars.py index 8b68ea1..93a7b5d 100644 --- a/Lib/dos-8x3/sre_pars.py +++ b/Lib/dos-8x3/sre_pars.py @@ -1,37 +1,34 @@ # # Secret Labs' Regular Expression Engine -# $Id$ # -# convert re-style regular expression to SRE template. the current -# implementation is somewhat incomplete, and not very fast. should -# definitely be rewritten before Python 1.6 goes beta. +# convert re-style regular expression to sre pattern # # Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. # -# This code can only be used for 1.6 alpha testing. All other use -# require explicit permission from Secret Labs AB. -# # Portions of this engine have been developed in cooperation with # CNRI. Hewlett-Packard provided funding for 1.6 integration and # other compatibility work. # -# FIXME: comments marked with the FIXME tag are open issues. all such -# issues should be closed before the final beta. - import string, sys +import _sre + from sre_constants import * +# FIXME: should be 65535, but the arraymodule is still broken +MAXREPEAT = 32767 + SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" -# FIXME: string in tuple tests may explode with if char is unicode :-( DIGITS = tuple(string.digits) OCTDIGITS = tuple("01234567") HEXDIGITS = tuple("0123456789abcdefABCDEF") +WHITESPACE = string.whitespace + ESCAPES = { "\\a": (LITERAL, chr(7)), "\\b": (LITERAL, chr(8)), @@ -55,10 +52,21 @@ CATEGORIES = { "\\Z": (AT, AT_END), # end of string } -class Pattern: - # FIXME: <fl> rename class, and store flags in here too! +FLAGS = { + # standard flags + "i": SRE_FLAG_IGNORECASE, + "L": SRE_FLAG_LOCALE, + "m": SRE_FLAG_MULTILINE, + "s": SRE_FLAG_DOTALL, + "x": SRE_FLAG_VERBOSE, + # extensions + "t": SRE_FLAG_TEMPLATE, + "u": SRE_FLAG_UNICODE, +} + +class State: def __init__(self): - self.flags = [] + self.flags = 0 self.groups = 1 self.groupdict = {} def getgroup(self, name=None): @@ -67,9 +75,6 @@ class Pattern: if name: self.groupdict[name] = gid return gid - def setflag(self, flag): - if flag in self.flags: - self.flags.append(flag) class SubPattern: # a subpattern, in intermediate form @@ -78,7 +83,6 @@ class SubPattern: if not data: data = [] self.data = data - self.flags = [] self.width = None def __repr__(self): return repr(self.data) @@ -121,8 +125,8 @@ class SubPattern: hi = hi + j elif op in (MIN_REPEAT, MAX_REPEAT): i, j = av[2].getwidth() - lo = lo + i * av[0] - hi = hi + j * av[1] + lo = lo + long(i) * av[0] + hi = hi + long(j) * av[1] elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): lo = lo + 1 hi = hi + 1 @@ -130,47 +134,23 @@ class SubPattern: break self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) return self.width - def set(self, flag): - if not flag in self.flags: - self.flags.append(flag) - def reset(self, flag): - if flag in self.flags: - self.flags.remove(flag) class Tokenizer: def __init__(self, string): - self.string = list(string) + self.index = 0 + self.string = string self.next = self.__next() def __next(self): - if not self.string: + if self.index >= len(self.string): return None - char = self.string[0] + char = self.string[self.index] if char[0] == "\\": try: - c = self.string[1] + c = self.string[self.index + 1] except IndexError: - raise SyntaxError, "bogus escape" + raise error, "bogus escape" char = char + c - try: - if c == "x": - # hexadecimal constant - for i in xrange(2, sys.maxint): - c = self.string[i] - if str(c) not in HEXDIGITS: - break - char = char + c - elif str(c) in DIGITS: - # decimal (or octal) number - for i in xrange(2, sys.maxint): - c = self.string[i] - # FIXME: if larger than current number of - # groups, interpret as an octal number - if str(c) not in DIGITS: - break - char = char + c - except IndexError: - pass # use what we've got this far - del self.string[0:len(char)] + self.index = self.index + len(char) return char def match(self, char): if char == self.next: @@ -187,45 +167,103 @@ class Tokenizer: self.next = self.__next() return this -def _fixescape(escape, character_class=0): - # convert escape to (type, value) - if character_class: - # inside a character class, we'll look in the character - # escapes dictionary first - code = ESCAPES.get(escape) - if code: - return code - code = CATEGORIES.get(escape) - else: - code = CATEGORIES.get(escape) - if code: - return code - code = ESCAPES.get(escape) +def isident(char): + return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" + +def isdigit(char): + return "0" <= char <= "9" + +def isname(name): + # check that group name is a valid string + # FIXME: <fl> this code is really lame. should use a regular + # expression instead, but I seem to have certain bootstrapping + # problems here ;-) + if not isident(name[0]): + return 0 + for char in name: + if not isident(char) and not isdigit(char): + return 0 + return 1 + +def _group(escape, state): + # check if the escape string represents a valid group + try: + group = int(escape[1:]) + if group and group < state.groups: + return group + except ValueError: + pass + return None # not a valid group + +def _class_escape(source, escape): + # handle escape code inside character class + code = ESCAPES.get(escape) + if code: + return code + code = CATEGORIES.get(escape) if code: return code - if not character_class: - try: - group = int(escape[1:]) - # FIXME: only valid if group <= current number of groups - return GROUP, group - except ValueError: - pass try: if escape[1:2] == "x": + while source.next in HEXDIGITS: + escape = escape + source.get() escape = escape[2:] - return LITERAL, chr(int(escape[-2:], 16) & 0xff) - elif str(escape[1:2]) in DIGITS: - return LITERAL, chr(int(escape[1:], 8) & 0xff) - elif len(escape) == 2: + # FIXME: support unicode characters! + return LITERAL, chr(int(escape[-4:], 16) & 0xff) + elif str(escape[1:2]) in OCTDIGITS: + while source.next in OCTDIGITS: + escape = escape + source.get() + escape = escape[1:] + # FIXME: support unicode characters! + return LITERAL, chr(int(escape[-6:], 8) & 0xff) + if len(escape) == 2: return LITERAL, escape[1] except ValueError: pass - raise SyntaxError, "bogus escape: %s" % repr(escape) + raise error, "bogus escape: %s" % repr(escape) -def _branch(subpattern, items): +def _escape(source, escape, state): + # handle escape code in expression + code = CATEGORIES.get(escape) + if code: + return code + code = ESCAPES.get(escape) + if code: + return code + try: + if escape[1:2] == "x": + while source.next in HEXDIGITS: + escape = escape + source.get() + escape = escape[2:] + # FIXME: support unicode characters! + return LITERAL, chr(int(escape[-4:], 16) & 0xff) + elif escape[1:2] in DIGITS: + while 1: + group = _group(escape, state) + if group: + if (not source.next or + not _group(escape + source.next, state)): + return GROUP, group + escape = escape + source.get() + elif source.next in OCTDIGITS: + escape = escape + source.get() + else: + break + escape = escape[1:] + # FIXME: support unicode characters! + return LITERAL, chr(int(escape[-6:], 8) & 0xff) + if len(escape) == 2: + return LITERAL, escape[1] + except ValueError: + pass + raise error, "bogus escape: %s" % repr(escape) - # form a branch operator from a set of items (FIXME: move this - # optimization to the compiler module!) + +def _branch(pattern, items): + + # form a branch operator from a set of items + + subpattern = SubPattern(pattern) # check if all items share a common prefix while 1: @@ -257,26 +295,36 @@ def _branch(subpattern, items): for item in items: set.append(item[0]) subpattern.append((IN, set)) - return + return subpattern subpattern.append((BRANCH, (None, items))) + return subpattern -def _parse(source, pattern, flags=()): +def _parse(source, state, flags=0): # parse regular expression pattern into an operator list. - subpattern = SubPattern(pattern) - - this = None + subpattern = SubPattern(state) while 1: - if str(source.next) in ("|", ")"): + if source.next in ("|", ")"): break # end of subpattern this = source.get() if this is None: break # end of pattern + if state.flags & SRE_FLAG_VERBOSE: + # skip whitespace and comments + if this in WHITESPACE: + continue + if this == "#": + while 1: + this = source.get() + if this in (None, "\n"): + break + continue + if this and this[0] not in SPECIAL_CHARS: subpattern.append((LITERAL, this)) @@ -294,11 +342,11 @@ def _parse(source, pattern, flags=()): if this == "]" and set != start: break elif this and this[0] == "\\": - code1 = _fixescape(this, 1) + code1 = _class_escape(source, this) elif this: code1 = LITERAL, this else: - raise SyntaxError, "unexpected end of regular expression" + raise error, "unexpected end of regular expression" if source.match("-"): # potential range this = source.get() @@ -308,20 +356,20 @@ def _parse(source, pattern, flags=()): break else: if this[0] == "\\": - code2 = _fixescape(this, 1) + code2 = _class_escape(source, this) else: code2 = LITERAL, this if code1[0] != LITERAL or code2[0] != LITERAL: - raise SyntaxError, "illegal range" + raise error, "illegal range" if len(code1[1]) != 1 or len(code2[1]) != 1: - raise SyntaxError, "illegal range" + raise error, "illegal range" set.append((RANGE, (code1[1], code2[1]))) else: if code1[0] is IN: code1 = code1[1][0] set.append(code1) - # FIXME: <fl> move set optimization to support function + # FIXME: <fl> move set optimization to compiler! if len(set)==1 and set[0][0] is LITERAL: subpattern.append(set[0]) # optimization elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: @@ -335,43 +383,41 @@ def _parse(source, pattern, flags=()): if this == "?": min, max = 0, 1 elif this == "*": - min, max = 0, sys.maxint + min, max = 0, MAXREPEAT elif this == "+": - min, max = 1, sys.maxint + min, max = 1, MAXREPEAT elif this == "{": - min, max = 0, sys.maxint + min, max = 0, MAXREPEAT lo = hi = "" - while str(source.next) in DIGITS: + while source.next in DIGITS: lo = lo + source.get() if source.match(","): - while str(source.next) in DIGITS: + while source.next in DIGITS: hi = hi + source.get() else: hi = lo if not source.match("}"): - raise SyntaxError, "bogus range" + raise error, "bogus range" if lo: min = int(lo) if hi: max = int(hi) # FIXME: <fl> check that hi >= lo! else: - raise SyntaxError, "not supported" + raise error, "not supported" # figure out which item to repeat - # FIXME: should back up to the right mark, right? if subpattern: - index = len(subpattern)-1 - while subpattern[index][0] is MARK: - index = index - 1 - item = subpattern[index:index+1] + item = subpattern[-1:] else: - raise SyntaxError, "nothing to repeat" + raise error, "nothing to repeat" if source.match("?"): - subpattern[index] = (MIN_REPEAT, (min, max, item)) + subpattern[-1] = (MIN_REPEAT, (min, max, item)) else: - subpattern[index] = (MAX_REPEAT, (min, max, item)) + subpattern[-1] = (MAX_REPEAT, (min, max, item)) + elif this == ".": subpattern.append((ANY, None)) + elif this == "(": group = 1 name = None @@ -379,28 +425,41 @@ def _parse(source, pattern, flags=()): group = 0 # options if source.match("P"): - # named group: skip forward to end of name + # python extensions if source.match("<"): + # named group: skip forward to end of name name = "" while 1: char = source.get() - if char is None or char == ">": + if char is None: + raise error, "unterminated name" + if char == ">": break name = name + char group = 1 + if not isname(name): + raise error, "illegal character in group name" + elif source.match("="): + # named backreference + raise error, "not yet implemented" + else: + char = source.get() + if char is None: + raise error, "unexpected end of pattern" + raise error, "unknown specifier: ?P%s" % char elif source.match(":"): # non-capturing group group = 2 - elif source.match_set("iI"): - pattern.setflag("i") - elif source.match_set("lL"): - pattern.setflag("l") - elif source.match_set("mM"): - pattern.setflag("m") - elif source.match_set("sS"): - pattern.setflag("s") - elif source.match_set("xX"): - pattern.setflag("x") + elif source.match("#"): + # comment + while 1: + if source.next is None or source.next == ")": + break + source.get() + else: + # flags + while FLAGS.has_key(source.next): + state.flags = state.flags | FLAGS[source.get()] if group: # parse group contents b = [] @@ -408,30 +467,25 @@ def _parse(source, pattern, flags=()): # anonymous group group = None else: - group = pattern.getgroup(name) - if group: - subpattern.append((MARK, (group-1)*2)) + group = state.getgroup(name) while 1: - p = _parse(source, pattern, flags) + p = _parse(source, state, flags) if source.match(")"): if b: b.append(p) - _branch(subpattern, b) - else: - subpattern.append((SUBPATTERN, (group, p))) + p = _branch(state, b) + subpattern.append((SUBPATTERN, (group, p))) break elif source.match("|"): b.append(p) else: - raise SyntaxError, "group not properly closed" - if group: - subpattern.append((MARK, (group-1)*2+1)) + raise error, "group not properly closed" else: - # FIXME: should this really be a while loop? while 1: char = source.get() if char is None or char == ")": break + raise error, "unknown extension" elif this == "^": subpattern.append((AT, AT_BEGINNING)) @@ -440,58 +494,93 @@ def _parse(source, pattern, flags=()): subpattern.append((AT, AT_END)) elif this and this[0] == "\\": - code =_fixescape(this) + code = _escape(source, this, state) subpattern.append(code) else: - raise SyntaxError, "parser error" + raise error, "parser error" return subpattern -def parse(source, flags=()): - s = Tokenizer(source) - g = Pattern() +def parse(pattern, flags=0): + # parse 're' pattern into list of (opcode, argument) tuples + source = Tokenizer(pattern) + state = State() b = [] while 1: - p = _parse(s, g, flags) - tail = s.get() + p = _parse(source, state, flags) + tail = source.get() if tail == "|": b.append(p) elif tail == ")": - raise SyntaxError, "unbalanced parenthesis" + raise error, "unbalanced parenthesis" elif tail is None: if b: b.append(p) - p = SubPattern(g) - _branch(p, b) + p = _branch(state, b) break else: - raise SyntaxError, "bogus characters at end of regular expression" + raise error, "bogus characters at end of regular expression" return p -if __name__ == "__main__": - from pprint import pprint - from testpatterns import PATTERNS - a = b = c = 0 - for pattern, flags in PATTERNS: - if flags: - continue - print "-"*68 - try: - p = parse(pattern) - print repr(pattern), "->" - pprint(p.data) - import sre_compile - try: - code = sre_compile.compile(p) - c = c + 1 - except: - pass - a = a + 1 - except SyntaxError, v: - print "**", repr(pattern), v - b = b + 1 - print "-"*68 - print a, "of", b, "patterns successfully parsed" - print c, "of", b, "patterns successfully compiled" +def parse_template(source, pattern): + # parse 're' replacement string into list of literals and + # group references + s = Tokenizer(source) + p = [] + a = p.append + while 1: + this = s.get() + if this is None: + break # end of replacement string + if this and this[0] == "\\": + if this == "\\g": + name = "" + if s.match("<"): + while 1: + char = s.get() + if char is None: + raise error, "unterminated group name" + if char == ">": + break + name = name + char + if not name: + raise error, "bad group name" + try: + index = int(name) + except ValueError: + if not isname(name): + raise error, "illegal character in group name" + try: + index = pattern.groupindex[name] + except KeyError: + raise IndexError, "unknown group name" + a((MARK, index)) + elif len(this) > 1 and this[1] in DIGITS: + while s.next in DIGITS: + this = this + s.get() + a((MARK, int(this[1:]))) + else: + try: + a(ESCAPES[this]) + except KeyError: + for char in this: + a((LITERAL, char)) + else: + a((LITERAL, this)) + return p +def expand_template(template, match): + # FIXME: <fl> this is sooooo slow. drop in the slicelist + # code instead + p = [] + a = p.append + for c, s in template: + if c is LITERAL: + a(s) + elif c is MARK: + s = match.group(s) + if s is None: + raise error, "empty group" + a(s) + return match.string[:0].join(p) |