The usual :)

author: Guido van Rossum <guido@python.org> 2000-06-29 19:35:29 (GMT)
committer: Guido van Rossum <guido@python.org> 2000-06-29 19:35:29 (GMT)
commit: 3e06ab1d447442a56af739a906546d8d1998dfdc (patch)
tree: 23145a16354b8b2e59932767c6dc9a5b56bb12cf /Lib/dos-8x3/sre_pars.py
parent: 45cd9de2bb2faa96bb18eb11d20261d7d1b8c20e (diff)
download: cpython-3e06ab1d447442a56af739a906546d8d1998dfdc.zip
cpython-3e06ab1d447442a56af739a906546d8d1998dfdc.tar.gz
cpython-3e06ab1d447442a56af739a906546d8d1998dfdc.tar.bz2
1 files changed, 255 insertions, 166 deletions
diff --git a/Lib/dos-8x3/sre_pars.py b/Lib/dos-8x3/sre_pars.py
index 8b68ea1..93a7b5d 100644
--- a/Lib/dos-8x3/sre_pars.py
+++ b/Lib/dos-8x3/sre_pars.py
@@ -1,37 +1,34 @@
 #
 # Secret Labs' Regular Expression Engine
-# $Id$
 #
-# convert re-style regular expression to SRE template.  the current
-# implementation is somewhat incomplete, and not very fast.  should
-# definitely be rewritten before Python 1.6 goes beta.
+# convert re-style regular expression to sre pattern
 #
 # Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
 #
-# This code can only be used for 1.6 alpha testing.  All other use
-# require explicit permission from Secret Labs AB.
-#
 # Portions of this engine have been developed in cooperation with
 # CNRI.  Hewlett-Packard provided funding for 1.6 integration and
 # other compatibility work.
 #
 
-# FIXME: comments marked with the FIXME tag are open issues.  all such
-# issues should be closed before the final beta.
-
 import string, sys
 
+import _sre
+
 from sre_constants import *
 
+# FIXME: should be 65535, but the arraymodule is still broken
+MAXREPEAT = 32767
+
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS  = "*+?{"
 
-# FIXME: string in tuple tests may explode with if char is unicode :-(
 DIGITS = tuple(string.digits)
 
 OCTDIGITS = tuple("01234567")
 HEXDIGITS = tuple("0123456789abcdefABCDEF")
 
+WHITESPACE = string.whitespace
+
 ESCAPES = {
     "\\a": (LITERAL, chr(7)),
     "\\b": (LITERAL, chr(8)),
@@ -55,10 +52,21 @@ CATEGORIES = {
     "\\Z": (AT, AT_END), # end of string
 }
 
-class Pattern:
-    # FIXME: <fl> rename class, and store flags in here too!
+FLAGS = {
+    # standard flags
+    "i": SRE_FLAG_IGNORECASE,
+    "L": SRE_FLAG_LOCALE,
+    "m": SRE_FLAG_MULTILINE,
+    "s": SRE_FLAG_DOTALL,
+    "x": SRE_FLAG_VERBOSE,
+    # extensions
+    "t": SRE_FLAG_TEMPLATE,
+    "u": SRE_FLAG_UNICODE,
+}
+
+class State:
     def __init__(self):
-	self.flags = []
+	self.flags = 0
 	self.groups = 1
 	self.groupdict = {}
     def getgroup(self, name=None):
@@ -67,9 +75,6 @@ class Pattern:
 	if name:
 	    self.groupdict[name] = gid
 	return gid
-    def setflag(self, flag):
-	if flag in self.flags:
-	    self.flags.append(flag)
 
 class SubPattern:
     # a subpattern, in intermediate form
@@ -78,7 +83,6 @@ class SubPattern:
 	if not data:
 	    data = []
 	self.data = data
-	self.flags = []
 	self.width = None
     def __repr__(self):
 	return repr(self.data)
@@ -121,8 +125,8 @@ class SubPattern:
 		hi = hi + j
 	    elif op in (MIN_REPEAT, MAX_REPEAT):
 		i, j = av[2].getwidth()
-		lo = lo + i * av[0]
-		hi = hi + j * av[1]
+		lo = lo + long(i) * av[0]
+		hi = hi + long(j) * av[1]
 	    elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
 		lo = lo + 1
 		hi = hi + 1
@@ -130,47 +134,23 @@ class SubPattern:
 		break
 	self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
 	return self.width
-    def set(self, flag):
-	if not flag in self.flags:
-	    self.flags.append(flag)
-    def reset(self, flag):
-	if flag in self.flags:
-	    self.flags.remove(flag)
 
 class Tokenizer:
     def __init__(self, string):
-	self.string = list(string)
+	self.index = 0
+	self.string = string
 	self.next = self.__next()
     def __next(self):
-	if not self.string:
+	if self.index >= len(self.string):
 	    return None
-	char = self.string[0]
+	char = self.string[self.index]
 	if char[0] == "\\":
 	    try:
-		c = self.string[1]
+		c = self.string[self.index + 1]
 	    except IndexError:
-		raise SyntaxError, "bogus escape"
+		raise error, "bogus escape"
 	    char = char + c
-	    try:
-		if c == "x":
-		    # hexadecimal constant
-		    for i in xrange(2, sys.maxint):
-			c = self.string[i]
-			if str(c) not in HEXDIGITS:
-			    break
-			char = char + c
-		elif str(c) in DIGITS:
-		    # decimal (or octal) number
-		    for i in xrange(2, sys.maxint):
-			c = self.string[i]
-			# FIXME: if larger than current number of
-			# groups, interpret as an octal number 
-			if str(c) not in DIGITS:
-			    break
-			char = char + c
-	    except IndexError:
-		pass # use what we've got this far
-	del self.string[0:len(char)]
+	self.index = self.index + len(char)
 	return char
     def match(self, char):
 	if char == self.next:
@@ -187,45 +167,103 @@ class Tokenizer:
 	self.next = self.__next()
 	return this
 
-def _fixescape(escape, character_class=0):
-    # convert escape to (type, value)
-    if character_class:
-	# inside a character class, we'll look in the character
-	# escapes dictionary first
-	code = ESCAPES.get(escape)
-	if code:
-	    return code
-	code = CATEGORIES.get(escape)
-    else:
-	code = CATEGORIES.get(escape)
-	if code:
-	    return code
-	code = ESCAPES.get(escape)
+def isident(char):
+    return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
+
+def isdigit(char):
+    return "0" <= char <= "9"
+
+def isname(name):
+    # check that group name is a valid string
+    # FIXME: <fl> this code is really lame.  should use a regular
+    # expression instead, but I seem to have certain bootstrapping
+    # problems here ;-)
+    if not isident(name[0]):
+	return 0
+    for char in name:
+	if not isident(char) and not isdigit(char):
+	    return 0
+    return 1
+
+def _group(escape, state):
+    # check if the escape string represents a valid group
+    try:
+	group = int(escape[1:])
+	if group and group < state.groups:
+	    return group
+    except ValueError:
+	pass
+    return None # not a valid group
+
+def _class_escape(source, escape):
+    # handle escape code inside character class
+    code = ESCAPES.get(escape)
+    if code:
+	return code
+    code = CATEGORIES.get(escape)
     if code:
 	return code
-    if not character_class:
-	try:
-	    group = int(escape[1:])
-	    # FIXME: only valid if group <= current number of groups
-	    return GROUP, group
-	except ValueError:
-	    pass
     try:
 	if escape[1:2] == "x":
+	    while source.next in HEXDIGITS:
+		escape = escape + source.get()
 	    escape = escape[2:]
-	    return LITERAL, chr(int(escape[-2:], 16) & 0xff)
-	elif str(escape[1:2]) in DIGITS:
-	    return LITERAL, chr(int(escape[1:], 8) & 0xff)
-	elif len(escape) == 2:
+	    # FIXME: support unicode characters!
+	    return LITERAL, chr(int(escape[-4:], 16) & 0xff)
+	elif str(escape[1:2]) in OCTDIGITS:
+	    while source.next in OCTDIGITS:
+		escape = escape + source.get()
+	    escape = escape[1:]
+	    # FIXME: support unicode characters!
+	    return LITERAL, chr(int(escape[-6:], 8) & 0xff)
+	if len(escape) == 2:
 	    return LITERAL, escape[1]
     except ValueError:
 	pass
-    raise SyntaxError, "bogus escape: %s" % repr(escape)
+    raise error, "bogus escape: %s" % repr(escape)
 
-def _branch(subpattern, items):
+def _escape(source, escape, state):
+    # handle escape code in expression
+    code = CATEGORIES.get(escape)
+    if code:
+	return code
+    code = ESCAPES.get(escape)
+    if code:
+	return code
+    try:
+	if escape[1:2] == "x":
+	    while source.next in HEXDIGITS:
+		escape = escape + source.get()
+	    escape = escape[2:]
+	    # FIXME: support unicode characters!
+	    return LITERAL, chr(int(escape[-4:], 16) & 0xff)
+	elif escape[1:2] in DIGITS:
+	    while 1:
+		group = _group(escape, state)
+		if group:
+		    if (not source.next or
+			not _group(escape + source.next, state)):
+		        return GROUP, group
+		    escape = escape + source.get()
+		elif source.next in OCTDIGITS:
+		    escape = escape + source.get()
+		else:
+		    break
+	    escape = escape[1:]
+	    # FIXME: support unicode characters!
+	    return LITERAL, chr(int(escape[-6:], 8) & 0xff)
+	if len(escape) == 2:
+	    return LITERAL, escape[1]
+    except ValueError:
+	pass
+    raise error, "bogus escape: %s" % repr(escape)
 
-    # form a branch operator from a set of items (FIXME: move this
-    # optimization to the compiler module!)
+
+def _branch(pattern, items):
+
+    # form a branch operator from a set of items
+
+    subpattern = SubPattern(pattern)
 
     # check if all items share a common prefix
     while 1:
@@ -257,26 +295,36 @@ def _branch(subpattern, items):
 	for item in items:
 	    set.append(item[0])
 	subpattern.append((IN, set))
-	return
+	return subpattern
 
     subpattern.append((BRANCH, (None, items)))
+    return subpattern
 
-def _parse(source, pattern, flags=()):
+def _parse(source, state, flags=0):
 
     # parse regular expression pattern into an operator list.
 
-    subpattern = SubPattern(pattern)
-
-    this = None
+    subpattern = SubPattern(state)
 
     while 1:
 
-	if str(source.next) in ("|", ")"):
+	if source.next in ("|", ")"):
 	    break # end of subpattern
 	this = source.get()
 	if this is None:
 	    break # end of pattern
 
+	if state.flags & SRE_FLAG_VERBOSE:
+	    # skip whitespace and comments
+	    if this in WHITESPACE:
+		continue
+	    if this == "#":
+		while 1:
+		    this = source.get()
+		    if this in (None, "\n"):
+			break
+		continue
+
 	if this and this[0] not in SPECIAL_CHARS:
 	    subpattern.append((LITERAL, this))
 
@@ -294,11 +342,11 @@ def _parse(source, pattern, flags=()):
 		if this == "]" and set != start:
 		    break
 		elif this and this[0] == "\\":
-		    code1 = _fixescape(this, 1)
+		    code1 = _class_escape(source, this)
 		elif this:
 		    code1 = LITERAL, this
 		else:
-		    raise SyntaxError, "unexpected end of regular expression"
+		    raise error, "unexpected end of regular expression"
 		if source.match("-"):
 		    # potential range
 		    this = source.get()
@@ -308,20 +356,20 @@ def _parse(source, pattern, flags=()):
 			break
 		    else:
 			if this[0] == "\\":
-			    code2 = _fixescape(this, 1)
+			    code2 = _class_escape(source, this)
 			else:
 			    code2 = LITERAL, this
 			if code1[0] != LITERAL or code2[0] != LITERAL:
-			    raise SyntaxError, "illegal range"
+			    raise error, "illegal range"
 			if len(code1[1]) != 1 or len(code2[1]) != 1:
-			    raise SyntaxError, "illegal range"
+			    raise error, "illegal range"
 			set.append((RANGE, (code1[1], code2[1])))
 		else:
 		    if code1[0] is IN:
 			code1 = code1[1][0]
 		    set.append(code1)
 
-	    # FIXME: <fl> move set optimization to support function
+	    # FIXME: <fl> move set optimization to compiler!
 	    if len(set)==1 and set[0][0] is LITERAL:
 		subpattern.append(set[0]) # optimization
 	    elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
@@ -335,43 +383,41 @@ def _parse(source, pattern, flags=()):
 	    if this == "?":
 		min, max = 0, 1
 	    elif this == "*":
-		min, max = 0, sys.maxint
+		min, max = 0, MAXREPEAT
 	    elif this == "+":
-		min, max = 1, sys.maxint
+		min, max = 1, MAXREPEAT
 	    elif this == "{":
-		min, max = 0, sys.maxint
+		min, max = 0, MAXREPEAT
 		lo = hi = ""
-		while str(source.next) in DIGITS:
+		while source.next in DIGITS:
 		    lo = lo + source.get()
 		if source.match(","):
-		    while str(source.next) in DIGITS:
+		    while source.next in DIGITS:
 			hi = hi + source.get()
 		else:
 		    hi = lo
 		if not source.match("}"):
-		    raise SyntaxError, "bogus range"
+		    raise error, "bogus range"
 		if lo:
 		    min = int(lo)
 		if hi:
 		    max = int(hi)
 		# FIXME: <fl> check that hi >= lo!
 	    else:
-		raise SyntaxError, "not supported"
+		raise error, "not supported"
 	    # figure out which item to repeat
-	    # FIXME: should back up to the right mark, right?
 	    if subpattern:
-		index = len(subpattern)-1
-		while subpattern[index][0] is MARK:
-		    index = index - 1
-		item = subpattern[index:index+1]
+		item = subpattern[-1:]
 	    else:
-		raise SyntaxError, "nothing to repeat"
+		raise error, "nothing to repeat"
 	    if source.match("?"):
-		subpattern[index] = (MIN_REPEAT, (min, max, item))
+		subpattern[-1] = (MIN_REPEAT, (min, max, item))
 	    else:
-		subpattern[index] = (MAX_REPEAT, (min, max, item))
+		subpattern[-1] = (MAX_REPEAT, (min, max, item))
+
 	elif this == ".":
 	    subpattern.append((ANY, None))
+
 	elif this == "(":
 	    group = 1
 	    name = None
@@ -379,28 +425,41 @@ def _parse(source, pattern, flags=()):
 		group = 0
 		# options
 		if source.match("P"):
-		    # named group: skip forward to end of name
+		    # python extensions
 		    if source.match("<"):
+			# named group: skip forward to end of name
 			name = ""
 			while 1:
 			    char = source.get()
-			    if char is None or char == ">":
+			    if char is None:
+				raise error, "unterminated name"
+			    if char == ">":
 				break
 			    name = name + char
 			group = 1
+			if not isname(name):
+			    raise error, "illegal character in group name"
+		    elif source.match("="):
+			# named backreference
+			raise error, "not yet implemented"
+		    else:
+			char = source.get()
+			if char is None:
+			    raise error, "unexpected end of pattern"
+			raise error, "unknown specifier: ?P%s" % char
 		elif source.match(":"):
 		    # non-capturing group
 		    group = 2
-		elif source.match_set("iI"):
-		    pattern.setflag("i")
-		elif source.match_set("lL"):
-		    pattern.setflag("l")
-		elif source.match_set("mM"):
-		    pattern.setflag("m")
-		elif source.match_set("sS"):
-		    pattern.setflag("s")
-		elif source.match_set("xX"):
-		    pattern.setflag("x")
+		elif source.match("#"):
+		    # comment
+		    while 1:
+			if source.next is None or source.next == ")":
+			    break
+			source.get()
+		else:
+		    # flags
+		    while FLAGS.has_key(source.next):
+			state.flags = state.flags | FLAGS[source.get()]
 	    if group:
 		# parse group contents
 		b = []
@@ -408,30 +467,25 @@ def _parse(source, pattern, flags=()):
 		    # anonymous group
 		    group = None
 		else:
-		    group = pattern.getgroup(name)
- 		if group:
- 		    subpattern.append((MARK, (group-1)*2))
+		    group = state.getgroup(name)
 		while 1:
-		    p = _parse(source, pattern, flags)
+		    p = _parse(source, state, flags)
 		    if source.match(")"):
 			if b:
 			    b.append(p)
-			    _branch(subpattern, b)
-			else:
-			    subpattern.append((SUBPATTERN, (group, p)))
+			    p = _branch(state, b)
+			subpattern.append((SUBPATTERN, (group, p)))
 			break
 		    elif source.match("|"):
 			b.append(p)
 		    else:
-			raise SyntaxError, "group not properly closed"
- 		if group:
- 		    subpattern.append((MARK, (group-1)*2+1))
+			raise error, "group not properly closed"
 	    else:
-		# FIXME: should this really be a while loop?
 		while 1:
 		    char = source.get()
 		    if char is None or char == ")":
 			break
+		    raise error, "unknown extension"
 
 	elif this == "^":
 	    subpattern.append((AT, AT_BEGINNING))
@@ -440,58 +494,93 @@ def _parse(source, pattern, flags=()):
 	    subpattern.append((AT, AT_END))
 
 	elif this and this[0] == "\\":
-	    code =_fixescape(this)
+	    code = _escape(source, this, state)
 	    subpattern.append(code)
 
 	else:
-	    raise SyntaxError, "parser error"
+	    raise error, "parser error"
 
     return subpattern
 
-def parse(source, flags=()):
-    s = Tokenizer(source)
-    g = Pattern()
+def parse(pattern, flags=0):
+    # parse 're' pattern into list of (opcode, argument) tuples
+    source = Tokenizer(pattern)
+    state = State()
     b = []
     while 1:
-	p = _parse(s, g, flags)
-	tail = s.get()
+	p = _parse(source, state, flags)
+	tail = source.get()
 	if tail == "|":
 	    b.append(p)
 	elif tail == ")":
-	    raise SyntaxError, "unbalanced parenthesis"
+	    raise error, "unbalanced parenthesis"
 	elif tail is None:
 	    if b:
 		b.append(p)
-		p = SubPattern(g)
-		_branch(p, b)
+		p = _branch(state, b)
 	    break
 	else:
-	    raise SyntaxError, "bogus characters at end of regular expression"
+	    raise error, "bogus characters at end of regular expression"
     return p
 
-if __name__ == "__main__":
-    from pprint import pprint
-    from testpatterns import PATTERNS
-    a = b = c = 0
-    for pattern, flags in PATTERNS:
-	if flags:
-	    continue
-	print "-"*68
-	try:
-	    p = parse(pattern)
-	    print repr(pattern), "->"
-	    pprint(p.data)
-	    import sre_compile
-	    try:
-		code = sre_compile.compile(p)
-		c = c + 1
-	    except:
-		pass
-	    a = a + 1
-	except SyntaxError, v:
-	    print "**", repr(pattern), v
-	b = b + 1
-    print "-"*68
-    print a, "of", b, "patterns successfully parsed"
-    print c, "of", b, "patterns successfully compiled"
+def parse_template(source, pattern):
+    # parse 're' replacement string into list of literals and
+    # group references
+    s = Tokenizer(source)
+    p = []
+    a = p.append
+    while 1:
+	this = s.get()
+	if this is None:
+	    break # end of replacement string
+	if this and this[0] == "\\":
+	    if this == "\\g":
+		name = ""
+		if s.match("<"):
+		    while 1:
+			char = s.get()
+			if char is None:
+			    raise error, "unterminated group name"
+			if char == ">":
+			    break
+			name = name + char
+		if not name:
+		    raise error, "bad group name"
+		try:
+		    index = int(name)
+		except ValueError:
+		    if not isname(name):
+			raise error, "illegal character in group name"
+		    try:
+			index = pattern.groupindex[name]
+		    except KeyError:
+			raise IndexError, "unknown group name"
+		a((MARK, index))
+	    elif len(this) > 1 and this[1] in DIGITS:
+		while s.next in DIGITS:
+		    this = this + s.get()
+		a((MARK, int(this[1:])))
+	    else:
+		try:
+		    a(ESCAPES[this])
+		except KeyError:
+		    for char in this:
+			a((LITERAL, char))
+	else:
+	    a((LITERAL, this))
+    return p
 
+def expand_template(template, match):
+    # FIXME: <fl> this is sooooo slow.  drop in the slicelist
+    # code instead
+    p = []
+    a = p.append
+    for c, s in template:
+	if c is LITERAL:
+	    a(s)
+	elif c is MARK:
+	    s = match.group(s)
+	    if s is None:
+		raise error, "empty group"
+	    a(s)
+    return match.string[:0].join(p)
author	Guido van Rossum <guido@python.org>	2000-06-29 19:35:29 (GMT)
committer	Guido van Rossum <guido@python.org>	2000-06-29 19:35:29 (GMT)
commit	3e06ab1d447442a56af739a906546d8d1998dfdc (patch)
tree	23145a16354b8b2e59932767c6dc9a5b56bb12cf /Lib/dos-8x3/sre_pars.py
parent	45cd9de2bb2faa96bb18eb11d20261d7d1b8c20e (diff)
download	cpython-3e06ab1d447442a56af739a906546d8d1998dfdc.zip cpython-3e06ab1d447442a56af739a906546d8d1998dfdc.tar.gz cpython-3e06ab1d447442a56af739a906546d8d1998dfdc.tar.bz2