Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!
author: Guido van Rossum <guido@python.org> 2000-03-31 14:58:54 (GMT)
committer: Guido van Rossum <guido@python.org> 2000-03-31 14:58:54 (GMT)
commit: 7627c0de6968471996ce05aab200115d56efa1d5 (patch)
tree: 9c4191aa38d9b7428d11c1b6267b95e5add41afa /Lib/sre_parse.py
parent: 7a5b796322e2bf58fa8afe78bbaccfcc9492d178 (diff)
download: cpython-7627c0de6968471996ce05aab200115d56efa1d5.zip
cpython-7627c0de6968471996ce05aab200115d56efa1d5.tar.gz
cpython-7627c0de6968471996ce05aab200115d56efa1d5.tar.bz2
1 files changed, 492 insertions, 0 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
new file mode 100644
index 0000000..a563de3
--- /dev/null
+++ b/Lib/sre_parse.py
@@ -0,0 +1,492 @@
+#
+# Secret Labs' Regular Expression Engine
+# $Id$
+#
+# convert re-style regular expression to SRE template.  the current
+# implementation is somewhat incomplete, and not very fast.  should
+# definitely be rewritten before Python 1.6 goes beta.
+#
+# Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
+#
+# This code can only be used for 1.6 alpha testing.  All other use
+# require explicit permission from Secret Labs AB.
+#
+# Portions of this engine have been developed in cooperation with
+# CNRI.  Hewlett-Packard provided funding for 1.6 integration and
+# other compatibility work.
+#
+
+# FIXME: comments marked with the FIXME tag are open issues.  all such
+# issues should be closed before the final beta.
+
+import string, sys
+
+from sre_constants import *
+
+SPECIAL_CHARS = ".\\[{()*+?^$|"
+REPEAT_CHARS  = "*+?{"
+
+OCTDIGITS = "01234567"
+HEXDIGITS = "0123456789abcdefABCDEF"
+
+ESCAPES = {
+    "\\a": (LITERAL, chr(7)),
+    "\\b": (LITERAL, chr(8)),
+    "\\f": (LITERAL, chr(12)),
+    "\\n": (LITERAL, chr(10)),
+    "\\r": (LITERAL, chr(13)),
+    "\\t": (LITERAL, chr(9)),
+    "\\v": (LITERAL, chr(11))
+}
+
+CATEGORIES = {
+    "\\A": (AT, AT_BEGINNING), # start of string
+    "\\b": (AT, AT_BOUNDARY),
+    "\\B": (AT, AT_NON_BOUNDARY),
+    "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
+    "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
+    "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
+    "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
+    "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
+    "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
+    "\\Z": (AT, AT_END), # end of string
+}
+
+class Pattern:
+    # FIXME: <fl> rename class, and store flags in here too!
+    def __init__(self):
+	self.flags = []
+	self.groups = 1
+	self.groupdict = {}
+    def getgroup(self, name=None):
+	gid = self.groups
+	self.groups = gid + 1
+	if name:
+	    self.groupdict[name] = gid
+	return gid
+    def setflag(self, flag):
+	if flag not in self.flags:
+	    self.flags.append(flag)
+
+class SubPattern:
+    # a subpattern, in intermediate form
+    def __init__(self, pattern, data=None):
+	self.pattern = pattern
+	if not data:
+	    data = []
+	self.data = data
+	self.flags = []
+	self.width = None
+    def __repr__(self):
+	return repr(self.data)
+    def __len__(self):
+	return len(self.data)
+    def __delitem__(self, index):
+	del self.data[index]
+    def __getitem__(self, index):
+	return self.data[index]
+    def __setitem__(self, index, code):
+	self.data[index] = code
+    def __getslice__(self, start, stop):
+	return SubPattern(self.pattern, self.data[start:stop])
+    def insert(self, index, code):
+	self.data.insert(index, code)
+    def append(self, code):
+	self.data.append(code)
+    def getwidth(self):
+	# determine the width (min, max) for this subpattern
+	if self.width:
+	    return self.width
+	lo = hi = 0L
+	for op, av in self.data:
+	    if op is BRANCH:
+		l = sys.maxint
+		h = 0
+		for av in av[1]:
+		    i, j = av.getwidth()
+		    l = min(l, i)
+		    h = min(h, j)
+		lo = lo + i
+		hi = hi + j
+	    elif op is CALL:
+		i, j = av.getwidth()
+		lo = lo + i
+		hi = hi + j
+	    elif op is SUBPATTERN:
+		i, j = av[1].getwidth()
+		lo = lo + i
+		hi = hi + j
+	    elif op in (MIN_REPEAT, MAX_REPEAT):
+		i, j = av[2].getwidth()
+		lo = lo + i * av[0]
+		hi = hi + j * av[1]
+	    elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
+		lo = lo + 1
+		hi = hi + 1
+	    elif op == SUCCESS:
+		break
+	self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
+	return self.width
+    def set(self, flag):
+	if not flag in self.flags:
+	    self.flags.append(flag)
+    def reset(self, flag):
+	if flag in self.flags:
+	    self.flags.remove(flag)
+
+class Tokenizer:
+    def __init__(self, string):
+	self.string = list(string)
+	self.next = self.__next()
+    def __next(self):
+	if not self.string:
+	    return None
+	char = self.string[0]
+	if char[0] == "\\":
+	    try:
+		c = self.string[1]
+	    except IndexError:
+		raise SyntaxError, "bogus escape"
+	    char = char + c
+	    try:
+		if c == "x":
+		    # hexadecimal constant
+		    for i in xrange(2, sys.maxint):
+			c = self.string[i]
+			if c not in HEXDIGITS:
+			    break
+			char = char + c
+		elif c in string.digits:
+		    # decimal (or octal) number
+		    for i in xrange(2, sys.maxint):
+			c = self.string[i]
+			# FIXME: if larger than current number of
+			# groups, interpret as an octal number 
+			if c not in string.digits:
+			    break
+			char = char + c
+	    except IndexError:
+		pass # use what we've got this far
+	del self.string[0:len(char)]
+	return char
+    def match(self, char):
+	if char == self.next:
+	    self.next = self.__next()
+	    return 1
+	return 0
+    def match_set(self, set):
+	if self.next in set:
+	    self.next = self.__next()
+	    return 1
+	return 0
+    def get(self):
+	this = self.next
+	self.next = self.__next()
+	return this
+
+def _fixescape(escape, character_class=0):
+    # convert escape to (type, value)
+    if character_class:
+	# inside a character class, we'll look in the character
+	# escapes dictionary first
+	code = ESCAPES.get(escape)
+	if code:
+	    return code
+	code = CATEGORIES.get(escape)
+    else:
+	code = CATEGORIES.get(escape)
+	if code:
+	    return code
+	code = ESCAPES.get(escape)
+    if code:
+	return code
+    if not character_class:
+	try:
+	    group = int(escape[1:])
+	    # FIXME: only valid if group <= current number of groups
+	    return GROUP, group
+	except ValueError:
+	    pass
+    try:
+	if escape[1:2] == "x":
+	    escape = escape[2:]
+	    return LITERAL, chr(string.atoi(escape[-2:], 16) & 0xff)
+	elif escape[1:2] in string.digits:
+	    return LITERAL, chr(string.atoi(escape[1:], 8) & 0xff)
+	elif len(escape) == 2:
+	    return LITERAL, escape[1]
+    except ValueError:
+	pass
+    raise SyntaxError, "bogus escape: %s" % repr(escape)
+
+def _branch(subpattern, items):
+
+    # form a branch operator from a set of items (FIXME: move this
+    # optimization to the compiler module!)
+
+    # check if all items share a common prefix
+    while 1:
+	prefix = None
+	for item in items:
+	    if not item:
+		break
+	    if prefix is None:
+		prefix = item[0]
+	    elif item[0] != prefix:
+		break
+	else:
+	    # all subitems start with a common "prefix".
+	    # move it out of the branch
+	    for item in items:
+		del item[0]
+	    subpattern.append(prefix)
+	    continue # check next one
+	break
+
+    # check if the branch can be replaced by a character set
+    for item in items:
+	if len(item) != 1 or item[0][0] != LITERAL:
+	    break
+    else:
+	# we can store this as a character set instead of a
+	# branch (FIXME: use a range if possible)
+	set = []
+	for item in items:
+	    set.append(item[0])
+	subpattern.append((IN, set))
+	return
+
+    subpattern.append((BRANCH, (None, items)))
+
+def _parse(source, pattern, flags=()):
+
+    # parse regular expression pattern into an operator list.
+
+    subpattern = SubPattern(pattern)
+
+    this = None
+
+    while 1:
+
+	if source.next in ("|", ")"):
+	    break # end of subpattern
+	this = source.get()
+	if this is None:
+	    break # end of pattern
+
+	if this and this[0] not in SPECIAL_CHARS:
+	    subpattern.append((LITERAL, this))
+
+	elif this == "[":
+	    # character set
+	    set = []
+## 	    if source.match(":"):
+## 		pass # handle character classes
+	    if source.match("^"):
+		set.append((NEGATE, None))
+	    # check remaining characters
+	    start = set[:]
+	    while 1:
+		this = source.get()
+		if this == "]" and set != start:
+		    break
+		elif this and this[0] == "\\":
+		    code1 = _fixescape(this, 1)
+		elif this:
+		    code1 = LITERAL, this
+		else:
+		    raise SyntaxError, "unexpected end of regular expression"
+		if source.match("-"):
+		    # potential range
+		    this = source.get()
+		    if this == "]":
+			set.append(code1)
+			set.append((LITERAL, "-"))
+			break
+		    else:
+			if this[0] == "\\":
+			    code2 = _fixescape(this, 1)
+			else:
+			    code2 = LITERAL, this
+			if code1[0] != LITERAL or code2[0] != LITERAL:
+			    raise SyntaxError, "illegal range"
+			if len(code1[1]) != 1 or len(code2[1]) != 1:
+			    raise SyntaxError, "illegal range"
+			set.append((RANGE, (code1[1], code2[1])))
+		else:
+		    if code1[0] is IN:
+			code1 = code1[1][0]
+		    set.append(code1)
+
+	    # FIXME: <fl> move set optimization to support function
+	    if len(set)==1 and set[0][0] is LITERAL:
+		subpattern.append(set[0]) # optimization
+	    elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
+		subpattern.append((NOT_LITERAL, set[1][1])) # optimization
+	    else:
+		# FIXME: <fl> add charmap optimization
+		subpattern.append((IN, set))
+
+	elif this and this[0] in REPEAT_CHARS:
+	    # repeat previous item
+	    if this == "?":
+		min, max = 0, 1
+	    elif this == "*":
+		min, max = 0, sys.maxint
+	    elif this == "+":
+		min, max = 1, sys.maxint
+	    elif this == "{":
+		min, max = 0, sys.maxint
+		lo = hi = ""
+		while source.next in string.digits:
+		    lo = lo + source.get()
+		if source.match(","):
+		    while source.next in string.digits:
+			hi = hi + source.get()
+		else:
+		    hi = lo
+		if not source.match("}"):
+		    raise SyntaxError, "bogus range"
+		if lo:
+		    min = int(lo)
+		if hi:
+		    max = int(hi)
+		# FIXME: <fl> check that hi >= lo!
+	    else:
+		raise SyntaxError, "not supported"
+	    # figure out which item to repeat
+	    # FIXME: should back up to the right mark, right?
+	    if subpattern:
+		index = len(subpattern)-1
+		while subpattern[index][0] is MARK:
+		    index = index - 1
+		item = subpattern[index:index+1]
+	    else:
+		raise SyntaxError, "nothing to repeat"
+	    if source.match("?"):
+		subpattern[index] = (MIN_REPEAT, (min, max, item))
+	    else:
+		subpattern[index] = (MAX_REPEAT, (min, max, item))
+	elif this == ".":
+	    subpattern.append((ANY, None))
+	elif this == "(":
+	    group = 1
+	    name = None
+	    if source.match("?"):
+		group = 0
+		# options
+		if source.match("P"):
+		    # named group: skip forward to end of name
+		    if source.match("<"):
+			name = ""
+			while 1:
+			    char = source.get()
+			    if char in (">", None):
+				break
+			    name = name + char
+			group = 1
+		elif source.match(":"):
+		    # non-capturing group
+		    group = 2
+		elif source.match_set("iI"):
+		    pattern.setflag("i")
+		elif source.match_set("lL"):
+		    pattern.setflag("l")
+		elif source.match_set("mM"):
+		    pattern.setflag("m")
+		elif source.match_set("sS"):
+		    pattern.setflag("s")
+		elif source.match_set("xX"):
+		    pattern.setflag("x")
+	    if group:
+		# parse group contents
+		b = []
+		if group == 2:
+		    # anonymous group
+		    group = None
+		else:
+		    group = pattern.getgroup(name)
+ 		if group:
+ 		    subpattern.append((MARK, (group-1)*2))
+		while 1:
+		    p = _parse(source, pattern, flags)
+		    if source.match(")"):
+			if b:
+			    b.append(p)
+			    _branch(subpattern, b)
+			else:
+			    subpattern.append((SUBPATTERN, (group, p)))
+			break
+		    elif source.match("|"):
+			b.append(p)
+		    else:
+			raise SyntaxError, "group not properly closed"
+ 		if group:
+ 		    subpattern.append((MARK, (group-1)*2+1))
+	    else:
+		# FIXME: should this really be a while loop?
+ 		while source.get() not in (")", None):
+ 		    pass
+
+	elif this == "^":
+	    subpattern.append((AT, AT_BEGINNING))
+
+	elif this == "$":
+	    subpattern.append((AT, AT_END))
+
+	elif this and this[0] == "\\":
+	    code =_fixescape(this)
+	    subpattern.append(code)
+
+	else:
+	    raise SyntaxError, "parser error"
+
+    return subpattern
+
+def parse(source, flags=()):
+    s = Tokenizer(source)
+    g = Pattern()
+    b = []
+    while 1:
+	p = _parse(s, g, flags)
+	tail = s.get()
+	if tail == "|":
+	    b.append(p)
+	elif tail == ")":
+	    raise SyntaxError, "unbalanced parenthesis"
+	elif tail is None:
+	    if b:
+		b.append(p)
+		p = SubPattern(g)
+		_branch(p, b)
+	    break
+	else:
+	    raise SyntaxError, "bogus characters at end of regular expression"
+    return p
+
+if __name__ == "__main__":
+    from pprint import pprint
+    from testpatterns import PATTERNS
+    a = b = c = 0
+    for pattern, flags in PATTERNS:
+	if flags:
+	    continue
+	print "-"*68
+	try:
+	    p = parse(pattern)
+	    print repr(pattern), "->"
+	    pprint(p.data)
+	    import sre_compile
+	    try:
+		code = sre_compile.compile(p)
+		c = c + 1
+	    except:
+		pass
+	    a = a + 1
+	except SyntaxError, v:
+	    print "**", repr(pattern), v
+	b = b + 1
+    print "-"*68
+    print a, "of", b, "patterns successfully parsed"
+    print c, "of", b, "patterns successfully compiled"
+
author	Guido van Rossum <guido@python.org>	2000-03-31 14:58:54 (GMT)
committer	Guido van Rossum <guido@python.org>	2000-03-31 14:58:54 (GMT)
commit	7627c0de6968471996ce05aab200115d56efa1d5 (patch)
tree	9c4191aa38d9b7428d11c1b6267b95e5add41afa /Lib/sre_parse.py
parent	7a5b796322e2bf58fa8afe78bbaccfcc9492d178 (diff)
download	cpython-7627c0de6968471996ce05aab200115d56efa1d5.zip cpython-7627c0de6968471996ce05aab200115d56efa1d5.tar.gz cpython-7627c0de6968471996ce05aab200115d56efa1d5.tar.bz2