Fredrik Lundh: new snapshot. Mostly reindented.

This one should work with unicode expressions, and compile a bit more silently.
author: Guido van Rossum <guido@python.org> 2000-04-10 17:10:48 (GMT)
committer: Guido van Rossum <guido@python.org> 2000-04-10 17:10:48 (GMT)
commit: b81e70ebdb28246e427249d386518bc03d08c959 (patch)
tree: 4f2ba435b4815d7ff7f4f6abab7505fb16f4c7c7 /Lib
parent: 5de435a245fd7158b1a8db1201154ad73fd4bf13 (diff)
download: cpython-b81e70ebdb28246e427249d386518bc03d08c959.zip
cpython-b81e70ebdb28246e427249d386518bc03d08c959.tar.gz
cpython-b81e70ebdb28246e427249d386518bc03d08c959.tar.bz2
3 files changed, 490 insertions, 485 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 600b237..8738061 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -26,7 +26,7 @@ from sre_constants import *
 # find an array type code that matches the engine's code size
 for WORDSIZE in "BHil":
     if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
-        break
+	break
 else:
     raise RuntimeError, "cannot find a useable array type"
 
@@ -34,18 +34,18 @@ else:
 
 class Code:
     def __init__(self):
-        self.data = []
+	self.data = []
     def __len__(self):
-        return len(self.data)
+	return len(self.data)
     def __getitem__(self, index):
-        return self.data[index]
+	return self.data[index]
     def __setitem__(self, index, code):
-        self.data[index] = code
+	self.data[index] = code
     def append(self, code):
-        self.data.append(code)
+	self.data.append(code)
     def todata(self):
-        # print self.data
-        return array.array(WORDSIZE, self.data).tostring()
+	# print self.data
+	return array.array(WORDSIZE, self.data).tostring()
 
 def _lower(literal):
     # return _sre._lower(literal) # FIXME
@@ -54,122 +54,122 @@ def _lower(literal):
 def _compile(code, pattern, flags):
     append = code.append
     for op, av in pattern:
-        if op is ANY:
-            if "s" in flags:
-                append(CODES[op]) # any character at all!
-            else:
-                append(CODES[NOT_LITERAL])
-                append(10)
-        elif op in (SUCCESS, FAILURE):
-            append(CODES[op])
-        elif op is AT:
-            append(CODES[op])
-            append(POSITIONS[av])
-        elif op is BRANCH:
-            append(CODES[op])
-            tail = []
-            for av in av[1]:
-                skip = len(code); append(0)
-                _compile(code, av, flags)
-                append(CODES[JUMP])
-                tail.append(len(code)); append(0)
-                code[skip] = len(code) - skip
-            append(0) # end of branch
-            for tail in tail:
-                code[tail] = len(code) - tail
-        elif op is CALL:
-            append(CODES[op])
-            skip = len(code); append(0)
-            _compile(code, av, flags)
-            append(CODES[SUCCESS])
-            code[skip] = len(code) - skip
-        elif op is CATEGORY: # not used by current parser
-            append(CODES[op])
-            append(CATEGORIES[av])
-        elif op is GROUP:
-            if "i" in flags:
-                append(CODES[MAP_IGNORE[op]])
-            else:
-                append(CODES[op])
-            append(av)
-        elif op is IN:
-            if "i" in flags:
-                append(CODES[MAP_IGNORE[op]])
-                def fixup(literal):
-                    return ord(_lower(literal))
-            else:
-                append(CODES[op])
-                fixup = ord
-            skip = len(code); append(0)
-            for op, av in av:
-                append(CODES[op])
-                if op is NEGATE:
-                    pass
-                elif op is LITERAL:
-                    append(fixup(av))
-                elif op is RANGE:
-                    append(fixup(av[0]))
-                    append(fixup(av[1]))
-                elif op is CATEGORY:
-                    append(CATEGORIES[av])
-                else:
-                    raise ValueError, "unsupported set operator"
-            append(CODES[FAILURE])
-            code[skip] = len(code) - skip
-        elif op in (LITERAL, NOT_LITERAL):
-            if "i" in flags:
-                append(CODES[MAP_IGNORE[op]])
-                append(ord(_lower(av)))
-            else:
-                append(CODES[op])
-                append(ord(av))
-        elif op is MARK:
-            append(CODES[op])
-            append(av)
-        elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
-            lo, hi = av[2].getwidth()
-            if lo == 0:
-                raise SyntaxError, "cannot repeat zero-width items"
-            if lo == hi == 1 and op is MAX_REPEAT:
-                append(CODES[MAX_REPEAT_ONE])
-                skip = len(code); append(0)
-                append(av[0])
-                append(av[1])
-                _compile(code, av[2], flags)
-                append(CODES[SUCCESS])
-                code[skip] = len(code) - skip
-            else:
-                append(CODES[op])
-                skip = len(code); append(0)
-                append(av[0])
-                append(av[1])
-                _compile(code, av[2], flags)
-                if op is MIN_REPEAT:
-                    append(CODES[MIN_UNTIL])
-                else:
-                    # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
-                    append(CODES[MAX_UNTIL])
-                code[skip] = len(code) - skip
-        elif op is SUBPATTERN:
-##          group = av[0]
-##          if group:
-##              append(CODES[MARK])
-##              append((group-1)*2)
-            _compile(code, av[1], flags)
-##          if group:
-##              append(CODES[MARK])
-##              append((group-1)*2+1)
-        else:
-            raise ValueError, ("unsupported operand type", op)
+	if op is ANY:
+	    if "s" in flags:
+		append(CODES[op]) # any character at all!
+	    else:
+		append(CODES[NOT_LITERAL])
+		append(10)
+	elif op in (SUCCESS, FAILURE):
+	    append(CODES[op])
+	elif op is AT:
+	    append(CODES[op])
+	    append(POSITIONS[av])
+	elif op is BRANCH:
+	    append(CODES[op])
+	    tail = []
+	    for av in av[1]:
+		skip = len(code); append(0)
+		_compile(code, av, flags)
+		append(CODES[JUMP])
+		tail.append(len(code)); append(0)
+		code[skip] = len(code) - skip
+	    append(0) # end of branch
+	    for tail in tail:
+		code[tail] = len(code) - tail
+	elif op is CALL:
+	    append(CODES[op])
+	    skip = len(code); append(0)
+	    _compile(code, av, flags)
+	    append(CODES[SUCCESS])
+	    code[skip] = len(code) - skip
+	elif op is CATEGORY: # not used by current parser
+	    append(CODES[op])
+	    append(CATEGORIES[av])
+	elif op is GROUP:
+	    if "i" in flags:
+		append(CODES[MAP_IGNORE[op]])
+	    else:
+		append(CODES[op])
+	    append(av)
+	elif op is IN:
+	    if "i" in flags:
+		append(CODES[MAP_IGNORE[op]])
+		def fixup(literal):
+		    return ord(_lower(literal))
+	    else:
+		append(CODES[op])
+		fixup = ord
+	    skip = len(code); append(0)
+	    for op, av in av:
+		append(CODES[op])
+		if op is NEGATE:
+		    pass
+		elif op is LITERAL:
+		    append(fixup(av))
+		elif op is RANGE:
+		    append(fixup(av[0]))
+		    append(fixup(av[1]))
+		elif op is CATEGORY:
+		    append(CATEGORIES[av])
+		else:
+		    raise ValueError, "unsupported set operator"
+	    append(CODES[FAILURE])
+	    code[skip] = len(code) - skip
+	elif op in (LITERAL, NOT_LITERAL):
+	    if "i" in flags:
+		append(CODES[MAP_IGNORE[op]])
+		append(ord(_lower(av)))
+	    else:
+		append(CODES[op])
+		append(ord(av))
+	elif op is MARK:
+	    append(CODES[op])
+	    append(av)
+ 	elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
+	    lo, hi = av[2].getwidth()
+ 	    if lo == 0:
+ 		raise SyntaxError, "cannot repeat zero-width items"
+	    if lo == hi == 1 and op is MAX_REPEAT:
+		append(CODES[MAX_REPEAT_ONE])
+		skip = len(code); append(0)
+		append(av[0])
+		append(av[1])
+		_compile(code, av[2], flags)
+		append(CODES[SUCCESS])
+		code[skip] = len(code) - skip
+	    else:
+		append(CODES[op])
+		skip = len(code); append(0)
+		append(av[0])
+		append(av[1])
+		_compile(code, av[2], flags)
+		if op is MIN_REPEAT:
+		    append(CODES[MIN_UNTIL])
+		else:
+		    # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
+		    append(CODES[MAX_UNTIL])
+		code[skip] = len(code) - skip
+	elif op is SUBPATTERN:
+## 	    group = av[0]
+## 	    if group:
+## 		append(CODES[MARK])
+## 		append((group-1)*2)
+	    _compile(code, av[1], flags)
+## 	    if group:
+## 		append(CODES[MARK])
+## 		append((group-1)*2+1)
+	else:
+	    raise ValueError, ("unsupported operand type", op)
 
 def compile(p, flags=()):
     # convert pattern list to internal format
-    if type(p) is type(""):
-        import sre_parse
-        pattern = p
-        p = sre_parse.parse(p)
+    if type(p) in (type(""), type(u"")):
+	import sre_parse
+	pattern = p
+	p = sre_parse.parse(p)
     else:
-        pattern = None
+	pattern = None
     # print p.getwidth()
     # print p
     code = Code()
@@ -178,10 +178,10 @@ def compile(p, flags=()):
     # print list(code.data)
     data = code.todata()
     if 0: # debugging
-        print
-        print "-" * 68
-        import sre_disasm
-        sre_disasm.disasm(data)
-        print "-" * 68
+	print
+	print "-" * 68
+	import sre_disasm
+	sre_disasm.disasm(data)
+	print "-" * 68
     # print len(data), p.pattern.groups, len(p.pattern.groupdict)
     return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict)
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index f05c797..af88309 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -126,6 +126,6 @@ if __name__ == "__main__":
     f = open("sre_constants.h", "w")
     f.write("/* generated by sre_constants.py */\n")
     for k, v in items:
-        f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n")
+	f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n")
     f.close()
     print "done"
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index db4c500..8b68ea1 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -26,8 +26,11 @@ from sre_constants import *
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS  = "*+?{"
 
-OCTDIGITS = "01234567"
-HEXDIGITS = "0123456789abcdefABCDEF"
+# FIXME: string in tuple tests may explode with if char is unicode :-(
+DIGITS = tuple(string.digits)
+
+OCTDIGITS = tuple("01234567")
+HEXDIGITS = tuple("0123456789abcdefABCDEF")
 
 ESCAPES = {
     "\\a": (LITERAL, chr(7)),
@@ -55,168 +58,168 @@ CATEGORIES = {
 class Pattern:
     # FIXME: <fl> rename class, and store flags in here too!
     def __init__(self):
-        self.flags = []
-        self.groups = 1
-        self.groupdict = {}
+	self.flags = []
+	self.groups = 1
+	self.groupdict = {}
     def getgroup(self, name=None):
-        gid = self.groups
-        self.groups = gid + 1
-        if name:
-            self.groupdict[name] = gid
-        return gid
+	gid = self.groups
+	self.groups = gid + 1
+	if name:
+	    self.groupdict[name] = gid
+	return gid
     def setflag(self, flag):
-        if flag not in self.flags:
-            self.flags.append(flag)
+	if flag in self.flags:
+	    self.flags.append(flag)
 
 class SubPattern:
     # a subpattern, in intermediate form
     def __init__(self, pattern, data=None):
-        self.pattern = pattern
-        if not data:
-            data = []
-        self.data = data
-        self.flags = []
-        self.width = None
+	self.pattern = pattern
+	if not data:
+	    data = []
+	self.data = data
+	self.flags = []
+	self.width = None
     def __repr__(self):
-        return repr(self.data)
+	return repr(self.data)
     def __len__(self):
-        return len(self.data)
+	return len(self.data)
     def __delitem__(self, index):
-        del self.data[index]
+	del self.data[index]
     def __getitem__(self, index):
-        return self.data[index]
+	return self.data[index]
     def __setitem__(self, index, code):
-        self.data[index] = code
+	self.data[index] = code
     def __getslice__(self, start, stop):
-        return SubPattern(self.pattern, self.data[start:stop])
+	return SubPattern(self.pattern, self.data[start:stop])
     def insert(self, index, code):
-        self.data.insert(index, code)
+	self.data.insert(index, code)
     def append(self, code):
-        self.data.append(code)
+	self.data.append(code)
     def getwidth(self):
-        # determine the width (min, max) for this subpattern
-        if self.width:
-            return self.width
-        lo = hi = 0L
-        for op, av in self.data:
-            if op is BRANCH:
-                l = sys.maxint
-                h = 0
-                for av in av[1]:
-                    i, j = av.getwidth()
-                    l = min(l, i)
-                    h = min(h, j)
-                lo = lo + i
-                hi = hi + j
-            elif op is CALL:
-                i, j = av.getwidth()
-                lo = lo + i
-                hi = hi + j
-            elif op is SUBPATTERN:
-                i, j = av[1].getwidth()
-                lo = lo + i
-                hi = hi + j
-            elif op in (MIN_REPEAT, MAX_REPEAT):
-                i, j = av[2].getwidth()
-                lo = lo + i * av[0]
-                hi = hi + j * av[1]
-            elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
-                lo = lo + 1
-                hi = hi + 1
-            elif op == SUCCESS:
-                break
-        self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
-        return self.width
+	# determine the width (min, max) for this subpattern
+	if self.width:
+	    return self.width
+	lo = hi = 0L
+	for op, av in self.data:
+	    if op is BRANCH:
+		l = sys.maxint
+		h = 0
+		for av in av[1]:
+		    i, j = av.getwidth()
+		    l = min(l, i)
+		    h = min(h, j)
+		lo = lo + i
+		hi = hi + j
+	    elif op is CALL:
+		i, j = av.getwidth()
+		lo = lo + i
+		hi = hi + j
+	    elif op is SUBPATTERN:
+		i, j = av[1].getwidth()
+		lo = lo + i
+		hi = hi + j
+	    elif op in (MIN_REPEAT, MAX_REPEAT):
+		i, j = av[2].getwidth()
+		lo = lo + i * av[0]
+		hi = hi + j * av[1]
+	    elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
+		lo = lo + 1
+		hi = hi + 1
+	    elif op == SUCCESS:
+		break
+	self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
+	return self.width
     def set(self, flag):
-        if not flag in self.flags:
-            self.flags.append(flag)
+	if not flag in self.flags:
+	    self.flags.append(flag)
     def reset(self, flag):
-        if flag in self.flags:
-            self.flags.remove(flag)
+	if flag in self.flags:
+	    self.flags.remove(flag)
 
 class Tokenizer:
     def __init__(self, string):
-        self.string = list(string)
-        self.next = self.__next()
+	self.string = list(string)
+	self.next = self.__next()
     def __next(self):
-        if not self.string:
-            return None
-        char = self.string[0]
-        if char[0] == "\\":
-            try:
-                c = self.string[1]
-            except IndexError:
-                raise SyntaxError, "bogus escape"
-            char = char + c
-            try:
-                if c == "x":
-                    # hexadecimal constant
-                    for i in xrange(2, sys.maxint):
-                        c = self.string[i]
-                        if c not in HEXDIGITS:
-                            break
-                        char = char + c
-                elif c in string.digits:
-                    # decimal (or octal) number
-                    for i in xrange(2, sys.maxint):
-                        c = self.string[i]
-                        # FIXME: if larger than current number of
-                        # groups, interpret as an octal number 
-                        if c not in string.digits:
-                            break
-                        char = char + c
-            except IndexError:
-                pass # use what we've got this far
-        del self.string[0:len(char)]
-        return char
+	if not self.string:
+	    return None
+	char = self.string[0]
+	if char[0] == "\\":
+	    try:
+		c = self.string[1]
+	    except IndexError:
+		raise SyntaxError, "bogus escape"
+	    char = char + c
+	    try:
+		if c == "x":
+		    # hexadecimal constant
+		    for i in xrange(2, sys.maxint):
+			c = self.string[i]
+			if str(c) not in HEXDIGITS:
+			    break
+			char = char + c
+		elif str(c) in DIGITS:
+		    # decimal (or octal) number
+		    for i in xrange(2, sys.maxint):
+			c = self.string[i]
+			# FIXME: if larger than current number of
+			# groups, interpret as an octal number 
+			if str(c) not in DIGITS:
+			    break
+			char = char + c
+	    except IndexError:
+		pass # use what we've got this far
+	del self.string[0:len(char)]
+	return char
     def match(self, char):
-        if char == self.next:
-            self.next = self.__next()
-            return 1
-        return 0
+	if char == self.next:
+	    self.next = self.__next()
+	    return 1
+	return 0
     def match_set(self, set):
-        if self.next in set:
-            self.next = self.__next()
-            return 1
-        return 0
+	if self.next and self.next in set:
+	    self.next = self.__next()
+	    return 1
+	return 0
     def get(self):
-        this = self.next
-        self.next = self.__next()
-        return this
+	this = self.next
+	self.next = self.__next()
+	return this
 
 def _fixescape(escape, character_class=0):
     # convert escape to (type, value)
     if character_class:
-        # inside a character class, we'll look in the character
-        # escapes dictionary first
-        code = ESCAPES.get(escape)
-        if code:
-            return code
-        code = CATEGORIES.get(escape)
+	# inside a character class, we'll look in the character
+	# escapes dictionary first
+	code = ESCAPES.get(escape)
+	if code:
+	    return code
+	code = CATEGORIES.get(escape)
     else:
-        code = CATEGORIES.get(escape)
-        if code:
-            return code
-        code = ESCAPES.get(escape)
+	code = CATEGORIES.get(escape)
+	if code:
+	    return code
+	code = ESCAPES.get(escape)
     if code:
-        return code
+	return code
     if not character_class:
-        try:
-            group = int(escape[1:])
-            # FIXME: only valid if group <= current number of groups
-            return GROUP, group
-        except ValueError:
-            pass
+	try:
+	    group = int(escape[1:])
+	    # FIXME: only valid if group <= current number of groups
+	    return GROUP, group
+	except ValueError:
+	    pass
     try:
-        if escape[1:2] == "x":
-            escape = escape[2:]
-            return LITERAL, chr(string.atoi(escape[-2:], 16) & 0xff)
-        elif escape[1:2] in string.digits:
-            return LITERAL, chr(string.atoi(escape[1:], 8) & 0xff)
-        elif len(escape) == 2:
-            return LITERAL, escape[1]
+	if escape[1:2] == "x":
+	    escape = escape[2:]
+	    return LITERAL, chr(int(escape[-2:], 16) & 0xff)
+	elif str(escape[1:2]) in DIGITS:
+	    return LITERAL, chr(int(escape[1:], 8) & 0xff)
+	elif len(escape) == 2:
+	    return LITERAL, escape[1]
     except ValueError:
-        pass
+	pass
     raise SyntaxError, "bogus escape: %s" % repr(escape)
 
 def _branch(subpattern, items):
@@ -226,35 +229,35 @@ def _branch(subpattern, items):
 
     # check if all items share a common prefix
     while 1:
-        prefix = None
-        for item in items:
-            if not item:
-                break
-            if prefix is None:
-                prefix = item[0]
-            elif item[0] != prefix:
-                break
-        else:
-            # all subitems start with a common "prefix".
-            # move it out of the branch
-            for item in items:
-                del item[0]
-            subpattern.append(prefix)
-            continue # check next one
-        break
+	prefix = None
+	for item in items:
+	    if not item:
+		break
+	    if prefix is None:
+		prefix = item[0]
+	    elif item[0] != prefix:
+		break
+	else:
+	    # all subitems start with a common "prefix".
+	    # move it out of the branch
+	    for item in items:
+		del item[0]
+	    subpattern.append(prefix)
+	    continue # check next one
+	break
 
     # check if the branch can be replaced by a character set
     for item in items:
-        if len(item) != 1 or item[0][0] != LITERAL:
-            break
+	if len(item) != 1 or item[0][0] != LITERAL:
+	    break
     else:
-        # we can store this as a character set instead of a
-        # branch (FIXME: use a range if possible)
-        set = []
-        for item in items:
-            set.append(item[0])
-        subpattern.append((IN, set))
-        return
+	# we can store this as a character set instead of a
+	# branch (FIXME: use a range if possible)
+	set = []
+	for item in items:
+	    set.append(item[0])
+	subpattern.append((IN, set))
+	return
 
     subpattern.append((BRANCH, (None, items)))
 
@@ -268,178 +271,180 @@ def _parse(source, pattern, flags=()):
 
     while 1:
 
-        if source.next in ("|", ")"):
-            break # end of subpattern
-        this = source.get()
-        if this is None:
-            break # end of pattern
-
-        if this and this[0] not in SPECIAL_CHARS:
-            subpattern.append((LITERAL, this))
-
-        elif this == "[":
-            # character set
-            set = []
-##          if source.match(":"):
-##              pass # handle character classes
-            if source.match("^"):
-                set.append((NEGATE, None))
-            # check remaining characters
-            start = set[:]
-            while 1:
-                this = source.get()
-                if this == "]" and set != start:
-                    break
-                elif this and this[0] == "\\":
-                    code1 = _fixescape(this, 1)
-                elif this:
-                    code1 = LITERAL, this
-                else:
-                    raise SyntaxError, "unexpected end of regular expression"
-                if source.match("-"):
-                    # potential range
-                    this = source.get()
-                    if this == "]":
-                        set.append(code1)
-                        set.append((LITERAL, "-"))
-                        break
-                    else:
-                        if this[0] == "\\":
-                            code2 = _fixescape(this, 1)
-                        else:
-                            code2 = LITERAL, this
-                        if code1[0] != LITERAL or code2[0] != LITERAL:
-                            raise SyntaxError, "illegal range"
-                        if len(code1[1]) != 1 or len(code2[1]) != 1:
-                            raise SyntaxError, "illegal range"
-                        set.append((RANGE, (code1[1], code2[1])))
-                else:
-                    if code1[0] is IN:
-                        code1 = code1[1][0]
-                    set.append(code1)
-
-            # FIXME: <fl> move set optimization to support function
-            if len(set)==1 and set[0][0] is LITERAL:
-                subpattern.append(set[0]) # optimization
-            elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
-                subpattern.append((NOT_LITERAL, set[1][1])) # optimization
-            else:
-                # FIXME: <fl> add charmap optimization
-                subpattern.append((IN, set))
-
-        elif this and this[0] in REPEAT_CHARS:
-            # repeat previous item
-            if this == "?":
-                min, max = 0, 1
-            elif this == "*":
-                min, max = 0, sys.maxint
-            elif this == "+":
-                min, max = 1, sys.maxint
-            elif this == "{":
-                min, max = 0, sys.maxint
-                lo = hi = ""
-                while source.next in string.digits:
-                    lo = lo + source.get()
-                if source.match(","):
-                    while source.next in string.digits:
-                        hi = hi + source.get()
-                else:
-                    hi = lo
-                if not source.match("}"):
-                    raise SyntaxError, "bogus range"
-                if lo:
-                    min = int(lo)
-                if hi:
-                    max = int(hi)
-                # FIXME: <fl> check that hi >= lo!
-            else:
-                raise SyntaxError, "not supported"
-            # figure out which item to repeat
-            # FIXME: should back up to the right mark, right?
-            if subpattern:
-                index = len(subpattern)-1
-                while subpattern[index][0] is MARK:
-                    index = index - 1
-                item = subpattern[index:index+1]
-            else:
-                raise SyntaxError, "nothing to repeat"
-            if source.match("?"):
-                subpattern[index] = (MIN_REPEAT, (min, max, item))
-            else:
-                subpattern[index] = (MAX_REPEAT, (min, max, item))
-        elif this == ".":
-            subpattern.append((ANY, None))
-        elif this == "(":
-            group = 1
-            name = None
-            if source.match("?"):
-                group = 0
-                # options
-                if source.match("P"):
-                    # named group: skip forward to end of name
-                    if source.match("<"):
-                        name = ""
-                        while 1:
-                            char = source.get()
-                            if char in (">", None):
-                                break
-                            name = name + char
-                        group = 1
-                elif source.match(":"):
-                    # non-capturing group
-                    group = 2
-                elif source.match_set("iI"):
-                    pattern.setflag("i")
-                elif source.match_set("lL"):
-                    pattern.setflag("l")
-                elif source.match_set("mM"):
-                    pattern.setflag("m")
-                elif source.match_set("sS"):
-                    pattern.setflag("s")
-                elif source.match_set("xX"):
-                    pattern.setflag("x")
-            if group:
-                # parse group contents
-                b = []
-                if group == 2:
-                    # anonymous group
-                    group = None
-                else:
-                    group = pattern.getgroup(name)
-                if group:
-                    subpattern.append((MARK, (group-1)*2))
-                while 1:
-                    p = _parse(source, pattern, flags)
-                    if source.match(")"):
-                        if b:
-                            b.append(p)
-                            _branch(subpattern, b)
-                        else:
-                            subpattern.append((SUBPATTERN, (group, p)))
-                        break
-                    elif source.match("|"):
-                        b.append(p)
-                    else:
-                        raise SyntaxError, "group not properly closed"
-                if group:
-                    subpattern.append((MARK, (group-1)*2+1))
-            else:
-                # FIXME: should this really be a while loop?
-                while source.get() not in (")", None):
-                    pass
-
-        elif this == "^":
-            subpattern.append((AT, AT_BEGINNING))
-
-        elif this == "$":
-            subpattern.append((AT, AT_END))
-
-        elif this and this[0] == "\\":
-            code =_fixescape(this)
-            subpattern.append(code)
-
-        else:
-            raise SyntaxError, "parser error"
+	if str(source.next) in ("|", ")"):
+	    break # end of subpattern
+	this = source.get()
+	if this is None:
+	    break # end of pattern
+
+	if this and this[0] not in SPECIAL_CHARS:
+	    subpattern.append((LITERAL, this))
+
+	elif this == "[":
+	    # character set
+	    set = []
+## 	    if source.match(":"):
+## 		pass # handle character classes
+	    if source.match("^"):
+		set.append((NEGATE, None))
+	    # check remaining characters
+	    start = set[:]
+	    while 1:
+		this = source.get()
+		if this == "]" and set != start:
+		    break
+		elif this and this[0] == "\\":
+		    code1 = _fixescape(this, 1)
+		elif this:
+		    code1 = LITERAL, this
+		else:
+		    raise SyntaxError, "unexpected end of regular expression"
+		if source.match("-"):
+		    # potential range
+		    this = source.get()
+		    if this == "]":
+			set.append(code1)
+			set.append((LITERAL, "-"))
+			break
+		    else:
+			if this[0] == "\\":
+			    code2 = _fixescape(this, 1)
+			else:
+			    code2 = LITERAL, this
+			if code1[0] != LITERAL or code2[0] != LITERAL:
+			    raise SyntaxError, "illegal range"
+			if len(code1[1]) != 1 or len(code2[1]) != 1:
+			    raise SyntaxError, "illegal range"
+			set.append((RANGE, (code1[1], code2[1])))
+		else:
+		    if code1[0] is IN:
+			code1 = code1[1][0]
+		    set.append(code1)
+
+	    # FIXME: <fl> move set optimization to support function
+	    if len(set)==1 and set[0][0] is LITERAL:
+		subpattern.append(set[0]) # optimization
+	    elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
+		subpattern.append((NOT_LITERAL, set[1][1])) # optimization
+	    else:
+		# FIXME: <fl> add charmap optimization
+		subpattern.append((IN, set))
+
+	elif this and this[0] in REPEAT_CHARS:
+	    # repeat previous item
+	    if this == "?":
+		min, max = 0, 1
+	    elif this == "*":
+		min, max = 0, sys.maxint
+	    elif this == "+":
+		min, max = 1, sys.maxint
+	    elif this == "{":
+		min, max = 0, sys.maxint
+		lo = hi = ""
+		while str(source.next) in DIGITS:
+		    lo = lo + source.get()
+		if source.match(","):
+		    while str(source.next) in DIGITS:
+			hi = hi + source.get()
+		else:
+		    hi = lo
+		if not source.match("}"):
+		    raise SyntaxError, "bogus range"
+		if lo:
+		    min = int(lo)
+		if hi:
+		    max = int(hi)
+		# FIXME: <fl> check that hi >= lo!
+	    else:
+		raise SyntaxError, "not supported"
+	    # figure out which item to repeat
+	    # FIXME: should back up to the right mark, right?
+	    if subpattern:
+		index = len(subpattern)-1
+		while subpattern[index][0] is MARK:
+		    index = index - 1
+		item = subpattern[index:index+1]
+	    else:
+		raise SyntaxError, "nothing to repeat"
+	    if source.match("?"):
+		subpattern[index] = (MIN_REPEAT, (min, max, item))
+	    else:
+		subpattern[index] = (MAX_REPEAT, (min, max, item))
+	elif this == ".":
+	    subpattern.append((ANY, None))
+	elif this == "(":
+	    group = 1
+	    name = None
+	    if source.match("?"):
+		group = 0
+		# options
+		if source.match("P"):
+		    # named group: skip forward to end of name
+		    if source.match("<"):
+			name = ""
+			while 1:
+			    char = source.get()
+			    if char is None or char == ">":
+				break
+			    name = name + char
+			group = 1
+		elif source.match(":"):
+		    # non-capturing group
+		    group = 2
+		elif source.match_set("iI"):
+		    pattern.setflag("i")
+		elif source.match_set("lL"):
+		    pattern.setflag("l")
+		elif source.match_set("mM"):
+		    pattern.setflag("m")
+		elif source.match_set("sS"):
+		    pattern.setflag("s")
+		elif source.match_set("xX"):
+		    pattern.setflag("x")
+	    if group:
+		# parse group contents
+		b = []
+		if group == 2:
+		    # anonymous group
+		    group = None
+		else:
+		    group = pattern.getgroup(name)
+ 		if group:
+ 		    subpattern.append((MARK, (group-1)*2))
+		while 1:
+		    p = _parse(source, pattern, flags)
+		    if source.match(")"):
+			if b:
+			    b.append(p)
+			    _branch(subpattern, b)
+			else:
+			    subpattern.append((SUBPATTERN, (group, p)))
+			break
+		    elif source.match("|"):
+			b.append(p)
+		    else:
+			raise SyntaxError, "group not properly closed"
+ 		if group:
+ 		    subpattern.append((MARK, (group-1)*2+1))
+	    else:
+		# FIXME: should this really be a while loop?
+		while 1:
+		    char = source.get()
+		    if char is None or char == ")":
+			break
+
+	elif this == "^":
+	    subpattern.append((AT, AT_BEGINNING))
+
+	elif this == "$":
+	    subpattern.append((AT, AT_END))
+
+	elif this and this[0] == "\\":
+	    code =_fixescape(this)
+	    subpattern.append(code)
+
+	else:
+	    raise SyntaxError, "parser error"
 
     return subpattern
 
@@ -448,20 +453,20 @@ def parse(source, flags=()):
     g = Pattern()
     b = []
     while 1:
-        p = _parse(s, g, flags)
-        tail = s.get()
-        if tail == "|":
-            b.append(p)
-        elif tail == ")":
-            raise SyntaxError, "unbalanced parenthesis"
-        elif tail is None:
-            if b:
-                b.append(p)
-                p = SubPattern(g)
-                _branch(p, b)
-            break
-        else:
-            raise SyntaxError, "bogus characters at end of regular expression"
+	p = _parse(s, g, flags)
+	tail = s.get()
+	if tail == "|":
+	    b.append(p)
+	elif tail == ")":
+	    raise SyntaxError, "unbalanced parenthesis"
+	elif tail is None:
+	    if b:
+		b.append(p)
+		p = SubPattern(g)
+		_branch(p, b)
+	    break
+	else:
+	    raise SyntaxError, "bogus characters at end of regular expression"
     return p
 
 if __name__ == "__main__":
@@ -469,23 +474,23 @@ if __name__ == "__main__":
     from testpatterns import PATTERNS
     a = b = c = 0
     for pattern, flags in PATTERNS:
-        if flags:
-            continue
-        print "-"*68
-        try:
-            p = parse(pattern)
-            print repr(pattern), "->"
-            pprint(p.data)
-            import sre_compile
-            try:
-                code = sre_compile.compile(p)
-                c = c + 1
-            except:
-                pass
-            a = a + 1
-        except SyntaxError, v:
-            print "**", repr(pattern), v
-        b = b + 1
+	if flags:
+	    continue
+	print "-"*68
+	try:
+	    p = parse(pattern)
+	    print repr(pattern), "->"
+	    pprint(p.data)
+	    import sre_compile
+	    try:
+		code = sre_compile.compile(p)
+		c = c + 1
+	    except:
+		pass
+	    a = a + 1
+	except SyntaxError, v:
+	    print "**", repr(pattern), v
+	b = b + 1
     print "-"*68
     print a, "of", b, "patterns successfully parsed"
     print c, "of", b, "patterns successfully compiled"
author	Guido van Rossum <guido@python.org>	2000-04-10 17:10:48 (GMT)
committer	Guido van Rossum <guido@python.org>	2000-04-10 17:10:48 (GMT)
commit	b81e70ebdb28246e427249d386518bc03d08c959 (patch)
tree	4f2ba435b4815d7ff7f4f6abab7505fb16f4c7c7 /Lib
parent	5de435a245fd7158b1a8db1201154ad73fd4bf13 (diff)
download	cpython-b81e70ebdb28246e427249d386518bc03d08c959.zip cpython-b81e70ebdb28246e427249d386518bc03d08c959.tar.gz cpython-b81e70ebdb28246e427249d386518bc03d08c959.tar.bz2