-- SRE 0.9.6 sync. this includes:

+ added "regs" attribute + fixed "pos" and "endpos" attributes + reset "lastindex" and "lastgroup" in scanner methods + removed (?P#id) syntax; the "lastindex" and "lastgroup" attributes are now always set + removed string module dependencies in sre_parse + better debugging support in sre_parse + various tweaks to build under 1.5.2
author: Fredrik Lundh <fredrik@pythonware.com> 2000-07-23 21:46:17 (GMT)
committer: Fredrik Lundh <fredrik@pythonware.com> 2000-07-23 21:46:17 (GMT)
commit: 8a3ebf8ca87a7d2989148d7c218974ab4235ca5d (patch)
tree: 1694bf24acdb9defbf770ad34e1181b84bf17428 /Lib
parent: 4f1b2081e9dde910048ef679f5afe252023a4031 (diff)
download: cpython-8a3ebf8ca87a7d2989148d7c218974ab4235ca5d.zip
cpython-8a3ebf8ca87a7d2989148d7c218974ab4235ca5d.tar.gz
cpython-8a3ebf8ca87a7d2989148d7c218974ab4235ca5d.tar.bz2
5 files changed, 118 insertions, 88 deletions
diff --git a/Lib/sre.py b/Lib/sre.py
index fbf44a5..6dd1df9 100644
--- a/Lib/sre.py
+++ b/Lib/sre.py
@@ -10,9 +10,13 @@
 # other compatibility work.
 #
 
+# FIXME: change all FIXME's to XXX ;-)
+
 import sre_compile
 import sre_parse
 
+import string
+
 # flags
 I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
 L = LOCALE = sre_compile.SRE_FLAG_LOCALE
@@ -53,6 +57,9 @@ def findall(pattern, string, maxsplit=0):
 def compile(pattern, flags=0):
     return _compile(pattern, flags)
 
+def purge():
+    _cache.clear()
+
 def template(pattern, flags=0):
     return _compile(pattern, flags|T)
 
@@ -65,7 +72,7 @@ def escape(pattern):
                 s[i] = "\\000"
             else:
                 s[i] = "\\" + c
-    return pattern[:0].join(s)
+    return _join(s, pattern)
 
 # --------------------------------------------------------------------
 # internals
@@ -73,10 +80,14 @@ def escape(pattern):
 _cache = {}
 _MAXCACHE = 100
 
+def _join(seq, sep):
+    # internal: join into string having the same type as sep
+    return string.join(seq, sep[:0])
+
 def _compile(pattern, flags=0):
     # internal: compile pattern
     tp = type(pattern)
-    if tp not in (type(""), type(u"")):
+    if tp not in sre_compile.STRING_TYPES:
         return pattern
     key = (tp, pattern, flags)
     try:
@@ -89,10 +100,6 @@ def _compile(pattern, flags=0):
     _cache[key] = p
     return p
 
-def purge():
-    # clear pattern cache
-    _cache.clear()
-
 def _sub(pattern, template, string, count=0):
     # internal: pattern.sub implementation hook
     return _subn(pattern, template, string, count)[0]
@@ -120,7 +127,7 @@ def _subn(pattern, template, string, count=0):
         i = e
         n = n + 1
     append(string[i:])
-    return string[:0].join(s), n
+    return _join(s, string[:0]), n
 
 def _split(pattern, string, maxsplit=0):
     # internal: pattern.split implementation hook
@@ -161,11 +168,19 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)
 
 class Scanner:
     def __init__(self, lexicon):
+        from sre_constants import BRANCH, SUBPATTERN, INDEX
         self.lexicon = lexicon
+        # combine phrases into a compound pattern
         p = []
+        s = sre_parse.Pattern()
         for phrase, action in lexicon:
-            p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
-        self.scanner = _compile("|".join(p))
+            p.append(sre_parse.SubPattern(s, [
+                (SUBPATTERN, (None, sre_parse.parse(phrase))),
+                (INDEX, len(p))
+                ]))
+        p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
+        s.groups = len(p)
+        self.scanner = sre_compile.compile(p)
     def scan(self, string):
         result = []
         append = result.append
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index d8d01ea..ef26e1c 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -197,10 +197,11 @@ def _compile(code, pattern, flags):
             else:
                 emit(ATCODES[av])
         elif op is BRANCH:
-            emit(OPCODES[op])
             tail = []
             for av in av[1]:
+                emit(OPCODES[op])
                 skip = len(code); emit(0)
+                emit(MAXCODE) # save mark
                 _compile(code, av, flags)
                 emit(OPCODES[JUMP])
                 tail.append(len(code)); emit(0)
@@ -286,11 +287,18 @@ def _compile_info(code, pattern, flags):
         emit(OPCODES[FAILURE])
     code[skip] = len(code) - skip
 
+STRING_TYPES = [type("")]
+
+try:
+    STRING_TYPES.append(type(unicode("")))
+except NameError:
+    pass
+
 def compile(p, flags=0):
     # internal: convert pattern list to internal format
 
     # compile, as necessary
-    if type(p) in (type(""), type(u"")):
+    if type(p) in STRING_TYPES:
         import sre_parse
         pattern = p
         p = sre_parse.parse(p, flags)
@@ -308,6 +316,8 @@ def compile(p, flags=0):
 
     code.append(OPCODES[SUCCESS])
 
+    # print code
+
     # FIXME: <fl> get rid of this limitation!
     assert p.pattern.groups <= 100,\
            "sorry, but this version only supports 100 named groups"
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index c2cecdf..ef32c32 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -172,7 +172,7 @@ CH_UNICODE = {
 # flags
 SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
 SRE_FLAG_IGNORECASE = 2 # case insensitive
-SRE_FLAG_LOCALE = 4 # honor system locale
+SRE_FLAG_LOCALE = 4 # honour system locale
 SRE_FLAG_MULTILINE = 8 # treat target as multiline string
 SRE_FLAG_DOTALL = 16 # treat target as a single string
 SRE_FLAG_UNICODE = 32 # use unicode locale
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 053335a..1b56352 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -25,12 +25,12 @@ CHARMASK = 0xff
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS  = "*+?{"
 
-DIGITS = tuple(string.digits)
+DIGITS = tuple("012345689")
 
 OCTDIGITS = tuple("01234567")
 HEXDIGITS = tuple("0123456789abcdefABCDEF")
 
-WHITESPACE = tuple(string.whitespace)
+WHITESPACE = tuple(" \t\n\r\v\f")
 
 ESCAPES = {
     r"\a": (LITERAL, 7),
@@ -68,7 +68,8 @@ FLAGS = {
     "u": SRE_FLAG_UNICODE,
 }
 
-class State:
+class Pattern:
+    # master pattern object.  keeps track of global attributes
     def __init__(self):
         self.flags = 0
         self.groups = 1
@@ -88,6 +89,33 @@ class SubPattern:
             data = []
         self.data = data
         self.width = None
+    def dump(self, level=0):
+        nl = 1
+        for op, av in self.data:
+            print level*"  " + op,; nl = 0
+            if op == "in":
+                # member sublanguage
+                print; nl = 1
+                for op, a in av:
+                    print (level+1)*"  " + op, a
+            elif op == "branch":
+                print; nl = 1
+                i = 0
+                for a in av[1]:
+                    if i > 0:
+                        print level*"  " + "or"
+                    a.dump(level+1); nl = 1
+                    i = i + 1
+            elif type(av) in (type(()), type([])):
+                for a in av:
+                    if isinstance(a, SubPattern):
+                        if not nl: print
+                        a.dump(level+1); nl = 1
+                    else:
+                        print a, ; nl = 0
+            else:
+                print av, ; nl = 0
+            if not nl: print
     def __repr__(self):
         return repr(self.data)
     def __len__(self):
@@ -255,10 +283,25 @@ def _escape(source, escape, state):
         pass
     raise error, "bogus escape: %s" % repr(escape)
 
-def _branch(pattern, items):
-    # form a branch operator from a set of items
+def _parse_sub(source, state, nested=1):
+    # parse an alternation: a|b|c
 
-    subpattern = SubPattern(pattern)
+    items = []
+    while 1:
+        items.append(_parse(source, state))
+        if source.match("|"):
+            continue
+        if not nested:
+            break
+        if not source.next or source.match(")"):
+            break
+        else:
+            raise error, "pattern not properly closed"
+
+    if len(items) == 1:
+        return items[0]
+
+    subpattern = SubPattern(state)
 
     # check if all items share a common prefix
     while 1:
@@ -285,7 +328,7 @@ def _branch(pattern, items):
             break
     else:
         # we can store this as a character set instead of a
-        # branch (FIXME: use a range if possible)
+        # branch (the compiler may optimize this even more)
         set = []
         for item in items:
             set.append(item[0])
@@ -296,8 +339,7 @@ def _branch(pattern, items):
     return subpattern
 
 def _parse(source, state):
-
-    # parse regular expression pattern into an operator list.
+    # parse a simple pattern
 
     subpattern = SubPattern(state)
 
@@ -451,22 +493,6 @@ def _parse(source, state):
                         if gid is None:
                             raise error, "unknown group name"
                         subpattern.append((GROUPREF, gid))
-                    elif source.match("#"):
-                        index = ""
-                        while 1:
-                            char = source.get()
-                            if char is None:
-                                raise error, "unterminated index"
-                            if char == ")":
-                                break
-                            index = index + char
-                        try:
-                            index = int(index)
-                            if index < 0 or index > MAXREPEAT:
-                                raise ValueError
-                        except ValueError:
-                            raise error, "illegal index"
-                        subpattern.append((INDEX, index))
                         continue
                     else:
                         char = source.get()
@@ -491,48 +517,27 @@ def _parse(source, state):
                             raise error, "syntax error"
                         dir = -1 # lookbehind
                         char = source.get()
-                    b = []
-                    while 1:
-                        p = _parse(source, state)
-                        if source.next == ")":
-                            if b:
-                                b.append(p)
-                                p = _branch(state, b)
-                            if char == "=":
-                                subpattern.append((ASSERT, (dir, p)))
-                            else:
-                                subpattern.append((ASSERT_NOT, (dir, p)))
-                            break
-                        elif source.match("|"):
-                            b.append(p)
-                        else:
-                            raise error, "pattern not properly closed"
+                    p = _parse_sub(source, state)
+                    if char == "=":
+                        subpattern.append((ASSERT, (dir, p)))
+                    else:
+                        subpattern.append((ASSERT_NOT, (dir, p)))
+                    continue
                 else:
                     # flags
                     while FLAGS.has_key(source.next):
                         state.flags = state.flags | FLAGS[source.get()]
             if group:
                 # parse group contents
-                b = []
                 if group == 2:
                     # anonymous group
                     group = None
                 else:
                     group = state.getgroup(name)
-                while 1:
-                    p = _parse(source, state)
-                    if group is not None:
-                        p.append((INDEX, group))
-                    if source.match(")"):
-                        if b:
-                            b.append(p)
-                            p = _branch(state, b)
-                        subpattern.append((SUBPATTERN, (group, p)))
-                        break
-                    elif source.match("|"):
-                        b.append(p)
-                    else:
-                        raise error, "group not properly closed"
+                p = _parse_sub(source, state)
+                subpattern.append((SUBPATTERN, (group, p)))
+                if group is not None:
+                    p.append((INDEX, group))
             else:
                 while 1:
                     char = source.get()
@@ -555,26 +560,24 @@ def _parse(source, state):
 
     return subpattern
 
-def parse(pattern, flags=0):
+def parse(str, flags=0):
     # parse 're' pattern into list of (opcode, argument) tuples
-    source = Tokenizer(pattern)
-    state = State()
-    state.flags = flags
-    b = []
-    while 1:
-        p = _parse(source, state)
-        tail = source.get()
-        if tail == "|":
-            b.append(p)
-        elif tail == ")":
-            raise error, "unbalanced parenthesis"
-        elif tail is None:
-            if b:
-                b.append(p)
-                p = _branch(state, b)
-            break
-        else:
-            raise error, "bogus characters at end of regular expression"
+
+    source = Tokenizer(str)
+
+    pattern = Pattern()
+    pattern.flags = flags
+
+    p = _parse_sub(source, pattern, 0)
+
+    tail = source.get()
+    if tail == ")":
+        raise error, "unbalanced parenthesis"
+    elif tail:
+        raise error, "bogus characters at end of regular expression"
+
+    # p.dump()
+
     return p
 
 def parse_template(source, pattern):
@@ -656,4 +659,4 @@ def expand_template(template, match):
             if s is None:
                 raise error, "empty group"
             a(s)
-    return sep.join(p)
+    return string.join(p, sep)
diff --git a/Lib/test/output/test_sre b/Lib/test/output/test_sre
index d949f25..05bcead 100644
--- a/Lib/test/output/test_sre
+++ b/Lib/test/output/test_sre
@@ -1,4 +1,6 @@
 test_sre
 === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
 === Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a')
+=== grouping error ('(a)(b)c|ab', 'ab', 0, 'found+"-"+g1+"-"+g2', 'ab-None-None') 'ab-None-b' should be 'ab-None-None'
+=== grouping error ('(a)+b|aac', 'aac', 0, 'found+"-"+g1', 'aac-None') 'aac-a' should be 'aac-None'
 === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
author	Fredrik Lundh <fredrik@pythonware.com>	2000-07-23 21:46:17 (GMT)
committer	Fredrik Lundh <fredrik@pythonware.com>	2000-07-23 21:46:17 (GMT)
commit	8a3ebf8ca87a7d2989148d7c218974ab4235ca5d (patch)
tree	1694bf24acdb9defbf770ad34e1181b84bf17428 /Lib
parent	4f1b2081e9dde910048ef679f5afe252023a4031 (diff)
download	cpython-8a3ebf8ca87a7d2989148d7c218974ab4235ca5d.zip cpython-8a3ebf8ca87a7d2989148d7c218974ab4235ca5d.tar.gz cpython-8a3ebf8ca87a7d2989148d7c218974ab4235ca5d.tar.bz2