summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
Diffstat (limited to 'Lib')
-rw-r--r--Lib/sre.py33
-rw-r--r--Lib/sre_compile.py14
-rw-r--r--Lib/sre_constants.py2
-rw-r--r--Lib/sre_parse.py155
-rw-r--r--Lib/test/output/test_sre2
5 files changed, 118 insertions, 88 deletions
diff --git a/Lib/sre.py b/Lib/sre.py
index fbf44a5..6dd1df9 100644
--- a/Lib/sre.py
+++ b/Lib/sre.py
@@ -10,9 +10,13 @@
# other compatibility work.
#
+# FIXME: change all FIXME's to XXX ;-)
+
import sre_compile
import sre_parse
+import string
+
# flags
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
L = LOCALE = sre_compile.SRE_FLAG_LOCALE
@@ -53,6 +57,9 @@ def findall(pattern, string, maxsplit=0):
def compile(pattern, flags=0):
return _compile(pattern, flags)
+def purge():
+ _cache.clear()
+
def template(pattern, flags=0):
return _compile(pattern, flags|T)
@@ -65,7 +72,7 @@ def escape(pattern):
s[i] = "\\000"
else:
s[i] = "\\" + c
- return pattern[:0].join(s)
+ return _join(s, pattern)
# --------------------------------------------------------------------
# internals
@@ -73,10 +80,14 @@ def escape(pattern):
_cache = {}
_MAXCACHE = 100
+def _join(seq, sep):
+ # internal: join into string having the same type as sep
+ return string.join(seq, sep[:0])
+
def _compile(pattern, flags=0):
# internal: compile pattern
tp = type(pattern)
- if tp not in (type(""), type(u"")):
+ if tp not in sre_compile.STRING_TYPES:
return pattern
key = (tp, pattern, flags)
try:
@@ -89,10 +100,6 @@ def _compile(pattern, flags=0):
_cache[key] = p
return p
-def purge():
- # clear pattern cache
- _cache.clear()
-
def _sub(pattern, template, string, count=0):
# internal: pattern.sub implementation hook
return _subn(pattern, template, string, count)[0]
@@ -120,7 +127,7 @@ def _subn(pattern, template, string, count=0):
i = e
n = n + 1
append(string[i:])
- return string[:0].join(s), n
+ return _join(s, string[:0]), n
def _split(pattern, string, maxsplit=0):
# internal: pattern.split implementation hook
@@ -161,11 +168,19 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)
class Scanner:
def __init__(self, lexicon):
+ from sre_constants import BRANCH, SUBPATTERN, INDEX
self.lexicon = lexicon
+ # combine phrases into a compound pattern
p = []
+ s = sre_parse.Pattern()
for phrase, action in lexicon:
- p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
- self.scanner = _compile("|".join(p))
+ p.append(sre_parse.SubPattern(s, [
+ (SUBPATTERN, (None, sre_parse.parse(phrase))),
+ (INDEX, len(p))
+ ]))
+ p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
+ s.groups = len(p)
+ self.scanner = sre_compile.compile(p)
def scan(self, string):
result = []
append = result.append
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index d8d01ea..ef26e1c 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -197,10 +197,11 @@ def _compile(code, pattern, flags):
else:
emit(ATCODES[av])
elif op is BRANCH:
- emit(OPCODES[op])
tail = []
for av in av[1]:
+ emit(OPCODES[op])
skip = len(code); emit(0)
+ emit(MAXCODE) # save mark
_compile(code, av, flags)
emit(OPCODES[JUMP])
tail.append(len(code)); emit(0)
@@ -286,11 +287,18 @@ def _compile_info(code, pattern, flags):
emit(OPCODES[FAILURE])
code[skip] = len(code) - skip
+STRING_TYPES = [type("")]
+
+try:
+ STRING_TYPES.append(type(unicode("")))
+except NameError:
+ pass
+
def compile(p, flags=0):
# internal: convert pattern list to internal format
# compile, as necessary
- if type(p) in (type(""), type(u"")):
+ if type(p) in STRING_TYPES:
import sre_parse
pattern = p
p = sre_parse.parse(p, flags)
@@ -308,6 +316,8 @@ def compile(p, flags=0):
code.append(OPCODES[SUCCESS])
+ # print code
+
# FIXME: <fl> get rid of this limitation!
assert p.pattern.groups <= 100,\
"sorry, but this version only supports 100 named groups"
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index c2cecdf..ef32c32 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -172,7 +172,7 @@ CH_UNICODE = {
# flags
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
SRE_FLAG_IGNORECASE = 2 # case insensitive
-SRE_FLAG_LOCALE = 4 # honor system locale
+SRE_FLAG_LOCALE = 4 # honour system locale
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
SRE_FLAG_DOTALL = 16 # treat target as a single string
SRE_FLAG_UNICODE = 32 # use unicode locale
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 053335a..1b56352 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -25,12 +25,12 @@ CHARMASK = 0xff
SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
-DIGITS = tuple(string.digits)
+DIGITS = tuple("012345689")
OCTDIGITS = tuple("01234567")
HEXDIGITS = tuple("0123456789abcdefABCDEF")
-WHITESPACE = tuple(string.whitespace)
+WHITESPACE = tuple(" \t\n\r\v\f")
ESCAPES = {
r"\a": (LITERAL, 7),
@@ -68,7 +68,8 @@ FLAGS = {
"u": SRE_FLAG_UNICODE,
}
-class State:
+class Pattern:
+ # master pattern object. keeps track of global attributes
def __init__(self):
self.flags = 0
self.groups = 1
@@ -88,6 +89,33 @@ class SubPattern:
data = []
self.data = data
self.width = None
+ def dump(self, level=0):
+ nl = 1
+ for op, av in self.data:
+ print level*" " + op,; nl = 0
+ if op == "in":
+ # member sublanguage
+ print; nl = 1
+ for op, a in av:
+ print (level+1)*" " + op, a
+ elif op == "branch":
+ print; nl = 1
+ i = 0
+ for a in av[1]:
+ if i > 0:
+ print level*" " + "or"
+ a.dump(level+1); nl = 1
+ i = i + 1
+ elif type(av) in (type(()), type([])):
+ for a in av:
+ if isinstance(a, SubPattern):
+ if not nl: print
+ a.dump(level+1); nl = 1
+ else:
+ print a, ; nl = 0
+ else:
+ print av, ; nl = 0
+ if not nl: print
def __repr__(self):
return repr(self.data)
def __len__(self):
@@ -255,10 +283,25 @@ def _escape(source, escape, state):
pass
raise error, "bogus escape: %s" % repr(escape)
-def _branch(pattern, items):
- # form a branch operator from a set of items
+def _parse_sub(source, state, nested=1):
+ # parse an alternation: a|b|c
- subpattern = SubPattern(pattern)
+ items = []
+ while 1:
+ items.append(_parse(source, state))
+ if source.match("|"):
+ continue
+ if not nested:
+ break
+ if not source.next or source.match(")"):
+ break
+ else:
+ raise error, "pattern not properly closed"
+
+ if len(items) == 1:
+ return items[0]
+
+ subpattern = SubPattern(state)
# check if all items share a common prefix
while 1:
@@ -285,7 +328,7 @@ def _branch(pattern, items):
break
else:
# we can store this as a character set instead of a
- # branch (FIXME: use a range if possible)
+ # branch (the compiler may optimize this even more)
set = []
for item in items:
set.append(item[0])
@@ -296,8 +339,7 @@ def _branch(pattern, items):
return subpattern
def _parse(source, state):
-
- # parse regular expression pattern into an operator list.
+ # parse a simple pattern
subpattern = SubPattern(state)
@@ -451,22 +493,6 @@ def _parse(source, state):
if gid is None:
raise error, "unknown group name"
subpattern.append((GROUPREF, gid))
- elif source.match("#"):
- index = ""
- while 1:
- char = source.get()
- if char is None:
- raise error, "unterminated index"
- if char == ")":
- break
- index = index + char
- try:
- index = int(index)
- if index < 0 or index > MAXREPEAT:
- raise ValueError
- except ValueError:
- raise error, "illegal index"
- subpattern.append((INDEX, index))
continue
else:
char = source.get()
@@ -491,48 +517,27 @@ def _parse(source, state):
raise error, "syntax error"
dir = -1 # lookbehind
char = source.get()
- b = []
- while 1:
- p = _parse(source, state)
- if source.next == ")":
- if b:
- b.append(p)
- p = _branch(state, b)
- if char == "=":
- subpattern.append((ASSERT, (dir, p)))
- else:
- subpattern.append((ASSERT_NOT, (dir, p)))
- break
- elif source.match("|"):
- b.append(p)
- else:
- raise error, "pattern not properly closed"
+ p = _parse_sub(source, state)
+ if char == "=":
+ subpattern.append((ASSERT, (dir, p)))
+ else:
+ subpattern.append((ASSERT_NOT, (dir, p)))
+ continue
else:
# flags
while FLAGS.has_key(source.next):
state.flags = state.flags | FLAGS[source.get()]
if group:
# parse group contents
- b = []
if group == 2:
# anonymous group
group = None
else:
group = state.getgroup(name)
- while 1:
- p = _parse(source, state)
- if group is not None:
- p.append((INDEX, group))
- if source.match(")"):
- if b:
- b.append(p)
- p = _branch(state, b)
- subpattern.append((SUBPATTERN, (group, p)))
- break
- elif source.match("|"):
- b.append(p)
- else:
- raise error, "group not properly closed"
+ p = _parse_sub(source, state)
+ subpattern.append((SUBPATTERN, (group, p)))
+ if group is not None:
+ p.append((INDEX, group))
else:
while 1:
char = source.get()
@@ -555,26 +560,24 @@ def _parse(source, state):
return subpattern
-def parse(pattern, flags=0):
+def parse(str, flags=0):
# parse 're' pattern into list of (opcode, argument) tuples
- source = Tokenizer(pattern)
- state = State()
- state.flags = flags
- b = []
- while 1:
- p = _parse(source, state)
- tail = source.get()
- if tail == "|":
- b.append(p)
- elif tail == ")":
- raise error, "unbalanced parenthesis"
- elif tail is None:
- if b:
- b.append(p)
- p = _branch(state, b)
- break
- else:
- raise error, "bogus characters at end of regular expression"
+
+ source = Tokenizer(str)
+
+ pattern = Pattern()
+ pattern.flags = flags
+
+ p = _parse_sub(source, pattern, 0)
+
+ tail = source.get()
+ if tail == ")":
+ raise error, "unbalanced parenthesis"
+ elif tail:
+ raise error, "bogus characters at end of regular expression"
+
+ # p.dump()
+
return p
def parse_template(source, pattern):
@@ -656,4 +659,4 @@ def expand_template(template, match):
if s is None:
raise error, "empty group"
a(s)
- return sep.join(p)
+ return string.join(p, sep)
diff --git a/Lib/test/output/test_sre b/Lib/test/output/test_sre
index d949f25..05bcead 100644
--- a/Lib/test/output/test_sre
+++ b/Lib/test/output/test_sre
@@ -1,4 +1,6 @@
test_sre
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
=== Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a')
+=== grouping error ('(a)(b)c|ab', 'ab', 0, 'found+"-"+g1+"-"+g2', 'ab-None-None') 'ab-None-b' should be 'ab-None-None'
+=== grouping error ('(a)+b|aac', 'aac', 0, 'found+"-"+g1', 'aac-None') 'aac-a' should be 'aac-None'
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')