summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2017-05-14 05:32:33 (GMT)
committerGitHub <noreply@github.com>2017-05-14 05:32:33 (GMT)
commit821a9d146bc04a1bc1a9807962990a1f59d692b8 (patch)
treee981ba61ef49d7bcd83474cefe76ee3f18a6dc3f
parentcbddf58c797f850a5b06f317a4bb7ab69c6e9715 (diff)
downloadcpython-821a9d146bc04a1bc1a9807962990a1f59d692b8.zip
cpython-821a9d146bc04a1bc1a9807962990a1f59d692b8.tar.gz
cpython-821a9d146bc04a1bc1a9807962990a1f59d692b8.tar.bz2
bpo-30340: Enhanced regular expressions optimization. (#1542)
This increased the performance of matching some patterns up to 25 times.
-rw-r--r--Lib/sre_compile.py15
-rw-r--r--Lib/sre_parse.py105
-rw-r--r--Lib/test/test_re.py26
-rw-r--r--Misc/NEWS3
4 files changed, 95 insertions, 54 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index cebecb9..aeb89bc 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -20,6 +20,7 @@ _LITERAL_CODES = {LITERAL, NOT_LITERAL}
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
+_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
# Sets of lowercase characters which have the same uppercase.
_equivalences = (
@@ -125,7 +126,7 @@ def _compile(code, pattern, flags):
elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator %r" % (op,))
- elif _simple(av) and op is not REPEAT:
+ if _simple(av[2]):
if op is MAX_REPEAT:
emit(REPEAT_ONE)
else:
@@ -404,10 +405,14 @@ def _bytes_to_codes(b):
assert len(a) * a.itemsize == len(b)
return a.tolist()
-def _simple(av):
- # check if av is a "simple" operator
- lo, hi = av[2].getwidth()
- return lo == hi == 1 and av[2][0][0] != SUBPATTERN
+def _simple(p):
+ # check if this subpattern is a "simple" operator
+ if len(p) != 1:
+ return False
+ op, av = p[0]
+ if op is SUBPATTERN:
+ return av[0] is None and _simple(av[-1])
+ return op in _UNIT_CODES
def _generate_overlap_table(prefix):
"""
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index d8d1bd5..f72408f 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -114,6 +114,7 @@ class SubPattern:
data = []
self.data = data
self.width = None
+
def dump(self, level=0):
nl = True
seqtypes = (tuple, list)
@@ -404,6 +405,15 @@ def _escape(source, escape, state):
pass
raise source.error("bad escape %s" % escape, len(escape))
+def _uniq(items):
+ if len(set(items)) == len(items):
+ return items
+ newitems = []
+ for item in items:
+ if item not in newitems:
+ newitems.append(item)
+ return newitems
+
def _parse_sub(source, state, verbose, nested=True):
# parse an alternation: a|b|c
@@ -420,7 +430,6 @@ def _parse_sub(source, state, verbose, nested=True):
return items[0]
subpattern = SubPattern(state)
- subpatternappend = subpattern.append
# check if all items share a common prefix
while True:
@@ -437,35 +446,31 @@ def _parse_sub(source, state, verbose, nested=True):
# move it out of the branch
for item in items:
del item[0]
- subpatternappend(prefix)
+ subpattern.append(prefix)
continue # check next one
break
# check if the branch can be replaced by a character set
+ set = []
for item in items:
- if len(item) != 1 or item[0][0] is not LITERAL:
+ if len(item) != 1:
+ break
+ op, av = item[0]
+ if op is LITERAL:
+ set.append((op, av))
+ elif op is IN and av[0][0] is not NEGATE:
+ set.extend(av)
+ else:
break
else:
# we can store this as a character set instead of a
# branch (the compiler may optimize this even more)
- subpatternappend((IN, [item[0] for item in items]))
+ subpattern.append((IN, _uniq(set)))
return subpattern
subpattern.append((BRANCH, (None, items)))
return subpattern
-def _parse_sub_cond(source, state, condgroup, verbose):
- item_yes = _parse(source, state, verbose)
- if source.match("|"):
- item_no = _parse(source, state, verbose)
- if source.next == "|":
- raise source.error("conditional backref with more than two branches")
- else:
- item_no = None
- subpattern = SubPattern(state)
- subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
- return subpattern
-
def _parse(source, state, verbose, first=False):
# parse a simple pattern
subpattern = SubPattern(state)
@@ -511,16 +516,14 @@ def _parse(source, state, verbose, first=False):
setappend = set.append
## if sourcematch(":"):
## pass # handle character classes
- if sourcematch("^"):
- setappend((NEGATE, None))
+ negate = sourcematch("^")
# check remaining characters
- start = set[:]
while True:
this = sourceget()
if this is None:
raise source.error("unterminated character set",
source.tell() - here)
- if this == "]" and set != start:
+ if this == "]" and set:
break
elif this[0] == "\\":
code1 = _class_escape(source, this)
@@ -556,13 +559,19 @@ def _parse(source, state, verbose, first=False):
code1 = code1[1][0]
setappend(code1)
+ set = _uniq(set)
# XXX: <fl> should move set optimization to compiler!
- if _len(set)==1 and set[0][0] is LITERAL:
- subpatternappend(set[0]) # optimization
- elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
- subpatternappend((NOT_LITERAL, set[1][1])) # optimization
+ if _len(set) == 1 and set[0][0] is LITERAL:
+ # optimization
+ if negate:
+ subpatternappend((NOT_LITERAL, set[0][1]))
+ else:
+ subpatternappend(set[0])
else:
- # XXX: <fl> should add charmap optimization here
+ if negate:
+ set.insert(0, (NEGATE, None))
+ # charmap optimization can't be added here because
+ # global flags still are not known
subpatternappend((IN, set))
elif this in REPEAT_CHARS:
@@ -579,6 +588,7 @@ def _parse(source, state, verbose, first=False):
if source.next == "}":
subpatternappend((LITERAL, _ord(this)))
continue
+
min, max = 0, MAXREPEAT
lo = hi = ""
while source.next in DIGITS:
@@ -592,6 +602,7 @@ def _parse(source, state, verbose, first=False):
subpatternappend((LITERAL, _ord(this)))
source.seek(here)
continue
+
if lo:
min = int(lo)
if min >= MAXREPEAT:
@@ -610,12 +621,16 @@ def _parse(source, state, verbose, first=False):
item = subpattern[-1:]
else:
item = None
- if not item or (_len(item) == 1 and item[0][0] is AT):
+ if not item or item[0][0] is AT:
raise source.error("nothing to repeat",
source.tell() - here + len(this))
if item[0][0] in _REPEATCODES:
raise source.error("multiple repeat",
source.tell() - here + len(this))
+ if item[0][0] is SUBPATTERN:
+ group, add_flags, del_flags, p = item[0][1]
+ if group is None and not add_flags and not del_flags:
+ item = p
if sourcematch("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item))
else:
@@ -628,7 +643,6 @@ def _parse(source, state, verbose, first=False):
start = source.tell() - 1
group = True
name = None
- condgroup = None
add_flags = 0
del_flags = 0
if sourcematch("?"):
@@ -660,6 +674,7 @@ def _parse(source, state, verbose, first=False):
state.checklookbehindgroup(gid, source)
subpatternappend((GROUPREF, gid))
continue
+
else:
char = sourceget()
if char is None:
@@ -678,6 +693,7 @@ def _parse(source, state, verbose, first=False):
if sourceget() == ")":
break
continue
+
elif char in "=!<":
# lookahead assertions
dir = 1
@@ -704,10 +720,10 @@ def _parse(source, state, verbose, first=False):
else:
subpatternappend((ASSERT_NOT, (dir, p)))
continue
+
elif char == "(":
# conditional backreference group
condname = source.getuntil(")")
- group = None
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
@@ -728,6 +744,19 @@ def _parse(source, state, verbose, first=False):
msg = "invalid group reference %d" % condgroup
raise source.error(msg, len(condname) + 1)
state.checklookbehindgroup(condgroup, source)
+ item_yes = _parse(source, state, verbose)
+ if source.match("|"):
+ item_no = _parse(source, state, verbose)
+ if source.next == "|":
+ raise source.error("conditional backref with more than two branches")
+ else:
+ item_no = None
+ if not source.match(")"):
+ raise source.error("missing ), unterminated subpattern",
+ source.tell() - start)
+ subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
+ continue
+
elif char in FLAGS or char == "-":
# flags
flags = _parse_flags(source, state, char)
@@ -744,6 +773,7 @@ def _parse(source, state, verbose, first=False):
if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
raise Verbose
continue
+
add_flags, del_flags = flags
group = None
else:
@@ -756,12 +786,9 @@ def _parse(source, state, verbose, first=False):
group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1) from None
- if condgroup:
- p = _parse_sub_cond(source, state, condgroup, verbose)
- else:
- sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
- not (del_flags & SRE_FLAG_VERBOSE))
- p = _parse_sub(source, state, sub_verbose)
+ sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
+ not (del_flags & SRE_FLAG_VERBOSE))
+ p = _parse_sub(source, state, sub_verbose)
if not source.match(")"):
raise source.error("missing ), unterminated subpattern",
source.tell() - start)
@@ -773,11 +800,19 @@ def _parse(source, state, verbose, first=False):
subpatternappend((AT, AT_BEGINNING))
elif this == "$":
- subpattern.append((AT, AT_END))
+ subpatternappend((AT, AT_END))
else:
raise AssertionError("unsupported special character %r" % (char,))
+ # unpack non-capturing groups
+ for i in range(len(subpattern))[::-1]:
+ op, av = subpattern[i]
+ if op is SUBPATTERN:
+ group, add_flags, del_flags, p = av
+ if group is None and not add_flags and not del_flags:
+ subpattern[i: i+1] = p
+
return subpattern
def _parse_flags(source, state, char):
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 4d71eea..5d36b54 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1695,20 +1695,18 @@ class ReTests(unittest.TestCase):
dump = '''\
SUBPATTERN 1 0 0
LITERAL 46
-SUBPATTERN None 0 0
- BRANCH
- IN
- LITERAL 99
- LITERAL 104
- OR
- LITERAL 112
- LITERAL 121
-SUBPATTERN None 0 0
- GROUPREF_EXISTS 1
- AT AT_END
- ELSE
- LITERAL 58
- LITERAL 32
+BRANCH
+ IN
+ LITERAL 99
+ LITERAL 104
+OR
+ LITERAL 112
+ LITERAL 121
+GROUPREF_EXISTS 1
+ AT AT_END
+ELSE
+ LITERAL 58
+ LITERAL 32
'''
self.assertEqual(out.getvalue(), dump)
# Debug output is output again even a second time (bypassing
diff --git a/Misc/NEWS b/Misc/NEWS
index 73cd82c..e6b4ced 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -326,6 +326,9 @@ Library
- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
running coroutine and the coroutine returned without any more ``await``.
+- bpo-30340: Enhanced regular expressions optimization. This increased
+ the performance of matching some patterns up to 25 times.
+
- bpo-30298: Weaken the condition of deprecation warnings for inline modifiers.
Now allowed several subsequential inline modifiers at the start of the
pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments