diff options
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r-- | Lib/sre_parse.py | 105 |
1 files changed, 70 insertions, 35 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index d8d1bd5..f72408f 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -114,6 +114,7 @@ class SubPattern: data = [] self.data = data self.width = None + def dump(self, level=0): nl = True seqtypes = (tuple, list) @@ -404,6 +405,15 @@ def _escape(source, escape, state): pass raise source.error("bad escape %s" % escape, len(escape)) +def _uniq(items): + if len(set(items)) == len(items): + return items + newitems = [] + for item in items: + if item not in newitems: + newitems.append(item) + return newitems + def _parse_sub(source, state, verbose, nested=True): # parse an alternation: a|b|c @@ -420,7 +430,6 @@ def _parse_sub(source, state, verbose, nested=True): return items[0] subpattern = SubPattern(state) - subpatternappend = subpattern.append # check if all items share a common prefix while True: @@ -437,35 +446,31 @@ def _parse_sub(source, state, verbose, nested=True): # move it out of the branch for item in items: del item[0] - subpatternappend(prefix) + subpattern.append(prefix) continue # check next one break # check if the branch can be replaced by a character set + set = [] for item in items: - if len(item) != 1 or item[0][0] is not LITERAL: + if len(item) != 1: + break + op, av = item[0] + if op is LITERAL: + set.append((op, av)) + elif op is IN and av[0][0] is not NEGATE: + set.extend(av) + else: break else: # we can store this as a character set instead of a # branch (the compiler may optimize this even more) - subpatternappend((IN, [item[0] for item in items])) + subpattern.append((IN, _uniq(set))) return subpattern subpattern.append((BRANCH, (None, items))) return subpattern -def _parse_sub_cond(source, state, condgroup, verbose): - item_yes = _parse(source, state, verbose) - if source.match("|"): - item_no = _parse(source, state, verbose) - if source.next == "|": - raise source.error("conditional backref with more than two branches") - else: - item_no = None - subpattern = SubPattern(state) - subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) - return subpattern - def _parse(source, state, verbose, first=False): # parse a simple pattern subpattern = SubPattern(state) @@ -511,16 +516,14 @@ def _parse(source, state, verbose, first=False): setappend = set.append ## if sourcematch(":"): ## pass # handle character classes - if sourcematch("^"): - setappend((NEGATE, None)) + negate = sourcematch("^") # check remaining characters - start = set[:] while True: this = sourceget() if this is None: raise source.error("unterminated character set", source.tell() - here) - if this == "]" and set != start: + if this == "]" and set: break elif this[0] == "\\": code1 = _class_escape(source, this) @@ -556,13 +559,19 @@ def _parse(source, state, verbose, first=False): code1 = code1[1][0] setappend(code1) + set = _uniq(set) # XXX: <fl> should move set optimization to compiler! - if _len(set)==1 and set[0][0] is LITERAL: - subpatternappend(set[0]) # optimization - elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: - subpatternappend((NOT_LITERAL, set[1][1])) # optimization + if _len(set) == 1 and set[0][0] is LITERAL: + # optimization + if negate: + subpatternappend((NOT_LITERAL, set[0][1])) + else: + subpatternappend(set[0]) else: - # XXX: <fl> should add charmap optimization here + if negate: + set.insert(0, (NEGATE, None)) + # charmap optimization can't be added here because + # global flags still are not known subpatternappend((IN, set)) elif this in REPEAT_CHARS: @@ -579,6 +588,7 @@ def _parse(source, state, verbose, first=False): if source.next == "}": subpatternappend((LITERAL, _ord(this))) continue + min, max = 0, MAXREPEAT lo = hi = "" while source.next in DIGITS: @@ -592,6 +602,7 @@ def _parse(source, state, verbose, first=False): subpatternappend((LITERAL, _ord(this))) source.seek(here) continue + if lo: min = int(lo) if min >= MAXREPEAT: @@ -610,12 +621,16 @@ def _parse(source, state, verbose, first=False): item = subpattern[-1:] else: item = None - if not item or (_len(item) == 1 and item[0][0] is AT): + if not item or item[0][0] is AT: raise source.error("nothing to repeat", source.tell() - here + len(this)) if item[0][0] in _REPEATCODES: raise source.error("multiple repeat", source.tell() - here + len(this)) + if item[0][0] is SUBPATTERN: + group, add_flags, del_flags, p = item[0][1] + if group is None and not add_flags and not del_flags: + item = p if sourcematch("?"): subpattern[-1] = (MIN_REPEAT, (min, max, item)) else: @@ -628,7 +643,6 @@ def _parse(source, state, verbose, first=False): start = source.tell() - 1 group = True name = None - condgroup = None add_flags = 0 del_flags = 0 if sourcematch("?"): @@ -660,6 +674,7 @@ def _parse(source, state, verbose, first=False): state.checklookbehindgroup(gid, source) subpatternappend((GROUPREF, gid)) continue + else: char = sourceget() if char is None: @@ -678,6 +693,7 @@ def _parse(source, state, verbose, first=False): if sourceget() == ")": break continue + elif char in "=!<": # lookahead assertions dir = 1 @@ -704,10 +720,10 @@ def _parse(source, state, verbose, first=False): else: subpatternappend((ASSERT_NOT, (dir, p))) continue + elif char == "(": # conditional backreference group condname = source.getuntil(")") - group = None if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: @@ -728,6 +744,19 @@ def _parse(source, state, verbose, first=False): msg = "invalid group reference %d" % condgroup raise source.error(msg, len(condname) + 1) state.checklookbehindgroup(condgroup, source) + item_yes = _parse(source, state, verbose) + if source.match("|"): + item_no = _parse(source, state, verbose) + if source.next == "|": + raise source.error("conditional backref with more than two branches") + else: + item_no = None + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) + continue + elif char in FLAGS or char == "-": # flags flags = _parse_flags(source, state, char) @@ -744,6 +773,7 @@ def _parse(source, state, verbose, first=False): if (state.flags & SRE_FLAG_VERBOSE) and not verbose: raise Verbose continue + add_flags, del_flags = flags group = None else: @@ -756,12 +786,9 @@ def _parse(source, state, verbose, first=False): group = state.opengroup(name) except error as err: raise source.error(err.msg, len(name) + 1) from None - if condgroup: - p = _parse_sub_cond(source, state, condgroup, verbose) - else: - sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and - not (del_flags & SRE_FLAG_VERBOSE)) - p = _parse_sub(source, state, sub_verbose) + sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and + not (del_flags & SRE_FLAG_VERBOSE)) + p = _parse_sub(source, state, sub_verbose) if not source.match(")"): raise source.error("missing ), unterminated subpattern", source.tell() - start) @@ -773,11 +800,19 @@ def _parse(source, state, verbose, first=False): subpatternappend((AT, AT_BEGINNING)) elif this == "$": - subpattern.append((AT, AT_END)) + subpatternappend((AT, AT_END)) else: raise AssertionError("unsupported special character %r" % (char,)) + # unpack non-capturing groups + for i in range(len(subpattern))[::-1]: + op, av = subpattern[i] + if op is SUBPATTERN: + group, add_flags, del_flags, p = av + if group is None and not add_flags and not del_flags: + subpattern[i: i+1] = p + return subpattern def _parse_flags(source, state, char): |