diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-04-02 08:35:13 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-02 08:35:13 (GMT) |
commit | 1be3260a90f16aae334d993aecf7b70426f98013 (patch) | |
tree | 965c64fd457364b1e303953bced685e76c334d5f /Lib/sre_parse.py | |
parent | 4ed8a9a589d2eee7442e0c9417515a707e504faa (diff) | |
download | cpython-1be3260a90f16aae334d993aecf7b70426f98013.zip cpython-1be3260a90f16aae334d993aecf7b70426f98013.tar.gz cpython-1be3260a90f16aae334d993aecf7b70426f98013.tar.bz2 |
bpo-47152: Convert the re module into a package (GH-32177)
The sre_* modules are now deprecated.
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r-- | Lib/sre_parse.py | 1084 |
1 files changed, 6 insertions, 1078 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index b91082e..25a3f55 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -1,1079 +1,7 @@ -# -# Secret Labs' Regular Expression Engine -# -# convert re-style regular expression to sre pattern -# -# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. -# -# See the sre.py file for information on usage and redistribution. -# +import warnings +warnings.warn(f"module {__name__!r} is deprecated", + DeprecationWarning, + stacklevel=2) -"""Internal support module for sre""" - -# XXX: show string offset and offending character for all errors - -from sre_constants import * - -SPECIAL_CHARS = ".\\[{()*+?^$|" -REPEAT_CHARS = "*+?{" - -DIGITS = frozenset("0123456789") - -OCTDIGITS = frozenset("01234567") -HEXDIGITS = frozenset("0123456789abcdefABCDEF") -ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") - -WHITESPACE = frozenset(" \t\n\r\v\f") - -_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) -_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) - -ESCAPES = { - r"\a": (LITERAL, ord("\a")), - r"\b": (LITERAL, ord("\b")), - r"\f": (LITERAL, ord("\f")), - r"\n": (LITERAL, ord("\n")), - r"\r": (LITERAL, ord("\r")), - r"\t": (LITERAL, ord("\t")), - r"\v": (LITERAL, ord("\v")), - r"\\": (LITERAL, ord("\\")) -} - -CATEGORIES = { - r"\A": (AT, AT_BEGINNING_STRING), # start of string - r"\b": (AT, AT_BOUNDARY), - r"\B": (AT, AT_NON_BOUNDARY), - r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), - r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), - r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), - r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), - r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), - r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), - r"\Z": (AT, AT_END_STRING), # end of string -} - -FLAGS = { - # standard flags - "i": SRE_FLAG_IGNORECASE, - "L": SRE_FLAG_LOCALE, - "m": SRE_FLAG_MULTILINE, - "s": SRE_FLAG_DOTALL, - "x": SRE_FLAG_VERBOSE, - # extensions - "a": SRE_FLAG_ASCII, - "t": SRE_FLAG_TEMPLATE, - "u": SRE_FLAG_UNICODE, -} - -TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE -GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE - -class Verbose(Exception): - pass - -class State: - # keeps track of state for parsing - def __init__(self): - self.flags = 0 - self.groupdict = {} - self.groupwidths = [None] # group 0 - self.lookbehindgroups = None - @property - def groups(self): - return len(self.groupwidths) - def opengroup(self, name=None): - gid = self.groups - self.groupwidths.append(None) - if self.groups > MAXGROUPS: - raise error("too many groups") - if name is not None: - ogid = self.groupdict.get(name, None) - if ogid is not None: - raise error("redefinition of group name %r as group %d; " - "was group %d" % (name, gid, ogid)) - self.groupdict[name] = gid - return gid - def closegroup(self, gid, p): - self.groupwidths[gid] = p.getwidth() - def checkgroup(self, gid): - return gid < self.groups and self.groupwidths[gid] is not None - - def checklookbehindgroup(self, gid, source): - if self.lookbehindgroups is not None: - if not self.checkgroup(gid): - raise source.error('cannot refer to an open group') - if gid >= self.lookbehindgroups: - raise source.error('cannot refer to group defined in the same ' - 'lookbehind subpattern') - -class SubPattern: - # a subpattern, in intermediate form - def __init__(self, state, data=None): - self.state = state - if data is None: - data = [] - self.data = data - self.width = None - - def dump(self, level=0): - nl = True - seqtypes = (tuple, list) - for op, av in self.data: - print(level*" " + str(op), end='') - if op is IN: - # member sublanguage - print() - for op, a in av: - print((level+1)*" " + str(op), a) - elif op is BRANCH: - print() - for i, a in enumerate(av[1]): - if i: - print(level*" " + "OR") - a.dump(level+1) - elif op is GROUPREF_EXISTS: - condgroup, item_yes, item_no = av - print('', condgroup) - item_yes.dump(level+1) - if item_no: - print(level*" " + "ELSE") - item_no.dump(level+1) - elif isinstance(av, seqtypes): - nl = False - for a in av: - if isinstance(a, SubPattern): - if not nl: - print() - a.dump(level+1) - nl = True - else: - if not nl: - print(' ', end='') - print(a, end='') - nl = False - if not nl: - print() - else: - print('', av) - def __repr__(self): - return repr(self.data) - def __len__(self): - return len(self.data) - def __delitem__(self, index): - del self.data[index] - def __getitem__(self, index): - if isinstance(index, slice): - return SubPattern(self.state, self.data[index]) - return self.data[index] - def __setitem__(self, index, code): - self.data[index] = code - def insert(self, index, code): - self.data.insert(index, code) - def append(self, code): - self.data.append(code) - def getwidth(self): - # determine the width (min, max) for this subpattern - if self.width is not None: - return self.width - lo = hi = 0 - for op, av in self.data: - if op is BRANCH: - i = MAXREPEAT - 1 - j = 0 - for av in av[1]: - l, h = av.getwidth() - i = min(i, l) - j = max(j, h) - lo = lo + i - hi = hi + j - elif op is CALL: - i, j = av.getwidth() - lo = lo + i - hi = hi + j - elif op is ATOMIC_GROUP: - i, j = av.getwidth() - lo = lo + i - hi = hi + j - elif op is SUBPATTERN: - i, j = av[-1].getwidth() - lo = lo + i - hi = hi + j - elif op in _REPEATCODES: - i, j = av[2].getwidth() - lo = lo + i * av[0] - hi = hi + j * av[1] - elif op in _UNITCODES: - lo = lo + 1 - hi = hi + 1 - elif op is GROUPREF: - i, j = self.state.groupwidths[av] - lo = lo + i - hi = hi + j - elif op is GROUPREF_EXISTS: - i, j = av[1].getwidth() - if av[2] is not None: - l, h = av[2].getwidth() - i = min(i, l) - j = max(j, h) - else: - i = 0 - lo = lo + i - hi = hi + j - elif op is SUCCESS: - break - self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) - return self.width - -class Tokenizer: - def __init__(self, string): - self.istext = isinstance(string, str) - self.string = string - if not self.istext: - string = str(string, 'latin1') - self.decoded_string = string - self.index = 0 - self.next = None - self.__next() - def __next(self): - index = self.index - try: - char = self.decoded_string[index] - except IndexError: - self.next = None - return - if char == "\\": - index += 1 - try: - char += self.decoded_string[index] - except IndexError: - raise error("bad escape (end of pattern)", - self.string, len(self.string) - 1) from None - self.index = index + 1 - self.next = char - def match(self, char): - if char == self.next: - self.__next() - return True - return False - def get(self): - this = self.next - self.__next() - return this - def getwhile(self, n, charset): - result = '' - for _ in range(n): - c = self.next - if c not in charset: - break - result += c - self.__next() - return result - def getuntil(self, terminator, name): - result = '' - while True: - c = self.next - self.__next() - if c is None: - if not result: - raise self.error("missing " + name) - raise self.error("missing %s, unterminated name" % terminator, - len(result)) - if c == terminator: - if not result: - raise self.error("missing " + name, 1) - break - result += c - return result - @property - def pos(self): - return self.index - len(self.next or '') - def tell(self): - return self.index - len(self.next or '') - def seek(self, index): - self.index = index - self.__next() - - def error(self, msg, offset=0): - return error(msg, self.string, self.tell() - offset) - -def _class_escape(source, escape): - # handle escape code inside character class - code = ESCAPES.get(escape) - if code: - return code - code = CATEGORIES.get(escape) - if code and code[0] is IN: - return code - try: - c = escape[1:2] - if c == "x": - # hexadecimal escape (exactly two digits) - escape += source.getwhile(2, HEXDIGITS) - if len(escape) != 4: - raise source.error("incomplete escape %s" % escape, len(escape)) - return LITERAL, int(escape[2:], 16) - elif c == "u" and source.istext: - # unicode escape (exactly four digits) - escape += source.getwhile(4, HEXDIGITS) - if len(escape) != 6: - raise source.error("incomplete escape %s" % escape, len(escape)) - return LITERAL, int(escape[2:], 16) - elif c == "U" and source.istext: - # unicode escape (exactly eight digits) - escape += source.getwhile(8, HEXDIGITS) - if len(escape) != 10: - raise source.error("incomplete escape %s" % escape, len(escape)) - c = int(escape[2:], 16) - chr(c) # raise ValueError for invalid code - return LITERAL, c - elif c == "N" and source.istext: - import unicodedata - # named unicode escape e.g. \N{EM DASH} - if not source.match('{'): - raise source.error("missing {") - charname = source.getuntil('}', 'character name') - try: - c = ord(unicodedata.lookup(charname)) - except KeyError: - raise source.error("undefined character name %r" % charname, - len(charname) + len(r'\N{}')) - return LITERAL, c - elif c in OCTDIGITS: - # octal escape (up to three digits) - escape += source.getwhile(2, OCTDIGITS) - c = int(escape[1:], 8) - if c > 0o377: - raise source.error('octal escape value %s outside of ' - 'range 0-0o377' % escape, len(escape)) - return LITERAL, c - elif c in DIGITS: - raise ValueError - if len(escape) == 2: - if c in ASCIILETTERS: - raise source.error('bad escape %s' % escape, len(escape)) - return LITERAL, ord(escape[1]) - except ValueError: - pass - raise source.error("bad escape %s" % escape, len(escape)) - -def _escape(source, escape, state): - # handle escape code in expression - code = CATEGORIES.get(escape) - if code: - return code - code = ESCAPES.get(escape) - if code: - return code - try: - c = escape[1:2] - if c == "x": - # hexadecimal escape - escape += source.getwhile(2, HEXDIGITS) - if len(escape) != 4: - raise source.error("incomplete escape %s" % escape, len(escape)) - return LITERAL, int(escape[2:], 16) - elif c == "u" and source.istext: - # unicode escape (exactly four digits) - escape += source.getwhile(4, HEXDIGITS) - if len(escape) != 6: - raise source.error("incomplete escape %s" % escape, len(escape)) - return LITERAL, int(escape[2:], 16) - elif c == "U" and source.istext: - # unicode escape (exactly eight digits) - escape += source.getwhile(8, HEXDIGITS) - if len(escape) != 10: - raise source.error("incomplete escape %s" % escape, len(escape)) - c = int(escape[2:], 16) - chr(c) # raise ValueError for invalid code - return LITERAL, c - elif c == "N" and source.istext: - import unicodedata - # named unicode escape e.g. \N{EM DASH} - if not source.match('{'): - raise source.error("missing {") - charname = source.getuntil('}', 'character name') - try: - c = ord(unicodedata.lookup(charname)) - except KeyError: - raise source.error("undefined character name %r" % charname, - len(charname) + len(r'\N{}')) - return LITERAL, c - elif c == "0": - # octal escape - escape += source.getwhile(2, OCTDIGITS) - return LITERAL, int(escape[1:], 8) - elif c in DIGITS: - # octal escape *or* decimal group reference (sigh) - if source.next in DIGITS: - escape += source.get() - if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and - source.next in OCTDIGITS): - # got three octal digits; this is an octal escape - escape += source.get() - c = int(escape[1:], 8) - if c > 0o377: - raise source.error('octal escape value %s outside of ' - 'range 0-0o377' % escape, - len(escape)) - return LITERAL, c - # not an octal escape, so this is a group reference - group = int(escape[1:]) - if group < state.groups: - if not state.checkgroup(group): - raise source.error("cannot refer to an open group", - len(escape)) - state.checklookbehindgroup(group, source) - return GROUPREF, group - raise source.error("invalid group reference %d" % group, len(escape) - 1) - if len(escape) == 2: - if c in ASCIILETTERS: - raise source.error("bad escape %s" % escape, len(escape)) - return LITERAL, ord(escape[1]) - except ValueError: - pass - raise source.error("bad escape %s" % escape, len(escape)) - -def _uniq(items): - return list(dict.fromkeys(items)) - -def _parse_sub(source, state, verbose, nested): - # parse an alternation: a|b|c - - items = [] - itemsappend = items.append - sourcematch = source.match - start = source.tell() - while True: - itemsappend(_parse(source, state, verbose, nested + 1, - not nested and not items)) - if not sourcematch("|"): - break - - if len(items) == 1: - return items[0] - - subpattern = SubPattern(state) - - # check if all items share a common prefix - while True: - prefix = None - for item in items: - if not item: - break - if prefix is None: - prefix = item[0] - elif item[0] != prefix: - break - else: - # all subitems start with a common "prefix". - # move it out of the branch - for item in items: - del item[0] - subpattern.append(prefix) - continue # check next one - break - - # check if the branch can be replaced by a character set - set = [] - for item in items: - if len(item) != 1: - break - op, av = item[0] - if op is LITERAL: - set.append((op, av)) - elif op is IN and av[0][0] is not NEGATE: - set.extend(av) - else: - break - else: - # we can store this as a character set instead of a - # branch (the compiler may optimize this even more) - subpattern.append((IN, _uniq(set))) - return subpattern - - subpattern.append((BRANCH, (None, items))) - return subpattern - -def _parse(source, state, verbose, nested, first=False): - # parse a simple pattern - subpattern = SubPattern(state) - - # precompute constants into local variables - subpatternappend = subpattern.append - sourceget = source.get - sourcematch = source.match - _len = len - _ord = ord - - while True: - - this = source.next - if this is None: - break # end of pattern - if this in "|)": - break # end of subpattern - sourceget() - - if verbose: - # skip whitespace and comments - if this in WHITESPACE: - continue - if this == "#": - while True: - this = sourceget() - if this is None or this == "\n": - break - continue - - if this[0] == "\\": - code = _escape(source, this, state) - subpatternappend(code) - - elif this not in SPECIAL_CHARS: - subpatternappend((LITERAL, _ord(this))) - - elif this == "[": - here = source.tell() - 1 - # character set - set = [] - setappend = set.append -## if sourcematch(":"): -## pass # handle character classes - if source.next == '[': - import warnings - warnings.warn( - 'Possible nested set at position %d' % source.tell(), - FutureWarning, stacklevel=nested + 6 - ) - negate = sourcematch("^") - # check remaining characters - while True: - this = sourceget() - if this is None: - raise source.error("unterminated character set", - source.tell() - here) - if this == "]" and set: - break - elif this[0] == "\\": - code1 = _class_escape(source, this) - else: - if set and this in '-&~|' and source.next == this: - import warnings - warnings.warn( - 'Possible set %s at position %d' % ( - 'difference' if this == '-' else - 'intersection' if this == '&' else - 'symmetric difference' if this == '~' else - 'union', - source.tell() - 1), - FutureWarning, stacklevel=nested + 6 - ) - code1 = LITERAL, _ord(this) - if sourcematch("-"): - # potential range - that = sourceget() - if that is None: - raise source.error("unterminated character set", - source.tell() - here) - if that == "]": - if code1[0] is IN: - code1 = code1[1][0] - setappend(code1) - setappend((LITERAL, _ord("-"))) - break - if that[0] == "\\": - code2 = _class_escape(source, that) - else: - if that == '-': - import warnings - warnings.warn( - 'Possible set difference at position %d' % ( - source.tell() - 2), - FutureWarning, stacklevel=nested + 6 - ) - code2 = LITERAL, _ord(that) - if code1[0] != LITERAL or code2[0] != LITERAL: - msg = "bad character range %s-%s" % (this, that) - raise source.error(msg, len(this) + 1 + len(that)) - lo = code1[1] - hi = code2[1] - if hi < lo: - msg = "bad character range %s-%s" % (this, that) - raise source.error(msg, len(this) + 1 + len(that)) - setappend((RANGE, (lo, hi))) - else: - if code1[0] is IN: - code1 = code1[1][0] - setappend(code1) - - set = _uniq(set) - # XXX: <fl> should move set optimization to compiler! - if _len(set) == 1 and set[0][0] is LITERAL: - # optimization - if negate: - subpatternappend((NOT_LITERAL, set[0][1])) - else: - subpatternappend(set[0]) - else: - if negate: - set.insert(0, (NEGATE, None)) - # charmap optimization can't be added here because - # global flags still are not known - subpatternappend((IN, set)) - - elif this in REPEAT_CHARS: - # repeat previous item - here = source.tell() - if this == "?": - min, max = 0, 1 - elif this == "*": - min, max = 0, MAXREPEAT - - elif this == "+": - min, max = 1, MAXREPEAT - elif this == "{": - if source.next == "}": - subpatternappend((LITERAL, _ord(this))) - continue - - min, max = 0, MAXREPEAT - lo = hi = "" - while source.next in DIGITS: - lo += sourceget() - if sourcematch(","): - while source.next in DIGITS: - hi += sourceget() - else: - hi = lo - if not sourcematch("}"): - subpatternappend((LITERAL, _ord(this))) - source.seek(here) - continue - - if lo: - min = int(lo) - if min >= MAXREPEAT: - raise OverflowError("the repetition number is too large") - if hi: - max = int(hi) - if max >= MAXREPEAT: - raise OverflowError("the repetition number is too large") - if max < min: - raise source.error("min repeat greater than max repeat", - source.tell() - here) - else: - raise AssertionError("unsupported quantifier %r" % (char,)) - # figure out which item to repeat - if subpattern: - item = subpattern[-1:] - else: - item = None - if not item or item[0][0] is AT: - raise source.error("nothing to repeat", - source.tell() - here + len(this)) - if item[0][0] in _REPEATCODES: - raise source.error("multiple repeat", - source.tell() - here + len(this)) - if item[0][0] is SUBPATTERN: - group, add_flags, del_flags, p = item[0][1] - if group is None and not add_flags and not del_flags: - item = p - if sourcematch("?"): - # Non-Greedy Match - subpattern[-1] = (MIN_REPEAT, (min, max, item)) - elif sourcematch("+"): - # Possessive Match (Always Greedy) - subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) - else: - # Greedy Match - subpattern[-1] = (MAX_REPEAT, (min, max, item)) - - elif this == ".": - subpatternappend((ANY, None)) - - elif this == "(": - start = source.tell() - 1 - capture = True - atomic = False - name = None - add_flags = 0 - del_flags = 0 - if sourcematch("?"): - # options - char = sourceget() - if char is None: - raise source.error("unexpected end of pattern") - if char == "P": - # python extensions - if sourcematch("<"): - # named group: skip forward to end of name - name = source.getuntil(">", "group name") - if not name.isidentifier(): - msg = "bad character in group name %r" % name - raise source.error(msg, len(name) + 1) - elif sourcematch("="): - # named backreference - name = source.getuntil(")", "group name") - if not name.isidentifier(): - msg = "bad character in group name %r" % name - raise source.error(msg, len(name) + 1) - gid = state.groupdict.get(name) - if gid is None: - msg = "unknown group name %r" % name - raise source.error(msg, len(name) + 1) - if not state.checkgroup(gid): - raise source.error("cannot refer to an open group", - len(name) + 1) - state.checklookbehindgroup(gid, source) - subpatternappend((GROUPREF, gid)) - continue - - else: - char = sourceget() - if char is None: - raise source.error("unexpected end of pattern") - raise source.error("unknown extension ?P" + char, - len(char) + 2) - elif char == ":": - # non-capturing group - capture = False - elif char == "#": - # comment - while True: - if source.next is None: - raise source.error("missing ), unterminated comment", - source.tell() - start) - if sourceget() == ")": - break - continue - - elif char in "=!<": - # lookahead assertions - dir = 1 - if char == "<": - char = sourceget() - if char is None: - raise source.error("unexpected end of pattern") - if char not in "=!": - raise source.error("unknown extension ?<" + char, - len(char) + 2) - dir = -1 # lookbehind - lookbehindgroups = state.lookbehindgroups - if lookbehindgroups is None: - state.lookbehindgroups = state.groups - p = _parse_sub(source, state, verbose, nested + 1) - if dir < 0: - if lookbehindgroups is None: - state.lookbehindgroups = None - if not sourcematch(")"): - raise source.error("missing ), unterminated subpattern", - source.tell() - start) - if char == "=": - subpatternappend((ASSERT, (dir, p))) - else: - subpatternappend((ASSERT_NOT, (dir, p))) - continue - - elif char == "(": - # conditional backreference group - condname = source.getuntil(")", "group name") - if condname.isidentifier(): - condgroup = state.groupdict.get(condname) - if condgroup is None: - msg = "unknown group name %r" % condname - raise source.error(msg, len(condname) + 1) - else: - try: - condgroup = int(condname) - if condgroup < 0: - raise ValueError - except ValueError: - msg = "bad character in group name %r" % condname - raise source.error(msg, len(condname) + 1) from None - if not condgroup: - raise source.error("bad group number", - len(condname) + 1) - if condgroup >= MAXGROUPS: - msg = "invalid group reference %d" % condgroup - raise source.error(msg, len(condname) + 1) - state.checklookbehindgroup(condgroup, source) - item_yes = _parse(source, state, verbose, nested + 1) - if source.match("|"): - item_no = _parse(source, state, verbose, nested + 1) - if source.next == "|": - raise source.error("conditional backref with more than two branches") - else: - item_no = None - if not source.match(")"): - raise source.error("missing ), unterminated subpattern", - source.tell() - start) - subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) - continue - - elif char == ">": - # non-capturing, atomic group - capture = False - atomic = True - elif char in FLAGS or char == "-": - # flags - flags = _parse_flags(source, state, char) - if flags is None: # global flags - if not first or subpattern: - raise source.error('global flags not at the start ' - 'of the expression', - source.tell() - start) - if (state.flags & SRE_FLAG_VERBOSE) and not verbose: - raise Verbose - continue - - add_flags, del_flags = flags - capture = False - else: - raise source.error("unknown extension ?" + char, - len(char) + 1) - - # parse group contents - if capture: - try: - group = state.opengroup(name) - except error as err: - raise source.error(err.msg, len(name) + 1) from None - else: - group = None - sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and - not (del_flags & SRE_FLAG_VERBOSE)) - p = _parse_sub(source, state, sub_verbose, nested + 1) - if not source.match(")"): - raise source.error("missing ), unterminated subpattern", - source.tell() - start) - if group is not None: - state.closegroup(group, p) - if atomic: - assert group is None - subpatternappend((ATOMIC_GROUP, p)) - else: - subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) - - elif this == "^": - subpatternappend((AT, AT_BEGINNING)) - - elif this == "$": - subpatternappend((AT, AT_END)) - - else: - raise AssertionError("unsupported special character %r" % (char,)) - - # unpack non-capturing groups - for i in range(len(subpattern))[::-1]: - op, av = subpattern[i] - if op is SUBPATTERN: - group, add_flags, del_flags, p = av - if group is None and not add_flags and not del_flags: - subpattern[i: i+1] = p - - return subpattern - -def _parse_flags(source, state, char): - sourceget = source.get - add_flags = 0 - del_flags = 0 - if char != "-": - while True: - flag = FLAGS[char] - if source.istext: - if char == 'L': - msg = "bad inline flags: cannot use 'L' flag with a str pattern" - raise source.error(msg) - else: - if char == 'u': - msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" - raise source.error(msg) - add_flags |= flag - if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: - msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" - raise source.error(msg) - char = sourceget() - if char is None: - raise source.error("missing -, : or )") - if char in ")-:": - break - if char not in FLAGS: - msg = "unknown flag" if char.isalpha() else "missing -, : or )" - raise source.error(msg, len(char)) - if char == ")": - state.flags |= add_flags - return None - if add_flags & GLOBAL_FLAGS: - raise source.error("bad inline flags: cannot turn on global flag", 1) - if char == "-": - char = sourceget() - if char is None: - raise source.error("missing flag") - if char not in FLAGS: - msg = "unknown flag" if char.isalpha() else "missing flag" - raise source.error(msg, len(char)) - while True: - flag = FLAGS[char] - if flag & TYPE_FLAGS: - msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" - raise source.error(msg) - del_flags |= flag - char = sourceget() - if char is None: - raise source.error("missing :") - if char == ":": - break - if char not in FLAGS: - msg = "unknown flag" if char.isalpha() else "missing :" - raise source.error(msg, len(char)) - assert char == ":" - if del_flags & GLOBAL_FLAGS: - raise source.error("bad inline flags: cannot turn off global flag", 1) - if add_flags & del_flags: - raise source.error("bad inline flags: flag turned on and off", 1) - return add_flags, del_flags - -def fix_flags(src, flags): - # Check and fix flags according to the type of pattern (str or bytes) - if isinstance(src, str): - if flags & SRE_FLAG_LOCALE: - raise ValueError("cannot use LOCALE flag with a str pattern") - if not flags & SRE_FLAG_ASCII: - flags |= SRE_FLAG_UNICODE - elif flags & SRE_FLAG_UNICODE: - raise ValueError("ASCII and UNICODE flags are incompatible") - else: - if flags & SRE_FLAG_UNICODE: - raise ValueError("cannot use UNICODE flag with a bytes pattern") - if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: - raise ValueError("ASCII and LOCALE flags are incompatible") - return flags - -def parse(str, flags=0, state=None): - # parse 're' pattern into list of (opcode, argument) tuples - - source = Tokenizer(str) - - if state is None: - state = State() - state.flags = flags - state.str = str - - try: - p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0) - except Verbose: - # the VERBOSE flag was switched on inside the pattern. to be - # on the safe side, we'll parse the whole thing again... - state = State() - state.flags = flags | SRE_FLAG_VERBOSE - state.str = str - source.seek(0) - p = _parse_sub(source, state, True, 0) - - p.state.flags = fix_flags(str, p.state.flags) - - if source.next is not None: - assert source.next == ")" - raise source.error("unbalanced parenthesis") - - if flags & SRE_FLAG_DEBUG: - p.dump() - - return p - -def parse_template(source, state): - # parse 're' replacement string into list of literals and - # group references - s = Tokenizer(source) - sget = s.get - groups = [] - literals = [] - literal = [] - lappend = literal.append - def addgroup(index, pos): - if index > state.groups: - raise s.error("invalid group reference %d" % index, pos) - if literal: - literals.append(''.join(literal)) - del literal[:] - groups.append((len(literals), index)) - literals.append(None) - groupindex = state.groupindex - while True: - this = sget() - if this is None: - break # end of replacement string - if this[0] == "\\": - # group - c = this[1] - if c == "g": - name = "" - if not s.match("<"): - raise s.error("missing <") - name = s.getuntil(">", "group name") - if name.isidentifier(): - try: - index = groupindex[name] - except KeyError: - raise IndexError("unknown group name %r" % name) - else: - try: - index = int(name) - if index < 0: - raise ValueError - except ValueError: - raise s.error("bad character in group name %r" % name, - len(name) + 1) from None - if index >= MAXGROUPS: - raise s.error("invalid group reference %d" % index, - len(name) + 1) - addgroup(index, len(name) + 1) - elif c == "0": - if s.next in OCTDIGITS: - this += sget() - if s.next in OCTDIGITS: - this += sget() - lappend(chr(int(this[1:], 8) & 0xff)) - elif c in DIGITS: - isoctal = False - if s.next in DIGITS: - this += sget() - if (c in OCTDIGITS and this[2] in OCTDIGITS and - s.next in OCTDIGITS): - this += sget() - isoctal = True - c = int(this[1:], 8) - if c > 0o377: - raise s.error('octal escape value %s outside of ' - 'range 0-0o377' % this, len(this)) - lappend(chr(c)) - if not isoctal: - addgroup(int(this[1:]), len(this) - 1) - else: - try: - this = chr(ESCAPES[this][1]) - except KeyError: - if c in ASCIILETTERS: - raise s.error('bad escape %s' % this, len(this)) - lappend(this) - else: - lappend(this) - if literal: - literals.append(''.join(literal)) - if not isinstance(source, str): - # The tokenizer implicitly decodes bytes objects as latin-1, we must - # therefore re-encode the final representation. - literals = [None if s is None else s.encode('latin-1') for s in literals] - return groups, literals - -def expand_template(template, match): - g = match.group - empty = match.string[:0] - groups, literals = template - literals = literals[:] - try: - for index, group in groups: - literals[index] = g(group) or empty - except IndexError: - raise error("invalid group reference %d" % index) - return empty.join(literals) +from re import _parser as _ +globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'}) |