diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-04-02 08:35:13 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-02 08:35:13 (GMT) |
commit | 1be3260a90f16aae334d993aecf7b70426f98013 (patch) | |
tree | 965c64fd457364b1e303953bced685e76c334d5f /Lib/re | |
parent | 4ed8a9a589d2eee7442e0c9417515a707e504faa (diff) | |
download | cpython-1be3260a90f16aae334d993aecf7b70426f98013.zip cpython-1be3260a90f16aae334d993aecf7b70426f98013.tar.gz cpython-1be3260a90f16aae334d993aecf7b70426f98013.tar.bz2 |
bpo-47152: Convert the re module into a package (GH-32177)
The sre_* modules are now deprecated.
Diffstat (limited to 'Lib/re')
-rw-r--r-- | Lib/re/__init__.py | 363 | ||||
-rw-r--r-- | Lib/re/_compiler.py | 800 | ||||
-rw-r--r-- | Lib/re/_constants.py | 262 | ||||
-rw-r--r-- | Lib/re/_parser.py | 1079 |
4 files changed, 2504 insertions, 0 deletions
diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py new file mode 100644 index 0000000..c47a265 --- /dev/null +++ b/Lib/re/__init__.py @@ -0,0 +1,363 @@ +# +# Secret Labs' Regular Expression Engine +# +# re-compatible interface for the sre matching engine +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# This version of the SRE library can be redistributed under CNRI's +# Python 1.6 license. For any other use, please contact Secret Labs +# AB (info@pythonware.com). +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# + +r"""Support for regular expressions (RE). + +This module provides regular expression matching operations similar to +those found in Perl. It supports both 8-bit and Unicode strings; both +the pattern and the strings being processed can contain null bytes and +characters outside the US ASCII range. + +Regular expressions can contain both special and ordinary characters. +Most ordinary characters, like "A", "a", or "0", are the simplest +regular expressions; they simply match themselves. You can +concatenate ordinary characters, so last matches the string 'last'. + +The special characters are: + "." Matches any character except a newline. + "^" Matches the start of the string. + "$" Matches the end of the string or just before the newline at + the end of the string. + "*" Matches 0 or more (greedy) repetitions of the preceding RE. + Greedy means that it will match as many repetitions as possible. + "+" Matches 1 or more (greedy) repetitions of the preceding RE. + "?" Matches 0 or 1 (greedy) of the preceding RE. + *?,+?,?? Non-greedy versions of the previous three special characters. + {m,n} Matches from m to n repetitions of the preceding RE. + {m,n}? Non-greedy version of the above. + "\\" Either escapes special characters or signals a special sequence. + [] Indicates a set of characters. + A "^" as the first character indicates a complementing set. + "|" A|B, creates an RE that will match either A or B. + (...) Matches the RE inside the parentheses. + The contents can be retrieved or matched later in the string. + (?aiLmsux) The letters set the corresponding flags defined below. + (?:...) Non-grouping version of regular parentheses. + (?P<name>...) The substring matched by the group is accessible by name. + (?P=name) Matches the text matched earlier by the group named name. + (?#...) A comment; ignored. + (?=...) Matches if ... matches next, but doesn't consume the string. + (?!...) Matches if ... doesn't match next. + (?<=...) Matches if preceded by ... (must be fixed length). + (?<!...) Matches if not preceded by ... (must be fixed length). + (?(id/name)yes|no) Matches yes pattern if the group with id/name matched, + the (optional) no pattern otherwise. + +The special sequences consist of "\\" and a character from the list +below. If the ordinary character is not on the list, then the +resulting RE will match the second character. + \number Matches the contents of the group of the same number. + \A Matches only at the start of the string. + \Z Matches only at the end of the string. + \b Matches the empty string, but only at the start or end of a word. + \B Matches the empty string, but not at the start or end of a word. + \d Matches any decimal digit; equivalent to the set [0-9] in + bytes patterns or string patterns with the ASCII flag. + In string patterns without the ASCII flag, it will match the whole + range of Unicode digits. + \D Matches any non-digit character; equivalent to [^\d]. + \s Matches any whitespace character; equivalent to [ \t\n\r\f\v] in + bytes patterns or string patterns with the ASCII flag. + In string patterns without the ASCII flag, it will match the whole + range of Unicode whitespace characters. + \S Matches any non-whitespace character; equivalent to [^\s]. + \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_] + in bytes patterns or string patterns with the ASCII flag. + In string patterns without the ASCII flag, it will match the + range of Unicode alphanumeric characters (letters plus digits + plus underscore). + With LOCALE, it will match the set [0-9_] plus characters defined + as letters for the current locale. + \W Matches the complement of \w. + \\ Matches a literal backslash. + +This module exports the following functions: + match Match a regular expression pattern to the beginning of a string. + fullmatch Match a regular expression pattern to all of a string. + search Search a string for the presence of a pattern. + sub Substitute occurrences of a pattern found in a string. + subn Same as sub, but also return the number of substitutions made. + split Split a string by the occurrences of a pattern. + findall Find all occurrences of a pattern in a string. + finditer Return an iterator yielding a Match object for each match. + compile Compile a pattern into a Pattern object. + purge Clear the regular expression cache. + escape Backslash all non-alphanumerics in a string. + +Each function other than purge and escape can take an optional 'flags' argument +consisting of one or more of the following module constants, joined by "|". +A, L, and U are mutually exclusive. + A ASCII For string patterns, make \w, \W, \b, \B, \d, \D + match the corresponding ASCII character categories + (rather than the whole Unicode categories, which is the + default). + For bytes patterns, this flag is the only available + behaviour and needn't be specified. + I IGNORECASE Perform case-insensitive matching. + L LOCALE Make \w, \W, \b, \B, dependent on the current locale. + M MULTILINE "^" matches the beginning of lines (after a newline) + as well as the string. + "$" matches the end of lines (before a newline) as well + as the end of the string. + S DOTALL "." matches any character at all, including the newline. + X VERBOSE Ignore whitespace and comments for nicer looking RE's. + U UNICODE For compatibility only. Ignored for string patterns (it + is the default), and forbidden for bytes patterns. + +This module also defines an exception 'error'. + +""" + +import enum +from . import _compiler, _parser +import functools +try: + import _locale +except ImportError: + _locale = None + + +# public symbols +__all__ = [ + "match", "fullmatch", "search", "sub", "subn", "split", + "findall", "finditer", "compile", "purge", "template", "escape", + "error", "Pattern", "Match", "A", "I", "L", "M", "S", "X", "U", + "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", + "UNICODE", "NOFLAG", "RegexFlag", +] + +__version__ = "2.2.1" + +@enum.global_enum +@enum._simple_enum(enum.IntFlag, boundary=enum.KEEP) +class RegexFlag: + NOFLAG = 0 + ASCII = A = _compiler.SRE_FLAG_ASCII # assume ascii "locale" + IGNORECASE = I = _compiler.SRE_FLAG_IGNORECASE # ignore case + LOCALE = L = _compiler.SRE_FLAG_LOCALE # assume current 8-bit locale + UNICODE = U = _compiler.SRE_FLAG_UNICODE # assume unicode "locale" + MULTILINE = M = _compiler.SRE_FLAG_MULTILINE # make anchors look for newline + DOTALL = S = _compiler.SRE_FLAG_DOTALL # make dot match newline + VERBOSE = X = _compiler.SRE_FLAG_VERBOSE # ignore whitespace and comments + # sre extensions (experimental, don't rely on these) + TEMPLATE = T = _compiler.SRE_FLAG_TEMPLATE # disable backtracking + DEBUG = _compiler.SRE_FLAG_DEBUG # dump pattern after compilation + __str__ = object.__str__ + _numeric_repr_ = hex + +# sre exception +error = _compiler.error + +# -------------------------------------------------------------------- +# public interface + +def match(pattern, string, flags=0): + """Try to apply the pattern at the start of the string, returning + a Match object, or None if no match was found.""" + return _compile(pattern, flags).match(string) + +def fullmatch(pattern, string, flags=0): + """Try to apply the pattern to all of the string, returning + a Match object, or None if no match was found.""" + return _compile(pattern, flags).fullmatch(string) + +def search(pattern, string, flags=0): + """Scan through string looking for a match to the pattern, returning + a Match object, or None if no match was found.""" + return _compile(pattern, flags).search(string) + +def sub(pattern, repl, string, count=0, flags=0): + """Return the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in string by the + replacement repl. repl can be either a string or a callable; + if a string, backslash escapes in it are processed. If it is + a callable, it's passed the Match object and must return + a replacement string to be used.""" + return _compile(pattern, flags).sub(repl, string, count) + +def subn(pattern, repl, string, count=0, flags=0): + """Return a 2-tuple containing (new_string, number). + new_string is the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in the source + string by the replacement repl. number is the number of + substitutions that were made. repl can be either a string or a + callable; if a string, backslash escapes in it are processed. + If it is a callable, it's passed the Match object and must + return a replacement string to be used.""" + return _compile(pattern, flags).subn(repl, string, count) + +def split(pattern, string, maxsplit=0, flags=0): + """Split the source string by the occurrences of the pattern, + returning a list containing the resulting substrings. If + capturing parentheses are used in pattern, then the text of all + groups in the pattern are also returned as part of the resulting + list. If maxsplit is nonzero, at most maxsplit splits occur, + and the remainder of the string is returned as the final element + of the list.""" + return _compile(pattern, flags).split(string, maxsplit) + +def findall(pattern, string, flags=0): + """Return a list of all non-overlapping matches in the string. + + If one or more capturing groups are present in the pattern, return + a list of groups; this will be a list of tuples if the pattern + has more than one group. + + Empty matches are included in the result.""" + return _compile(pattern, flags).findall(string) + +def finditer(pattern, string, flags=0): + """Return an iterator over all non-overlapping matches in the + string. For each match, the iterator returns a Match object. + + Empty matches are included in the result.""" + return _compile(pattern, flags).finditer(string) + +def compile(pattern, flags=0): + "Compile a regular expression pattern, returning a Pattern object." + return _compile(pattern, flags) + +def purge(): + "Clear the regular expression caches" + _cache.clear() + _compile_repl.cache_clear() + +def template(pattern, flags=0): + "Compile a template pattern, returning a Pattern object" + return _compile(pattern, flags|T) + +# SPECIAL_CHARS +# closing ')', '}' and ']' +# '-' (a range in character set) +# '&', '~', (extended character set operations) +# '#' (comment) and WHITESPACE (ignored) in verbose mode +_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'} + +def escape(pattern): + """ + Escape special characters in a string. + """ + if isinstance(pattern, str): + return pattern.translate(_special_chars_map) + else: + pattern = str(pattern, 'latin1') + return pattern.translate(_special_chars_map).encode('latin1') + +Pattern = type(_compiler.compile('', 0)) +Match = type(_compiler.compile('', 0).match('')) + +# -------------------------------------------------------------------- +# internals + +_cache = {} # ordered! + +_MAXCACHE = 512 +def _compile(pattern, flags): + # internal: compile pattern + if isinstance(flags, RegexFlag): + flags = flags.value + try: + return _cache[type(pattern), pattern, flags] + except KeyError: + pass + if isinstance(pattern, Pattern): + if flags: + raise ValueError( + "cannot process flags argument with a compiled pattern") + return pattern + if not _compiler.isstring(pattern): + raise TypeError("first argument must be string or compiled pattern") + p = _compiler.compile(pattern, flags) + if not (flags & DEBUG): + if len(_cache) >= _MAXCACHE: + # Drop the oldest item + try: + del _cache[next(iter(_cache))] + except (StopIteration, RuntimeError, KeyError): + pass + _cache[type(pattern), pattern, flags] = p + return p + +@functools.lru_cache(_MAXCACHE) +def _compile_repl(repl, pattern): + # internal: compile replacement pattern + return _parser.parse_template(repl, pattern) + +def _expand(pattern, match, template): + # internal: Match.expand implementation hook + template = _parser.parse_template(template, pattern) + return _parser.expand_template(template, match) + +def _subx(pattern, template): + # internal: Pattern.sub/subn implementation helper + template = _compile_repl(template, pattern) + if not template[0] and len(template[1]) == 1: + # literal replacement + return template[1][0] + def filter(match, template=template): + return _parser.expand_template(template, match) + return filter + +# register myself for pickling + +import copyreg + +def _pickle(p): + return _compile, (p.pattern, p.flags) + +copyreg.pickle(Pattern, _pickle, _compile) + +# -------------------------------------------------------------------- +# experimental stuff (see python-dev discussions for details) + +class Scanner: + def __init__(self, lexicon, flags=0): + from ._constants import BRANCH, SUBPATTERN + if isinstance(flags, RegexFlag): + flags = flags.value + self.lexicon = lexicon + # combine phrases into a compound pattern + p = [] + s = _parser.State() + s.flags = flags + for phrase, action in lexicon: + gid = s.opengroup() + p.append(_parser.SubPattern(s, [ + (SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))), + ])) + s.closegroup(gid, p[-1]) + p = _parser.SubPattern(s, [(BRANCH, (None, p))]) + self.scanner = _compiler.compile(p) + def scan(self, string): + result = [] + append = result.append + match = self.scanner.scanner(string).match + i = 0 + while True: + m = match() + if not m: + break + j = m.end() + if i == j: + break + action = self.lexicon[m.lastindex-1][1] + if callable(action): + self.match = m + action = action(self, m.group()) + if action is not None: + append(action) + i = j + return result, string[i:] diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py new file mode 100644 index 0000000..62da8e5 --- /dev/null +++ b/Lib/re/_compiler.py @@ -0,0 +1,800 @@ +# +# Secret Labs' Regular Expression Engine +# +# convert template to internal format +# +# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. +# +# See the __init__.py file for information on usage and redistribution. +# + +"""Internal support module for sre""" + +import _sre +from . import _parser +from ._constants import * + +assert _sre.MAGIC == MAGIC, "SRE module mismatch" + +_LITERAL_CODES = {LITERAL, NOT_LITERAL} +_SUCCESS_CODES = {SUCCESS, FAILURE} +_ASSERT_CODES = {ASSERT, ASSERT_NOT} +_UNIT_CODES = _LITERAL_CODES | {ANY, IN} + +_REPEATING_CODES = { + MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), + MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE), + POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), +} + +# Sets of lowercase characters which have the same uppercase. +_equivalences = ( + # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I + (0x69, 0x131), # iı + # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S + (0x73, 0x17f), # sſ + # MICRO SIGN, GREEK SMALL LETTER MU + (0xb5, 0x3bc), # µμ + # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI + (0x345, 0x3b9, 0x1fbe), # \u0345ιι + # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA + (0x390, 0x1fd3), # ΐΐ + # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA + (0x3b0, 0x1fe3), # ΰΰ + # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL + (0x3b2, 0x3d0), # βϐ + # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL + (0x3b5, 0x3f5), # εϵ + # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL + (0x3b8, 0x3d1), # θϑ + # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL + (0x3ba, 0x3f0), # κϰ + # GREEK SMALL LETTER PI, GREEK PI SYMBOL + (0x3c0, 0x3d6), # πϖ + # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL + (0x3c1, 0x3f1), # ρϱ + # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA + (0x3c2, 0x3c3), # ςσ + # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL + (0x3c6, 0x3d5), # φϕ + # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE + (0x1e61, 0x1e9b), # ṡẛ + # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST + (0xfb05, 0xfb06), # ſtst +) + +# Maps the lowercase code to lowercase codes which have the same uppercase. +_ignorecase_fixes = {i: tuple(j for j in t if i != j) + for t in _equivalences for i in t} + +def _combine_flags(flags, add_flags, del_flags, + TYPE_FLAGS=_parser.TYPE_FLAGS): + if add_flags & TYPE_FLAGS: + flags &= ~TYPE_FLAGS + return (flags | add_flags) & ~del_flags + +def _compile(code, pattern, flags): + # internal: compile a (sub)pattern + emit = code.append + _len = len + LITERAL_CODES = _LITERAL_CODES + REPEATING_CODES = _REPEATING_CODES + SUCCESS_CODES = _SUCCESS_CODES + ASSERT_CODES = _ASSERT_CODES + iscased = None + tolower = None + fixes = None + if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: + if flags & SRE_FLAG_UNICODE: + iscased = _sre.unicode_iscased + tolower = _sre.unicode_tolower + fixes = _ignorecase_fixes + else: + iscased = _sre.ascii_iscased + tolower = _sre.ascii_tolower + for op, av in pattern: + if op in LITERAL_CODES: + if not flags & SRE_FLAG_IGNORECASE: + emit(op) + emit(av) + elif flags & SRE_FLAG_LOCALE: + emit(OP_LOCALE_IGNORE[op]) + emit(av) + elif not iscased(av): + emit(op) + emit(av) + else: + lo = tolower(av) + if not fixes: # ascii + emit(OP_IGNORE[op]) + emit(lo) + elif lo not in fixes: + emit(OP_UNICODE_IGNORE[op]) + emit(lo) + else: + emit(IN_UNI_IGNORE) + skip = _len(code); emit(0) + if op is NOT_LITERAL: + emit(NEGATE) + for k in (lo,) + fixes[lo]: + emit(LITERAL) + emit(k) + emit(FAILURE) + code[skip] = _len(code) - skip + elif op is IN: + charset, hascased = _optimize_charset(av, iscased, tolower, fixes) + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + emit(IN_LOC_IGNORE) + elif not hascased: + emit(IN) + elif not fixes: # ascii + emit(IN_IGNORE) + else: + emit(IN_UNI_IGNORE) + skip = _len(code); emit(0) + _compile_charset(charset, flags, code) + code[skip] = _len(code) - skip + elif op is ANY: + if flags & SRE_FLAG_DOTALL: + emit(ANY_ALL) + else: + emit(ANY) + elif op in REPEATING_CODES: + if flags & SRE_FLAG_TEMPLATE: + raise error("internal: unsupported template operator %r" % (op,)) + if _simple(av[2]): + emit(REPEATING_CODES[op][2]) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(SUCCESS) + code[skip] = _len(code) - skip + else: + emit(REPEATING_CODES[op][0]) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + code[skip] = _len(code) - skip + emit(REPEATING_CODES[op][1]) + elif op is SUBPATTERN: + group, add_flags, del_flags, p = av + if group: + emit(MARK) + emit((group-1)*2) + # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) + _compile(code, p, _combine_flags(flags, add_flags, del_flags)) + if group: + emit(MARK) + emit((group-1)*2+1) + elif op is ATOMIC_GROUP: + # Atomic Groups are handled by starting with an Atomic + # Group op code, then putting in the atomic group pattern + # and finally a success op code to tell any repeat + # operations within the Atomic Group to stop eating and + # pop their stack if they reach it + emit(ATOMIC_GROUP) + skip = _len(code); emit(0) + _compile(code, av, flags) + emit(SUCCESS) + code[skip] = _len(code) - skip + elif op in SUCCESS_CODES: + emit(op) + elif op in ASSERT_CODES: + emit(op) + skip = _len(code); emit(0) + if av[0] >= 0: + emit(0) # look ahead + else: + lo, hi = av[1].getwidth() + if lo != hi: + raise error("look-behind requires fixed-width pattern") + emit(lo) # look behind + _compile(code, av[1], flags) + emit(SUCCESS) + code[skip] = _len(code) - skip + elif op is CALL: + emit(op) + skip = _len(code); emit(0) + _compile(code, av, flags) + emit(SUCCESS) + code[skip] = _len(code) - skip + elif op is AT: + emit(op) + if flags & SRE_FLAG_MULTILINE: + av = AT_MULTILINE.get(av, av) + if flags & SRE_FLAG_LOCALE: + av = AT_LOCALE.get(av, av) + elif flags & SRE_FLAG_UNICODE: + av = AT_UNICODE.get(av, av) + emit(av) + elif op is BRANCH: + emit(op) + tail = [] + tailappend = tail.append + for av in av[1]: + skip = _len(code); emit(0) + # _compile_info(code, av, flags) + _compile(code, av, flags) + emit(JUMP) + tailappend(_len(code)); emit(0) + code[skip] = _len(code) - skip + emit(FAILURE) # end of branch + for tail in tail: + code[tail] = _len(code) - tail + elif op is CATEGORY: + emit(op) + if flags & SRE_FLAG_LOCALE: + av = CH_LOCALE[av] + elif flags & SRE_FLAG_UNICODE: + av = CH_UNICODE[av] + emit(av) + elif op is GROUPREF: + if not flags & SRE_FLAG_IGNORECASE: + emit(op) + elif flags & SRE_FLAG_LOCALE: + emit(GROUPREF_LOC_IGNORE) + elif not fixes: # ascii + emit(GROUPREF_IGNORE) + else: + emit(GROUPREF_UNI_IGNORE) + emit(av-1) + elif op is GROUPREF_EXISTS: + emit(op) + emit(av[0]-1) + skipyes = _len(code); emit(0) + _compile(code, av[1], flags) + if av[2]: + emit(JUMP) + skipno = _len(code); emit(0) + code[skipyes] = _len(code) - skipyes + 1 + _compile(code, av[2], flags) + code[skipno] = _len(code) - skipno + else: + code[skipyes] = _len(code) - skipyes + 1 + else: + raise error("internal: unsupported operand type %r" % (op,)) + +def _compile_charset(charset, flags, code): + # compile charset subprogram + emit = code.append + for op, av in charset: + emit(op) + if op is NEGATE: + pass + elif op is LITERAL: + emit(av) + elif op is RANGE or op is RANGE_UNI_IGNORE: + emit(av[0]) + emit(av[1]) + elif op is CHARSET: + code.extend(av) + elif op is BIGCHARSET: + code.extend(av) + elif op is CATEGORY: + if flags & SRE_FLAG_LOCALE: + emit(CH_LOCALE[av]) + elif flags & SRE_FLAG_UNICODE: + emit(CH_UNICODE[av]) + else: + emit(av) + else: + raise error("internal: unsupported set operator %r" % (op,)) + emit(FAILURE) + +def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): + # internal: optimize character set + out = [] + tail = [] + charmap = bytearray(256) + hascased = False + for op, av in charset: + while True: + try: + if op is LITERAL: + if fixup: + lo = fixup(av) + charmap[lo] = 1 + if fixes and lo in fixes: + for k in fixes[lo]: + charmap[k] = 1 + if not hascased and iscased(av): + hascased = True + else: + charmap[av] = 1 + elif op is RANGE: + r = range(av[0], av[1]+1) + if fixup: + if fixes: + for i in map(fixup, r): + charmap[i] = 1 + if i in fixes: + for k in fixes[i]: + charmap[k] = 1 + else: + for i in map(fixup, r): + charmap[i] = 1 + if not hascased: + hascased = any(map(iscased, r)) + else: + for i in r: + charmap[i] = 1 + elif op is NEGATE: + out.append((op, av)) + else: + tail.append((op, av)) + except IndexError: + if len(charmap) == 256: + # character set contains non-UCS1 character codes + charmap += b'\0' * 0xff00 + continue + # Character set contains non-BMP character codes. + if fixup: + hascased = True + # There are only two ranges of cased non-BMP characters: + # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), + # and for both ranges RANGE_UNI_IGNORE works. + if op is RANGE: + op = RANGE_UNI_IGNORE + tail.append((op, av)) + break + + # compress character map + runs = [] + q = 0 + while True: + p = charmap.find(1, q) + if p < 0: + break + if len(runs) >= 2: + runs = None + break + q = charmap.find(0, p) + if q < 0: + runs.append((p, len(charmap))) + break + runs.append((p, q)) + if runs is not None: + # use literal/range + for p, q in runs: + if q - p == 1: + out.append((LITERAL, p)) + else: + out.append((RANGE, (p, q - 1))) + out += tail + # if the case was changed or new representation is more compact + if hascased or len(out) < len(charset): + return out, hascased + # else original character set is good enough + return charset, hascased + + # use bitmap + if len(charmap) == 256: + data = _mk_bitmap(charmap) + out.append((CHARSET, data)) + out += tail + return out, hascased + + # To represent a big charset, first a bitmap of all characters in the + # set is constructed. Then, this bitmap is sliced into chunks of 256 + # characters, duplicate chunks are eliminated, and each chunk is + # given a number. In the compiled expression, the charset is + # represented by a 32-bit word sequence, consisting of one word for + # the number of different chunks, a sequence of 256 bytes (64 words) + # of chunk numbers indexed by their original chunk position, and a + # sequence of 256-bit chunks (8 words each). + + # Compression is normally good: in a typical charset, large ranges of + # Unicode will be either completely excluded (e.g. if only cyrillic + # letters are to be matched), or completely included (e.g. if large + # subranges of Kanji match). These ranges will be represented by + # chunks of all one-bits or all zero-bits. + + # Matching can be also done efficiently: the more significant byte of + # the Unicode character is an index into the chunk number, and the + # less significant byte is a bit index in the chunk (just like the + # CHARSET matching). + + charmap = bytes(charmap) # should be hashable + comps = {} + mapping = bytearray(256) + block = 0 + data = bytearray() + for i in range(0, 65536, 256): + chunk = charmap[i: i + 256] + if chunk in comps: + mapping[i // 256] = comps[chunk] + else: + mapping[i // 256] = comps[chunk] = block + block += 1 + data += chunk + data = _mk_bitmap(data) + data[0:0] = [block] + _bytes_to_codes(mapping) + out.append((BIGCHARSET, data)) + out += tail + return out, hascased + +_CODEBITS = _sre.CODESIZE * 8 +MAXCODE = (1 << _CODEBITS) - 1 +_BITS_TRANS = b'0' + b'1' * 255 +def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): + s = bits.translate(_BITS_TRANS)[::-1] + return [_int(s[i - _CODEBITS: i], 2) + for i in range(len(s), 0, -_CODEBITS)] + +def _bytes_to_codes(b): + # Convert block indices to word array + a = memoryview(b).cast('I') + assert a.itemsize == _sre.CODESIZE + assert len(a) * a.itemsize == len(b) + return a.tolist() + +def _simple(p): + # check if this subpattern is a "simple" operator + if len(p) != 1: + return False + op, av = p[0] + if op is SUBPATTERN: + return av[0] is None and _simple(av[-1]) + return op in _UNIT_CODES + +def _generate_overlap_table(prefix): + """ + Generate an overlap table for the following prefix. + An overlap table is a table of the same size as the prefix which + informs about the potential self-overlap for each index in the prefix: + - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...] + - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with + prefix[0:k] + """ + table = [0] * len(prefix) + for i in range(1, len(prefix)): + idx = table[i - 1] + while prefix[i] != prefix[idx]: + if idx == 0: + table[i] = 0 + break + idx = table[idx - 1] + else: + table[i] = idx + 1 + return table + +def _get_iscased(flags): + if not flags & SRE_FLAG_IGNORECASE: + return None + elif flags & SRE_FLAG_UNICODE: + return _sre.unicode_iscased + else: + return _sre.ascii_iscased + +def _get_literal_prefix(pattern, flags): + # look for literal prefix + prefix = [] + prefixappend = prefix.append + prefix_skip = None + iscased = _get_iscased(flags) + for op, av in pattern.data: + if op is LITERAL: + if iscased and iscased(av): + break + prefixappend(av) + elif op is SUBPATTERN: + group, add_flags, del_flags, p = av + flags1 = _combine_flags(flags, add_flags, del_flags) + if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: + break + prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) + if prefix_skip is None: + if group is not None: + prefix_skip = len(prefix) + elif prefix_skip1 is not None: + prefix_skip = len(prefix) + prefix_skip1 + prefix.extend(prefix1) + if not got_all: + break + else: + break + else: + return prefix, prefix_skip, True + return prefix, prefix_skip, False + +def _get_charset_prefix(pattern, flags): + while True: + if not pattern.data: + return None + op, av = pattern.data[0] + if op is not SUBPATTERN: + break + group, add_flags, del_flags, pattern = av + flags = _combine_flags(flags, add_flags, del_flags) + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + return None + + iscased = _get_iscased(flags) + if op is LITERAL: + if iscased and iscased(av): + return None + return [(op, av)] + elif op is BRANCH: + charset = [] + charsetappend = charset.append + for p in av[1]: + if not p: + return None + op, av = p[0] + if op is LITERAL and not (iscased and iscased(av)): + charsetappend((op, av)) + else: + return None + return charset + elif op is IN: + charset = av + if iscased: + for op, av in charset: + if op is LITERAL: + if iscased(av): + return None + elif op is RANGE: + if av[1] > 0xffff: + return None + if any(map(iscased, range(av[0], av[1]+1))): + return None + return charset + return None + +def _compile_info(code, pattern, flags): + # internal: compile an info block. in the current version, + # this contains min/max pattern width, and an optional literal + # prefix or a character map + lo, hi = pattern.getwidth() + if hi > MAXCODE: + hi = MAXCODE + if lo == 0: + code.extend([INFO, 4, 0, lo, hi]) + return + # look for a literal prefix + prefix = [] + prefix_skip = 0 + charset = [] # not used + if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): + # look for literal prefix + prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) + # if no prefix, look for charset prefix + if not prefix: + charset = _get_charset_prefix(pattern, flags) +## if prefix: +## print("*** PREFIX", prefix, prefix_skip) +## if charset: +## print("*** CHARSET", charset) + # add an info block + emit = code.append + emit(INFO) + skip = len(code); emit(0) + # literal flag + mask = 0 + if prefix: + mask = SRE_INFO_PREFIX + if prefix_skip is None and got_all: + mask = mask | SRE_INFO_LITERAL + elif charset: + mask = mask | SRE_INFO_CHARSET + emit(mask) + # pattern length + if lo < MAXCODE: + emit(lo) + else: + emit(MAXCODE) + prefix = prefix[:MAXCODE] + emit(min(hi, MAXCODE)) + # add literal prefix + if prefix: + emit(len(prefix)) # length + if prefix_skip is None: + prefix_skip = len(prefix) + emit(prefix_skip) # skip + code.extend(prefix) + # generate overlap table + code.extend(_generate_overlap_table(prefix)) + elif charset: + charset, hascased = _optimize_charset(charset) + assert not hascased + _compile_charset(charset, flags, code) + code[skip] = len(code) - skip + +def isstring(obj): + return isinstance(obj, (str, bytes)) + +def _code(p, flags): + + flags = p.state.flags | flags + code = [] + + # compile info block + _compile_info(code, p, flags) + + # compile the pattern + _compile(code, p.data, flags) + + code.append(SUCCESS) + + return code + +def _hex_code(code): + return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) + +def dis(code): + import sys + + labels = set() + level = 0 + offset_width = len(str(len(code) - 1)) + + def dis_(start, end): + def print_(*args, to=None): + if to is not None: + labels.add(to) + args += ('(to %d)' % (to,),) + print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'), + end=' '*(level-1)) + print(*args) + + def print_2(*args): + print(end=' '*(offset_width + 2*level)) + print(*args) + + nonlocal level + level += 1 + i = start + while i < end: + start = i + op = code[i] + i += 1 + op = OPCODES[op] + if op in (SUCCESS, FAILURE, ANY, ANY_ALL, + MAX_UNTIL, MIN_UNTIL, NEGATE): + print_(op) + elif op in (LITERAL, NOT_LITERAL, + LITERAL_IGNORE, NOT_LITERAL_IGNORE, + LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE, + LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): + arg = code[i] + i += 1 + print_(op, '%#02x (%r)' % (arg, chr(arg))) + elif op is AT: + arg = code[i] + i += 1 + arg = str(ATCODES[arg]) + assert arg[:3] == 'AT_' + print_(op, arg[3:]) + elif op is CATEGORY: + arg = code[i] + i += 1 + arg = str(CHCODES[arg]) + assert arg[:9] == 'CATEGORY_' + print_(op, arg[9:]) + elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): + skip = code[i] + print_(op, skip, to=i+skip) + dis_(i+1, i+skip) + i += skip + elif op in (RANGE, RANGE_UNI_IGNORE): + lo, hi = code[i: i+2] + i += 2 + print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) + elif op is CHARSET: + print_(op, _hex_code(code[i: i + 256//_CODEBITS])) + i += 256//_CODEBITS + elif op is BIGCHARSET: + arg = code[i] + i += 1 + mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder) + for x in code[i: i + 256//_sre.CODESIZE])) + print_(op, arg, mapping) + i += 256//_sre.CODESIZE + level += 1 + for j in range(arg): + print_2(_hex_code(code[i: i + 256//_CODEBITS])) + i += 256//_CODEBITS + level -= 1 + elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, + GROUPREF_LOC_IGNORE): + arg = code[i] + i += 1 + print_(op, arg) + elif op is JUMP: + skip = code[i] + print_(op, skip, to=i+skip) + i += 1 + elif op is BRANCH: + skip = code[i] + print_(op, skip, to=i+skip) + while skip: + dis_(i+1, i+skip) + i += skip + start = i + skip = code[i] + if skip: + print_('branch', skip, to=i+skip) + else: + print_(FAILURE) + i += 1 + elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, + POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): + skip, min, max = code[i: i+3] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, min, max, to=i+skip) + dis_(i+3, i+skip) + i += skip + elif op is GROUPREF_EXISTS: + arg, skip = code[i: i+2] + print_(op, arg, skip, to=i+skip) + i += 2 + elif op in (ASSERT, ASSERT_NOT): + skip, arg = code[i: i+2] + print_(op, skip, arg, to=i+skip) + dis_(i+2, i+skip) + i += skip + elif op is ATOMIC_GROUP: + skip = code[i] + print_(op, skip, to=i+skip) + dis_(i+1, i+skip) + i += skip + elif op is INFO: + skip, flags, min, max = code[i: i+4] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, bin(flags), min, max, to=i+skip) + start = i+4 + if flags & SRE_INFO_PREFIX: + prefix_len, prefix_skip = code[i+4: i+6] + print_2(' prefix_skip', prefix_skip) + start = i + 6 + prefix = code[start: start+prefix_len] + print_2(' prefix', + '[%s]' % ', '.join('%#02x' % x for x in prefix), + '(%r)' % ''.join(map(chr, prefix))) + start += prefix_len + print_2(' overlap', code[start: start+prefix_len]) + start += prefix_len + if flags & SRE_INFO_CHARSET: + level += 1 + print_2('in') + dis_(start, i+skip) + level -= 1 + i += skip + else: + raise ValueError(op) + + level -= 1 + + dis_(0, len(code)) + + +def compile(p, flags=0): + # internal: convert pattern list to internal format + + if isstring(p): + pattern = p + p = _parser.parse(p, flags) + else: + pattern = None + + code = _code(p, flags) + + if flags & SRE_FLAG_DEBUG: + print() + dis(code) + + # map in either direction + groupindex = p.state.groupdict + indexgroup = [None] * p.state.groups + for k, i in groupindex.items(): + indexgroup[i] = k + + return _sre.compile( + pattern, flags | p.state.flags, code, + p.state.groups-1, + groupindex, tuple(indexgroup) + ) diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py new file mode 100644 index 0000000..c735edf --- /dev/null +++ b/Lib/re/_constants.py @@ -0,0 +1,262 @@ +# +# Secret Labs' Regular Expression Engine +# +# various symbols used by the regular expression engine. +# run this script to update the _sre include files! +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# See the __init__.py file for information on usage and redistribution. +# + +"""Internal support module for sre""" + +# update when constants are added or removed + +MAGIC = 20220318 + +from _sre import MAXREPEAT, MAXGROUPS + +# SRE standard exception (access as sre.error) +# should this really be here? + +class error(Exception): + """Exception raised for invalid regular expressions. + + Attributes: + + msg: The unformatted error message + pattern: The regular expression pattern + pos: The index in the pattern where compilation failed (may be None) + lineno: The line corresponding to pos (may be None) + colno: The column corresponding to pos (may be None) + """ + + __module__ = 're' + + def __init__(self, msg, pattern=None, pos=None): + self.msg = msg + self.pattern = pattern + self.pos = pos + if pattern is not None and pos is not None: + msg = '%s at position %d' % (msg, pos) + if isinstance(pattern, str): + newline = '\n' + else: + newline = b'\n' + self.lineno = pattern.count(newline, 0, pos) + 1 + self.colno = pos - pattern.rfind(newline, 0, pos) + if newline in pattern: + msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno) + else: + self.lineno = self.colno = None + super().__init__(msg) + + +class _NamedIntConstant(int): + def __new__(cls, value, name): + self = super(_NamedIntConstant, cls).__new__(cls, value) + self.name = name + return self + + def __repr__(self): + return self.name + +MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT') + +def _makecodes(names): + names = names.strip().split() + items = [_NamedIntConstant(i, name) for i, name in enumerate(names)] + globals().update({item.name: item for item in items}) + return items + +# operators +# failure=0 success=1 (just because it looks better that way :-) +OPCODES = _makecodes(""" + FAILURE SUCCESS + + ANY ANY_ALL + ASSERT ASSERT_NOT + AT + BRANCH + CALL + CATEGORY + CHARSET BIGCHARSET + GROUPREF GROUPREF_EXISTS + IN + INFO + JUMP + LITERAL + MARK + MAX_UNTIL + MIN_UNTIL + NOT_LITERAL + NEGATE + RANGE + REPEAT + REPEAT_ONE + SUBPATTERN + MIN_REPEAT_ONE + ATOMIC_GROUP + POSSESSIVE_REPEAT + POSSESSIVE_REPEAT_ONE + + GROUPREF_IGNORE + IN_IGNORE + LITERAL_IGNORE + NOT_LITERAL_IGNORE + + GROUPREF_LOC_IGNORE + IN_LOC_IGNORE + LITERAL_LOC_IGNORE + NOT_LITERAL_LOC_IGNORE + + GROUPREF_UNI_IGNORE + IN_UNI_IGNORE + LITERAL_UNI_IGNORE + NOT_LITERAL_UNI_IGNORE + RANGE_UNI_IGNORE + + MIN_REPEAT MAX_REPEAT +""") +del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT + +# positions +ATCODES = _makecodes(""" + AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING + AT_BOUNDARY AT_NON_BOUNDARY + AT_END AT_END_LINE AT_END_STRING + + AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY + + AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY +""") + +# categories +CHCODES = _makecodes(""" + CATEGORY_DIGIT CATEGORY_NOT_DIGIT + CATEGORY_SPACE CATEGORY_NOT_SPACE + CATEGORY_WORD CATEGORY_NOT_WORD + CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK + + CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD + + CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT + CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE + CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD + CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK +""") + + +# replacement operations for "ignore case" mode +OP_IGNORE = { + LITERAL: LITERAL_IGNORE, + NOT_LITERAL: NOT_LITERAL_IGNORE, +} + +OP_LOCALE_IGNORE = { + LITERAL: LITERAL_LOC_IGNORE, + NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, +} + +OP_UNICODE_IGNORE = { + LITERAL: LITERAL_UNI_IGNORE, + NOT_LITERAL: NOT_LITERAL_UNI_IGNORE, +} + +AT_MULTILINE = { + AT_BEGINNING: AT_BEGINNING_LINE, + AT_END: AT_END_LINE +} + +AT_LOCALE = { + AT_BOUNDARY: AT_LOC_BOUNDARY, + AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY +} + +AT_UNICODE = { + AT_BOUNDARY: AT_UNI_BOUNDARY, + AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY +} + +CH_LOCALE = { + CATEGORY_DIGIT: CATEGORY_DIGIT, + CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, + CATEGORY_SPACE: CATEGORY_SPACE, + CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, + CATEGORY_WORD: CATEGORY_LOC_WORD, + CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, + CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, + CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK +} + +CH_UNICODE = { + CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, + CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, + CATEGORY_SPACE: CATEGORY_UNI_SPACE, + CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, + CATEGORY_WORD: CATEGORY_UNI_WORD, + CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, + CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, + CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK +} + +# flags +SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) +SRE_FLAG_IGNORECASE = 2 # case insensitive +SRE_FLAG_LOCALE = 4 # honour system locale +SRE_FLAG_MULTILINE = 8 # treat target as multiline string +SRE_FLAG_DOTALL = 16 # treat target as a single string +SRE_FLAG_UNICODE = 32 # use unicode "locale" +SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments +SRE_FLAG_DEBUG = 128 # debugging +SRE_FLAG_ASCII = 256 # use ascii "locale" + +# flags for INFO primitive +SRE_INFO_PREFIX = 1 # has prefix +SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) +SRE_INFO_CHARSET = 4 # pattern starts with character from given set + +if __name__ == "__main__": + def dump(f, d, prefix): + items = sorted(d) + for item in items: + f.write("#define %s_%s %d\n" % (prefix, item, item)) + with open("sre_constants.h", "w") as f: + f.write("""\ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * NOTE: This file is generated by Lib/re/_constants.py. If you need + * to change anything in here, edit Lib/re/_constants.py and run it. + * + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. + * + * See the _sre.c file for information on usage and redistribution. + */ + +""") + + f.write("#define SRE_MAGIC %d\n" % MAGIC) + + dump(f, OPCODES, "SRE_OP") + dump(f, ATCODES, "SRE") + dump(f, CHCODES, "SRE") + + f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE) + f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE) + f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE) + f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE) + f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL) + f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE) + f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE) + f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG) + f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII) + + f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX) + f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL) + f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET) + + print("done") diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py new file mode 100644 index 0000000..ae44118 --- /dev/null +++ b/Lib/re/_parser.py @@ -0,0 +1,1079 @@ +# +# Secret Labs' Regular Expression Engine +# +# convert re-style regular expression to sre pattern +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# See the __init__.py file for information on usage and redistribution. +# + +"""Internal support module for sre""" + +# XXX: show string offset and offending character for all errors + +from ._constants import * + +SPECIAL_CHARS = ".\\[{()*+?^$|" +REPEAT_CHARS = "*+?{" + +DIGITS = frozenset("0123456789") + +OCTDIGITS = frozenset("01234567") +HEXDIGITS = frozenset("0123456789abcdefABCDEF") +ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") + +WHITESPACE = frozenset(" \t\n\r\v\f") + +_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) +_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) + +ESCAPES = { + r"\a": (LITERAL, ord("\a")), + r"\b": (LITERAL, ord("\b")), + r"\f": (LITERAL, ord("\f")), + r"\n": (LITERAL, ord("\n")), + r"\r": (LITERAL, ord("\r")), + r"\t": (LITERAL, ord("\t")), + r"\v": (LITERAL, ord("\v")), + r"\\": (LITERAL, ord("\\")) +} + +CATEGORIES = { + r"\A": (AT, AT_BEGINNING_STRING), # start of string + r"\b": (AT, AT_BOUNDARY), + r"\B": (AT, AT_NON_BOUNDARY), + r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), + r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), + r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), + r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), + r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), + r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), + r"\Z": (AT, AT_END_STRING), # end of string +} + +FLAGS = { + # standard flags + "i": SRE_FLAG_IGNORECASE, + "L": SRE_FLAG_LOCALE, + "m": SRE_FLAG_MULTILINE, + "s": SRE_FLAG_DOTALL, + "x": SRE_FLAG_VERBOSE, + # extensions + "a": SRE_FLAG_ASCII, + "t": SRE_FLAG_TEMPLATE, + "u": SRE_FLAG_UNICODE, +} + +TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE +GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE + +class Verbose(Exception): + pass + +class State: + # keeps track of state for parsing + def __init__(self): + self.flags = 0 + self.groupdict = {} + self.groupwidths = [None] # group 0 + self.lookbehindgroups = None + @property + def groups(self): + return len(self.groupwidths) + def opengroup(self, name=None): + gid = self.groups + self.groupwidths.append(None) + if self.groups > MAXGROUPS: + raise error("too many groups") + if name is not None: + ogid = self.groupdict.get(name, None) + if ogid is not None: + raise error("redefinition of group name %r as group %d; " + "was group %d" % (name, gid, ogid)) + self.groupdict[name] = gid + return gid + def closegroup(self, gid, p): + self.groupwidths[gid] = p.getwidth() + def checkgroup(self, gid): + return gid < self.groups and self.groupwidths[gid] is not None + + def checklookbehindgroup(self, gid, source): + if self.lookbehindgroups is not None: + if not self.checkgroup(gid): + raise source.error('cannot refer to an open group') + if gid >= self.lookbehindgroups: + raise source.error('cannot refer to group defined in the same ' + 'lookbehind subpattern') + +class SubPattern: + # a subpattern, in intermediate form + def __init__(self, state, data=None): + self.state = state + if data is None: + data = [] + self.data = data + self.width = None + + def dump(self, level=0): + nl = True + seqtypes = (tuple, list) + for op, av in self.data: + print(level*" " + str(op), end='') + if op is IN: + # member sublanguage + print() + for op, a in av: + print((level+1)*" " + str(op), a) + elif op is BRANCH: + print() + for i, a in enumerate(av[1]): + if i: + print(level*" " + "OR") + a.dump(level+1) + elif op is GROUPREF_EXISTS: + condgroup, item_yes, item_no = av + print('', condgroup) + item_yes.dump(level+1) + if item_no: + print(level*" " + "ELSE") + item_no.dump(level+1) + elif isinstance(av, seqtypes): + nl = False + for a in av: + if isinstance(a, SubPattern): + if not nl: + print() + a.dump(level+1) + nl = True + else: + if not nl: + print(' ', end='') + print(a, end='') + nl = False + if not nl: + print() + else: + print('', av) + def __repr__(self): + return repr(self.data) + def __len__(self): + return len(self.data) + def __delitem__(self, index): + del self.data[index] + def __getitem__(self, index): + if isinstance(index, slice): + return SubPattern(self.state, self.data[index]) + return self.data[index] + def __setitem__(self, index, code): + self.data[index] = code + def insert(self, index, code): + self.data.insert(index, code) + def append(self, code): + self.data.append(code) + def getwidth(self): + # determine the width (min, max) for this subpattern + if self.width is not None: + return self.width + lo = hi = 0 + for op, av in self.data: + if op is BRANCH: + i = MAXREPEAT - 1 + j = 0 + for av in av[1]: + l, h = av.getwidth() + i = min(i, l) + j = max(j, h) + lo = lo + i + hi = hi + j + elif op is CALL: + i, j = av.getwidth() + lo = lo + i + hi = hi + j + elif op is ATOMIC_GROUP: + i, j = av.getwidth() + lo = lo + i + hi = hi + j + elif op is SUBPATTERN: + i, j = av[-1].getwidth() + lo = lo + i + hi = hi + j + elif op in _REPEATCODES: + i, j = av[2].getwidth() + lo = lo + i * av[0] + hi = hi + j * av[1] + elif op in _UNITCODES: + lo = lo + 1 + hi = hi + 1 + elif op is GROUPREF: + i, j = self.state.groupwidths[av] + lo = lo + i + hi = hi + j + elif op is GROUPREF_EXISTS: + i, j = av[1].getwidth() + if av[2] is not None: + l, h = av[2].getwidth() + i = min(i, l) + j = max(j, h) + else: + i = 0 + lo = lo + i + hi = hi + j + elif op is SUCCESS: + break + self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) + return self.width + +class Tokenizer: + def __init__(self, string): + self.istext = isinstance(string, str) + self.string = string + if not self.istext: + string = str(string, 'latin1') + self.decoded_string = string + self.index = 0 + self.next = None + self.__next() + def __next(self): + index = self.index + try: + char = self.decoded_string[index] + except IndexError: + self.next = None + return + if char == "\\": + index += 1 + try: + char += self.decoded_string[index] + except IndexError: + raise error("bad escape (end of pattern)", + self.string, len(self.string) - 1) from None + self.index = index + 1 + self.next = char + def match(self, char): + if char == self.next: + self.__next() + return True + return False + def get(self): + this = self.next + self.__next() + return this + def getwhile(self, n, charset): + result = '' + for _ in range(n): + c = self.next + if c not in charset: + break + result += c + self.__next() + return result + def getuntil(self, terminator, name): + result = '' + while True: + c = self.next + self.__next() + if c is None: + if not result: + raise self.error("missing " + name) + raise self.error("missing %s, unterminated name" % terminator, + len(result)) + if c == terminator: + if not result: + raise self.error("missing " + name, 1) + break + result += c + return result + @property + def pos(self): + return self.index - len(self.next or '') + def tell(self): + return self.index - len(self.next or '') + def seek(self, index): + self.index = index + self.__next() + + def error(self, msg, offset=0): + return error(msg, self.string, self.tell() - offset) + +def _class_escape(source, escape): + # handle escape code inside character class + code = ESCAPES.get(escape) + if code: + return code + code = CATEGORIES.get(escape) + if code and code[0] is IN: + return code + try: + c = escape[1:2] + if c == "x": + # hexadecimal escape (exactly two digits) + escape += source.getwhile(2, HEXDIGITS) + if len(escape) != 4: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise source.error("incomplete escape %s" % escape, len(escape)) + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c + elif c == "N" and source.istext: + import unicodedata + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except KeyError: + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) + return LITERAL, c + elif c in OCTDIGITS: + # octal escape (up to three digits) + escape += source.getwhile(2, OCTDIGITS) + c = int(escape[1:], 8) + if c > 0o377: + raise source.error('octal escape value %s outside of ' + 'range 0-0o377' % escape, len(escape)) + return LITERAL, c + elif c in DIGITS: + raise ValueError + if len(escape) == 2: + if c in ASCIILETTERS: + raise source.error('bad escape %s' % escape, len(escape)) + return LITERAL, ord(escape[1]) + except ValueError: + pass + raise source.error("bad escape %s" % escape, len(escape)) + +def _escape(source, escape, state): + # handle escape code in expression + code = CATEGORIES.get(escape) + if code: + return code + code = ESCAPES.get(escape) + if code: + return code + try: + c = escape[1:2] + if c == "x": + # hexadecimal escape + escape += source.getwhile(2, HEXDIGITS) + if len(escape) != 4: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise source.error("incomplete escape %s" % escape, len(escape)) + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c + elif c == "N" and source.istext: + import unicodedata + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except KeyError: + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) + return LITERAL, c + elif c == "0": + # octal escape + escape += source.getwhile(2, OCTDIGITS) + return LITERAL, int(escape[1:], 8) + elif c in DIGITS: + # octal escape *or* decimal group reference (sigh) + if source.next in DIGITS: + escape += source.get() + if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and + source.next in OCTDIGITS): + # got three octal digits; this is an octal escape + escape += source.get() + c = int(escape[1:], 8) + if c > 0o377: + raise source.error('octal escape value %s outside of ' + 'range 0-0o377' % escape, + len(escape)) + return LITERAL, c + # not an octal escape, so this is a group reference + group = int(escape[1:]) + if group < state.groups: + if not state.checkgroup(group): + raise source.error("cannot refer to an open group", + len(escape)) + state.checklookbehindgroup(group, source) + return GROUPREF, group + raise source.error("invalid group reference %d" % group, len(escape) - 1) + if len(escape) == 2: + if c in ASCIILETTERS: + raise source.error("bad escape %s" % escape, len(escape)) + return LITERAL, ord(escape[1]) + except ValueError: + pass + raise source.error("bad escape %s" % escape, len(escape)) + +def _uniq(items): + return list(dict.fromkeys(items)) + +def _parse_sub(source, state, verbose, nested): + # parse an alternation: a|b|c + + items = [] + itemsappend = items.append + sourcematch = source.match + start = source.tell() + while True: + itemsappend(_parse(source, state, verbose, nested + 1, + not nested and not items)) + if not sourcematch("|"): + break + + if len(items) == 1: + return items[0] + + subpattern = SubPattern(state) + + # check if all items share a common prefix + while True: + prefix = None + for item in items: + if not item: + break + if prefix is None: + prefix = item[0] + elif item[0] != prefix: + break + else: + # all subitems start with a common "prefix". + # move it out of the branch + for item in items: + del item[0] + subpattern.append(prefix) + continue # check next one + break + + # check if the branch can be replaced by a character set + set = [] + for item in items: + if len(item) != 1: + break + op, av = item[0] + if op is LITERAL: + set.append((op, av)) + elif op is IN and av[0][0] is not NEGATE: + set.extend(av) + else: + break + else: + # we can store this as a character set instead of a + # branch (the compiler may optimize this even more) + subpattern.append((IN, _uniq(set))) + return subpattern + + subpattern.append((BRANCH, (None, items))) + return subpattern + +def _parse(source, state, verbose, nested, first=False): + # parse a simple pattern + subpattern = SubPattern(state) + + # precompute constants into local variables + subpatternappend = subpattern.append + sourceget = source.get + sourcematch = source.match + _len = len + _ord = ord + + while True: + + this = source.next + if this is None: + break # end of pattern + if this in "|)": + break # end of subpattern + sourceget() + + if verbose: + # skip whitespace and comments + if this in WHITESPACE: + continue + if this == "#": + while True: + this = sourceget() + if this is None or this == "\n": + break + continue + + if this[0] == "\\": + code = _escape(source, this, state) + subpatternappend(code) + + elif this not in SPECIAL_CHARS: + subpatternappend((LITERAL, _ord(this))) + + elif this == "[": + here = source.tell() - 1 + # character set + set = [] + setappend = set.append +## if sourcematch(":"): +## pass # handle character classes + if source.next == '[': + import warnings + warnings.warn( + 'Possible nested set at position %d' % source.tell(), + FutureWarning, stacklevel=nested + 6 + ) + negate = sourcematch("^") + # check remaining characters + while True: + this = sourceget() + if this is None: + raise source.error("unterminated character set", + source.tell() - here) + if this == "]" and set: + break + elif this[0] == "\\": + code1 = _class_escape(source, this) + else: + if set and this in '-&~|' and source.next == this: + import warnings + warnings.warn( + 'Possible set %s at position %d' % ( + 'difference' if this == '-' else + 'intersection' if this == '&' else + 'symmetric difference' if this == '~' else + 'union', + source.tell() - 1), + FutureWarning, stacklevel=nested + 6 + ) + code1 = LITERAL, _ord(this) + if sourcematch("-"): + # potential range + that = sourceget() + if that is None: + raise source.error("unterminated character set", + source.tell() - here) + if that == "]": + if code1[0] is IN: + code1 = code1[1][0] + setappend(code1) + setappend((LITERAL, _ord("-"))) + break + if that[0] == "\\": + code2 = _class_escape(source, that) + else: + if that == '-': + import warnings + warnings.warn( + 'Possible set difference at position %d' % ( + source.tell() - 2), + FutureWarning, stacklevel=nested + 6 + ) + code2 = LITERAL, _ord(that) + if code1[0] != LITERAL or code2[0] != LITERAL: + msg = "bad character range %s-%s" % (this, that) + raise source.error(msg, len(this) + 1 + len(that)) + lo = code1[1] + hi = code2[1] + if hi < lo: + msg = "bad character range %s-%s" % (this, that) + raise source.error(msg, len(this) + 1 + len(that)) + setappend((RANGE, (lo, hi))) + else: + if code1[0] is IN: + code1 = code1[1][0] + setappend(code1) + + set = _uniq(set) + # XXX: <fl> should move set optimization to compiler! + if _len(set) == 1 and set[0][0] is LITERAL: + # optimization + if negate: + subpatternappend((NOT_LITERAL, set[0][1])) + else: + subpatternappend(set[0]) + else: + if negate: + set.insert(0, (NEGATE, None)) + # charmap optimization can't be added here because + # global flags still are not known + subpatternappend((IN, set)) + + elif this in REPEAT_CHARS: + # repeat previous item + here = source.tell() + if this == "?": + min, max = 0, 1 + elif this == "*": + min, max = 0, MAXREPEAT + + elif this == "+": + min, max = 1, MAXREPEAT + elif this == "{": + if source.next == "}": + subpatternappend((LITERAL, _ord(this))) + continue + + min, max = 0, MAXREPEAT + lo = hi = "" + while source.next in DIGITS: + lo += sourceget() + if sourcematch(","): + while source.next in DIGITS: + hi += sourceget() + else: + hi = lo + if not sourcematch("}"): + subpatternappend((LITERAL, _ord(this))) + source.seek(here) + continue + + if lo: + min = int(lo) + if min >= MAXREPEAT: + raise OverflowError("the repetition number is too large") + if hi: + max = int(hi) + if max >= MAXREPEAT: + raise OverflowError("the repetition number is too large") + if max < min: + raise source.error("min repeat greater than max repeat", + source.tell() - here) + else: + raise AssertionError("unsupported quantifier %r" % (char,)) + # figure out which item to repeat + if subpattern: + item = subpattern[-1:] + else: + item = None + if not item or item[0][0] is AT: + raise source.error("nothing to repeat", + source.tell() - here + len(this)) + if item[0][0] in _REPEATCODES: + raise source.error("multiple repeat", + source.tell() - here + len(this)) + if item[0][0] is SUBPATTERN: + group, add_flags, del_flags, p = item[0][1] + if group is None and not add_flags and not del_flags: + item = p + if sourcematch("?"): + # Non-Greedy Match + subpattern[-1] = (MIN_REPEAT, (min, max, item)) + elif sourcematch("+"): + # Possessive Match (Always Greedy) + subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) + else: + # Greedy Match + subpattern[-1] = (MAX_REPEAT, (min, max, item)) + + elif this == ".": + subpatternappend((ANY, None)) + + elif this == "(": + start = source.tell() - 1 + capture = True + atomic = False + name = None + add_flags = 0 + del_flags = 0 + if sourcematch("?"): + # options + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + if char == "P": + # python extensions + if sourcematch("<"): + # named group: skip forward to end of name + name = source.getuntil(">", "group name") + if not name.isidentifier(): + msg = "bad character in group name %r" % name + raise source.error(msg, len(name) + 1) + elif sourcematch("="): + # named backreference + name = source.getuntil(")", "group name") + if not name.isidentifier(): + msg = "bad character in group name %r" % name + raise source.error(msg, len(name) + 1) + gid = state.groupdict.get(name) + if gid is None: + msg = "unknown group name %r" % name + raise source.error(msg, len(name) + 1) + if not state.checkgroup(gid): + raise source.error("cannot refer to an open group", + len(name) + 1) + state.checklookbehindgroup(gid, source) + subpatternappend((GROUPREF, gid)) + continue + + else: + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + raise source.error("unknown extension ?P" + char, + len(char) + 2) + elif char == ":": + # non-capturing group + capture = False + elif char == "#": + # comment + while True: + if source.next is None: + raise source.error("missing ), unterminated comment", + source.tell() - start) + if sourceget() == ")": + break + continue + + elif char in "=!<": + # lookahead assertions + dir = 1 + if char == "<": + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + if char not in "=!": + raise source.error("unknown extension ?<" + char, + len(char) + 2) + dir = -1 # lookbehind + lookbehindgroups = state.lookbehindgroups + if lookbehindgroups is None: + state.lookbehindgroups = state.groups + p = _parse_sub(source, state, verbose, nested + 1) + if dir < 0: + if lookbehindgroups is None: + state.lookbehindgroups = None + if not sourcematch(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + if char == "=": + subpatternappend((ASSERT, (dir, p))) + else: + subpatternappend((ASSERT_NOT, (dir, p))) + continue + + elif char == "(": + # conditional backreference group + condname = source.getuntil(")", "group name") + if condname.isidentifier(): + condgroup = state.groupdict.get(condname) + if condgroup is None: + msg = "unknown group name %r" % condname + raise source.error(msg, len(condname) + 1) + else: + try: + condgroup = int(condname) + if condgroup < 0: + raise ValueError + except ValueError: + msg = "bad character in group name %r" % condname + raise source.error(msg, len(condname) + 1) from None + if not condgroup: + raise source.error("bad group number", + len(condname) + 1) + if condgroup >= MAXGROUPS: + msg = "invalid group reference %d" % condgroup + raise source.error(msg, len(condname) + 1) + state.checklookbehindgroup(condgroup, source) + item_yes = _parse(source, state, verbose, nested + 1) + if source.match("|"): + item_no = _parse(source, state, verbose, nested + 1) + if source.next == "|": + raise source.error("conditional backref with more than two branches") + else: + item_no = None + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) + continue + + elif char == ">": + # non-capturing, atomic group + capture = False + atomic = True + elif char in FLAGS or char == "-": + # flags + flags = _parse_flags(source, state, char) + if flags is None: # global flags + if not first or subpattern: + raise source.error('global flags not at the start ' + 'of the expression', + source.tell() - start) + if (state.flags & SRE_FLAG_VERBOSE) and not verbose: + raise Verbose + continue + + add_flags, del_flags = flags + capture = False + else: + raise source.error("unknown extension ?" + char, + len(char) + 1) + + # parse group contents + if capture: + try: + group = state.opengroup(name) + except error as err: + raise source.error(err.msg, len(name) + 1) from None + else: + group = None + sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and + not (del_flags & SRE_FLAG_VERBOSE)) + p = _parse_sub(source, state, sub_verbose, nested + 1) + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + if group is not None: + state.closegroup(group, p) + if atomic: + assert group is None + subpatternappend((ATOMIC_GROUP, p)) + else: + subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) + + elif this == "^": + subpatternappend((AT, AT_BEGINNING)) + + elif this == "$": + subpatternappend((AT, AT_END)) + + else: + raise AssertionError("unsupported special character %r" % (char,)) + + # unpack non-capturing groups + for i in range(len(subpattern))[::-1]: + op, av = subpattern[i] + if op is SUBPATTERN: + group, add_flags, del_flags, p = av + if group is None and not add_flags and not del_flags: + subpattern[i: i+1] = p + + return subpattern + +def _parse_flags(source, state, char): + sourceget = source.get + add_flags = 0 + del_flags = 0 + if char != "-": + while True: + flag = FLAGS[char] + if source.istext: + if char == 'L': + msg = "bad inline flags: cannot use 'L' flag with a str pattern" + raise source.error(msg) + else: + if char == 'u': + msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" + raise source.error(msg) + add_flags |= flag + if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: + msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" + raise source.error(msg) + char = sourceget() + if char is None: + raise source.error("missing -, : or )") + if char in ")-:": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing -, : or )" + raise source.error(msg, len(char)) + if char == ")": + state.flags |= add_flags + return None + if add_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn on global flag", 1) + if char == "-": + char = sourceget() + if char is None: + raise source.error("missing flag") + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing flag" + raise source.error(msg, len(char)) + while True: + flag = FLAGS[char] + if flag & TYPE_FLAGS: + msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" + raise source.error(msg) + del_flags |= flag + char = sourceget() + if char is None: + raise source.error("missing :") + if char == ":": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing :" + raise source.error(msg, len(char)) + assert char == ":" + if del_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn off global flag", 1) + if add_flags & del_flags: + raise source.error("bad inline flags: flag turned on and off", 1) + return add_flags, del_flags + +def fix_flags(src, flags): + # Check and fix flags according to the type of pattern (str or bytes) + if isinstance(src, str): + if flags & SRE_FLAG_LOCALE: + raise ValueError("cannot use LOCALE flag with a str pattern") + if not flags & SRE_FLAG_ASCII: + flags |= SRE_FLAG_UNICODE + elif flags & SRE_FLAG_UNICODE: + raise ValueError("ASCII and UNICODE flags are incompatible") + else: + if flags & SRE_FLAG_UNICODE: + raise ValueError("cannot use UNICODE flag with a bytes pattern") + if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: + raise ValueError("ASCII and LOCALE flags are incompatible") + return flags + +def parse(str, flags=0, state=None): + # parse 're' pattern into list of (opcode, argument) tuples + + source = Tokenizer(str) + + if state is None: + state = State() + state.flags = flags + state.str = str + + try: + p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0) + except Verbose: + # the VERBOSE flag was switched on inside the pattern. to be + # on the safe side, we'll parse the whole thing again... + state = State() + state.flags = flags | SRE_FLAG_VERBOSE + state.str = str + source.seek(0) + p = _parse_sub(source, state, True, 0) + + p.state.flags = fix_flags(str, p.state.flags) + + if source.next is not None: + assert source.next == ")" + raise source.error("unbalanced parenthesis") + + if flags & SRE_FLAG_DEBUG: + p.dump() + + return p + +def parse_template(source, state): + # parse 're' replacement string into list of literals and + # group references + s = Tokenizer(source) + sget = s.get + groups = [] + literals = [] + literal = [] + lappend = literal.append + def addgroup(index, pos): + if index > state.groups: + raise s.error("invalid group reference %d" % index, pos) + if literal: + literals.append(''.join(literal)) + del literal[:] + groups.append((len(literals), index)) + literals.append(None) + groupindex = state.groupindex + while True: + this = sget() + if this is None: + break # end of replacement string + if this[0] == "\\": + # group + c = this[1] + if c == "g": + name = "" + if not s.match("<"): + raise s.error("missing <") + name = s.getuntil(">", "group name") + if name.isidentifier(): + try: + index = groupindex[name] + except KeyError: + raise IndexError("unknown group name %r" % name) + else: + try: + index = int(name) + if index < 0: + raise ValueError + except ValueError: + raise s.error("bad character in group name %r" % name, + len(name) + 1) from None + if index >= MAXGROUPS: + raise s.error("invalid group reference %d" % index, + len(name) + 1) + addgroup(index, len(name) + 1) + elif c == "0": + if s.next in OCTDIGITS: + this += sget() + if s.next in OCTDIGITS: + this += sget() + lappend(chr(int(this[1:], 8) & 0xff)) + elif c in DIGITS: + isoctal = False + if s.next in DIGITS: + this += sget() + if (c in OCTDIGITS and this[2] in OCTDIGITS and + s.next in OCTDIGITS): + this += sget() + isoctal = True + c = int(this[1:], 8) + if c > 0o377: + raise s.error('octal escape value %s outside of ' + 'range 0-0o377' % this, len(this)) + lappend(chr(c)) + if not isoctal: + addgroup(int(this[1:]), len(this) - 1) + else: + try: + this = chr(ESCAPES[this][1]) + except KeyError: + if c in ASCIILETTERS: + raise s.error('bad escape %s' % this, len(this)) + lappend(this) + else: + lappend(this) + if literal: + literals.append(''.join(literal)) + if not isinstance(source, str): + # The tokenizer implicitly decodes bytes objects as latin-1, we must + # therefore re-encode the final representation. + literals = [None if s is None else s.encode('latin-1') for s in literals] + return groups, literals + +def expand_template(template, match): + g = match.group + empty = match.string[:0] + groups, literals = template + literals = literals[:] + try: + for index, group in groups: + literals[index] = g(group) or empty + except IndexError: + raise error("invalid group reference %d" % index) + return empty.join(literals) |