From be9a4e5c855188cf146962483e6de942bf154d95 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 10 Sep 2016 00:57:55 +0300 Subject: Issue #433028: Added support of modifier spans in regular expressions. --- Doc/library/re.rst | 10 +++++ Doc/whatsnew/3.6.rst | 9 ++++ Lib/re.py | 2 +- Lib/sre_compile.py | 69 +++++++++++++++++-------------- Lib/sre_parse.py | 114 +++++++++++++++++++++++++++++++++++++-------------- Lib/test/test_re.py | 40 ++++++++++++++++-- Misc/NEWS | 2 + 7 files changed, 180 insertions(+), 66 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index dfbedd4..df5b547 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -237,6 +237,16 @@ The special characters are: *cannot* be retrieved after performing a match or referenced later in the pattern. +``(?imsx-imsx:...)`` + (Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``, + optionally followed by ``'-'`` followed by one or more letters from the + same set.) The letters set or removes the corresponding flags: + :const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S` + (dot matches all), and :const:`re.X` (verbose), for the part of the + expression. (The flags are described in :ref:`contents-of-module-re`.) + + .. versionadded: 3.7 + ``(?P...)`` Similar to regular parentheses, but the substring matched by the group is accessible via the symbolic group name *name*. Group names must be valid diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst index f15bf4d..8a57110 100644 --- a/Doc/whatsnew/3.6.rst +++ b/Doc/whatsnew/3.6.rst @@ -645,6 +645,15 @@ Protocol version 4 already supports this case. (Contributed by Serhiy Storchaka in :issue:`24164`.) +re +-- + +Added support of modifier spans in regular expressions. Examples: +``'(?i:p)ython'`` matches ``'python'`` and ``'Python'``, but not ``'PYTHON'``; +``'(?i)g(?-i:v)r'`` matches ``'GvR'`` and ``'gvr'``, but not ``'GVR'``. +(Contributed by Serhiy Storchaka in :issue:`433028`.) + + readline -------- diff --git a/Lib/re.py b/Lib/re.py index 661929e..b78da89 100644 --- a/Lib/re.py +++ b/Lib/re.py @@ -352,7 +352,7 @@ class Scanner: for phrase, action in lexicon: gid = s.opengroup() p.append(sre_parse.SubPattern(s, [ - (SUBPATTERN, (gid, sre_parse.parse(phrase, flags))), + (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))), ])) s.closegroup(gid, p[-1]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 4edb03f..420d83d 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -71,7 +71,8 @@ def _compile(code, pattern, flags): ASSERT_CODES = _ASSERT_CODES if (flags & SRE_FLAG_IGNORECASE and not (flags & SRE_FLAG_LOCALE) and - flags & SRE_FLAG_UNICODE): + flags & SRE_FLAG_UNICODE and + not (flags & SRE_FLAG_ASCII)): fixes = _ignorecase_fixes else: fixes = None @@ -137,14 +138,15 @@ def _compile(code, pattern, flags): else: emit(MIN_UNTIL) elif op is SUBPATTERN: - if av[0]: + group, add_flags, del_flags, p = av + if group: emit(MARK) - emit((av[0]-1)*2) - # _compile_info(code, av[1], flags) - _compile(code, av[1], flags) - if av[0]: + emit((group-1)*2) + # _compile_info(code, p, (flags | add_flags) & ~del_flags) + _compile(code, p, (flags | add_flags) & ~del_flags) + if group: emit(MARK) - emit((av[0]-1)*2+1) + emit((group-1)*2+1) elif op in SUCCESS_CODES: emit(op) elif op in ASSERT_CODES: @@ -172,7 +174,7 @@ def _compile(code, pattern, flags): av = AT_MULTILINE.get(av, av) if flags & SRE_FLAG_LOCALE: av = AT_LOCALE.get(av, av) - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): av = AT_UNICODE.get(av, av) emit(av) elif op is BRANCH: @@ -193,7 +195,7 @@ def _compile(code, pattern, flags): emit(op) if flags & SRE_FLAG_LOCALE: av = CH_LOCALE[av] - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): av = CH_UNICODE[av] emit(av) elif op is GROUPREF: @@ -237,7 +239,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None): elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: emit(CH_LOCALE[av]) - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): emit(CH_UNICODE[av]) else: emit(av) @@ -414,14 +416,16 @@ def _get_literal_prefix(pattern): prefix = [] prefixappend = prefix.append prefix_skip = None - got_all = True for op, av in pattern.data: if op is LITERAL: prefixappend(av) elif op is SUBPATTERN: - prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1]) + group, add_flags, del_flags, p = av + if add_flags & SRE_FLAG_IGNORECASE: + break + prefix1, prefix_skip1, got_all = _get_literal_prefix(p) if prefix_skip is None: - if av[0] is not None: + if group is not None: prefix_skip = len(prefix) elif prefix_skip1 is not None: prefix_skip = len(prefix) + prefix_skip1 @@ -429,32 +433,35 @@ def _get_literal_prefix(pattern): if not got_all: break else: - got_all = False break - return prefix, prefix_skip, got_all + else: + return prefix, prefix_skip, True + return prefix, prefix_skip, False def _get_charset_prefix(pattern): charset = [] # not used charsetappend = charset.append if pattern.data: op, av = pattern.data[0] - if op is SUBPATTERN and av[1]: - op, av = av[1][0] - if op is LITERAL: - charsetappend((op, av)) - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) + if op is SUBPATTERN: + group, add_flags, del_flags, p = av + if p and not (add_flags & SRE_FLAG_IGNORECASE): + op, av = p[0] + if op is LITERAL: + charsetappend((op, av)) + elif op is BRANCH: + c = [] + cappend = c.append + for p in av[1]: + if not p: + break + op, av = p[0] + if op is LITERAL: + cappend((op, av)) + else: + break else: - break - else: - charset = c + charset = c elif op is BRANCH: c = [] cappend = c.append diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 521e379..09f3be2 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -65,6 +65,12 @@ FLAGS = { "u": SRE_FLAG_UNICODE, } +GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | + SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE) + +class Verbose(Exception): + pass + class Pattern: # master pattern object. keeps track of global attributes def __init__(self): @@ -184,7 +190,7 @@ class SubPattern: lo = lo + i hi = hi + j elif op is SUBPATTERN: - i, j = av[1].getwidth() + i, j = av[-1].getwidth() lo = lo + i hi = hi + j elif op in _REPEATCODES: @@ -395,7 +401,7 @@ def _escape(source, escape, state): pass raise source.error("bad escape %s" % escape, len(escape)) -def _parse_sub(source, state, nested=True): +def _parse_sub(source, state, verbose, nested=True): # parse an alternation: a|b|c items = [] @@ -403,7 +409,7 @@ def _parse_sub(source, state, nested=True): sourcematch = source.match start = source.tell() while True: - itemsappend(_parse(source, state)) + itemsappend(_parse(source, state, verbose)) if not sourcematch("|"): break @@ -445,10 +451,10 @@ def _parse_sub(source, state, nested=True): subpattern.append((BRANCH, (None, items))) return subpattern -def _parse_sub_cond(source, state, condgroup): - item_yes = _parse(source, state) +def _parse_sub_cond(source, state, condgroup, verbose): + item_yes = _parse(source, state, verbose) if source.match("|"): - item_no = _parse(source, state) + item_no = _parse(source, state, verbose) if source.next == "|": raise source.error("conditional backref with more than two branches") else: @@ -457,7 +463,7 @@ def _parse_sub_cond(source, state, condgroup): subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) return subpattern -def _parse(source, state): +def _parse(source, state, verbose): # parse a simple pattern subpattern = SubPattern(state) @@ -467,7 +473,6 @@ def _parse(source, state): sourcematch = source.match _len = len _ord = ord - verbose = state.flags & SRE_FLAG_VERBOSE while True: @@ -621,6 +626,8 @@ def _parse(source, state): group = True name = None condgroup = None + add_flags = 0 + del_flags = 0 if sourcematch("?"): # options char = sourceget() @@ -682,7 +689,7 @@ def _parse(source, state): lookbehindgroups = state.lookbehindgroups if lookbehindgroups is None: state.lookbehindgroups = state.groups - p = _parse_sub(source, state) + p = _parse_sub(source, state, verbose) if dir < 0: if lookbehindgroups is None: state.lookbehindgroups = None @@ -718,19 +725,13 @@ def _parse(source, state): raise source.error("invalid group reference", len(condname) + 1) state.checklookbehindgroup(condgroup, source) - elif char in FLAGS: + elif char in FLAGS or char == "-": # flags - while True: - state.flags |= FLAGS[char] - char = sourceget() - if char is None: - raise source.error("missing )") - if char == ")": - break - if char not in FLAGS: - raise source.error("unknown flag", len(char)) - verbose = state.flags & SRE_FLAG_VERBOSE - continue + flags = _parse_flags(source, state, char) + if flags is None: # global flags + continue + add_flags, del_flags = flags + group = None else: raise source.error("unknown extension ?" + char, len(char) + 1) @@ -742,15 +743,17 @@ def _parse(source, state): except error as err: raise source.error(err.msg, len(name) + 1) from None if condgroup: - p = _parse_sub_cond(source, state, condgroup) + p = _parse_sub_cond(source, state, condgroup, verbose) else: - p = _parse_sub(source, state) + sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and + not (del_flags & SRE_FLAG_VERBOSE)) + p = _parse_sub(source, state, sub_verbose) if not source.match(")"): raise source.error("missing ), unterminated subpattern", source.tell() - start) if group is not None: state.closegroup(group, p) - subpatternappend((SUBPATTERN, (group, p))) + subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) elif this == "^": subpatternappend((AT, AT_BEGINNING)) @@ -763,6 +766,53 @@ def _parse(source, state): return subpattern +def _parse_flags(source, state, char): + sourceget = source.get + add_flags = 0 + del_flags = 0 + if char != "-": + while True: + add_flags |= FLAGS[char] + char = sourceget() + if char is None: + raise source.error("missing -, : or )") + if char in ")-:": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing -, : or )" + raise source.error(msg, len(char)) + if char == ")": + if ((add_flags & SRE_FLAG_VERBOSE) and + not (state.flags & SRE_FLAG_VERBOSE)): + raise Verbose + state.flags |= add_flags + return None + if add_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn on global flag", 1) + if char == "-": + char = sourceget() + if char is None: + raise source.error("missing flag") + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing flag" + raise source.error(msg, len(char)) + while True: + del_flags |= FLAGS[char] + char = sourceget() + if char is None: + raise source.error("missing :") + if char == ":": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing :" + raise source.error(msg, len(char)) + assert char == ":" + if del_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn off global flag", 1) + if add_flags & del_flags: + raise source.error("bad inline flags: flag turned on and off", 1) + return add_flags, del_flags + def fix_flags(src, flags): # Check and fix flags according to the type of pattern (str or bytes) if isinstance(src, str): @@ -789,18 +839,22 @@ def parse(str, flags=0, pattern=None): pattern.flags = flags pattern.str = str - p = _parse_sub(source, pattern, 0) + try: + p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False) + except Verbose: + # the VERBOSE flag was switched on inside the pattern. to be + # on the safe side, we'll parse the whole thing again... + pattern = Pattern() + pattern.flags = flags | SRE_FLAG_VERBOSE + pattern.str = str + p = _parse_sub(source, pattern, True, False) + p.pattern.flags = fix_flags(str, p.pattern.flags) if source.next is not None: assert source.next == ")" raise source.error("unbalanced parenthesis") - if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: - # the VERBOSE flag was switched on inside the pattern. to be - # on the safe side, we'll parse the whole thing again... - return parse(str, p.pattern.flags) - if flags & SRE_FLAG_DEBUG: p.dump() diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 02fed21..2322ca9 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1376,6 +1376,38 @@ class ReTests(unittest.TestCase): self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE) self.assertRaises(ValueError, re.compile, b'(?aL)') + def test_scoped_flags(self): + self.assertTrue(re.match(r'(?i:a)b', 'Ab')) + self.assertIsNone(re.match(r'(?i:a)b', 'aB')) + self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE)) + self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE)) + self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab')) + self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB')) + + self.assertTrue(re.match(r'(?x: a) b', 'a b')) + self.assertIsNone(re.match(r'(?x: a) b', ' a b')) + self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE)) + self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) + + self.checkPatternError(r'(?a:\w)', + 'bad inline flags: cannot turn on global flag', 3) + self.checkPatternError(r'(?a)(?-a:\w)', + 'bad inline flags: cannot turn off global flag', 8) + self.checkPatternError(r'(?i-i:a)', + 'bad inline flags: flag turned on and off', 5) + + self.checkPatternError(r'(?-', 'missing flag', 3) + self.checkPatternError(r'(?-+', 'missing flag', 3) + self.checkPatternError(r'(?-z', 'unknown flag', 3) + self.checkPatternError(r'(?-i', 'missing :', 4) + self.checkPatternError(r'(?-i)', 'missing :', 4) + self.checkPatternError(r'(?-i+', 'missing :', 4) + self.checkPatternError(r'(?-iz', 'unknown flag', 4) + self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0) + self.checkPatternError(r'(?i', 'missing -, : or )', 3) + self.checkPatternError(r'(?i+', 'missing -, : or )', 3) + self.checkPatternError(r'(?iz', 'unknown flag', 3) + def test_bug_6509(self): # Replacement strings of both types must parse properly. # all strings @@ -1538,9 +1570,9 @@ class ReTests(unittest.TestCase): with captured_stdout() as out: re.compile(pat, re.DEBUG) dump = '''\ -SUBPATTERN 1 +SUBPATTERN 1 0 0 LITERAL 46 -SUBPATTERN None +SUBPATTERN None 0 0 BRANCH IN LITERAL 99 @@ -1548,7 +1580,7 @@ SUBPATTERN None OR LITERAL 112 LITERAL 121 -SUBPATTERN None +SUBPATTERN None 0 0 GROUPREF_EXISTS 1 AT AT_END ELSE @@ -1664,7 +1696,7 @@ SUBPATTERN None self.checkPatternError(r'(?P', 'unexpected end of pattern', 3) self.checkPatternError(r'(?z)', 'unknown extension ?z', 1) self.checkPatternError(r'(?iz)', 'unknown flag', 3) - self.checkPatternError(r'(?i', 'missing )', 3) + self.checkPatternError(r'(?i', 'missing -, : or )', 3) self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0) self.checkPatternError(r'(?<', 'unexpected end of pattern', 3) self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1) diff --git a/Misc/NEWS b/Misc/NEWS index c47c4bf..1075769 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -120,6 +120,8 @@ Core and Builtins Library ------- +- Issue #433028: Added support of modifier spans in regular expressions. + - Issue #24594: Validates persist parameter when opening MSI database - Issue #28047: Fixed calculation of line length used for the base64 CTE -- cgit v0.12