diff options
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/sre_compile.py | 38 | ||||
-rw-r--r-- | Lib/sre_constants.py | 5 | ||||
-rw-r--r-- | Lib/sre_parse.py | 32 | ||||
-rw-r--r-- | Lib/test/test_re.py | 294 |
4 files changed, 293 insertions, 76 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index c6398bf..0867200 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -17,11 +17,16 @@ from sre_constants import * assert _sre.MAGIC == MAGIC, "SRE module mismatch" _LITERAL_CODES = {LITERAL, NOT_LITERAL} -_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} _SUCCESS_CODES = {SUCCESS, FAILURE} _ASSERT_CODES = {ASSERT, ASSERT_NOT} _UNIT_CODES = _LITERAL_CODES | {ANY, IN} +_REPEATING_CODES = { + MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), + MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE), + POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), +} + # Sets of lowercase characters which have the same uppercase. _equivalences = ( # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I @@ -138,10 +143,7 @@ def _compile(code, pattern, flags): if flags & SRE_FLAG_TEMPLATE: raise error("internal: unsupported template operator %r" % (op,)) if _simple(av[2]): - if op is MAX_REPEAT: - emit(REPEAT_ONE) - else: - emit(MIN_REPEAT_ONE) + emit(REPEATING_CODES[op][2]) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) @@ -149,16 +151,13 @@ def _compile(code, pattern, flags): emit(SUCCESS) code[skip] = _len(code) - skip else: - emit(REPEAT) + emit(REPEATING_CODES[op][0]) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) code[skip] = _len(code) - skip - if op is MAX_REPEAT: - emit(MAX_UNTIL) - else: - emit(MIN_UNTIL) + emit(REPEATING_CODES[op][1]) elif op is SUBPATTERN: group, add_flags, del_flags, p = av if group: @@ -169,6 +168,17 @@ def _compile(code, pattern, flags): if group: emit(MARK) emit((group-1)*2+1) + elif op is ATOMIC_GROUP: + # Atomic Groups are handled by starting with an Atomic + # Group op code, then putting in the atomic group pattern + # and finally a success op code to tell any repeat + # operations within the Atomic Group to stop eating and + # pop their stack if they reach it + emit(ATOMIC_GROUP) + skip = _len(code); emit(0) + _compile(code, av, flags) + emit(SUCCESS) + code[skip] = _len(code) - skip elif op in SUCCESS_CODES: emit(op) elif op in ASSERT_CODES: @@ -709,7 +719,8 @@ def dis(code): else: print_(FAILURE) i += 1 - elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE): + elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, + POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): skip, min, max = code[i: i+3] if max == MAXREPEAT: max = 'MAXREPEAT' @@ -725,6 +736,11 @@ def dis(code): print_(op, skip, arg, to=i+skip) dis_(i+2, i+skip) i += skip + elif op is ATOMIC_GROUP: + skip = code[i] + print_(op, skip, to=i+skip) + dis_(i+1, i+skip) + i += skip elif op is INFO: skip, flags, min, max = code[i: i+4] if max == MAXREPEAT: diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 8e613cb..a00b017 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20171005 +MAGIC = 20220318 from _sre import MAXREPEAT, MAXGROUPS @@ -97,6 +97,9 @@ OPCODES = _makecodes(""" REPEAT_ONE SUBPATTERN MIN_REPEAT_ONE + ATOMIC_GROUP + POSSESSIVE_REPEAT + POSSESSIVE_REPEAT_ONE GROUPREF_IGNORE IN_IGNORE diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index bb95107..b91082e 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -25,7 +25,7 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") WHITESPACE = frozenset(" \t\n\r\v\f") -_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) +_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) ESCAPES = { @@ -190,6 +190,10 @@ class SubPattern: i, j = av.getwidth() lo = lo + i hi = hi + j + elif op is ATOMIC_GROUP: + i, j = av.getwidth() + lo = lo + i + hi = hi + j elif op is SUBPATTERN: i, j = av[-1].getwidth() lo = lo + i @@ -675,8 +679,13 @@ def _parse(source, state, verbose, nested, first=False): if group is None and not add_flags and not del_flags: item = p if sourcematch("?"): + # Non-Greedy Match subpattern[-1] = (MIN_REPEAT, (min, max, item)) + elif sourcematch("+"): + # Possessive Match (Always Greedy) + subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) else: + # Greedy Match subpattern[-1] = (MAX_REPEAT, (min, max, item)) elif this == ".": @@ -684,7 +693,8 @@ def _parse(source, state, verbose, nested, first=False): elif this == "(": start = source.tell() - 1 - group = True + capture = True + atomic = False name = None add_flags = 0 del_flags = 0 @@ -726,7 +736,7 @@ def _parse(source, state, verbose, nested, first=False): len(char) + 2) elif char == ":": # non-capturing group - group = None + capture = False elif char == "#": # comment while True: @@ -800,6 +810,10 @@ def _parse(source, state, verbose, nested, first=False): subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) continue + elif char == ">": + # non-capturing, atomic group + capture = False + atomic = True elif char in FLAGS or char == "-": # flags flags = _parse_flags(source, state, char) @@ -813,17 +827,19 @@ def _parse(source, state, verbose, nested, first=False): continue add_flags, del_flags = flags - group = None + capture = False else: raise source.error("unknown extension ?" + char, len(char) + 1) # parse group contents - if group is not None: + if capture: try: group = state.opengroup(name) except error as err: raise source.error(err.msg, len(name) + 1) from None + else: + group = None sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and not (del_flags & SRE_FLAG_VERBOSE)) p = _parse_sub(source, state, sub_verbose, nested + 1) @@ -832,7 +848,11 @@ def _parse(source, state, verbose, nested, first=False): source.tell() - start) if group is not None: state.closegroup(group, p) - subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) + if atomic: + assert group is None + subpatternappend((ATOMIC_GROUP, p)) + else: + subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) elif this == "^": subpatternappend((AT, AT_BEGINNING)) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index f8bbe51..bde7509 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -83,6 +83,23 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) self.assertIsNone(re.match('a+', 'xxx')) + def test_branching(self): + """Test Branching + Test expressions using the OR ('|') operator.""" + self.assertEqual(re.match('(ab|ba)', 'ab').span(), (0, 2)) + self.assertEqual(re.match('(ab|ba)', 'ba').span(), (0, 2)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'abc').span(), + (0, 3)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'bac').span(), + (0, 3)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'ca').span(), + (0, 2)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'cb').span(), + (0, 2)) + self.assertEqual(re.match('((a)|(b)|(c))', 'a').span(), (0, 1)) + self.assertEqual(re.match('((a)|(b)|(c))', 'b').span(), (0, 1)) + self.assertEqual(re.match('((a)|(b)|(c))', 'c').span(), (0, 1)) + def bump_num(self, matchobj): int_value = int(matchobj.group(0)) return str(int_value + 1) @@ -1239,11 +1256,13 @@ class ReTests(unittest.TestCase): 'nothing to repeat', 3) def test_multiple_repeat(self): - for outer_reps in '*', '+', '{1,2}': - for outer_mod in '', '?': + for outer_reps in '*', '+', '?', '{1,2}': + for outer_mod in '', '?', '+': outer_op = outer_reps + outer_mod for inner_reps in '*', '+', '?', '{1,2}': - for inner_mod in '', '?': + for inner_mod in '', '?', '+': + if inner_mod + outer_reps in ('?', '+'): + continue inner_op = inner_reps + inner_mod self.checkPatternError(r'x%s%s' % (inner_op, outer_op), 'multiple repeat', 1 + len(inner_op)) @@ -1458,7 +1477,8 @@ class ReTests(unittest.TestCase): def test_dollar_matches_twice(self): - "$ matches the end of string, and just before the terminating \n" + r"""Test that $ does not include \n + $ matches the end of string, and just before the terminating \n""" pattern = re.compile('$') self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') @@ -1774,60 +1794,6 @@ class ReTests(unittest.TestCase): self.assertEqual(m.group(1), "") self.assertEqual(m.group(2), "y") - @cpython_only - def test_debug_flag(self): - pat = r'(\.)(?:[ch]|py)(?(1)$|: )' - with captured_stdout() as out: - re.compile(pat, re.DEBUG) - self.maxDiff = None - dump = '''\ -SUBPATTERN 1 0 0 - LITERAL 46 -BRANCH - IN - LITERAL 99 - LITERAL 104 -OR - LITERAL 112 - LITERAL 121 -GROUPREF_EXISTS 1 - AT AT_END -ELSE - LITERAL 58 - LITERAL 32 - - 0. INFO 8 0b1 2 5 (to 9) - prefix_skip 0 - prefix [0x2e] ('.') - overlap [0] - 9: MARK 0 -11. LITERAL 0x2e ('.') -13. MARK 1 -15. BRANCH 10 (to 26) -17. IN 6 (to 24) -19. LITERAL 0x63 ('c') -21. LITERAL 0x68 ('h') -23. FAILURE -24: JUMP 9 (to 34) -26: branch 7 (to 33) -27. LITERAL 0x70 ('p') -29. LITERAL 0x79 ('y') -31. JUMP 2 (to 34) -33: FAILURE -34: GROUPREF_EXISTS 0 6 (to 41) -37. AT END -39. JUMP 5 (to 45) -41: LITERAL 0x3a (':') -43. LITERAL 0x20 (' ') -45: SUCCESS -''' - self.assertEqual(out.getvalue(), dump) - # Debug output is output again even a second time (bypassing - # the cache -- issue #20426). - with captured_stdout() as out: - re.compile(pat, re.DEBUG) - self.assertEqual(out.getvalue(), dump) - def test_keyword_parameters(self): # Issue #20283: Accepting the string keyword parameter. pat = re.compile(r'(ab)') @@ -2072,6 +2038,218 @@ ELSE with self.assertRaisesRegex(TypeError, "got 'type'"): re.search("x*", type) + def test_possessive_qualifiers(self): + """Test Possessive Qualifiers + Test qualifiers of the form @+ for some repetition operator @, + e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed + without creating a stack frame for rolling the stack back and + trying 1 or more fewer matches.""" + self.assertIsNone(re.match('e*+e', 'eeee')) + self.assertEqual(re.match('e++a', 'eeea').group(0), 'eeea') + self.assertEqual(re.match('e?+a', 'ea').group(0), 'ea') + self.assertEqual(re.match('e{2,4}+a', 'eeea').group(0), 'eeea') + self.assertIsNone(re.match('(.)++.', 'ee')) + self.assertEqual(re.match('(ae)*+a', 'aea').groups(), ('ae',)) + self.assertEqual(re.match('([ae][ae])?+a', 'aea').groups(), + ('ae',)) + self.assertEqual(re.match('(e?){2,4}+a', 'eeea').groups(), + ('',)) + self.assertEqual(re.match('()*+a', 'a').groups(), ('',)) + self.assertEqual(re.search('x*+', 'axx').span(), (0, 0)) + self.assertEqual(re.search('x++', 'axx').span(), (1, 3)) + self.assertEqual(re.match('a*+', 'xxx').span(), (0, 0)) + self.assertEqual(re.match('x*+', 'xxxa').span(), (0, 3)) + self.assertIsNone(re.match('a++', 'xxx')) + self.assertIsNone(re.match(r"^(\w){1}+$", "abc")) + self.assertIsNone(re.match(r"^(\w){1,2}+$", "abc")) + + self.assertEqual(re.match(r"^(\w){3}+$", "abc").group(1), "c") + self.assertEqual(re.match(r"^(\w){1,3}+$", "abc").group(1), "c") + self.assertEqual(re.match(r"^(\w){1,4}+$", "abc").group(1), "c") + + self.assertIsNone(re.match("^x{1}+$", "xxx")) + self.assertIsNone(re.match("^x{1,2}+$", "xxx")) + + self.assertTrue(re.match("^x{3}+$", "xxx")) + self.assertTrue(re.match("^x{1,3}+$", "xxx")) + self.assertTrue(re.match("^x{1,4}+$", "xxx")) + + self.assertIsNone(re.match("^x{}+$", "xxx")) + self.assertTrue(re.match("^x{}+$", "x{}")) + + def test_fullmatch_possessive_qualifiers(self): + self.assertTrue(re.fullmatch(r'a++', 'a')) + self.assertTrue(re.fullmatch(r'a*+', 'a')) + self.assertTrue(re.fullmatch(r'a?+', 'a')) + self.assertTrue(re.fullmatch(r'a{1,3}+', 'a')) + self.assertIsNone(re.fullmatch(r'a++', 'ab')) + self.assertIsNone(re.fullmatch(r'a*+', 'ab')) + self.assertIsNone(re.fullmatch(r'a?+', 'ab')) + self.assertIsNone(re.fullmatch(r'a{1,3}+', 'ab')) + + self.assertTrue(re.fullmatch(r'(?:ab)++', 'ab')) + self.assertTrue(re.fullmatch(r'(?:ab)*+', 'ab')) + self.assertTrue(re.fullmatch(r'(?:ab)?+', 'ab')) + self.assertTrue(re.fullmatch(r'(?:ab){1,3}+', 'ab')) + self.assertIsNone(re.fullmatch(r'(?:ab)++', 'abc')) + self.assertIsNone(re.fullmatch(r'(?:ab)*+', 'abc')) + self.assertIsNone(re.fullmatch(r'(?:ab)?+', 'abc')) + self.assertIsNone(re.fullmatch(r'(?:ab){1,3}+', 'abc')) + + def test_findall_possessive_qualifiers(self): + self.assertEqual(re.findall(r'a++', 'aab'), ['aa']) + self.assertEqual(re.findall(r'a*+', 'aab'), ['aa', '', '']) + self.assertEqual(re.findall(r'a?+', 'aab'), ['a', 'a', '', '']) + self.assertEqual(re.findall(r'a{1,3}+', 'aab'), ['aa']) + + self.assertEqual(re.findall(r'(?:ab)++', 'ababc'), ['abab']) + self.assertEqual(re.findall(r'(?:ab)*+', 'ababc'), ['abab', '', '']) + self.assertEqual(re.findall(r'(?:ab)?+', 'ababc'), ['ab', 'ab', '', '']) + self.assertEqual(re.findall(r'(?:ab){1,3}+', 'ababc'), ['abab']) + + def test_atomic_grouping(self): + """Test Atomic Grouping + Test non-capturing groups of the form (?>...), which does + not maintain any stack point created within the group once the + group is finished being evaluated.""" + pattern1 = re.compile(r'a(?>bc|b)c') + self.assertIsNone(pattern1.match('abc')) + self.assertTrue(pattern1.match('abcc')) + self.assertIsNone(re.match(r'(?>.*).', 'abc')) + self.assertTrue(re.match(r'(?>x)++', 'xxx')) + self.assertTrue(re.match(r'(?>x++)', 'xxx')) + self.assertIsNone(re.match(r'(?>x)++x', 'xxx')) + self.assertIsNone(re.match(r'(?>x++)x', 'xxx')) + + def test_fullmatch_atomic_grouping(self): + self.assertTrue(re.fullmatch(r'(?>a+)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a*)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a?)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a{1,3})', 'a')) + self.assertIsNone(re.fullmatch(r'(?>a+)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a*)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a?)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a{1,3})', 'ab')) + + self.assertTrue(re.fullmatch(r'(?>(?:ab)+)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)*)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)?)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)+)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)*)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)?)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab){1,3})', 'abc')) + + def test_findall_atomic_grouping(self): + self.assertEqual(re.findall(r'(?>a+)', 'aab'), ['aa']) + self.assertEqual(re.findall(r'(?>a*)', 'aab'), ['aa', '', '']) + self.assertEqual(re.findall(r'(?>a?)', 'aab'), ['a', 'a', '', '']) + self.assertEqual(re.findall(r'(?>a{1,3})', 'aab'), ['aa']) + + self.assertEqual(re.findall(r'(?>(?:ab)+)', 'ababc'), ['abab']) + self.assertEqual(re.findall(r'(?>(?:ab)*)', 'ababc'), ['abab', '', '']) + self.assertEqual(re.findall(r'(?>(?:ab)?)', 'ababc'), ['ab', 'ab', '', '']) + self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab']) + + +def get_debug_out(pat): + with captured_stdout() as out: + re.compile(pat, re.DEBUG) + return out.getvalue() + + +@cpython_only +class DebugTests(unittest.TestCase): + maxDiff = None + + def test_debug_flag(self): + pat = r'(\.)(?:[ch]|py)(?(1)$|: )' + dump = '''\ +SUBPATTERN 1 0 0 + LITERAL 46 +BRANCH + IN + LITERAL 99 + LITERAL 104 +OR + LITERAL 112 + LITERAL 121 +GROUPREF_EXISTS 1 + AT AT_END +ELSE + LITERAL 58 + LITERAL 32 + + 0. INFO 8 0b1 2 5 (to 9) + prefix_skip 0 + prefix [0x2e] ('.') + overlap [0] + 9: MARK 0 +11. LITERAL 0x2e ('.') +13. MARK 1 +15. BRANCH 10 (to 26) +17. IN 6 (to 24) +19. LITERAL 0x63 ('c') +21. LITERAL 0x68 ('h') +23. FAILURE +24: JUMP 9 (to 34) +26: branch 7 (to 33) +27. LITERAL 0x70 ('p') +29. LITERAL 0x79 ('y') +31. JUMP 2 (to 34) +33: FAILURE +34: GROUPREF_EXISTS 0 6 (to 41) +37. AT END +39. JUMP 5 (to 45) +41: LITERAL 0x3a (':') +43. LITERAL 0x20 (' ') +45: SUCCESS +''' + self.assertEqual(get_debug_out(pat), dump) + # Debug output is output again even a second time (bypassing + # the cache -- issue #20426). + self.assertEqual(get_debug_out(pat), dump) + + def test_atomic_group(self): + self.assertEqual(get_debug_out(r'(?>ab?)'), '''\ +ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))] + + 0. INFO 4 0b0 1 2 (to 5) + 5: ATOMIC_GROUP 11 (to 17) + 7. LITERAL 0x61 ('a') + 9. REPEAT_ONE 6 0 1 (to 16) +13. LITERAL 0x62 ('b') +15. SUCCESS +16: SUCCESS +17: SUCCESS +''') + + def test_possesive_repeat_one(self): + self.assertEqual(get_debug_out(r'a?+'), '''\ +POSSESSIVE_REPEAT 0 1 + LITERAL 97 + + 0. INFO 4 0b0 0 1 (to 5) + 5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12) + 9. LITERAL 0x61 ('a') +11. SUCCESS +12: SUCCESS +''') + + def test_possesive_repeat(self): + self.assertEqual(get_debug_out(r'(?:ab)?+'), '''\ +POSSESSIVE_REPEAT 0 1 + LITERAL 97 + LITERAL 98 + + 0. INFO 4 0b0 0 2 (to 5) + 5: POSSESSIVE_REPEAT 7 0 1 (to 13) + 9. LITERAL 0x61 ('a') +11. LITERAL 0x62 ('b') +13: SUCCESS +14. SUCCESS +''') + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): |