From 356997cccc21a3391175d20e9ef03d434675b496 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Tue, 29 Mar 2022 22:31:01 +0800 Subject: bpo-35859: Fix a few long-standing bugs in re engine (GH-12427) In rare cases, capturing group could get wrong result. Regular expression engines in Perl and Java have similar bugs. The new behavior now matches the behavior of more modern RE engines: in the regex module and in PHP, Ruby and Node.js. --- Doc/whatsnew/3.11.rst | 5 ++ Lib/test/test_re.py | 69 +++++++++++++++++++ .../2019-03-14-09-08-25.bpo-35859.8lFdLe.rst | 2 + Modules/_sre.c | 17 +++++ Modules/sre_lib.h | 78 ++++++++++++++++------ 5 files changed, 152 insertions(+), 19 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 41e4659..837d8c8 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -718,6 +718,11 @@ Changes in the Python API deprecated since Python 3.6. (Contributed by Serhiy Storchaka in :issue:`47066`.) +* :mod:`re` module: Fix a few long-standing bugs where, in rare cases, + capturing group could get wrong result. So the result may be different than + before. + (Contributed by Ma Lin in :issue:`35859`.) + * The *population* parameter of :func:`random.sample` must be a sequence. Automatic conversion of sets to lists is no longer supported. If the sample size is larger than the population size, a :exc:`ValueError` is raised. diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index fd6db6a..85716fb 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2033,6 +2033,75 @@ class ReTests(unittest.TestCase): {'tag': 'foo', 'text': None}, {'tag': 'foo', 'text': None}]) + def test_MARK_PUSH_macro_bug(self): + # issue35859, MARK_PUSH() macro didn't protect MARK-0 if it + # was the only available mark. + self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',)) + + def test_MIN_UNTIL_mark_bug(self): + # Fixed in issue35859, reported in issue9134. + # JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat + s = 'axxzbcz' + p = r'(?:(?:a|bc)*?(xx)??z)*' + self.assertEqual(re.match(p, s).groups(), ('xx',)) + + # test-case provided by issue9134 + s = 'xtcxyzxc' + p = r'((x|yz)+?(t)??c)*' + m = re.match(p, s) + self.assertEqual(m.span(), (0, 8)) + self.assertEqual(m.span(2), (6, 7)) + self.assertEqual(m.groups(), ('xyzxc', 'x', 't')) + + def test_REPEAT_ONE_mark_bug(self): + # issue35859 + # JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat + s = 'aabaab' + p = r'(?:[^b]*a(?=(b)|(a))ab)*' + m = re.match(p, s) + self.assertEqual(m.span(), (0, 6)) + self.assertEqual(m.span(2), (4, 5)) + self.assertEqual(m.groups(), (None, 'a')) + + # JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat + s = 'abab' + p = r'(?:[^b]*(?=(b)|(a))ab)*' + m = re.match(p, s) + self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.span(2), (2, 3)) + self.assertEqual(m.groups(), (None, 'a')) + + self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',)) + + def test_MIN_REPEAT_ONE_mark_bug(self): + # issue35859 + # JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat + s = 'abab' + p = r'(?:.*?(?=(a)|(b))b)*' + m = re.match(p, s) + self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.span(2), (3, 4)) + self.assertEqual(m.groups(), (None, 'b')) + + s = 'axxzaz' + p = r'(?:a*?(xx)??z)*' + self.assertEqual(re.match(p, s).groups(), ('xx',)) + + def test_ASSERT_NOT_mark_bug(self): + # Fixed in issue35859, reported in issue725149. + # JUMP_ASSERT_NOT should LASTMARK_SAVE() + self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,)) + + # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat + m = re.match(r'((?!(ab)c)(.))*', 'abab') + self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.span(1), (3, 4)) + self.assertEqual(m.span(3), (3, 4)) + self.assertEqual(m.groups(), ('b', None, 'b')) + def test_bug_40736(self): with self.assertRaisesRegex(TypeError, "got 'int'"): re.search("x*", 5) diff --git a/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst b/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst new file mode 100644 index 0000000..8c88ef0 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst @@ -0,0 +1,2 @@ +:mod:`re` module, fix a few bugs about capturing group. In rare cases, +capturing group gets an incorrect string. Patch by Ma Lin. diff --git a/Modules/_sre.c b/Modules/_sre.c index 35bdb4f..48193f8 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -532,6 +532,14 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty) } else { i = STATE_OFFSET(state, state->mark[index]); j = STATE_OFFSET(state, state->mark[index+1]); + + /* check wrong span */ + if (i > j) { + PyErr_SetString(PyExc_SystemError, + "The span of capturing group is wrong," + " please report a bug for the re module."); + return NULL; + } } return getslice(state->isbytes, state->beginning, string, i, j); @@ -2477,6 +2485,15 @@ pattern_new_match(_sremodulestate* module_state, if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) { match->mark[j+2] = ((char*) state->mark[j] - base) / n; match->mark[j+3] = ((char*) state->mark[j+1] - base) / n; + + /* check wrong span */ + if (match->mark[j+2] > match->mark[j+3]) { + PyErr_SetString(PyExc_SystemError, + "The span of capturing group is wrong," + " please report a bug for the re module."); + Py_DECREF(match); + return NULL; + } } else match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index a82210f..8e4e714 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -449,20 +449,20 @@ do { \ DATA_STACK_LOOKUP_AT(state,t,p,pos) #define MARK_PUSH(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ i = lastmark; /* ctx->lastmark may change if reallocated */ \ DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ } while (0) #define MARK_POP(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ } while (0) #define MARK_POP_KEEP(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ } while (0) #define MARK_POP_DISCARD(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ } while (0) @@ -770,8 +770,7 @@ entrance: /* <0=skip> code ... */ TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr)); LASTMARK_SAVE(); - ctx->u.rep = state->repeat; - if (ctx->u.rep) + if (state->repeat) MARK_PUSH(ctx->lastmark); for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { if (ctx->pattern[1] == SRE_OP_LITERAL && @@ -786,16 +785,16 @@ entrance: state->ptr = ctx->ptr; DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); if (ret) { - if (ctx->u.rep) + if (state->repeat) MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } - if (ctx->u.rep) + if (state->repeat) MARK_POP_KEEP(ctx->lastmark); LASTMARK_RESTORE(); } - if (ctx->u.rep) + if (state->repeat) MARK_POP_DISCARD(ctx->lastmark); RETURN_FAILURE; @@ -841,6 +840,8 @@ entrance: } LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) { /* tail starts with a literal. skip positions where @@ -858,16 +859,20 @@ entrance: DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1, ctx->pattern+ctx->pattern[0]); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } - + if (state->repeat) + MARK_POP_KEEP(ctx->lastmark); LASTMARK_RESTORE(); ctx->ptr--; ctx->count--; } - + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); } else { /* general case */ while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) { @@ -875,13 +880,20 @@ entrance: DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2, ctx->pattern+ctx->pattern[0]); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } + if (state->repeat) + MARK_POP_KEEP(ctx->lastmark); + LASTMARK_RESTORE(); + ctx->ptr--; ctx->count--; - LASTMARK_RESTORE(); } + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); } RETURN_FAILURE; @@ -930,15 +942,24 @@ entrance: } else { /* general case */ LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); + while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT || ctx->count <= (Py_ssize_t)ctx->pattern[2]) { state->ptr = ctx->ptr; DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one, ctx->pattern+ctx->pattern[0]); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } + if (state->repeat) + MARK_POP_KEEP(ctx->lastmark); + LASTMARK_RESTORE(); + state->ptr = ctx->ptr; ret = SRE(count)(state, ctx->pattern+3, 1); RETURN_ON_ERROR(ret); @@ -948,8 +969,9 @@ entrance: assert(ret == 1); ctx->ptr++; ctx->count++; - LASTMARK_RESTORE(); } + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); } RETURN_FAILURE; @@ -1098,8 +1120,9 @@ entrance: tail matches */ state->repeat = ctx->u.rep->prev; DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); + state->repeat = ctx->u.rep; // restore repeat before return + RETURN_ON_SUCCESS(ret); - state->repeat = ctx->u.rep; state->ptr = ctx->ptr; RETURN_FAILURE; @@ -1132,21 +1155,29 @@ entrance: RETURN_FAILURE; } - LASTMARK_SAVE(); - /* see if the tail matches */ state->repeat = ctx->u.rep->prev; + + LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); + DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); + SRE_REPEAT *repeat_of_tail = state->repeat; + state->repeat = ctx->u.rep; // restore repeat before return + if (ret) { + if (repeat_of_tail) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } + if (repeat_of_tail) + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); - state->repeat = ctx->u.rep; state->ptr = ctx->ptr; - LASTMARK_RESTORE(); - if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2] && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) || state->ptr == ctx->u.rep->last_ptr) @@ -1444,11 +1475,20 @@ entrance: ctx->ptr, ctx->pattern[1])); if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) { state->ptr = ctx->ptr - ctx->pattern[1]; + LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); + DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_FAILURE; } + if (state->repeat) + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); } ctx->pattern += ctx->pattern[0]; break; -- cgit v0.12