diff options
author | Ma Lin <animalize@users.noreply.github.com> | 2022-03-29 14:31:01 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-03-29 14:31:01 (GMT) |
commit | 356997cccc21a3391175d20e9ef03d434675b496 (patch) | |
tree | 16392c0b0212d7680d04f0ccb85fa6e13d812a9a /Modules | |
parent | 788154919c2d843a0a995994bf2aed2d074761ec (diff) | |
download | cpython-356997cccc21a3391175d20e9ef03d434675b496.zip cpython-356997cccc21a3391175d20e9ef03d434675b496.tar.gz cpython-356997cccc21a3391175d20e9ef03d434675b496.tar.bz2 |
bpo-35859: Fix a few long-standing bugs in re engine (GH-12427)
In rare cases, capturing group could get wrong result.
Regular expression engines in Perl and Java have similar bugs.
The new behavior now matches the behavior of more modern
RE engines: in the regex module and in PHP, Ruby and Node.js.
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/_sre.c | 17 | ||||
-rw-r--r-- | Modules/sre_lib.h | 78 |
2 files changed, 76 insertions, 19 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c index 35bdb4f..48193f8 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -532,6 +532,14 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty) } else { i = STATE_OFFSET(state, state->mark[index]); j = STATE_OFFSET(state, state->mark[index+1]); + + /* check wrong span */ + if (i > j) { + PyErr_SetString(PyExc_SystemError, + "The span of capturing group is wrong," + " please report a bug for the re module."); + return NULL; + } } return getslice(state->isbytes, state->beginning, string, i, j); @@ -2477,6 +2485,15 @@ pattern_new_match(_sremodulestate* module_state, if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) { match->mark[j+2] = ((char*) state->mark[j] - base) / n; match->mark[j+3] = ((char*) state->mark[j+1] - base) / n; + + /* check wrong span */ + if (match->mark[j+2] > match->mark[j+3]) { + PyErr_SetString(PyExc_SystemError, + "The span of capturing group is wrong," + " please report a bug for the re module."); + Py_DECREF(match); + return NULL; + } } else match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index a82210f..8e4e714 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -449,20 +449,20 @@ do { \ DATA_STACK_LOOKUP_AT(state,t,p,pos) #define MARK_PUSH(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ i = lastmark; /* ctx->lastmark may change if reallocated */ \ DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ } while (0) #define MARK_POP(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ } while (0) #define MARK_POP_KEEP(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ } while (0) #define MARK_POP_DISCARD(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ } while (0) @@ -770,8 +770,7 @@ entrance: /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */ TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr)); LASTMARK_SAVE(); - ctx->u.rep = state->repeat; - if (ctx->u.rep) + if (state->repeat) MARK_PUSH(ctx->lastmark); for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { if (ctx->pattern[1] == SRE_OP_LITERAL && @@ -786,16 +785,16 @@ entrance: state->ptr = ctx->ptr; DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); if (ret) { - if (ctx->u.rep) + if (state->repeat) MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } - if (ctx->u.rep) + if (state->repeat) MARK_POP_KEEP(ctx->lastmark); LASTMARK_RESTORE(); } - if (ctx->u.rep) + if (state->repeat) MARK_POP_DISCARD(ctx->lastmark); RETURN_FAILURE; @@ -841,6 +840,8 @@ entrance: } LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) { /* tail starts with a literal. skip positions where @@ -858,16 +859,20 @@ entrance: DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1, ctx->pattern+ctx->pattern[0]); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } - + if (state->repeat) + MARK_POP_KEEP(ctx->lastmark); LASTMARK_RESTORE(); ctx->ptr--; ctx->count--; } - + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); } else { /* general case */ while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) { @@ -875,13 +880,20 @@ entrance: DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2, ctx->pattern+ctx->pattern[0]); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } + if (state->repeat) + MARK_POP_KEEP(ctx->lastmark); + LASTMARK_RESTORE(); + ctx->ptr--; ctx->count--; - LASTMARK_RESTORE(); } + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); } RETURN_FAILURE; @@ -930,15 +942,24 @@ entrance: } else { /* general case */ LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); + while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT || ctx->count <= (Py_ssize_t)ctx->pattern[2]) { state->ptr = ctx->ptr; DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one, ctx->pattern+ctx->pattern[0]); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } + if (state->repeat) + MARK_POP_KEEP(ctx->lastmark); + LASTMARK_RESTORE(); + state->ptr = ctx->ptr; ret = SRE(count)(state, ctx->pattern+3, 1); RETURN_ON_ERROR(ret); @@ -948,8 +969,9 @@ entrance: assert(ret == 1); ctx->ptr++; ctx->count++; - LASTMARK_RESTORE(); } + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); } RETURN_FAILURE; @@ -1098,8 +1120,9 @@ entrance: tail matches */ state->repeat = ctx->u.rep->prev; DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); + state->repeat = ctx->u.rep; // restore repeat before return + RETURN_ON_SUCCESS(ret); - state->repeat = ctx->u.rep; state->ptr = ctx->ptr; RETURN_FAILURE; @@ -1132,21 +1155,29 @@ entrance: RETURN_FAILURE; } - LASTMARK_SAVE(); - /* see if the tail matches */ state->repeat = ctx->u.rep->prev; + + LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); + DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); + SRE_REPEAT *repeat_of_tail = state->repeat; + state->repeat = ctx->u.rep; // restore repeat before return + if (ret) { + if (repeat_of_tail) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } + if (repeat_of_tail) + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); - state->repeat = ctx->u.rep; state->ptr = ctx->ptr; - LASTMARK_RESTORE(); - if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2] && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) || state->ptr == ctx->u.rep->last_ptr) @@ -1444,11 +1475,20 @@ entrance: ctx->ptr, ctx->pattern[1])); if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) { state->ptr = ctx->ptr - ctx->pattern[1]; + LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); + DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_FAILURE; } + if (state->repeat) + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); } ctx->pattern += ctx->pattern[0]; break; |