summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorMa Lin <animalize@users.noreply.github.com>2022-03-29 14:31:01 (GMT)
committerGitHub <noreply@github.com>2022-03-29 14:31:01 (GMT)
commit356997cccc21a3391175d20e9ef03d434675b496 (patch)
tree16392c0b0212d7680d04f0ccb85fa6e13d812a9a /Modules
parent788154919c2d843a0a995994bf2aed2d074761ec (diff)
downloadcpython-356997cccc21a3391175d20e9ef03d434675b496.zip
cpython-356997cccc21a3391175d20e9ef03d434675b496.tar.gz
cpython-356997cccc21a3391175d20e9ef03d434675b496.tar.bz2
bpo-35859: Fix a few long-standing bugs in re engine (GH-12427)
In rare cases, capturing group could get wrong result. Regular expression engines in Perl and Java have similar bugs. The new behavior now matches the behavior of more modern RE engines: in the regex module and in PHP, Ruby and Node.js.
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_sre.c17
-rw-r--r--Modules/sre_lib.h78
2 files changed, 76 insertions, 19 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 35bdb4f..48193f8 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -532,6 +532,14 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
} else {
i = STATE_OFFSET(state, state->mark[index]);
j = STATE_OFFSET(state, state->mark[index+1]);
+
+ /* check wrong span */
+ if (i > j) {
+ PyErr_SetString(PyExc_SystemError,
+ "The span of capturing group is wrong,"
+ " please report a bug for the re module.");
+ return NULL;
+ }
}
return getslice(state->isbytes, state->beginning, string, i, j);
@@ -2477,6 +2485,15 @@ pattern_new_match(_sremodulestate* module_state,
if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
match->mark[j+2] = ((char*) state->mark[j] - base) / n;
match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
+
+ /* check wrong span */
+ if (match->mark[j+2] > match->mark[j+3]) {
+ PyErr_SetString(PyExc_SystemError,
+ "The span of capturing group is wrong,"
+ " please report a bug for the re module.");
+ Py_DECREF(match);
+ return NULL;
+ }
} else
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index a82210f..8e4e714 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -449,20 +449,20 @@ do { \
DATA_STACK_LOOKUP_AT(state,t,p,pos)
#define MARK_PUSH(lastmark) \
- do if (lastmark > 0) { \
+ do if (lastmark >= 0) { \
i = lastmark; /* ctx->lastmark may change if reallocated */ \
DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
} while (0)
#define MARK_POP(lastmark) \
- do if (lastmark > 0) { \
+ do if (lastmark >= 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
} while (0)
#define MARK_POP_KEEP(lastmark) \
- do if (lastmark > 0) { \
+ do if (lastmark >= 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
} while (0)
#define MARK_POP_DISCARD(lastmark) \
- do if (lastmark > 0) { \
+ do if (lastmark >= 0) { \
DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
} while (0)
@@ -770,8 +770,7 @@ entrance:
/* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
LASTMARK_SAVE();
- ctx->u.rep = state->repeat;
- if (ctx->u.rep)
+ if (state->repeat)
MARK_PUSH(ctx->lastmark);
for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
if (ctx->pattern[1] == SRE_OP_LITERAL &&
@@ -786,16 +785,16 @@ entrance:
state->ptr = ctx->ptr;
DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
if (ret) {
- if (ctx->u.rep)
+ if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
- if (ctx->u.rep)
+ if (state->repeat)
MARK_POP_KEEP(ctx->lastmark);
LASTMARK_RESTORE();
}
- if (ctx->u.rep)
+ if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_FAILURE;
@@ -841,6 +840,8 @@ entrance:
}
LASTMARK_SAVE();
+ if (state->repeat)
+ MARK_PUSH(ctx->lastmark);
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
/* tail starts with a literal. skip positions where
@@ -858,16 +859,20 @@ entrance:
DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
ctx->pattern+ctx->pattern[0]);
if (ret) {
+ if (state->repeat)
+ MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
-
+ if (state->repeat)
+ MARK_POP_KEEP(ctx->lastmark);
LASTMARK_RESTORE();
ctx->ptr--;
ctx->count--;
}
-
+ if (state->repeat)
+ MARK_POP_DISCARD(ctx->lastmark);
} else {
/* general case */
while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
@@ -875,13 +880,20 @@ entrance:
DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
ctx->pattern+ctx->pattern[0]);
if (ret) {
+ if (state->repeat)
+ MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
+ if (state->repeat)
+ MARK_POP_KEEP(ctx->lastmark);
+ LASTMARK_RESTORE();
+
ctx->ptr--;
ctx->count--;
- LASTMARK_RESTORE();
}
+ if (state->repeat)
+ MARK_POP_DISCARD(ctx->lastmark);
}
RETURN_FAILURE;
@@ -930,15 +942,24 @@ entrance:
} else {
/* general case */
LASTMARK_SAVE();
+ if (state->repeat)
+ MARK_PUSH(ctx->lastmark);
+
while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
|| ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
state->ptr = ctx->ptr;
DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
ctx->pattern+ctx->pattern[0]);
if (ret) {
+ if (state->repeat)
+ MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
+ if (state->repeat)
+ MARK_POP_KEEP(ctx->lastmark);
+ LASTMARK_RESTORE();
+
state->ptr = ctx->ptr;
ret = SRE(count)(state, ctx->pattern+3, 1);
RETURN_ON_ERROR(ret);
@@ -948,8 +969,9 @@ entrance:
assert(ret == 1);
ctx->ptr++;
ctx->count++;
- LASTMARK_RESTORE();
}
+ if (state->repeat)
+ MARK_POP_DISCARD(ctx->lastmark);
}
RETURN_FAILURE;
@@ -1098,8 +1120,9 @@ entrance:
tail matches */
state->repeat = ctx->u.rep->prev;
DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
+ state->repeat = ctx->u.rep; // restore repeat before return
+
RETURN_ON_SUCCESS(ret);
- state->repeat = ctx->u.rep;
state->ptr = ctx->ptr;
RETURN_FAILURE;
@@ -1132,21 +1155,29 @@ entrance:
RETURN_FAILURE;
}
- LASTMARK_SAVE();
-
/* see if the tail matches */
state->repeat = ctx->u.rep->prev;
+
+ LASTMARK_SAVE();
+ if (state->repeat)
+ MARK_PUSH(ctx->lastmark);
+
DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
+ SRE_REPEAT *repeat_of_tail = state->repeat;
+ state->repeat = ctx->u.rep; // restore repeat before return
+
if (ret) {
+ if (repeat_of_tail)
+ MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
+ if (repeat_of_tail)
+ MARK_POP(ctx->lastmark);
+ LASTMARK_RESTORE();
- state->repeat = ctx->u.rep;
state->ptr = ctx->ptr;
- LASTMARK_RESTORE();
-
if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
&& ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
state->ptr == ctx->u.rep->last_ptr)
@@ -1444,11 +1475,20 @@ entrance:
ctx->ptr, ctx->pattern[1]));
if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
state->ptr = ctx->ptr - ctx->pattern[1];
+ LASTMARK_SAVE();
+ if (state->repeat)
+ MARK_PUSH(ctx->lastmark);
+
DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
if (ret) {
+ if (state->repeat)
+ MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_FAILURE;
}
+ if (state->repeat)
+ MARK_POP(ctx->lastmark);
+ LASTMARK_RESTORE();
}
ctx->pattern += ctx->pattern[0];
break;