diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-03-21 16:28:22 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-03-21 16:28:22 (GMT) |
commit | 345b390ed69f36681dbc41187bc8f49cd9135b54 (patch) | |
tree | 31ce6451bed718405b29bdb32c7eb4ff96fe5697 /Modules | |
parent | 2bde6827ea4f136297b2d882480b981ff26262b6 (diff) | |
download | cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.zip cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.gz cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.bz2 |
bpo-433030: Add support of atomic grouping in regular expressions (GH-31982)
* Atomic grouping: (?>...).
* Possessive quantifiers: x++, x*+, x?+, x{m,n}+.
Equivalent to (?>x+), (?>x*), (?>x?), (?>x{m,n}).
Co-authored-by: Jeffrey C. Jacobs <timehorse@users.sourceforge.net>
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/_sre.c | 24 | ||||
-rw-r--r-- | Modules/sre_constants.h | 31 | ||||
-rw-r--r-- | Modules/sre_lib.h | 198 |
3 files changed, 237 insertions, 16 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c index ab321ea..35bdb4f 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1807,6 +1807,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_REPEAT_ONE: case SRE_OP_MIN_REPEAT_ONE: + case SRE_OP_POSSESSIVE_REPEAT_ONE: { SRE_CODE min, max; GET_SKIP; @@ -1826,8 +1827,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) break; case SRE_OP_REPEAT: + case SRE_OP_POSSESSIVE_REPEAT: { - SRE_CODE min, max; + SRE_CODE op1 = op, min, max; GET_SKIP; GET_ARG; min = arg; GET_ARG; max = arg; @@ -1839,7 +1841,25 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) FAIL; code += skip-3; GET_OP; - if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) + if (op1 == SRE_OP_POSSESSIVE_REPEAT) { + if (op != SRE_OP_SUCCESS) + FAIL; + } + else { + if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) + FAIL; + } + } + break; + + case SRE_OP_ATOMIC_GROUP: + { + GET_SKIP; + if (!_validate_inner(code, code+skip-2, groups)) + FAIL; + code += skip-2; + GET_OP; + if (op != SRE_OP_SUCCESS) FAIL; } break; diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index c8ccb32..8b9125b 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20171005 +#define SRE_MAGIC 20220318 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -40,19 +40,22 @@ #define SRE_OP_REPEAT_ONE 25 #define SRE_OP_SUBPATTERN 26 #define SRE_OP_MIN_REPEAT_ONE 27 -#define SRE_OP_GROUPREF_IGNORE 28 -#define SRE_OP_IN_IGNORE 29 -#define SRE_OP_LITERAL_IGNORE 30 -#define SRE_OP_NOT_LITERAL_IGNORE 31 -#define SRE_OP_GROUPREF_LOC_IGNORE 32 -#define SRE_OP_IN_LOC_IGNORE 33 -#define SRE_OP_LITERAL_LOC_IGNORE 34 -#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35 -#define SRE_OP_GROUPREF_UNI_IGNORE 36 -#define SRE_OP_IN_UNI_IGNORE 37 -#define SRE_OP_LITERAL_UNI_IGNORE 38 -#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39 -#define SRE_OP_RANGE_UNI_IGNORE 40 +#define SRE_OP_ATOMIC_GROUP 28 +#define SRE_OP_POSSESSIVE_REPEAT 29 +#define SRE_OP_POSSESSIVE_REPEAT_ONE 30 +#define SRE_OP_GROUPREF_IGNORE 31 +#define SRE_OP_IN_IGNORE 32 +#define SRE_OP_LITERAL_IGNORE 33 +#define SRE_OP_NOT_LITERAL_IGNORE 34 +#define SRE_OP_GROUPREF_LOC_IGNORE 35 +#define SRE_OP_IN_LOC_IGNORE 36 +#define SRE_OP_LITERAL_LOC_IGNORE 37 +#define SRE_OP_NOT_LITERAL_LOC_IGNORE 38 +#define SRE_OP_GROUPREF_UNI_IGNORE 39 +#define SRE_OP_IN_UNI_IGNORE 40 +#define SRE_OP_LITERAL_UNI_IGNORE 41 +#define SRE_OP_NOT_LITERAL_UNI_IGNORE 42 +#define SRE_OP_RANGE_UNI_IGNORE 43 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 32469cd..956fd3f 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -480,6 +480,9 @@ do { \ #define JUMP_BRANCH 11 #define JUMP_ASSERT 12 #define JUMP_ASSERT_NOT 13 +#define JUMP_POSS_REPEAT_1 14 +#define JUMP_POSS_REPEAT_2 15 +#define JUMP_ATOMIC_GROUP 16 #define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \ DATA_ALLOC(SRE(match_context), nextctx); \ @@ -950,6 +953,60 @@ entrance: } RETURN_FAILURE; + case SRE_OP_POSSESSIVE_REPEAT_ONE: + /* match repeated sequence (maximizing regexp) without + backtracking */ + + /* this operator only works if the repeated item is + exactly one character wide, and we're not already + collecting backtracking points. for other cases, + use the MAX_REPEAT operator */ + + /* <POSSESSIVE_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> + tail */ + + TRACE(("|%p|%p|POSSESSIVE_REPEAT_ONE %d %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[1], ctx->pattern[2])); + + if (ctx->ptr + ctx->pattern[1] > end) { + RETURN_FAILURE; /* cannot match */ + } + + state->ptr = ctx->ptr; + + ret = SRE(count)(state, ctx->pattern + 3, ctx->pattern[2]); + RETURN_ON_ERROR(ret); + DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); + ctx->count = ret; + ctx->ptr += ctx->count; + + /* when we arrive here, count contains the number of + matches, and ctx->ptr points to the tail of the target + string. check if the rest of the pattern matches, + and fail if not. */ + + /* Test for not enough repetitions in match */ + if (ctx->count < (Py_ssize_t) ctx->pattern[1]) { + RETURN_FAILURE; + } + + /* Update the pattern to point to the next op code */ + ctx->pattern += ctx->pattern[0]; + + /* Let the tail be evaluated separately and consider this + match successful. */ + if (*ctx->pattern == SRE_OP_SUCCESS && + ctx->ptr == state->end && + !(ctx->toplevel && state->must_advance && ctx->ptr == state->start)) + { + /* tail is empty. we're finished */ + state->ptr = ctx->ptr; + RETURN_SUCCESS; + } + + /* Attempt to match the rest of the string */ + break; + case SRE_OP_REPEAT: /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ @@ -1110,6 +1167,138 @@ entrance: state->ptr = ctx->ptr; RETURN_FAILURE; + case SRE_OP_POSSESSIVE_REPEAT: + /* create possessive repeat contexts. */ + /* <POSSESSIVE_REPEAT> <skip> <1=min> <2=max> pattern + <SUCCESS> tail */ + TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[1], ctx->pattern[2])); + + /* Set the global Input pointer to this context's Input + pointer */ + state->ptr = ctx->ptr; + + /* Initialize Count to 0 */ + ctx->count = 0; + + /* Check for minimum required matches. */ + while (ctx->count < (Py_ssize_t)ctx->pattern[1]) { + /* not enough matches */ + DO_JUMP(JUMP_POSS_REPEAT_1, jump_poss_repeat_1, + &ctx->pattern[3]); + if (ret) { + RETURN_ON_ERROR(ret); + ctx->count++; + } + else { + state->ptr = ctx->ptr; + RETURN_FAILURE; + } + } + + /* Clear the context's Input stream pointer so that it + doesn't match the global state so that the while loop can + be entered. */ + ctx->ptr = NULL; + + /* Keep trying to parse the <pattern> sub-pattern until the + end is reached, creating a new context each time. */ + while ((ctx->count < (Py_ssize_t)ctx->pattern[2] || + (Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT) && + state->ptr != ctx->ptr) { + /* Save the Capture Group Marker state into the current + Context and back up the current highest number + Capture Group marker. */ + LASTMARK_SAVE(); + MARK_PUSH(ctx->lastmark); + + /* zero-width match protection */ + /* Set the context's Input Stream pointer to be the + current Input Stream pointer from the global + state. When the loop reaches the next iteration, + the context will then store the last known good + position with the global state holding the Input + Input Stream position that has been updated with + the most recent match. Thus, if state's Input + stream remains the same as the one stored in the + current Context, we know we have successfully + matched an empty string and that all subsequent + matches will also be the empty string until the + maximum number of matches are counted, and because + of this, we could immediately stop at that point and + consider this match successful. */ + ctx->ptr = state->ptr; + + /* We have not reached the maximin matches, so try to + match once more. */ + DO_JUMP(JUMP_POSS_REPEAT_2, jump_poss_repeat_2, + &ctx->pattern[3]); + + /* Check to see if the last attempted match + succeeded. */ + if (ret) { + /* Drop the saved highest number Capture Group + marker saved above and use the newly updated + value. */ + MARK_POP_DISCARD(ctx->lastmark); + RETURN_ON_ERROR(ret); + + /* Success, increment the count. */ + ctx->count++; + } + /* Last attempted match failed. */ + else { + /* Restore the previously saved highest number + Capture Group marker since the last iteration + did not match, then restore that to the global + state. */ + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); + + /* We have sufficient matches, so exit loop. */ + break; + } + } + + /* Evaluate Tail */ + /* Jump to end of pattern indicated by skip, and then skip + the SUCCESS op code that follows it. */ + ctx->pattern += ctx->pattern[0] + 1; + ctx->ptr = state->ptr; + break; + + case SRE_OP_ATOMIC_GROUP: + /* Atomic Group Sub Pattern */ + /* <ATOMIC_GROUP> <skip> pattern <SUCCESS> tail */ + TRACE(("|%p|%p|ATOMIC_GROUP\n", ctx->pattern, ctx->ptr)); + + /* Set the global Input pointer to this context's Input + pointer */ + state->ptr = ctx->ptr; + + /* Evaluate the Atomic Group in a new context, terminating + when the end of the group, represented by a SUCCESS op + code, is reached. */ + /* Group Pattern begins at an offset of 1 code. */ + DO_JUMP(JUMP_ATOMIC_GROUP, jump_atomic_group, + &ctx->pattern[1]); + + /* Test Exit Condition */ + RETURN_ON_ERROR(ret); + + if (ret == 0) { + /* Atomic Group failed to Match. */ + state->ptr = ctx->ptr; + RETURN_FAILURE; + } + + /* Evaluate Tail */ + /* Jump to end of pattern indicated by skip, and then skip + the SUCCESS op code that follows it. */ + ctx->pattern += ctx->pattern[0]; + ctx->ptr = state->ptr; + break; + case SRE_OP_GROUPREF: /* match backreference */ TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern, @@ -1306,6 +1495,12 @@ exit: case JUMP_MIN_UNTIL_1: TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr)); goto jump_min_until_1; + case JUMP_POSS_REPEAT_1: + TRACE(("|%p|%p|JUMP_POSS_REPEAT_1\n", ctx->pattern, ctx->ptr)); + goto jump_poss_repeat_1; + case JUMP_POSS_REPEAT_2: + TRACE(("|%p|%p|JUMP_POSS_REPEAT_2\n", ctx->pattern, ctx->ptr)); + goto jump_poss_repeat_2; case JUMP_REPEAT: TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr)); goto jump_repeat; @@ -1318,6 +1513,9 @@ exit: case JUMP_MIN_REPEAT_ONE: TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr)); goto jump_min_repeat_one; + case JUMP_ATOMIC_GROUP: + TRACE(("|%p|%p|JUMP_ATOMIC_GROUP\n", ctx->pattern, ctx->ptr)); + goto jump_atomic_group; case JUMP_ASSERT: TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr)); goto jump_assert; |