summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2022-03-21 16:28:22 (GMT)
committerGitHub <noreply@github.com>2022-03-21 16:28:22 (GMT)
commit345b390ed69f36681dbc41187bc8f49cd9135b54 (patch)
tree31ce6451bed718405b29bdb32c7eb4ff96fe5697 /Modules
parent2bde6827ea4f136297b2d882480b981ff26262b6 (diff)
downloadcpython-345b390ed69f36681dbc41187bc8f49cd9135b54.zip
cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.gz
cpython-345b390ed69f36681dbc41187bc8f49cd9135b54.tar.bz2
bpo-433030: Add support of atomic grouping in regular expressions (GH-31982)
* Atomic grouping: (?>...). * Possessive quantifiers: x++, x*+, x?+, x{m,n}+. Equivalent to (?>x+), (?>x*), (?>x?), (?>x{m,n}). Co-authored-by: Jeffrey C. Jacobs <timehorse@users.sourceforge.net>
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_sre.c24
-rw-r--r--Modules/sre_constants.h31
-rw-r--r--Modules/sre_lib.h198
3 files changed, 237 insertions, 16 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c
index ab321ea..35bdb4f 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -1807,6 +1807,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_REPEAT_ONE:
case SRE_OP_MIN_REPEAT_ONE:
+ case SRE_OP_POSSESSIVE_REPEAT_ONE:
{
SRE_CODE min, max;
GET_SKIP;
@@ -1826,8 +1827,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
break;
case SRE_OP_REPEAT:
+ case SRE_OP_POSSESSIVE_REPEAT:
{
- SRE_CODE min, max;
+ SRE_CODE op1 = op, min, max;
GET_SKIP;
GET_ARG; min = arg;
GET_ARG; max = arg;
@@ -1839,7 +1841,25 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
FAIL;
code += skip-3;
GET_OP;
- if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
+ if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
+ if (op != SRE_OP_SUCCESS)
+ FAIL;
+ }
+ else {
+ if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
+ FAIL;
+ }
+ }
+ break;
+
+ case SRE_OP_ATOMIC_GROUP:
+ {
+ GET_SKIP;
+ if (!_validate_inner(code, code+skip-2, groups))
+ FAIL;
+ code += skip-2;
+ GET_OP;
+ if (op != SRE_OP_SUCCESS)
FAIL;
}
break;
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index c8ccb32..8b9125b 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
-#define SRE_MAGIC 20171005
+#define SRE_MAGIC 20220318
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@@ -40,19 +40,22 @@
#define SRE_OP_REPEAT_ONE 25
#define SRE_OP_SUBPATTERN 26
#define SRE_OP_MIN_REPEAT_ONE 27
-#define SRE_OP_GROUPREF_IGNORE 28
-#define SRE_OP_IN_IGNORE 29
-#define SRE_OP_LITERAL_IGNORE 30
-#define SRE_OP_NOT_LITERAL_IGNORE 31
-#define SRE_OP_GROUPREF_LOC_IGNORE 32
-#define SRE_OP_IN_LOC_IGNORE 33
-#define SRE_OP_LITERAL_LOC_IGNORE 34
-#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35
-#define SRE_OP_GROUPREF_UNI_IGNORE 36
-#define SRE_OP_IN_UNI_IGNORE 37
-#define SRE_OP_LITERAL_UNI_IGNORE 38
-#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39
-#define SRE_OP_RANGE_UNI_IGNORE 40
+#define SRE_OP_ATOMIC_GROUP 28
+#define SRE_OP_POSSESSIVE_REPEAT 29
+#define SRE_OP_POSSESSIVE_REPEAT_ONE 30
+#define SRE_OP_GROUPREF_IGNORE 31
+#define SRE_OP_IN_IGNORE 32
+#define SRE_OP_LITERAL_IGNORE 33
+#define SRE_OP_NOT_LITERAL_IGNORE 34
+#define SRE_OP_GROUPREF_LOC_IGNORE 35
+#define SRE_OP_IN_LOC_IGNORE 36
+#define SRE_OP_LITERAL_LOC_IGNORE 37
+#define SRE_OP_NOT_LITERAL_LOC_IGNORE 38
+#define SRE_OP_GROUPREF_UNI_IGNORE 39
+#define SRE_OP_IN_UNI_IGNORE 40
+#define SRE_OP_LITERAL_UNI_IGNORE 41
+#define SRE_OP_NOT_LITERAL_UNI_IGNORE 42
+#define SRE_OP_RANGE_UNI_IGNORE 43
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index 32469cd..956fd3f 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -480,6 +480,9 @@ do { \
#define JUMP_BRANCH 11
#define JUMP_ASSERT 12
#define JUMP_ASSERT_NOT 13
+#define JUMP_POSS_REPEAT_1 14
+#define JUMP_POSS_REPEAT_2 15
+#define JUMP_ATOMIC_GROUP 16
#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \
DATA_ALLOC(SRE(match_context), nextctx); \
@@ -950,6 +953,60 @@ entrance:
}
RETURN_FAILURE;
+ case SRE_OP_POSSESSIVE_REPEAT_ONE:
+ /* match repeated sequence (maximizing regexp) without
+ backtracking */
+
+ /* this operator only works if the repeated item is
+ exactly one character wide, and we're not already
+ collecting backtracking points. for other cases,
+ use the MAX_REPEAT operator */
+
+ /* <POSSESSIVE_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS>
+ tail */
+
+ TRACE(("|%p|%p|POSSESSIVE_REPEAT_ONE %d %d\n", ctx->pattern,
+ ctx->ptr, ctx->pattern[1], ctx->pattern[2]));
+
+ if (ctx->ptr + ctx->pattern[1] > end) {
+ RETURN_FAILURE; /* cannot match */
+ }
+
+ state->ptr = ctx->ptr;
+
+ ret = SRE(count)(state, ctx->pattern + 3, ctx->pattern[2]);
+ RETURN_ON_ERROR(ret);
+ DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
+ ctx->count = ret;
+ ctx->ptr += ctx->count;
+
+ /* when we arrive here, count contains the number of
+ matches, and ctx->ptr points to the tail of the target
+ string. check if the rest of the pattern matches,
+ and fail if not. */
+
+ /* Test for not enough repetitions in match */
+ if (ctx->count < (Py_ssize_t) ctx->pattern[1]) {
+ RETURN_FAILURE;
+ }
+
+ /* Update the pattern to point to the next op code */
+ ctx->pattern += ctx->pattern[0];
+
+ /* Let the tail be evaluated separately and consider this
+ match successful. */
+ if (*ctx->pattern == SRE_OP_SUCCESS &&
+ ctx->ptr == state->end &&
+ !(ctx->toplevel && state->must_advance && ctx->ptr == state->start))
+ {
+ /* tail is empty. we're finished */
+ state->ptr = ctx->ptr;
+ RETURN_SUCCESS;
+ }
+
+ /* Attempt to match the rest of the string */
+ break;
+
case SRE_OP_REPEAT:
/* create repeat context. all the hard work is done
by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
@@ -1110,6 +1167,138 @@ entrance:
state->ptr = ctx->ptr;
RETURN_FAILURE;
+ case SRE_OP_POSSESSIVE_REPEAT:
+ /* create possessive repeat contexts. */
+ /* <POSSESSIVE_REPEAT> <skip> <1=min> <2=max> pattern
+ <SUCCESS> tail */
+ TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern,
+ ctx->ptr, ctx->pattern[1], ctx->pattern[2]));
+
+ /* Set the global Input pointer to this context's Input
+ pointer */
+ state->ptr = ctx->ptr;
+
+ /* Initialize Count to 0 */
+ ctx->count = 0;
+
+ /* Check for minimum required matches. */
+ while (ctx->count < (Py_ssize_t)ctx->pattern[1]) {
+ /* not enough matches */
+ DO_JUMP(JUMP_POSS_REPEAT_1, jump_poss_repeat_1,
+ &ctx->pattern[3]);
+ if (ret) {
+ RETURN_ON_ERROR(ret);
+ ctx->count++;
+ }
+ else {
+ state->ptr = ctx->ptr;
+ RETURN_FAILURE;
+ }
+ }
+
+ /* Clear the context's Input stream pointer so that it
+ doesn't match the global state so that the while loop can
+ be entered. */
+ ctx->ptr = NULL;
+
+ /* Keep trying to parse the <pattern> sub-pattern until the
+ end is reached, creating a new context each time. */
+ while ((ctx->count < (Py_ssize_t)ctx->pattern[2] ||
+ (Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT) &&
+ state->ptr != ctx->ptr) {
+ /* Save the Capture Group Marker state into the current
+ Context and back up the current highest number
+ Capture Group marker. */
+ LASTMARK_SAVE();
+ MARK_PUSH(ctx->lastmark);
+
+ /* zero-width match protection */
+ /* Set the context's Input Stream pointer to be the
+ current Input Stream pointer from the global
+ state. When the loop reaches the next iteration,
+ the context will then store the last known good
+ position with the global state holding the Input
+ Input Stream position that has been updated with
+ the most recent match. Thus, if state's Input
+ stream remains the same as the one stored in the
+ current Context, we know we have successfully
+ matched an empty string and that all subsequent
+ matches will also be the empty string until the
+ maximum number of matches are counted, and because
+ of this, we could immediately stop at that point and
+ consider this match successful. */
+ ctx->ptr = state->ptr;
+
+ /* We have not reached the maximin matches, so try to
+ match once more. */
+ DO_JUMP(JUMP_POSS_REPEAT_2, jump_poss_repeat_2,
+ &ctx->pattern[3]);
+
+ /* Check to see if the last attempted match
+ succeeded. */
+ if (ret) {
+ /* Drop the saved highest number Capture Group
+ marker saved above and use the newly updated
+ value. */
+ MARK_POP_DISCARD(ctx->lastmark);
+ RETURN_ON_ERROR(ret);
+
+ /* Success, increment the count. */
+ ctx->count++;
+ }
+ /* Last attempted match failed. */
+ else {
+ /* Restore the previously saved highest number
+ Capture Group marker since the last iteration
+ did not match, then restore that to the global
+ state. */
+ MARK_POP(ctx->lastmark);
+ LASTMARK_RESTORE();
+
+ /* We have sufficient matches, so exit loop. */
+ break;
+ }
+ }
+
+ /* Evaluate Tail */
+ /* Jump to end of pattern indicated by skip, and then skip
+ the SUCCESS op code that follows it. */
+ ctx->pattern += ctx->pattern[0] + 1;
+ ctx->ptr = state->ptr;
+ break;
+
+ case SRE_OP_ATOMIC_GROUP:
+ /* Atomic Group Sub Pattern */
+ /* <ATOMIC_GROUP> <skip> pattern <SUCCESS> tail */
+ TRACE(("|%p|%p|ATOMIC_GROUP\n", ctx->pattern, ctx->ptr));
+
+ /* Set the global Input pointer to this context's Input
+ pointer */
+ state->ptr = ctx->ptr;
+
+ /* Evaluate the Atomic Group in a new context, terminating
+ when the end of the group, represented by a SUCCESS op
+ code, is reached. */
+ /* Group Pattern begins at an offset of 1 code. */
+ DO_JUMP(JUMP_ATOMIC_GROUP, jump_atomic_group,
+ &ctx->pattern[1]);
+
+ /* Test Exit Condition */
+ RETURN_ON_ERROR(ret);
+
+ if (ret == 0) {
+ /* Atomic Group failed to Match. */
+ state->ptr = ctx->ptr;
+ RETURN_FAILURE;
+ }
+
+ /* Evaluate Tail */
+ /* Jump to end of pattern indicated by skip, and then skip
+ the SUCCESS op code that follows it. */
+ ctx->pattern += ctx->pattern[0];
+ ctx->ptr = state->ptr;
+ break;
+
case SRE_OP_GROUPREF:
/* match backreference */
TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
@@ -1306,6 +1495,12 @@ exit:
case JUMP_MIN_UNTIL_1:
TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
goto jump_min_until_1;
+ case JUMP_POSS_REPEAT_1:
+ TRACE(("|%p|%p|JUMP_POSS_REPEAT_1\n", ctx->pattern, ctx->ptr));
+ goto jump_poss_repeat_1;
+ case JUMP_POSS_REPEAT_2:
+ TRACE(("|%p|%p|JUMP_POSS_REPEAT_2\n", ctx->pattern, ctx->ptr));
+ goto jump_poss_repeat_2;
case JUMP_REPEAT:
TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
goto jump_repeat;
@@ -1318,6 +1513,9 @@ exit:
case JUMP_MIN_REPEAT_ONE:
TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
goto jump_min_repeat_one;
+ case JUMP_ATOMIC_GROUP:
+ TRACE(("|%p|%p|JUMP_ATOMIC_GROUP\n", ctx->pattern, ctx->ptr));
+ goto jump_atomic_group;
case JUMP_ASSERT:
TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
goto jump_assert;