summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2017-12-04 12:29:05 (GMT)
committerGitHub <noreply@github.com>2017-12-04 12:29:05 (GMT)
commit70d56fb52582d9d3f7c00860d6e90570c6259371 (patch)
tree61e54b78f19535bfcf41d521b98def725de63497 /Modules
parente69fbb6a560a02d0587b9075afd338a1e9073af0 (diff)
downloadcpython-70d56fb52582d9d3f7c00860d6e90570c6259371.zip
cpython-70d56fb52582d9d3f7c00860d6e90570c6259371.tar.gz
cpython-70d56fb52582d9d3f7c00860d6e90570c6259371.tar.bz2
bpo-25054, bpo-1647489: Added support of splitting on zerowidth patterns. (#4471)
Also fixed searching patterns that could match an empty string.
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_sre.c77
-rw-r--r--Modules/sre.h4
-rw-r--r--Modules/sre_lib.h47
3 files changed, 55 insertions, 73 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c
index a9b6b50..68fc523 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -446,6 +446,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->isbytes = isbytes;
state->charsize = charsize;
+ state->match_all = 0;
+ state->must_advance = 0;
state->beginning = ptr;
@@ -559,14 +561,14 @@ pattern_dealloc(PatternObject* self)
}
LOCAL(Py_ssize_t)
-sre_match(SRE_STATE* state, SRE_CODE* pattern, int match_all)
+sre_match(SRE_STATE* state, SRE_CODE* pattern)
{
if (state->charsize == 1)
- return sre_ucs1_match(state, pattern, match_all);
+ return sre_ucs1_match(state, pattern, 1);
if (state->charsize == 2)
- return sre_ucs2_match(state, pattern, match_all);
+ return sre_ucs2_match(state, pattern, 1);
assert(state->charsize == 4);
- return sre_ucs4_match(state, pattern, match_all);
+ return sre_ucs4_match(state, pattern, 1);
}
LOCAL(Py_ssize_t)
@@ -606,7 +608,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyObject *string,
TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
- status = sre_match(&state, PatternObject_GetCode(self), 0);
+ status = sre_match(&state, PatternObject_GetCode(self));
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) {
@@ -645,7 +647,8 @@ _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyObject *string,
TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
- status = sre_match(&state, PatternObject_GetCode(self), 1);
+ state.match_all = 1;
+ status = sre_match(&state, PatternObject_GetCode(self));
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) {
@@ -808,11 +811,8 @@ _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
if (status < 0)
goto error;
- if (state.ptr == state.start)
- state.start = (void*) ((char*) state.ptr + state.charsize);
- else
- state.start = state.ptr;
-
+ state.must_advance = (state.ptr == state.start);
+ state.start = state.ptr;
}
state_fini(&state);
@@ -901,17 +901,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
void* last;
assert(self->codesize != 0);
- if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) {
- if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
- PyErr_SetString(PyExc_ValueError,
- "split() requires a non-empty pattern match.");
- return NULL;
- }
- if (PyErr_WarnEx(PyExc_FutureWarning,
- "split() requires a non-empty pattern match.",
- 1) < 0)
- return NULL;
- }
if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
return NULL;
@@ -942,14 +931,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
goto error;
}
- if (state.start == state.ptr) {
- if (last == state.end || state.ptr == state.end)
- break;
- /* skip one character */
- state.start = (void*) ((char*) state.ptr + state.charsize);
- continue;
- }
-
/* get segment before this match */
item = getslice(state.isbytes, state.beginning,
string, STATE_OFFSET(&state, last),
@@ -974,7 +955,7 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
}
n = n + 1;
-
+ state.must_advance = 1;
last = state.start = state.ptr;
}
@@ -1101,9 +1082,7 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
if (status < 0)
goto error;
- } else if (i == b && i == e && n > 0)
- /* ignore empty match on latest position */
- goto next;
+ }
if (filter_is_callable) {
/* pass match object through filter */
@@ -1130,16 +1109,8 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
i = e;
n = n + 1;
-
-next:
- /* move on */
- if (state.ptr == state.end)
- break;
- if (state.ptr == state.start)
- state.start = (void*) ((char*) state.ptr + state.charsize);
- else
- state.start = state.ptr;
-
+ state.must_advance = 1;
+ state.start = state.ptr;
}
/* get segment following last match */
@@ -2450,7 +2421,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self)
state->ptr = state->start;
- status = sre_match(state, PatternObject_GetCode(self->pattern), 0);
+ status = sre_match(state, PatternObject_GetCode(self->pattern));
if (PyErr_Occurred())
return NULL;
@@ -2459,12 +2430,10 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self)
if (status == 0)
state->start = NULL;
- else if (state->ptr != state->start)
+ else {
+ state->must_advance = (state->ptr == state->start);
state->start = state->ptr;
- else if (state->ptr != state->end)
- state->start = (void*) ((char*) state->ptr + state->charsize);
- else
- state->start = NULL;
+ }
return match;
}
@@ -2499,12 +2468,10 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self)
if (status == 0)
state->start = NULL;
- else if (state->ptr != state->start)
+ else {
+ state->must_advance = (state->ptr == state->start);
state->start = state->ptr;
- else if (state->ptr != state->end)
- state->start = (void*) ((char*) state->ptr + state->charsize);
- else
- state->start = NULL;
+ }
return match;
}
diff --git a/Modules/sre.h b/Modules/sre.h
index 585d284..a728488 100644
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -67,6 +67,7 @@ typedef struct {
void* end; /* end of original string */
/* attributes for the match object */
PyObject* string;
+ Py_buffer buffer;
Py_ssize_t pos, endpos;
int isbytes;
int charsize; /* character size */
@@ -74,11 +75,12 @@ typedef struct {
Py_ssize_t lastindex;
Py_ssize_t lastmark;
void** mark;
+ int match_all;
+ int must_advance;
/* dynamically allocated stuff */
char* data_stack;
size_t data_stack_size;
size_t data_stack_base;
- Py_buffer buffer;
/* current repeat context */
SRE_REPEAT *repeat;
} SRE_STATE;
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index e13b90e..44948e2 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -199,7 +199,7 @@ SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
return up != lo && SRE(charset)(state, set, up);
}
-LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
+LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel);
LOCAL(Py_ssize_t)
SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
@@ -510,12 +510,12 @@ do { \
#define JUMP_ASSERT 12
#define JUMP_ASSERT_NOT 13
-#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, matchall) \
+#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \
DATA_ALLOC(SRE(match_context), nextctx); \
nextctx->last_ctx_pos = ctx_pos; \
nextctx->jump = jumpvalue; \
nextctx->pattern = nextpattern; \
- nextctx->match_all = matchall; \
+ nextctx->toplevel = toplevel_; \
ctx_pos = alloc_pos; \
ctx = nextctx; \
goto entrance; \
@@ -523,7 +523,7 @@ do { \
while (0) /* gcc doesn't like labels at end of scopes */ \
#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
- DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->match_all)
+ DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel)
#define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \
DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
@@ -540,13 +540,13 @@ typedef struct {
SRE_CODE chr;
SRE_REPEAT* rep;
} u;
- int match_all;
+ int toplevel;
} SRE(match_context);
/* check if string matches the given pattern. returns <0 for
error, 0 for failure, and 1 for success */
LOCAL(Py_ssize_t)
-SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
+SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel)
{
SRE_CHAR* end = (SRE_CHAR *)state->end;
Py_ssize_t alloc_pos, ctx_pos = -1;
@@ -563,7 +563,7 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
ctx->last_ctx_pos = -1;
ctx->jump = JUMP_NONE;
ctx->pattern = pattern;
- ctx->match_all = match_all;
+ ctx->toplevel = toplevel;
ctx_pos = alloc_pos;
entrance:
@@ -636,11 +636,14 @@ entrance:
case SRE_OP_SUCCESS:
/* end of pattern */
TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
- if (!ctx->match_all || ctx->ptr == state->end) {
- state->ptr = ctx->ptr;
- RETURN_SUCCESS;
+ if (ctx->toplevel &&
+ ((state->match_all && ctx->ptr != state->end) ||
+ (state->must_advance && ctx->ptr == state->start)))
+ {
+ RETURN_FAILURE;
}
- RETURN_FAILURE;
+ state->ptr = ctx->ptr;
+ RETURN_SUCCESS;
case SRE_OP_AT:
/* match at given position */
@@ -856,7 +859,9 @@ entrance:
RETURN_FAILURE;
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
- ctx->ptr == state->end) {
+ ctx->ptr == state->end &&
+ !(ctx->toplevel && state->must_advance && ctx->ptr == state->start))
+ {
/* tail is empty. we're finished */
state->ptr = ctx->ptr;
RETURN_SUCCESS;
@@ -941,7 +946,10 @@ entrance:
}
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
- (!match_all || ctx->ptr == state->end)) {
+ !(ctx->toplevel &&
+ ((state->match_all && ctx->ptr != state->end) ||
+ (state->must_advance && ctx->ptr == state->start))))
+ {
/* tail is empty. we're finished */
state->ptr = ctx->ptr;
RETURN_SUCCESS;
@@ -1417,6 +1425,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
return 0; /* literal can't match: doesn't fit in char width */
#endif
end = (SRE_CHAR *)state->end;
+ state->must_advance = 0;
while (ptr < end) {
while (*ptr != c) {
if (++ptr >= end)
@@ -1458,6 +1467,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
return 0;
i = 1;
+ state->must_advance = 0;
do {
if (*ptr == (SRE_CHAR) prefix[i]) {
if (++i != prefix_len) {
@@ -1487,6 +1497,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
if (charset) {
/* pattern starts with a character from a known set */
end = (SRE_CHAR *)state->end;
+ state->must_advance = 0;
for (;;) {
while (ptr < end && !SRE(charset)(state, charset, *ptr))
ptr++;
@@ -1503,13 +1514,15 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
} else {
/* general case */
assert(ptr <= end);
- while (1) {
+ TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
+ state->start = state->ptr = ptr;
+ status = SRE(match)(state, pattern, 1);
+ state->must_advance = 0;
+ while (status == 0 && ptr < end) {
+ ptr++;
TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
state->start = state->ptr = ptr;
status = SRE(match)(state, pattern, 0);
- if (status != 0 || ptr >= end)
- break;
- ptr++;
}
}