diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2017-12-04 12:29:05 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-12-04 12:29:05 (GMT) |
commit | 70d56fb52582d9d3f7c00860d6e90570c6259371 (patch) | |
tree | 61e54b78f19535bfcf41d521b98def725de63497 /Modules | |
parent | e69fbb6a560a02d0587b9075afd338a1e9073af0 (diff) | |
download | cpython-70d56fb52582d9d3f7c00860d6e90570c6259371.zip cpython-70d56fb52582d9d3f7c00860d6e90570c6259371.tar.gz cpython-70d56fb52582d9d3f7c00860d6e90570c6259371.tar.bz2 |
bpo-25054, bpo-1647489: Added support of splitting on zerowidth patterns. (#4471)
Also fixed searching patterns that could match an empty string.
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/_sre.c | 77 | ||||
-rw-r--r-- | Modules/sre.h | 4 | ||||
-rw-r--r-- | Modules/sre_lib.h | 47 |
3 files changed, 55 insertions, 73 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c index a9b6b50..68fc523 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -446,6 +446,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->isbytes = isbytes; state->charsize = charsize; + state->match_all = 0; + state->must_advance = 0; state->beginning = ptr; @@ -559,14 +561,14 @@ pattern_dealloc(PatternObject* self) } LOCAL(Py_ssize_t) -sre_match(SRE_STATE* state, SRE_CODE* pattern, int match_all) +sre_match(SRE_STATE* state, SRE_CODE* pattern) { if (state->charsize == 1) - return sre_ucs1_match(state, pattern, match_all); + return sre_ucs1_match(state, pattern, 1); if (state->charsize == 2) - return sre_ucs2_match(state, pattern, match_all); + return sre_ucs2_match(state, pattern, 1); assert(state->charsize == 4); - return sre_ucs4_match(state, pattern, match_all); + return sre_ucs4_match(state, pattern, 1); } LOCAL(Py_ssize_t) @@ -606,7 +608,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyObject *string, TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr)); - status = sre_match(&state, PatternObject_GetCode(self), 0); + status = sre_match(&state, PatternObject_GetCode(self)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) { @@ -645,7 +647,8 @@ _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyObject *string, TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr)); - status = sre_match(&state, PatternObject_GetCode(self), 1); + state.match_all = 1; + status = sre_match(&state, PatternObject_GetCode(self)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) { @@ -808,11 +811,8 @@ _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string, if (status < 0) goto error; - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - + state.must_advance = (state.ptr == state.start); + state.start = state.ptr; } state_fini(&state); @@ -901,17 +901,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, void* last; assert(self->codesize != 0); - if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) { - if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) { - PyErr_SetString(PyExc_ValueError, - "split() requires a non-empty pattern match."); - return NULL; - } - if (PyErr_WarnEx(PyExc_FutureWarning, - "split() requires a non-empty pattern match.", - 1) < 0) - return NULL; - } if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) return NULL; @@ -942,14 +931,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, goto error; } - if (state.start == state.ptr) { - if (last == state.end || state.ptr == state.end) - break; - /* skip one character */ - state.start = (void*) ((char*) state.ptr + state.charsize); - continue; - } - /* get segment before this match */ item = getslice(state.isbytes, state.beginning, string, STATE_OFFSET(&state, last), @@ -974,7 +955,7 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, } n = n + 1; - + state.must_advance = 1; last = state.start = state.ptr; } @@ -1101,9 +1082,7 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, if (status < 0) goto error; - } else if (i == b && i == e && n > 0) - /* ignore empty match on latest position */ - goto next; + } if (filter_is_callable) { /* pass match object through filter */ @@ -1130,16 +1109,8 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, i = e; n = n + 1; - -next: - /* move on */ - if (state.ptr == state.end) - break; - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - + state.must_advance = 1; + state.start = state.ptr; } /* get segment following last match */ @@ -2450,7 +2421,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self) state->ptr = state->start; - status = sre_match(state, PatternObject_GetCode(self->pattern), 0); + status = sre_match(state, PatternObject_GetCode(self->pattern)); if (PyErr_Occurred()) return NULL; @@ -2459,12 +2430,10 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self) if (status == 0) state->start = NULL; - else if (state->ptr != state->start) + else { + state->must_advance = (state->ptr == state->start); state->start = state->ptr; - else if (state->ptr != state->end) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = NULL; + } return match; } @@ -2499,12 +2468,10 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self) if (status == 0) state->start = NULL; - else if (state->ptr != state->start) + else { + state->must_advance = (state->ptr == state->start); state->start = state->ptr; - else if (state->ptr != state->end) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = NULL; + } return match; } diff --git a/Modules/sre.h b/Modules/sre.h index 585d284..a728488 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -67,6 +67,7 @@ typedef struct { void* end; /* end of original string */ /* attributes for the match object */ PyObject* string; + Py_buffer buffer; Py_ssize_t pos, endpos; int isbytes; int charsize; /* character size */ @@ -74,11 +75,12 @@ typedef struct { Py_ssize_t lastindex; Py_ssize_t lastmark; void** mark; + int match_all; + int must_advance; /* dynamically allocated stuff */ char* data_stack; size_t data_stack_size; size_t data_stack_base; - Py_buffer buffer; /* current repeat context */ SRE_REPEAT *repeat; } SRE_STATE; diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index e13b90e..44948e2 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -199,7 +199,7 @@ SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) return up != lo && SRE(charset)(state, set, up); } -LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all); +LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel); LOCAL(Py_ssize_t) SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) @@ -510,12 +510,12 @@ do { \ #define JUMP_ASSERT 12 #define JUMP_ASSERT_NOT 13 -#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, matchall) \ +#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \ DATA_ALLOC(SRE(match_context), nextctx); \ nextctx->last_ctx_pos = ctx_pos; \ nextctx->jump = jumpvalue; \ nextctx->pattern = nextpattern; \ - nextctx->match_all = matchall; \ + nextctx->toplevel = toplevel_; \ ctx_pos = alloc_pos; \ ctx = nextctx; \ goto entrance; \ @@ -523,7 +523,7 @@ do { \ while (0) /* gcc doesn't like labels at end of scopes */ \ #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ - DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->match_all) + DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel) #define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \ DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0) @@ -540,13 +540,13 @@ typedef struct { SRE_CODE chr; SRE_REPEAT* rep; } u; - int match_all; + int toplevel; } SRE(match_context); /* check if string matches the given pattern. returns <0 for error, 0 for failure, and 1 for success */ LOCAL(Py_ssize_t) -SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all) +SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) { SRE_CHAR* end = (SRE_CHAR *)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; @@ -563,7 +563,7 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all) ctx->last_ctx_pos = -1; ctx->jump = JUMP_NONE; ctx->pattern = pattern; - ctx->match_all = match_all; + ctx->toplevel = toplevel; ctx_pos = alloc_pos; entrance: @@ -636,11 +636,14 @@ entrance: case SRE_OP_SUCCESS: /* end of pattern */ TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr)); - if (!ctx->match_all || ctx->ptr == state->end) { - state->ptr = ctx->ptr; - RETURN_SUCCESS; + if (ctx->toplevel && + ((state->match_all && ctx->ptr != state->end) || + (state->must_advance && ctx->ptr == state->start))) + { + RETURN_FAILURE; } - RETURN_FAILURE; + state->ptr = ctx->ptr; + RETURN_SUCCESS; case SRE_OP_AT: /* match at given position */ @@ -856,7 +859,9 @@ entrance: RETURN_FAILURE; if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && - ctx->ptr == state->end) { + ctx->ptr == state->end && + !(ctx->toplevel && state->must_advance && ctx->ptr == state->start)) + { /* tail is empty. we're finished */ state->ptr = ctx->ptr; RETURN_SUCCESS; @@ -941,7 +946,10 @@ entrance: } if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && - (!match_all || ctx->ptr == state->end)) { + !(ctx->toplevel && + ((state->match_all && ctx->ptr != state->end) || + (state->must_advance && ctx->ptr == state->start)))) + { /* tail is empty. we're finished */ state->ptr = ctx->ptr; RETURN_SUCCESS; @@ -1417,6 +1425,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) return 0; /* literal can't match: doesn't fit in char width */ #endif end = (SRE_CHAR *)state->end; + state->must_advance = 0; while (ptr < end) { while (*ptr != c) { if (++ptr >= end) @@ -1458,6 +1467,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) return 0; i = 1; + state->must_advance = 0; do { if (*ptr == (SRE_CHAR) prefix[i]) { if (++i != prefix_len) { @@ -1487,6 +1497,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) if (charset) { /* pattern starts with a character from a known set */ end = (SRE_CHAR *)state->end; + state->must_advance = 0; for (;;) { while (ptr < end && !SRE(charset)(state, charset, *ptr)) ptr++; @@ -1503,13 +1514,15 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) } else { /* general case */ assert(ptr <= end); - while (1) { + TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); + state->start = state->ptr = ptr; + status = SRE(match)(state, pattern, 1); + state->must_advance = 0; + while (status == 0 && ptr < end) { + ptr++; TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); state->start = state->ptr = ptr; status = SRE(match)(state, pattern, 0); - if (status != 0 || ptr >= end) - break; - ptr++; } } |