diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2017-10-24 20:31:42 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-24 20:31:42 (GMT) |
commit | 3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132 (patch) | |
tree | aa741f0d09293f6dfe9668a5b328658ce13c8279 /Modules | |
parent | fdd9b217c60b454ac6a82f02c8b0b551caeac88b (diff) | |
download | cpython-3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132.zip cpython-3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132.tar.gz cpython-3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132.tar.bz2 |
bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/_sre.c | 37 | ||||
-rw-r--r-- | Modules/sre.h | 4 | ||||
-rw-r--r-- | Modules/sre_constants.h | 51 | ||||
-rw-r--r-- | Modules/sre_lib.h | 136 |
4 files changed, 160 insertions, 68 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c index c42ab26..a9b6b50 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -97,12 +97,12 @@ static const char copyright[] = #define SRE_IS_WORD(ch)\ ((ch) < 128 && (Py_ISALNUM(ch) || (ch) == '_')) -static unsigned int sre_lower(unsigned int ch) +static unsigned int sre_lower_ascii(unsigned int ch) { return ((ch) < 128 ? Py_TOLOWER(ch) : ch); } -static unsigned int sre_upper(unsigned int ch) +static unsigned int sre_upper_ascii(unsigned int ch) { return ((ch) < 128 ? Py_TOUPPER(ch) : ch); } @@ -188,6 +188,15 @@ sre_category(SRE_CODE category, unsigned int ch) return 0; } +LOCAL(int) +char_loc_ignore(SRE_CODE pattern, SRE_CODE ch) +{ + return ch == pattern + || (SRE_CODE) sre_lower_locale(ch) == pattern + || (SRE_CODE) sre_upper_locale(ch) == pattern; +} + + /* helpers */ static void @@ -286,7 +295,7 @@ _sre_ascii_iscased_impl(PyObject *module, int character) /*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/ { unsigned int ch = (unsigned int)character; - return ch != sre_lower(ch) || ch != sre_upper(ch); + return ch != sre_lower_ascii(ch) || ch != sre_upper_ascii(ch); } /*[clinic input] @@ -317,7 +326,7 @@ static int _sre_ascii_tolower_impl(PyObject *module, int character) /*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/ { - return sre_lower(character); + return sre_lower_ascii(character); } /*[clinic input] @@ -448,19 +457,6 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->pos = start; state->endpos = end; - if (pattern->flags & SRE_FLAG_LOCALE) { - state->lower = sre_lower_locale; - state->upper = sre_upper_locale; - } - else if (pattern->flags & SRE_FLAG_UNICODE) { - state->lower = sre_lower_unicode; - state->upper = sre_upper_unicode; - } - else { - state->lower = sre_lower; - state->upper = sre_upper; - } - return string; err: PyMem_Del(state->mark); @@ -1533,7 +1529,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end) break; case SRE_OP_RANGE: - case SRE_OP_RANGE_IGNORE: + case SRE_OP_RANGE_UNI_IGNORE: GET_ARG; GET_ARG; break; @@ -1630,6 +1626,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_NOT_LITERAL: case SRE_OP_LITERAL_IGNORE: case SRE_OP_NOT_LITERAL_IGNORE: + case SRE_OP_LITERAL_UNI_IGNORE: + case SRE_OP_NOT_LITERAL_UNI_IGNORE: case SRE_OP_LITERAL_LOC_IGNORE: case SRE_OP_NOT_LITERAL_LOC_IGNORE: GET_ARG; @@ -1669,6 +1667,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_IN: case SRE_OP_IN_IGNORE: + case SRE_OP_IN_UNI_IGNORE: case SRE_OP_IN_LOC_IGNORE: GET_SKIP; /* Stop 1 before the end; we check the FAILURE below */ @@ -1805,6 +1804,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_GROUPREF: case SRE_OP_GROUPREF_IGNORE: + case SRE_OP_GROUPREF_UNI_IGNORE: + case SRE_OP_GROUPREF_LOC_IGNORE: GET_ARG; if (arg >= (size_t)groups) FAIL; diff --git a/Modules/sre.h b/Modules/sre.h index 9af5e40..585d284 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -52,8 +52,6 @@ typedef struct { Py_ssize_t mark[1]; } MatchObject; -typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); - typedef struct SRE_REPEAT_T { Py_ssize_t count; SRE_CODE* pattern; /* points to REPEAT operator arguments */ @@ -83,8 +81,6 @@ typedef struct { Py_buffer buffer; /* current repeat context */ SRE_REPEAT *repeat; - /* hooks */ - SRE_TOLOWER_HOOK lower, upper; } SRE_STATE; typedef struct { diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 6d6d21e..c8ccb32 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20170530 +#define SRE_MAGIC 20171005 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -26,28 +26,33 @@ #define SRE_OP_BIGCHARSET 11 #define SRE_OP_GROUPREF 12 #define SRE_OP_GROUPREF_EXISTS 13 -#define SRE_OP_GROUPREF_IGNORE 14 -#define SRE_OP_IN 15 -#define SRE_OP_IN_IGNORE 16 -#define SRE_OP_INFO 17 -#define SRE_OP_JUMP 18 -#define SRE_OP_LITERAL 19 -#define SRE_OP_LITERAL_IGNORE 20 -#define SRE_OP_MARK 21 -#define SRE_OP_MAX_UNTIL 22 -#define SRE_OP_MIN_UNTIL 23 -#define SRE_OP_NOT_LITERAL 24 -#define SRE_OP_NOT_LITERAL_IGNORE 25 -#define SRE_OP_NEGATE 26 -#define SRE_OP_RANGE 27 -#define SRE_OP_REPEAT 28 -#define SRE_OP_REPEAT_ONE 29 -#define SRE_OP_SUBPATTERN 30 -#define SRE_OP_MIN_REPEAT_ONE 31 -#define SRE_OP_RANGE_IGNORE 32 -#define SRE_OP_LITERAL_LOC_IGNORE 33 -#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34 -#define SRE_OP_IN_LOC_IGNORE 35 +#define SRE_OP_IN 14 +#define SRE_OP_INFO 15 +#define SRE_OP_JUMP 16 +#define SRE_OP_LITERAL 17 +#define SRE_OP_MARK 18 +#define SRE_OP_MAX_UNTIL 19 +#define SRE_OP_MIN_UNTIL 20 +#define SRE_OP_NOT_LITERAL 21 +#define SRE_OP_NEGATE 22 +#define SRE_OP_RANGE 23 +#define SRE_OP_REPEAT 24 +#define SRE_OP_REPEAT_ONE 25 +#define SRE_OP_SUBPATTERN 26 +#define SRE_OP_MIN_REPEAT_ONE 27 +#define SRE_OP_GROUPREF_IGNORE 28 +#define SRE_OP_IN_IGNORE 29 +#define SRE_OP_LITERAL_IGNORE 30 +#define SRE_OP_NOT_LITERAL_IGNORE 31 +#define SRE_OP_GROUPREF_LOC_IGNORE 32 +#define SRE_OP_IN_LOC_IGNORE 33 +#define SRE_OP_LITERAL_LOC_IGNORE 34 +#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35 +#define SRE_OP_GROUPREF_UNI_IGNORE 36 +#define SRE_OP_IN_UNI_IGNORE 37 +#define SRE_OP_LITERAL_UNI_IGNORE 38 +#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39 +#define SRE_OP_RANGE_UNI_IGNORE 40 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index b540d21..e13b90e 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -101,14 +101,6 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) } LOCAL(int) -SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch) -{ - return ch == pattern - || (SRE_CODE) state->lower(ch) == pattern - || (SRE_CODE) state->upper(ch) == pattern; -} - -LOCAL(int) SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) { /* check if character is a member of the given set */ @@ -150,14 +142,14 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) set += 2; break; - case SRE_OP_RANGE_IGNORE: - /* <RANGE_IGNORE> <lower> <upper> */ + case SRE_OP_RANGE_UNI_IGNORE: + /* <RANGE_UNI_IGNORE> <lower> <upper> */ { SRE_CODE uch; /* ch is already lower cased */ if (set[0] <= ch && ch <= set[1]) return ok; - uch = state->upper(ch); + uch = sre_upper_unicode(ch); if (set[0] <= uch && uch <= set[1]) return ok; set += 2; @@ -199,11 +191,11 @@ LOCAL(int) SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) { SRE_CODE lo, up; - lo = state->lower(ch); + lo = sre_lower_locale(ch); if (SRE(charset)(state, set, lo)) return 1; - up = state->upper(ch); + up = sre_upper_locale(ch); return up != lo && SRE(charset)(state, set, up); } @@ -263,7 +255,15 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) /* repeated literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr) + while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr) + ptr++; + break; + + case SRE_OP_LITERAL_UNI_IGNORE: + /* repeated literal */ + chr = pattern[1]; + TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr)); + while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr) ptr++; break; @@ -271,7 +271,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) /* repeated literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr)) + while (ptr < end && char_loc_ignore(chr, *ptr)) ptr++; break; @@ -293,7 +293,15 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) /* repeated non-literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr) + while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr) + ptr++; + break; + + case SRE_OP_NOT_LITERAL_UNI_IGNORE: + /* repeated non-literal */ + chr = pattern[1]; + TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr)); + while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr) ptr++; break; @@ -301,7 +309,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) /* repeated non-literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr)) + while (ptr < end && !char_loc_ignore(chr, *ptr)) ptr++; break; @@ -687,7 +695,17 @@ entrance: TRACE(("|%p|%p|LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, ctx->pattern[0])); if (ctx->ptr >= end || - state->lower(*ctx->ptr) != *ctx->pattern) + sre_lower_ascii(*ctx->ptr) != *ctx->pattern) + RETURN_FAILURE; + ctx->pattern++; + ctx->ptr++; + break; + + case SRE_OP_LITERAL_UNI_IGNORE: + TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n", + ctx->pattern, ctx->ptr, ctx->pattern[0])); + if (ctx->ptr >= end || + sre_lower_unicode(*ctx->ptr) != *ctx->pattern) RETURN_FAILURE; ctx->pattern++; ctx->ptr++; @@ -697,7 +715,7 @@ entrance: TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n", ctx->pattern, ctx->ptr, ctx->pattern[0])); if (ctx->ptr >= end - || !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr)) + || !char_loc_ignore(*ctx->pattern, *ctx->ptr)) RETURN_FAILURE; ctx->pattern++; ctx->ptr++; @@ -707,7 +725,17 @@ entrance: TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); if (ctx->ptr >= end || - state->lower(*ctx->ptr) == *ctx->pattern) + sre_lower_ascii(*ctx->ptr) == *ctx->pattern) + RETURN_FAILURE; + ctx->pattern++; + ctx->ptr++; + break; + + case SRE_OP_NOT_LITERAL_UNI_IGNORE: + TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n", + ctx->pattern, ctx->ptr, *ctx->pattern)); + if (ctx->ptr >= end || + sre_lower_unicode(*ctx->ptr) == *ctx->pattern) RETURN_FAILURE; ctx->pattern++; ctx->ptr++; @@ -717,7 +745,7 @@ entrance: TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); if (ctx->ptr >= end - || SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr)) + || char_loc_ignore(*ctx->pattern, *ctx->ptr)) RETURN_FAILURE; ctx->pattern++; ctx->ptr++; @@ -727,7 +755,17 @@ entrance: TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); if (ctx->ptr >= end || !SRE(charset)(state, ctx->pattern+1, - (SRE_CODE)state->lower(*ctx->ptr))) + (SRE_CODE)sre_lower_ascii(*ctx->ptr))) + RETURN_FAILURE; + ctx->pattern += ctx->pattern[0]; + ctx->ptr++; + break; + + case SRE_OP_IN_UNI_IGNORE: + TRACE(("|%p|%p|IN_UNI_IGNORE\n", ctx->pattern, ctx->ptr)); + if (ctx->ptr >= end + || !SRE(charset)(state, ctx->pattern+1, + (SRE_CODE)sre_lower_unicode(*ctx->ptr))) RETURN_FAILURE; ctx->pattern += ctx->pattern[0]; ctx->ptr++; @@ -1135,7 +1173,59 @@ entrance: RETURN_FAILURE; while (p < e) { if (ctx->ptr >= end || - state->lower(*ctx->ptr) != state->lower(*p)) + sre_lower_ascii(*ctx->ptr) != sre_lower_ascii(*p)) + RETURN_FAILURE; + p++; + ctx->ptr++; + } + } + } + ctx->pattern++; + break; + + case SRE_OP_GROUPREF_UNI_IGNORE: + /* match backreference */ + TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[0])); + i = ctx->pattern[0]; + { + Py_ssize_t groupref = i+i; + if (groupref >= state->lastmark) { + RETURN_FAILURE; + } else { + SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; + SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; + if (!p || !e || e < p) + RETURN_FAILURE; + while (p < e) { + if (ctx->ptr >= end || + sre_lower_unicode(*ctx->ptr) != sre_lower_unicode(*p)) + RETURN_FAILURE; + p++; + ctx->ptr++; + } + } + } + ctx->pattern++; + break; + + case SRE_OP_GROUPREF_LOC_IGNORE: + /* match backreference */ + TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[0])); + i = ctx->pattern[0]; + { + Py_ssize_t groupref = i+i; + if (groupref >= state->lastmark) { + RETURN_FAILURE; + } else { + SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; + SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; + if (!p || !e || e < p) + RETURN_FAILURE; + while (p < e) { + if (ctx->ptr >= end || + sre_lower_locale(*ctx->ptr) != sre_lower_locale(*p)) RETURN_FAILURE; p++; ctx->ptr++; |