diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2014-10-31 10:36:56 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2014-10-31 10:36:56 (GMT) |
commit | 4b8f8949b43715f1b0f0ef77e15e19c180ccc195 (patch) | |
tree | bebc1eda94d11692278f03c41c683b5b8ca815dd /Modules | |
parent | 455de40a6e99ad7548e6061733f9c5dae2327e83 (diff) | |
download | cpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.zip cpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.tar.gz cpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.tar.bz2 |
Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/_sre.c | 28 | ||||
-rw-r--r-- | Modules/sre.h | 2 | ||||
-rw-r--r-- | Modules/sre_constants.h | 3 | ||||
-rw-r--r-- | Modules/sre_lib.h | 28 |
4 files changed, 50 insertions, 11 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c index 0dc5212..63778f4 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -113,6 +113,11 @@ static unsigned int sre_lower(unsigned int ch) return ((ch) < 128 ? Py_TOLOWER(ch) : ch); } +static unsigned int sre_upper(unsigned int ch) +{ + return ((ch) < 128 ? Py_TOUPPER(ch) : ch); +} + /* locale-specific character predicates */ /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids * warnings when c's type supports only numbers < N+1 */ @@ -124,6 +129,11 @@ static unsigned int sre_lower_locale(unsigned int ch) return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch); } +static unsigned int sre_upper_locale(unsigned int ch) +{ + return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch); +} + /* unicode-specific character predicates */ #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch) @@ -137,6 +147,11 @@ static unsigned int sre_lower_unicode(unsigned int ch) return (unsigned int) Py_UNICODE_TOLOWER(ch); } +static unsigned int sre_upper_unicode(unsigned int ch) +{ + return (unsigned int) Py_UNICODE_TOUPPER(ch); +} + LOCAL(int) sre_category(SRE_CODE category, unsigned int ch) { @@ -377,12 +392,18 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->pos = start; state->endpos = end; - if (pattern->flags & SRE_FLAG_LOCALE) + if (pattern->flags & SRE_FLAG_LOCALE) { state->lower = sre_lower_locale; - else if (pattern->flags & SRE_FLAG_UNICODE) + state->upper = sre_upper_locale; + } + else if (pattern->flags & SRE_FLAG_UNICODE) { state->lower = sre_lower_unicode; - else + state->upper = sre_upper_unicode; + } + else { state->lower = sre_lower; + state->upper = sre_upper; + } return string; err: @@ -1567,6 +1588,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end) break; case SRE_OP_RANGE: + case SRE_OP_RANGE_IGNORE: GET_ARG; GET_ARG; break; diff --git a/Modules/sre.h b/Modules/sre.h index 35d198f..b632165 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -84,7 +84,7 @@ typedef struct { /* current repeat context */ SRE_REPEAT *repeat; /* hooks */ - SRE_TOLOWER_HOOK lower; + SRE_TOLOWER_HOOK lower, upper; } SRE_STATE; typedef struct { diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 5940d5a..6632442 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20031017 +#define SRE_MAGIC 20140917 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -44,6 +44,7 @@ #define SRE_OP_REPEAT_ONE 29 #define SRE_OP_SUBPATTERN 30 #define SRE_OP_MIN_REPEAT_ONE 31 +#define SRE_OP_RANGE_IGNORE 32 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 5c6c5a5..463a908 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -101,7 +101,7 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) } LOCAL(int) -SRE(charset)(SRE_CODE* set, SRE_CODE ch) +SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) { /* check if character is a member of the given set */ @@ -142,6 +142,20 @@ SRE(charset)(SRE_CODE* set, SRE_CODE ch) set += 2; break; + case SRE_OP_RANGE_IGNORE: + /* <RANGE_IGNORE> <lower> <upper> */ + { + SRE_CODE uch; + /* ch is already lower cased */ + if (set[0] <= ch && ch <= set[1]) + return ok; + uch = state->upper(ch); + if (set[0] <= uch && uch <= set[1]) + return ok; + set += 2; + break; + } + case SRE_OP_NEGATE: ok = !ok; break; @@ -193,7 +207,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) case SRE_OP_IN: /* repeated set */ TRACE(("|%p|%p|COUNT IN\n", pattern, ptr)); - while (ptr < end && SRE(charset)(pattern + 2, *ptr)) + while (ptr < end && SRE(charset)(state, pattern + 2, *ptr)) ptr++; break; @@ -628,7 +642,8 @@ entrance: /* match set member (or non_member) */ /* <IN> <skip> <set> */ TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end || !SRE(charset)(ctx->pattern + 1, *ctx->ptr)) + if (ctx->ptr >= end || + !SRE(charset)(state, ctx->pattern + 1, *ctx->ptr)) RETURN_FAILURE; ctx->pattern += ctx->pattern[0]; ctx->ptr++; @@ -657,7 +672,7 @@ entrance: case SRE_OP_IN_IGNORE: TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); if (ctx->ptr >= end - || !SRE(charset)(ctx->pattern+1, + || !SRE(charset)(state, ctx->pattern+1, (SRE_CODE)state->lower(*ctx->ptr))) RETURN_FAILURE; ctx->pattern += ctx->pattern[0]; @@ -688,7 +703,8 @@ entrance: continue; if (ctx->pattern[1] == SRE_OP_IN && (ctx->ptr >= end || - !SRE(charset)(ctx->pattern + 3, (SRE_CODE) *ctx->ptr))) + !SRE(charset)(state, ctx->pattern + 3, + (SRE_CODE) *ctx->ptr))) continue; state->ptr = ctx->ptr; DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); @@ -1310,7 +1326,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) /* pattern starts with a character from a known set */ end = (SRE_CHAR *)state->end; for (;;) { - while (ptr < end && !SRE(charset)(charset, *ptr)) + while (ptr < end && !SRE(charset)(state, charset, *ptr)) ptr++; if (ptr >= end) return 0; |