summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2014-10-31 10:36:56 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2014-10-31 10:36:56 (GMT)
commit4b8f8949b43715f1b0f0ef77e15e19c180ccc195 (patch)
treebebc1eda94d11692278f03c41c683b5b8ca815dd /Modules
parent455de40a6e99ad7548e6061733f9c5dae2327e83 (diff)
downloadcpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.zip
cpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.tar.gz
cpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.tar.bz2
Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_sre.c28
-rw-r--r--Modules/sre.h2
-rw-r--r--Modules/sre_constants.h3
-rw-r--r--Modules/sre_lib.h28
4 files changed, 50 insertions, 11 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 0dc5212..63778f4 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -113,6 +113,11 @@ static unsigned int sre_lower(unsigned int ch)
return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
}
+static unsigned int sre_upper(unsigned int ch)
+{
+ return ((ch) < 128 ? Py_TOUPPER(ch) : ch);
+}
+
/* locale-specific character predicates */
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
* warnings when c's type supports only numbers < N+1 */
@@ -124,6 +129,11 @@ static unsigned int sre_lower_locale(unsigned int ch)
return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
}
+static unsigned int sre_upper_locale(unsigned int ch)
+{
+ return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
+}
+
/* unicode-specific character predicates */
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
@@ -137,6 +147,11 @@ static unsigned int sre_lower_unicode(unsigned int ch)
return (unsigned int) Py_UNICODE_TOLOWER(ch);
}
+static unsigned int sre_upper_unicode(unsigned int ch)
+{
+ return (unsigned int) Py_UNICODE_TOUPPER(ch);
+}
+
LOCAL(int)
sre_category(SRE_CODE category, unsigned int ch)
{
@@ -377,12 +392,18 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->pos = start;
state->endpos = end;
- if (pattern->flags & SRE_FLAG_LOCALE)
+ if (pattern->flags & SRE_FLAG_LOCALE) {
state->lower = sre_lower_locale;
- else if (pattern->flags & SRE_FLAG_UNICODE)
+ state->upper = sre_upper_locale;
+ }
+ else if (pattern->flags & SRE_FLAG_UNICODE) {
state->lower = sre_lower_unicode;
- else
+ state->upper = sre_upper_unicode;
+ }
+ else {
state->lower = sre_lower;
+ state->upper = sre_upper;
+ }
return string;
err:
@@ -1567,6 +1588,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)
break;
case SRE_OP_RANGE:
+ case SRE_OP_RANGE_IGNORE:
GET_ARG;
GET_ARG;
break;
diff --git a/Modules/sre.h b/Modules/sre.h
index 35d198f..b632165 100644
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -84,7 +84,7 @@ typedef struct {
/* current repeat context */
SRE_REPEAT *repeat;
/* hooks */
- SRE_TOLOWER_HOOK lower;
+ SRE_TOLOWER_HOOK lower, upper;
} SRE_STATE;
typedef struct {
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index 5940d5a..6632442 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
-#define SRE_MAGIC 20031017
+#define SRE_MAGIC 20140917
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@@ -44,6 +44,7 @@
#define SRE_OP_REPEAT_ONE 29
#define SRE_OP_SUBPATTERN 30
#define SRE_OP_MIN_REPEAT_ONE 31
+#define SRE_OP_RANGE_IGNORE 32
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index 5c6c5a5..463a908 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -101,7 +101,7 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
}
LOCAL(int)
-SRE(charset)(SRE_CODE* set, SRE_CODE ch)
+SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{
/* check if character is a member of the given set */
@@ -142,6 +142,20 @@ SRE(charset)(SRE_CODE* set, SRE_CODE ch)
set += 2;
break;
+ case SRE_OP_RANGE_IGNORE:
+ /* <RANGE_IGNORE> <lower> <upper> */
+ {
+ SRE_CODE uch;
+ /* ch is already lower cased */
+ if (set[0] <= ch && ch <= set[1])
+ return ok;
+ uch = state->upper(ch);
+ if (set[0] <= uch && uch <= set[1])
+ return ok;
+ set += 2;
+ break;
+ }
+
case SRE_OP_NEGATE:
ok = !ok;
break;
@@ -193,7 +207,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
case SRE_OP_IN:
/* repeated set */
TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
- while (ptr < end && SRE(charset)(pattern + 2, *ptr))
+ while (ptr < end && SRE(charset)(state, pattern + 2, *ptr))
ptr++;
break;
@@ -628,7 +642,8 @@ entrance:
/* match set member (or non_member) */
/* <IN> <skip> <set> */
TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
- if (ctx->ptr >= end || !SRE(charset)(ctx->pattern + 1, *ctx->ptr))
+ if (ctx->ptr >= end ||
+ !SRE(charset)(state, ctx->pattern + 1, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern += ctx->pattern[0];
ctx->ptr++;
@@ -657,7 +672,7 @@ entrance:
case SRE_OP_IN_IGNORE:
TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
if (ctx->ptr >= end
- || !SRE(charset)(ctx->pattern+1,
+ || !SRE(charset)(state, ctx->pattern+1,
(SRE_CODE)state->lower(*ctx->ptr)))
RETURN_FAILURE;
ctx->pattern += ctx->pattern[0];
@@ -688,7 +703,8 @@ entrance:
continue;
if (ctx->pattern[1] == SRE_OP_IN &&
(ctx->ptr >= end ||
- !SRE(charset)(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
+ !SRE(charset)(state, ctx->pattern + 3,
+ (SRE_CODE) *ctx->ptr)))
continue;
state->ptr = ctx->ptr;
DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
@@ -1310,7 +1326,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
/* pattern starts with a character from a known set */
end = (SRE_CHAR *)state->end;
for (;;) {
- while (ptr < end && !SRE(charset)(charset, *ptr))
+ while (ptr < end && !SRE(charset)(state, charset, *ptr))
ptr++;
if (ptr >= end)
return 0;