summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2017-10-24 20:31:42 (GMT)
committerGitHub <noreply@github.com>2017-10-24 20:31:42 (GMT)
commit3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132 (patch)
treeaa741f0d09293f6dfe9668a5b328658ce13c8279 /Modules
parentfdd9b217c60b454ac6a82f02c8b0b551caeac88b (diff)
downloadcpython-3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132.zip
cpython-3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132.tar.gz
cpython-3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132.tar.bz2
bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_sre.c37
-rw-r--r--Modules/sre.h4
-rw-r--r--Modules/sre_constants.h51
-rw-r--r--Modules/sre_lib.h136
4 files changed, 160 insertions, 68 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c
index c42ab26..a9b6b50 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -97,12 +97,12 @@ static const char copyright[] =
#define SRE_IS_WORD(ch)\
((ch) < 128 && (Py_ISALNUM(ch) || (ch) == '_'))
-static unsigned int sre_lower(unsigned int ch)
+static unsigned int sre_lower_ascii(unsigned int ch)
{
return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
}
-static unsigned int sre_upper(unsigned int ch)
+static unsigned int sre_upper_ascii(unsigned int ch)
{
return ((ch) < 128 ? Py_TOUPPER(ch) : ch);
}
@@ -188,6 +188,15 @@ sre_category(SRE_CODE category, unsigned int ch)
return 0;
}
+LOCAL(int)
+char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
+{
+ return ch == pattern
+ || (SRE_CODE) sre_lower_locale(ch) == pattern
+ || (SRE_CODE) sre_upper_locale(ch) == pattern;
+}
+
+
/* helpers */
static void
@@ -286,7 +295,7 @@ _sre_ascii_iscased_impl(PyObject *module, int character)
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
{
unsigned int ch = (unsigned int)character;
- return ch != sre_lower(ch) || ch != sre_upper(ch);
+ return ch != sre_lower_ascii(ch) || ch != sre_upper_ascii(ch);
}
/*[clinic input]
@@ -317,7 +326,7 @@ static int
_sre_ascii_tolower_impl(PyObject *module, int character)
/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
{
- return sre_lower(character);
+ return sre_lower_ascii(character);
}
/*[clinic input]
@@ -448,19 +457,6 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->pos = start;
state->endpos = end;
- if (pattern->flags & SRE_FLAG_LOCALE) {
- state->lower = sre_lower_locale;
- state->upper = sre_upper_locale;
- }
- else if (pattern->flags & SRE_FLAG_UNICODE) {
- state->lower = sre_lower_unicode;
- state->upper = sre_upper_unicode;
- }
- else {
- state->lower = sre_lower;
- state->upper = sre_upper;
- }
-
return string;
err:
PyMem_Del(state->mark);
@@ -1533,7 +1529,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)
break;
case SRE_OP_RANGE:
- case SRE_OP_RANGE_IGNORE:
+ case SRE_OP_RANGE_UNI_IGNORE:
GET_ARG;
GET_ARG;
break;
@@ -1630,6 +1626,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_NOT_LITERAL:
case SRE_OP_LITERAL_IGNORE:
case SRE_OP_NOT_LITERAL_IGNORE:
+ case SRE_OP_LITERAL_UNI_IGNORE:
+ case SRE_OP_NOT_LITERAL_UNI_IGNORE:
case SRE_OP_LITERAL_LOC_IGNORE:
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
GET_ARG;
@@ -1669,6 +1667,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_IN:
case SRE_OP_IN_IGNORE:
+ case SRE_OP_IN_UNI_IGNORE:
case SRE_OP_IN_LOC_IGNORE:
GET_SKIP;
/* Stop 1 before the end; we check the FAILURE below */
@@ -1805,6 +1804,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_GROUPREF:
case SRE_OP_GROUPREF_IGNORE:
+ case SRE_OP_GROUPREF_UNI_IGNORE:
+ case SRE_OP_GROUPREF_LOC_IGNORE:
GET_ARG;
if (arg >= (size_t)groups)
FAIL;
diff --git a/Modules/sre.h b/Modules/sre.h
index 9af5e40..585d284 100644
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -52,8 +52,6 @@ typedef struct {
Py_ssize_t mark[1];
} MatchObject;
-typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
-
typedef struct SRE_REPEAT_T {
Py_ssize_t count;
SRE_CODE* pattern; /* points to REPEAT operator arguments */
@@ -83,8 +81,6 @@ typedef struct {
Py_buffer buffer;
/* current repeat context */
SRE_REPEAT *repeat;
- /* hooks */
- SRE_TOLOWER_HOOK lower, upper;
} SRE_STATE;
typedef struct {
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index 6d6d21e..c8ccb32 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
-#define SRE_MAGIC 20170530
+#define SRE_MAGIC 20171005
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@@ -26,28 +26,33 @@
#define SRE_OP_BIGCHARSET 11
#define SRE_OP_GROUPREF 12
#define SRE_OP_GROUPREF_EXISTS 13
-#define SRE_OP_GROUPREF_IGNORE 14
-#define SRE_OP_IN 15
-#define SRE_OP_IN_IGNORE 16
-#define SRE_OP_INFO 17
-#define SRE_OP_JUMP 18
-#define SRE_OP_LITERAL 19
-#define SRE_OP_LITERAL_IGNORE 20
-#define SRE_OP_MARK 21
-#define SRE_OP_MAX_UNTIL 22
-#define SRE_OP_MIN_UNTIL 23
-#define SRE_OP_NOT_LITERAL 24
-#define SRE_OP_NOT_LITERAL_IGNORE 25
-#define SRE_OP_NEGATE 26
-#define SRE_OP_RANGE 27
-#define SRE_OP_REPEAT 28
-#define SRE_OP_REPEAT_ONE 29
-#define SRE_OP_SUBPATTERN 30
-#define SRE_OP_MIN_REPEAT_ONE 31
-#define SRE_OP_RANGE_IGNORE 32
-#define SRE_OP_LITERAL_LOC_IGNORE 33
-#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34
-#define SRE_OP_IN_LOC_IGNORE 35
+#define SRE_OP_IN 14
+#define SRE_OP_INFO 15
+#define SRE_OP_JUMP 16
+#define SRE_OP_LITERAL 17
+#define SRE_OP_MARK 18
+#define SRE_OP_MAX_UNTIL 19
+#define SRE_OP_MIN_UNTIL 20
+#define SRE_OP_NOT_LITERAL 21
+#define SRE_OP_NEGATE 22
+#define SRE_OP_RANGE 23
+#define SRE_OP_REPEAT 24
+#define SRE_OP_REPEAT_ONE 25
+#define SRE_OP_SUBPATTERN 26
+#define SRE_OP_MIN_REPEAT_ONE 27
+#define SRE_OP_GROUPREF_IGNORE 28
+#define SRE_OP_IN_IGNORE 29
+#define SRE_OP_LITERAL_IGNORE 30
+#define SRE_OP_NOT_LITERAL_IGNORE 31
+#define SRE_OP_GROUPREF_LOC_IGNORE 32
+#define SRE_OP_IN_LOC_IGNORE 33
+#define SRE_OP_LITERAL_LOC_IGNORE 34
+#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35
+#define SRE_OP_GROUPREF_UNI_IGNORE 36
+#define SRE_OP_IN_UNI_IGNORE 37
+#define SRE_OP_LITERAL_UNI_IGNORE 38
+#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39
+#define SRE_OP_RANGE_UNI_IGNORE 40
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index b540d21..e13b90e 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -101,14 +101,6 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
}
LOCAL(int)
-SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
-{
- return ch == pattern
- || (SRE_CODE) state->lower(ch) == pattern
- || (SRE_CODE) state->upper(ch) == pattern;
-}
-
-LOCAL(int)
SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{
/* check if character is a member of the given set */
@@ -150,14 +142,14 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
set += 2;
break;
- case SRE_OP_RANGE_IGNORE:
- /* <RANGE_IGNORE> <lower> <upper> */
+ case SRE_OP_RANGE_UNI_IGNORE:
+ /* <RANGE_UNI_IGNORE> <lower> <upper> */
{
SRE_CODE uch;
/* ch is already lower cased */
if (set[0] <= ch && ch <= set[1])
return ok;
- uch = state->upper(ch);
+ uch = sre_upper_unicode(ch);
if (set[0] <= uch && uch <= set[1])
return ok;
set += 2;
@@ -199,11 +191,11 @@ LOCAL(int)
SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{
SRE_CODE lo, up;
- lo = state->lower(ch);
+ lo = sre_lower_locale(ch);
if (SRE(charset)(state, set, lo))
return 1;
- up = state->upper(ch);
+ up = sre_upper_locale(ch);
return up != lo && SRE(charset)(state, set, up);
}
@@ -263,7 +255,15 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
/* repeated literal */
chr = pattern[1];
TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
- while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
+ while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr)
+ ptr++;
+ break;
+
+ case SRE_OP_LITERAL_UNI_IGNORE:
+ /* repeated literal */
+ chr = pattern[1];
+ TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
+ while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr)
ptr++;
break;
@@ -271,7 +271,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
/* repeated literal */
chr = pattern[1];
TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
- while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
+ while (ptr < end && char_loc_ignore(chr, *ptr))
ptr++;
break;
@@ -293,7 +293,15 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
/* repeated non-literal */
chr = pattern[1];
TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
- while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
+ while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr)
+ ptr++;
+ break;
+
+ case SRE_OP_NOT_LITERAL_UNI_IGNORE:
+ /* repeated non-literal */
+ chr = pattern[1];
+ TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
+ while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr)
ptr++;
break;
@@ -301,7 +309,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
/* repeated non-literal */
chr = pattern[1];
TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
- while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
+ while (ptr < end && !char_loc_ignore(chr, *ptr))
ptr++;
break;
@@ -687,7 +695,17 @@ entrance:
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
ctx->pattern, ctx->ptr, ctx->pattern[0]));
if (ctx->ptr >= end ||
- state->lower(*ctx->ptr) != *ctx->pattern)
+ sre_lower_ascii(*ctx->ptr) != *ctx->pattern)
+ RETURN_FAILURE;
+ ctx->pattern++;
+ ctx->ptr++;
+ break;
+
+ case SRE_OP_LITERAL_UNI_IGNORE:
+ TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n",
+ ctx->pattern, ctx->ptr, ctx->pattern[0]));
+ if (ctx->ptr >= end ||
+ sre_lower_unicode(*ctx->ptr) != *ctx->pattern)
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
@@ -697,7 +715,7 @@ entrance:
TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
ctx->pattern, ctx->ptr, ctx->pattern[0]));
if (ctx->ptr >= end
- || !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
+ || !char_loc_ignore(*ctx->pattern, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
@@ -707,7 +725,17 @@ entrance:
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
ctx->pattern, ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end ||
- state->lower(*ctx->ptr) == *ctx->pattern)
+ sre_lower_ascii(*ctx->ptr) == *ctx->pattern)
+ RETURN_FAILURE;
+ ctx->pattern++;
+ ctx->ptr++;
+ break;
+
+ case SRE_OP_NOT_LITERAL_UNI_IGNORE:
+ TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n",
+ ctx->pattern, ctx->ptr, *ctx->pattern));
+ if (ctx->ptr >= end ||
+ sre_lower_unicode(*ctx->ptr) == *ctx->pattern)
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
@@ -717,7 +745,7 @@ entrance:
TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
ctx->pattern, ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end
- || SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
+ || char_loc_ignore(*ctx->pattern, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
@@ -727,7 +755,17 @@ entrance:
TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
if (ctx->ptr >= end
|| !SRE(charset)(state, ctx->pattern+1,
- (SRE_CODE)state->lower(*ctx->ptr)))
+ (SRE_CODE)sre_lower_ascii(*ctx->ptr)))
+ RETURN_FAILURE;
+ ctx->pattern += ctx->pattern[0];
+ ctx->ptr++;
+ break;
+
+ case SRE_OP_IN_UNI_IGNORE:
+ TRACE(("|%p|%p|IN_UNI_IGNORE\n", ctx->pattern, ctx->ptr));
+ if (ctx->ptr >= end
+ || !SRE(charset)(state, ctx->pattern+1,
+ (SRE_CODE)sre_lower_unicode(*ctx->ptr)))
RETURN_FAILURE;
ctx->pattern += ctx->pattern[0];
ctx->ptr++;
@@ -1135,7 +1173,59 @@ entrance:
RETURN_FAILURE;
while (p < e) {
if (ctx->ptr >= end ||
- state->lower(*ctx->ptr) != state->lower(*p))
+ sre_lower_ascii(*ctx->ptr) != sre_lower_ascii(*p))
+ RETURN_FAILURE;
+ p++;
+ ctx->ptr++;
+ }
+ }
+ }
+ ctx->pattern++;
+ break;
+
+ case SRE_OP_GROUPREF_UNI_IGNORE:
+ /* match backreference */
+ TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", ctx->pattern,
+ ctx->ptr, ctx->pattern[0]));
+ i = ctx->pattern[0];
+ {
+ Py_ssize_t groupref = i+i;
+ if (groupref >= state->lastmark) {
+ RETURN_FAILURE;
+ } else {
+ SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
+ SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
+ if (!p || !e || e < p)
+ RETURN_FAILURE;
+ while (p < e) {
+ if (ctx->ptr >= end ||
+ sre_lower_unicode(*ctx->ptr) != sre_lower_unicode(*p))
+ RETURN_FAILURE;
+ p++;
+ ctx->ptr++;
+ }
+ }
+ }
+ ctx->pattern++;
+ break;
+
+ case SRE_OP_GROUPREF_LOC_IGNORE:
+ /* match backreference */
+ TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", ctx->pattern,
+ ctx->ptr, ctx->pattern[0]));
+ i = ctx->pattern[0];
+ {
+ Py_ssize_t groupref = i+i;
+ if (groupref >= state->lastmark) {
+ RETURN_FAILURE;
+ } else {
+ SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
+ SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
+ if (!p || !e || e < p)
+ RETURN_FAILURE;
+ while (p < e) {
+ if (ctx->ptr >= end ||
+ sre_lower_locale(*ctx->ptr) != sre_lower_locale(*p))
RETURN_FAILURE;
p++;
ctx->ptr++;