summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2017-05-05 05:53:40 (GMT)
committerGitHub <noreply@github.com>2017-05-05 05:53:40 (GMT)
commit898ff03e1e7925ecde3da66327d3cdc7e07625ba (patch)
tree977fc4b98c0e85816348cebd3b12026407c368b6 /Modules
parent647c3d381e67490e82cdbbe6c96e46d5e1628ce2 (diff)
downloadcpython-898ff03e1e7925ecde3da66327d3cdc7e07625ba.zip
cpython-898ff03e1e7925ecde3da66327d3cdc7e07625ba.tar.gz
cpython-898ff03e1e7925ecde3da66327d3cdc7e07625ba.tar.bz2
bpo-30215: Make re.compile() locale agnostic. (#1361)
Compiled regular expression objects with the re.LOCALE flag no longer depend on the locale at compile time. Only the locale at matching time affects the result of matching.
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_sre.c3
-rw-r--r--Modules/sre_constants.h5
-rw-r--r--Modules/sre_lib.h69
3 files changed, 74 insertions, 3 deletions
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 03a138e..afb2bce 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -1588,6 +1588,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_NOT_LITERAL:
case SRE_OP_LITERAL_IGNORE:
case SRE_OP_NOT_LITERAL_IGNORE:
+ case SRE_OP_LITERAL_LOC_IGNORE:
+ case SRE_OP_NOT_LITERAL_LOC_IGNORE:
GET_ARG;
/* The arg is just a character, nothing to check */
break;
@@ -1625,6 +1627,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_IN:
case SRE_OP_IN_IGNORE:
+ case SRE_OP_IN_LOC_IGNORE:
GET_SKIP;
/* Stop 1 before the end; we check the FAILURE below */
if (!_validate_charset(code, code+skip-2))
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index 6632442..6d6d21e 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
-#define SRE_MAGIC 20140917
+#define SRE_MAGIC 20170530
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@@ -45,6 +45,9 @@
#define SRE_OP_SUBPATTERN 30
#define SRE_OP_MIN_REPEAT_ONE 31
#define SRE_OP_RANGE_IGNORE 32
+#define SRE_OP_LITERAL_LOC_IGNORE 33
+#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34
+#define SRE_OP_IN_LOC_IGNORE 35
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index 0865fc6..b540d21 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -101,6 +101,14 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
}
LOCAL(int)
+SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
+{
+ return ch == pattern
+ || (SRE_CODE) state->lower(ch) == pattern
+ || (SRE_CODE) state->upper(ch) == pattern;
+}
+
+LOCAL(int)
SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{
/* check if character is a member of the given set */
@@ -187,6 +195,18 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
}
}
+LOCAL(int)
+SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
+{
+ SRE_CODE lo, up;
+ lo = state->lower(ch);
+ if (SRE(charset)(state, set, lo))
+ return 1;
+
+ up = state->upper(ch);
+ return up != lo && SRE(charset)(state, set, up);
+}
+
LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
LOCAL(Py_ssize_t)
@@ -247,6 +267,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
ptr++;
break;
+ case SRE_OP_LITERAL_LOC_IGNORE:
+ /* repeated literal */
+ chr = pattern[1];
+ TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
+ while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
+ ptr++;
+ break;
+
case SRE_OP_NOT_LITERAL:
/* repeated non-literal */
chr = pattern[1];
@@ -269,6 +297,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
ptr++;
break;
+ case SRE_OP_NOT_LITERAL_LOC_IGNORE:
+ /* repeated non-literal */
+ chr = pattern[1];
+ TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
+ while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
+ ptr++;
+ break;
+
default:
/* repeated single character pattern */
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
@@ -651,7 +687,17 @@ entrance:
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
ctx->pattern, ctx->ptr, ctx->pattern[0]));
if (ctx->ptr >= end ||
- state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
+ state->lower(*ctx->ptr) != *ctx->pattern)
+ RETURN_FAILURE;
+ ctx->pattern++;
+ ctx->ptr++;
+ break;
+
+ case SRE_OP_LITERAL_LOC_IGNORE:
+ TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
+ ctx->pattern, ctx->ptr, ctx->pattern[0]));
+ if (ctx->ptr >= end
+ || !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
@@ -661,7 +707,17 @@ entrance:
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
ctx->pattern, ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end ||
- state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
+ state->lower(*ctx->ptr) == *ctx->pattern)
+ RETURN_FAILURE;
+ ctx->pattern++;
+ ctx->ptr++;
+ break;
+
+ case SRE_OP_NOT_LITERAL_LOC_IGNORE:
+ TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
+ ctx->pattern, ctx->ptr, *ctx->pattern));
+ if (ctx->ptr >= end
+ || SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
@@ -677,6 +733,15 @@ entrance:
ctx->ptr++;
break;
+ case SRE_OP_IN_LOC_IGNORE:
+ TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
+ if (ctx->ptr >= end
+ || !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
+ RETURN_FAILURE;
+ ctx->pattern += ctx->pattern[0];
+ ctx->ptr++;
+ break;
+
case SRE_OP_JUMP:
case SRE_OP_INFO:
/* jump forward */