summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2017-05-09 20:37:14 (GMT)
committerGitHub <noreply@github.com>2017-05-09 20:37:14 (GMT)
commit6d336a027913327fc042b0d758a16724fea27b9c (patch)
treeca511a6c75e340ef3493674b791f05a692e0c9e2
parentf93234bb8a87855f295d441524e519481ce6ab13 (diff)
downloadcpython-6d336a027913327fc042b0d758a16724fea27b9c.zip
cpython-6d336a027913327fc042b0d758a16724fea27b9c.tar.gz
cpython-6d336a027913327fc042b0d758a16724fea27b9c.tar.bz2
bpo-30285: Optimize case-insensitive matching and searching (#1482)
of regular expressions.
-rw-r--r--Doc/whatsnew/3.7.rst4
-rw-r--r--Lib/sre_compile.py171
-rw-r--r--Lib/test/test_re.py9
-rw-r--r--Misc/NEWS3
-rw-r--r--Modules/_sre.c34
-rw-r--r--Modules/clinic/_sre.c.h64
6 files changed, 215 insertions, 70 deletions
diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst
index 3de8bc5..57fd4e4 100644
--- a/Doc/whatsnew/3.7.rst
+++ b/Doc/whatsnew/3.7.rst
@@ -208,6 +208,10 @@ Optimizations
using the :func:`os.scandir` function.
(Contributed by Serhiy Storchaka in :issue:`25996`.)
+* Optimized case-insensitive matching and searching of :mod:`regular
+ expressions <re>`. Searching some patterns can now be up to 20 times faster.
+ (Contributed by Serhiy Storchaka in :issue:`30285`.)
+
Build and C API Changes
=======================
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index db8b8a2..cebecb9 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -69,13 +69,16 @@ def _compile(code, pattern, flags):
REPEATING_CODES = _REPEATING_CODES
SUCCESS_CODES = _SUCCESS_CODES
ASSERT_CODES = _ASSERT_CODES
+ iscased = None
tolower = None
fixes = None
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
+ iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower
fixes = _ignorecase_fixes
else:
+ iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower
for op, av in pattern:
if op in LITERAL_CODES:
@@ -85,6 +88,9 @@ def _compile(code, pattern, flags):
elif flags & SRE_FLAG_LOCALE:
emit(OP_LOC_IGNORE[op])
emit(av)
+ elif not iscased(av):
+ emit(op)
+ emit(av)
else:
lo = tolower(av)
if fixes and lo in fixes:
@@ -101,14 +107,15 @@ def _compile(code, pattern, flags):
emit(OP_IGNORE[op])
emit(lo)
elif op is IN:
- if not flags & SRE_FLAG_IGNORECASE:
- emit(op)
- elif flags & SRE_FLAG_LOCALE:
+ charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
+ if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
- else:
+ elif hascased:
emit(IN_IGNORE)
+ else:
+ emit(IN)
skip = _len(code); emit(0)
- _compile_charset(av, flags, code, tolower, fixes)
+ _compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
@@ -223,10 +230,10 @@ def _compile(code, pattern, flags):
else:
raise error("internal: unsupported operand type %r" % (op,))
-def _compile_charset(charset, flags, code, fixup=None, fixes=None):
+def _compile_charset(charset, flags, code):
# compile charset subprogram
emit = code.append
- for op, av in _optimize_charset(charset, fixup, fixes):
+ for op, av in charset:
emit(op)
if op is NEGATE:
pass
@@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
raise error("internal: unsupported set operator %r" % (op,))
emit(FAILURE)
-def _optimize_charset(charset, fixup, fixes):
+def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# internal: optimize character set
out = []
tail = []
charmap = bytearray(256)
+ hascased = False
for op, av in charset:
while True:
try:
@@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes):
if fixes and lo in fixes:
for k in fixes[lo]:
charmap[k] = 1
+ if not hascased and iscased(av):
+ hascased = True
else:
charmap[av] = 1
elif op is RANGE:
r = range(av[0], av[1]+1)
if fixup:
- r = map(fixup, r)
- if fixup and fixes:
- for i in r:
- charmap[i] = 1
- if i in fixes:
- for k in fixes[i]:
- charmap[k] = 1
+ if fixes:
+ for i in map(fixup, r):
+ charmap[i] = 1
+ if i in fixes:
+ for k in fixes[i]:
+ charmap[k] = 1
+ else:
+ for i in map(fixup, r):
+ charmap[i] = 1
+ if not hascased:
+ hascased = any(map(iscased, r))
else:
for i in r:
charmap[i] = 1
@@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes):
charmap += b'\0' * 0xff00
continue
# Character set contains non-BMP character codes.
- # There are only two ranges of cased non-BMP characters:
- # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
- # and for both ranges RANGE_IGNORE works.
- if fixup and op is RANGE:
- op = RANGE_IGNORE
+ if fixup:
+ hascased = True
+ # There are only two ranges of cased non-BMP characters:
+ # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
+ # and for both ranges RANGE_IGNORE works.
+ if op is RANGE:
+ op = RANGE_IGNORE
tail.append((op, av))
break
@@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes):
out.append((RANGE, (p, q - 1)))
out += tail
# if the case was changed or new representation is more compact
- if fixup or len(out) < len(charset):
- return out
+ if hascased or len(out) < len(charset):
+ return out, hascased
# else original character set is good enough
- return charset
+ return charset, hascased
# use bitmap
if len(charmap) == 256:
data = _mk_bitmap(charmap)
out.append((CHARSET, data))
out += tail
- return out
+ return out, hascased
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
@@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes):
data[0:0] = [block] + _bytes_to_codes(mapping)
out.append((BIGCHARSET, data))
out += tail
- return out
+ return out, hascased
_CODEBITS = _sre.CODESIZE * 8
MAXCODE = (1 << _CODEBITS) - 1
@@ -414,19 +430,31 @@ def _generate_overlap_table(prefix):
table[i] = idx + 1
return table
-def _get_literal_prefix(pattern):
+def _get_iscased(flags):
+ if not flags & SRE_FLAG_IGNORECASE:
+ return None
+ elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
+ return _sre.unicode_iscased
+ else:
+ return _sre.ascii_iscased
+
+def _get_literal_prefix(pattern, flags):
# look for literal prefix
prefix = []
prefixappend = prefix.append
prefix_skip = None
+ iscased = _get_iscased(flags)
for op, av in pattern.data:
if op is LITERAL:
+ if iscased and iscased(av):
+ break
prefixappend(av)
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
- if add_flags & SRE_FLAG_IGNORECASE:
+ flags1 = (flags | add_flags) & ~del_flags
+ if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
break
- prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
+ prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
if prefix_skip is None:
if group is not None:
prefix_skip = len(prefix)
@@ -441,46 +469,49 @@ def _get_literal_prefix(pattern):
return prefix, prefix_skip, True
return prefix, prefix_skip, False
-def _get_charset_prefix(pattern):
- charset = [] # not used
- charsetappend = charset.append
- if pattern.data:
+def _get_charset_prefix(pattern, flags):
+ while True:
+ if not pattern.data:
+ return None
op, av = pattern.data[0]
- if op is SUBPATTERN:
- group, add_flags, del_flags, p = av
- if p and not (add_flags & SRE_FLAG_IGNORECASE):
- op, av = p[0]
- if op is LITERAL:
- charsetappend((op, av))
- elif op is BRANCH:
- c = []
- cappend = c.append
- for p in av[1]:
- if not p:
- break
- op, av = p[0]
- if op is LITERAL:
- cappend((op, av))
- else:
- break
- else:
- charset = c
- elif op is BRANCH:
- c = []
- cappend = c.append
- for p in av[1]:
- if not p:
- break
- op, av = p[0]
- if op is LITERAL:
- cappend((op, av))
- else:
- break
+ if op is not SUBPATTERN:
+ break
+ group, add_flags, del_flags, pattern = av
+ flags = (flags | add_flags) & ~del_flags
+ if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
+ return None
+
+ iscased = _get_iscased(flags)
+ if op is LITERAL:
+ if iscased and iscased(av):
+ return None
+ return [(op, av)]
+ elif op is BRANCH:
+ charset = []
+ charsetappend = charset.append
+ for p in av[1]:
+ if not p:
+ return None
+ op, av = p[0]
+ if op is LITERAL and not (iscased and iscased(av)):
+ charsetappend((op, av))
else:
- charset = c
- elif op is IN:
- charset = av
- return charset
+ return None
+ return charset
+ elif op is IN:
+ charset = av
+ if iscased:
+ for op, av in charset:
+ if op is LITERAL:
+ if iscased(av):
+ return None
+ elif op is RANGE:
+ if av[1] > 0xffff:
+ return None
+ if any(map(iscased, range(av[0], av[1]+1))):
+ return None
+ return charset
+ return None
def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version,
@@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags):
prefix = []
prefix_skip = 0
charset = [] # not used
- if not (flags & SRE_FLAG_IGNORECASE):
+ if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
# look for literal prefix
- prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
+ prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
# if no prefix, look for charset prefix
if not prefix:
- charset = _get_charset_prefix(pattern)
+ charset = _get_charset_prefix(pattern, flags)
## if prefix:
## print("*** PREFIX", prefix, prefix_skip)
## if charset:
@@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags):
# generate overlap table
code.extend(_generate_overlap_table(prefix))
elif charset:
+ charset, hascased = _optimize_charset(charset)
+ assert not hascased
_compile_charset(charset, flags, code)
code[skip] = len(code) - skip
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index b5b7cff..3129f7e 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -891,15 +891,24 @@ class ReTests(unittest.TestCase):
lo = ord(c.lower())
self.assertEqual(_sre.ascii_tolower(i), lo)
self.assertEqual(_sre.unicode_tolower(i), lo)
+ iscased = c in string.ascii_letters
+ self.assertEqual(_sre.ascii_iscased(i), iscased)
+ self.assertEqual(_sre.unicode_iscased(i), iscased)
for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
c = chr(i)
self.assertEqual(_sre.ascii_tolower(i), i)
if i != 0x0130:
self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
+ iscased = c != c.lower() or c != c.upper()
+ self.assertFalse(_sre.ascii_iscased(i))
+ self.assertEqual(_sre.unicode_iscased(i),
+ c != c.lower() or c != c.upper())
self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
+ self.assertFalse(_sre.ascii_iscased(0x0130))
+ self.assertTrue(_sre.unicode_iscased(0x0130))
def test_not_literal(self):
self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
diff --git a/Misc/NEWS b/Misc/NEWS
index 1828b01..7a79521 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -320,6 +320,9 @@ Extension Modules
Library
-------
+- bpo-30285: Optimized case-insensitive matching and searching of regular
+ expressions.
+
- bpo-29990: Fix range checking in GB18030 decoder. Original patch by Ma Lin.
- bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and
diff --git a/Modules/_sre.c b/Modules/_sre.c
index a86c5f2..6873f1d 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -274,6 +274,38 @@ _sre_getcodesize_impl(PyObject *module)
}
/*[clinic input]
+_sre.ascii_iscased -> bool
+
+ character: int
+ /
+
+[clinic start generated code]*/
+
+static int
+_sre_ascii_iscased_impl(PyObject *module, int character)
+/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
+{
+ unsigned int ch = (unsigned int)character;
+ return ch != sre_lower(ch) || ch != sre_upper(ch);
+}
+
+/*[clinic input]
+_sre.unicode_iscased -> bool
+
+ character: int
+ /
+
+[clinic start generated code]*/
+
+static int
+_sre_unicode_iscased_impl(PyObject *module, int character)
+/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
+{
+ unsigned int ch = (unsigned int)character;
+ return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
+}
+
+/*[clinic input]
_sre.ascii_tolower -> int
character: int
@@ -2750,6 +2782,8 @@ static PyTypeObject Scanner_Type = {
static PyMethodDef _functions[] = {
_SRE_COMPILE_METHODDEF
_SRE_GETCODESIZE_METHODDEF
+ _SRE_ASCII_ISCASED_METHODDEF
+ _SRE_UNICODE_ISCASED_METHODDEF
_SRE_ASCII_TOLOWER_METHODDEF
_SRE_UNICODE_TOLOWER_METHODDEF
{NULL, NULL}
diff --git a/Modules/clinic/_sre.c.h b/Modules/clinic/_sre.c.h
index 8056eda..1e60686 100644
--- a/Modules/clinic/_sre.c.h
+++ b/Modules/clinic/_sre.c.h
@@ -29,6 +29,68 @@ exit:
return return_value;
}
+PyDoc_STRVAR(_sre_ascii_iscased__doc__,
+"ascii_iscased($module, character, /)\n"
+"--\n"
+"\n");
+
+#define _SRE_ASCII_ISCASED_METHODDEF \
+ {"ascii_iscased", (PyCFunction)_sre_ascii_iscased, METH_O, _sre_ascii_iscased__doc__},
+
+static int
+_sre_ascii_iscased_impl(PyObject *module, int character);
+
+static PyObject *
+_sre_ascii_iscased(PyObject *module, PyObject *arg)
+{
+ PyObject *return_value = NULL;
+ int character;
+ int _return_value;
+
+ if (!PyArg_Parse(arg, "i:ascii_iscased", &character)) {
+ goto exit;
+ }
+ _return_value = _sre_ascii_iscased_impl(module, character);
+ if ((_return_value == -1) && PyErr_Occurred()) {
+ goto exit;
+ }
+ return_value = PyBool_FromLong((long)_return_value);
+
+exit:
+ return return_value;
+}
+
+PyDoc_STRVAR(_sre_unicode_iscased__doc__,
+"unicode_iscased($module, character, /)\n"
+"--\n"
+"\n");
+
+#define _SRE_UNICODE_ISCASED_METHODDEF \
+ {"unicode_iscased", (PyCFunction)_sre_unicode_iscased, METH_O, _sre_unicode_iscased__doc__},
+
+static int
+_sre_unicode_iscased_impl(PyObject *module, int character);
+
+static PyObject *
+_sre_unicode_iscased(PyObject *module, PyObject *arg)
+{
+ PyObject *return_value = NULL;
+ int character;
+ int _return_value;
+
+ if (!PyArg_Parse(arg, "i:unicode_iscased", &character)) {
+ goto exit;
+ }
+ _return_value = _sre_unicode_iscased_impl(module, character);
+ if ((_return_value == -1) && PyErr_Occurred()) {
+ goto exit;
+ }
+ return_value = PyBool_FromLong((long)_return_value);
+
+exit:
+ return return_value;
+}
+
PyDoc_STRVAR(_sre_ascii_tolower__doc__,
"ascii_tolower($module, character, /)\n"
"--\n"
@@ -715,4 +777,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyObject *Py_UNUSED(ignored))
{
return _sre_SRE_Scanner_search_impl(self);
}
-/*[clinic end generated code: output=811e67d7f8f5052e input=a9049054013a1b77]*/
+/*[clinic end generated code: output=5fe47c49e475cccb input=a9049054013a1b77]*/