From 6d336a027913327fc042b0d758a16724fea27b9c Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 9 May 2017 23:37:14 +0300 Subject: bpo-30285: Optimize case-insensitive matching and searching (#1482) of regular expressions. --- Doc/whatsnew/3.7.rst | 4 ++ Lib/sre_compile.py | 171 +++++++++++++++++++++++++++++------------------- Lib/test/test_re.py | 9 +++ Misc/NEWS | 3 + Modules/_sre.c | 34 ++++++++++ Modules/clinic/_sre.c.h | 64 +++++++++++++++++- 6 files changed, 215 insertions(+), 70 deletions(-) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 3de8bc5..57fd4e4 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -208,6 +208,10 @@ Optimizations using the :func:`os.scandir` function. (Contributed by Serhiy Storchaka in :issue:`25996`.) +* Optimized case-insensitive matching and searching of :mod:`regular + expressions `. Searching some patterns can now be up to 20 times faster. + (Contributed by Serhiy Storchaka in :issue:`30285`.) + Build and C API Changes ======================= diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index db8b8a2..cebecb9 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -69,13 +69,16 @@ def _compile(code, pattern, flags): REPEATING_CODES = _REPEATING_CODES SUCCESS_CODES = _SUCCESS_CODES ASSERT_CODES = _ASSERT_CODES + iscased = None tolower = None fixes = None if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: + iscased = _sre.unicode_iscased tolower = _sre.unicode_tolower fixes = _ignorecase_fixes else: + iscased = _sre.ascii_iscased tolower = _sre.ascii_tolower for op, av in pattern: if op in LITERAL_CODES: @@ -85,6 +88,9 @@ def _compile(code, pattern, flags): elif flags & SRE_FLAG_LOCALE: emit(OP_LOC_IGNORE[op]) emit(av) + elif not iscased(av): + emit(op) + emit(av) else: lo = tolower(av) if fixes and lo in fixes: @@ -101,14 +107,15 @@ def _compile(code, pattern, flags): emit(OP_IGNORE[op]) emit(lo) elif op is IN: - if not flags & SRE_FLAG_IGNORECASE: - emit(op) - elif flags & SRE_FLAG_LOCALE: + charset, hascased = _optimize_charset(av, iscased, tolower, fixes) + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: emit(IN_LOC_IGNORE) - else: + elif hascased: emit(IN_IGNORE) + else: + emit(IN) skip = _len(code); emit(0) - _compile_charset(av, flags, code, tolower, fixes) + _compile_charset(charset, flags, code) code[skip] = _len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: @@ -223,10 +230,10 @@ def _compile(code, pattern, flags): else: raise error("internal: unsupported operand type %r" % (op,)) -def _compile_charset(charset, flags, code, fixup=None, fixes=None): +def _compile_charset(charset, flags, code): # compile charset subprogram emit = code.append - for op, av in _optimize_charset(charset, fixup, fixes): + for op, av in charset: emit(op) if op is NEGATE: pass @@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None): raise error("internal: unsupported set operator %r" % (op,)) emit(FAILURE) -def _optimize_charset(charset, fixup, fixes): +def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): # internal: optimize character set out = [] tail = [] charmap = bytearray(256) + hascased = False for op, av in charset: while True: try: @@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes): if fixes and lo in fixes: for k in fixes[lo]: charmap[k] = 1 + if not hascased and iscased(av): + hascased = True else: charmap[av] = 1 elif op is RANGE: r = range(av[0], av[1]+1) if fixup: - r = map(fixup, r) - if fixup and fixes: - for i in r: - charmap[i] = 1 - if i in fixes: - for k in fixes[i]: - charmap[k] = 1 + if fixes: + for i in map(fixup, r): + charmap[i] = 1 + if i in fixes: + for k in fixes[i]: + charmap[k] = 1 + else: + for i in map(fixup, r): + charmap[i] = 1 + if not hascased: + hascased = any(map(iscased, r)) else: for i in r: charmap[i] = 1 @@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes): charmap += b'\0' * 0xff00 continue # Character set contains non-BMP character codes. - # There are only two ranges of cased non-BMP characters: - # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), - # and for both ranges RANGE_IGNORE works. - if fixup and op is RANGE: - op = RANGE_IGNORE + if fixup: + hascased = True + # There are only two ranges of cased non-BMP characters: + # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), + # and for both ranges RANGE_IGNORE works. + if op is RANGE: + op = RANGE_IGNORE tail.append((op, av)) break @@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes): out.append((RANGE, (p, q - 1))) out += tail # if the case was changed or new representation is more compact - if fixup or len(out) < len(charset): - return out + if hascased or len(out) < len(charset): + return out, hascased # else original character set is good enough - return charset + return charset, hascased # use bitmap if len(charmap) == 256: data = _mk_bitmap(charmap) out.append((CHARSET, data)) out += tail - return out + return out, hascased # To represent a big charset, first a bitmap of all characters in the # set is constructed. Then, this bitmap is sliced into chunks of 256 @@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes): data[0:0] = [block] + _bytes_to_codes(mapping) out.append((BIGCHARSET, data)) out += tail - return out + return out, hascased _CODEBITS = _sre.CODESIZE * 8 MAXCODE = (1 << _CODEBITS) - 1 @@ -414,19 +430,31 @@ def _generate_overlap_table(prefix): table[i] = idx + 1 return table -def _get_literal_prefix(pattern): +def _get_iscased(flags): + if not flags & SRE_FLAG_IGNORECASE: + return None + elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: + return _sre.unicode_iscased + else: + return _sre.ascii_iscased + +def _get_literal_prefix(pattern, flags): # look for literal prefix prefix = [] prefixappend = prefix.append prefix_skip = None + iscased = _get_iscased(flags) for op, av in pattern.data: if op is LITERAL: + if iscased and iscased(av): + break prefixappend(av) elif op is SUBPATTERN: group, add_flags, del_flags, p = av - if add_flags & SRE_FLAG_IGNORECASE: + flags1 = (flags | add_flags) & ~del_flags + if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: break - prefix1, prefix_skip1, got_all = _get_literal_prefix(p) + prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) if prefix_skip is None: if group is not None: prefix_skip = len(prefix) @@ -441,46 +469,49 @@ def _get_literal_prefix(pattern): return prefix, prefix_skip, True return prefix, prefix_skip, False -def _get_charset_prefix(pattern): - charset = [] # not used - charsetappend = charset.append - if pattern.data: +def _get_charset_prefix(pattern, flags): + while True: + if not pattern.data: + return None op, av = pattern.data[0] - if op is SUBPATTERN: - group, add_flags, del_flags, p = av - if p and not (add_flags & SRE_FLAG_IGNORECASE): - op, av = p[0] - if op is LITERAL: - charsetappend((op, av)) - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break - else: - charset = c - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break + if op is not SUBPATTERN: + break + group, add_flags, del_flags, pattern = av + flags = (flags | add_flags) & ~del_flags + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + return None + + iscased = _get_iscased(flags) + if op is LITERAL: + if iscased and iscased(av): + return None + return [(op, av)] + elif op is BRANCH: + charset = [] + charsetappend = charset.append + for p in av[1]: + if not p: + return None + op, av = p[0] + if op is LITERAL and not (iscased and iscased(av)): + charsetappend((op, av)) else: - charset = c - elif op is IN: - charset = av - return charset + return None + return charset + elif op is IN: + charset = av + if iscased: + for op, av in charset: + if op is LITERAL: + if iscased(av): + return None + elif op is RANGE: + if av[1] > 0xffff: + return None + if any(map(iscased, range(av[0], av[1]+1))): + return None + return charset + return None def _compile_info(code, pattern, flags): # internal: compile an info block. in the current version, @@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags): prefix = [] prefix_skip = 0 charset = [] # not used - if not (flags & SRE_FLAG_IGNORECASE): + if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): # look for literal prefix - prefix, prefix_skip, got_all = _get_literal_prefix(pattern) + prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) # if no prefix, look for charset prefix if not prefix: - charset = _get_charset_prefix(pattern) + charset = _get_charset_prefix(pattern, flags) ## if prefix: ## print("*** PREFIX", prefix, prefix_skip) ## if charset: @@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags): # generate overlap table code.extend(_generate_overlap_table(prefix)) elif charset: + charset, hascased = _optimize_charset(charset) + assert not hascased _compile_charset(charset, flags, code) code[skip] = len(code) - skip diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index b5b7cff..3129f7e 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -891,15 +891,24 @@ class ReTests(unittest.TestCase): lo = ord(c.lower()) self.assertEqual(_sre.ascii_tolower(i), lo) self.assertEqual(_sre.unicode_tolower(i), lo) + iscased = c in string.ascii_letters + self.assertEqual(_sre.ascii_iscased(i), iscased) + self.assertEqual(_sre.unicode_iscased(i), iscased) for i in list(range(128, 0x1000)) + [0x10400, 0x10428]: c = chr(i) self.assertEqual(_sre.ascii_tolower(i), i) if i != 0x0130: self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) + iscased = c != c.lower() or c != c.upper() + self.assertFalse(_sre.ascii_iscased(i)) + self.assertEqual(_sre.unicode_iscased(i), + c != c.lower() or c != c.upper()) self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130) self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) + self.assertFalse(_sre.ascii_iscased(0x0130)) + self.assertTrue(_sre.unicode_iscased(0x0130)) def test_not_literal(self): self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") diff --git a/Misc/NEWS b/Misc/NEWS index 1828b01..7a79521 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -320,6 +320,9 @@ Extension Modules Library ------- +- bpo-30285: Optimized case-insensitive matching and searching of regular + expressions. + - bpo-29990: Fix range checking in GB18030 decoder. Original patch by Ma Lin. - bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and diff --git a/Modules/_sre.c b/Modules/_sre.c index a86c5f2..6873f1d 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -274,6 +274,38 @@ _sre_getcodesize_impl(PyObject *module) } /*[clinic input] +_sre.ascii_iscased -> bool + + character: int + / + +[clinic start generated code]*/ + +static int +_sre_ascii_iscased_impl(PyObject *module, int character) +/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/ +{ + unsigned int ch = (unsigned int)character; + return ch != sre_lower(ch) || ch != sre_upper(ch); +} + +/*[clinic input] +_sre.unicode_iscased -> bool + + character: int + / + +[clinic start generated code]*/ + +static int +_sre_unicode_iscased_impl(PyObject *module, int character) +/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/ +{ + unsigned int ch = (unsigned int)character; + return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch); +} + +/*[clinic input] _sre.ascii_tolower -> int character: int @@ -2750,6 +2782,8 @@ static PyTypeObject Scanner_Type = { static PyMethodDef _functions[] = { _SRE_COMPILE_METHODDEF _SRE_GETCODESIZE_METHODDEF + _SRE_ASCII_ISCASED_METHODDEF + _SRE_UNICODE_ISCASED_METHODDEF _SRE_ASCII_TOLOWER_METHODDEF _SRE_UNICODE_TOLOWER_METHODDEF {NULL, NULL} diff --git a/Modules/clinic/_sre.c.h b/Modules/clinic/_sre.c.h index 8056eda..1e60686 100644 --- a/Modules/clinic/_sre.c.h +++ b/Modules/clinic/_sre.c.h @@ -29,6 +29,68 @@ exit: return return_value; } +PyDoc_STRVAR(_sre_ascii_iscased__doc__, +"ascii_iscased($module, character, /)\n" +"--\n" +"\n"); + +#define _SRE_ASCII_ISCASED_METHODDEF \ + {"ascii_iscased", (PyCFunction)_sre_ascii_iscased, METH_O, _sre_ascii_iscased__doc__}, + +static int +_sre_ascii_iscased_impl(PyObject *module, int character); + +static PyObject * +_sre_ascii_iscased(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + int character; + int _return_value; + + if (!PyArg_Parse(arg, "i:ascii_iscased", &character)) { + goto exit; + } + _return_value = _sre_ascii_iscased_impl(module, character); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyBool_FromLong((long)_return_value); + +exit: + return return_value; +} + +PyDoc_STRVAR(_sre_unicode_iscased__doc__, +"unicode_iscased($module, character, /)\n" +"--\n" +"\n"); + +#define _SRE_UNICODE_ISCASED_METHODDEF \ + {"unicode_iscased", (PyCFunction)_sre_unicode_iscased, METH_O, _sre_unicode_iscased__doc__}, + +static int +_sre_unicode_iscased_impl(PyObject *module, int character); + +static PyObject * +_sre_unicode_iscased(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + int character; + int _return_value; + + if (!PyArg_Parse(arg, "i:unicode_iscased", &character)) { + goto exit; + } + _return_value = _sre_unicode_iscased_impl(module, character); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyBool_FromLong((long)_return_value); + +exit: + return return_value; +} + PyDoc_STRVAR(_sre_ascii_tolower__doc__, "ascii_tolower($module, character, /)\n" "--\n" @@ -715,4 +777,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyObject *Py_UNUSED(ignored)) { return _sre_SRE_Scanner_search_impl(self); } -/*[clinic end generated code: output=811e67d7f8f5052e input=a9049054013a1b77]*/ +/*[clinic end generated code: output=5fe47c49e475cccb input=a9049054013a1b77]*/ -- cgit v0.12