From c04fcd40bd8fd2c9e427faded617214f8ae18402 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 31 Oct 2014 13:34:06 +0200 Subject: Backported the optimization of compiling charsets in regular expressions (issue #19329). This is needed to apply the patch from issue #17381. --- Lib/sre_compile.py | 231 ++++++++++++++++++++++++----------------------------- Misc/NEWS | 2 + 2 files changed, 105 insertions(+), 128 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 471753e..e121845 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -207,149 +207,124 @@ def _compile_charset(charset, flags, code, fixup=None): def _optimize_charset(charset, fixup): # internal: optimize character set out = [] - outappend = out.append - charmap = [0]*256 - try: - for op, av in charset: - if op is NEGATE: - outappend((op, av)) - elif op is LITERAL: - charmap[fixup(av)] = 1 - elif op is RANGE: - for i in range(fixup(av[0]), fixup(av[1])+1): - charmap[i] = 1 - elif op is CATEGORY: - # XXX: could append to charmap tail - return charset # cannot compress - except IndexError: - # character set contains unicode characters - return _optimize_unicode(charset, fixup) + tail = [] + charmap = bytearray(256) + for op, av in charset: + while True: + try: + if op is LITERAL: + charmap[fixup(av)] = 1 + elif op is RANGE: + for i in range(fixup(av[0]), fixup(av[1])+1): + charmap[i] = 1 + elif op is NEGATE: + out.append((op, av)) + else: + tail.append((op, av)) + except IndexError: + if len(charmap) == 256: + # character set contains non-UCS1 character codes + charmap += b'\0' * 0xff00 + continue + # character set contains non-BMP character codes + tail.append((op, av)) + break + # compress character map - i = p = n = 0 runs = [] - runsappend = runs.append - for c in charmap: - if c: - if n == 0: - p = i - n = n + 1 - elif n: - runsappend((p, n)) - n = 0 - i = i + 1 - if n: - runsappend((p, n)) - if len(runs) <= 2: + q = 0 + while True: + p = charmap.find(b'\1', q) + if p < 0: + break + if len(runs) >= 2: + runs = None + break + q = charmap.find(b'\0', p) + if q < 0: + runs.append((p, len(charmap))) + break + runs.append((p, q)) + if runs is not None: # use literal/range - for p, n in runs: - if n == 1: - outappend((LITERAL, p)) + for p, q in runs: + if q - p == 1: + out.append((LITERAL, p)) else: - outappend((RANGE, (p, p+n-1))) + out.append((RANGE, (p, q - 1))) + out += tail if len(out) < len(charset): return out - else: - # use bitmap + return charset + + # use bitmap + if len(charmap) == 256: data = _mk_bitmap(charmap) - outappend((CHARSET, data)) + out.append((CHARSET, data)) + out += tail return out - return charset -def _mk_bitmap(bits): - data = [] - dataappend = data.append - if _sre.CODESIZE == 2: - start = (1, 0) - else: - start = (1L, 0L) - m, v = start - for c in bits: - if c: - v = v + m - m = m + m - if m > MAXCODE: - dataappend(v) - m, v = start - return data - -# To represent a big charset, first a bitmap of all characters in the -# set is constructed. Then, this bitmap is sliced into chunks of 256 -# characters, duplicate chunks are eliminated, and each chunk is -# given a number. In the compiled expression, the charset is -# represented by a 32-bit word sequence, consisting of one word for -# the number of different chunks, a sequence of 256 bytes (64 words) -# of chunk numbers indexed by their original chunk position, and a -# sequence of 256-bit chunks (8 words each). - -# Compression is normally good: in a typical charset, large ranges of -# Unicode will be either completely excluded (e.g. if only cyrillic -# letters are to be matched), or completely included (e.g. if large -# subranges of Kanji match). These ranges will be represented by -# chunks of all one-bits or all zero-bits. - -# Matching can be also done efficiently: the more significant byte of -# the Unicode character is an index into the chunk number, and the -# less significant byte is a bit index in the chunk (just like the -# CHARSET matching). - -# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets -# of the basic multilingual plane; an efficient representation -# for all of Unicode has not yet been developed. This means, -# in particular, that negated charsets cannot be represented as -# bigcharsets. - -def _optimize_unicode(charset, fixup): - try: - import array - except ImportError: - return charset - charmap = [0]*65536 - negate = 0 - try: - for op, av in charset: - if op is NEGATE: - negate = 1 - elif op is LITERAL: - charmap[fixup(av)] = 1 - elif op is RANGE: - for i in xrange(fixup(av[0]), fixup(av[1])+1): - charmap[i] = 1 - elif op is CATEGORY: - # XXX: could expand category - return charset # cannot compress - except IndexError: - # non-BMP characters - return charset - if negate: - if sys.maxunicode != 65535: - # XXX: negation does not work with big charsets - return charset - for i in xrange(65536): - charmap[i] = not charmap[i] + # To represent a big charset, first a bitmap of all characters in the + # set is constructed. Then, this bitmap is sliced into chunks of 256 + # characters, duplicate chunks are eliminated, and each chunk is + # given a number. In the compiled expression, the charset is + # represented by a 32-bit word sequence, consisting of one word for + # the number of different chunks, a sequence of 256 bytes (64 words) + # of chunk numbers indexed by their original chunk position, and a + # sequence of 256-bit chunks (8 words each). + + # Compression is normally good: in a typical charset, large ranges of + # Unicode will be either completely excluded (e.g. if only cyrillic + # letters are to be matched), or completely included (e.g. if large + # subranges of Kanji match). These ranges will be represented by + # chunks of all one-bits or all zero-bits. + + # Matching can be also done efficiently: the more significant byte of + # the Unicode character is an index into the chunk number, and the + # less significant byte is a bit index in the chunk (just like the + # CHARSET matching). + + # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets + # of the basic multilingual plane; an efficient representation + # for all of Unicode has not yet been developed. + + charmap = bytes(charmap) # should be hashable comps = {} - mapping = [0]*256 + mapping = bytearray(256) block = 0 - data = [] - for i in xrange(256): - chunk = tuple(charmap[i*256:(i+1)*256]) - new = comps.setdefault(chunk, block) - mapping[i] = new - if new == block: - block = block + 1 - data = data + _mk_bitmap(chunk) - header = [block] + data = bytearray() + for i in range(0, 65536, 256): + chunk = charmap[i: i + 256] + if chunk in comps: + mapping[i // 256] = comps[chunk] + else: + mapping[i // 256] = comps[chunk] = block + block += 1 + data += chunk + data = _mk_bitmap(data) + data[0:0] = [block] + _bytes_to_codes(mapping) + out.append((BIGCHARSET, data)) + out += tail + return out + +_CODEBITS = _sre.CODESIZE * 8 +_BITS_TRANS = b'0' + b'1' * 255 +def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): + s = bytes(bits).translate(_BITS_TRANS)[::-1] + return [_int(s[i - _CODEBITS: i], 2) + for i in range(len(s), 0, -_CODEBITS)] + +def _bytes_to_codes(b): + # Convert block indices to word array + import array if _sre.CODESIZE == 2: code = 'H' else: code = 'I' - # Convert block indices to byte array of 256 bytes - mapping = array.array('B', mapping).tostring() - # Convert byte array to word array - mapping = array.array(code, mapping) - assert mapping.itemsize == _sre.CODESIZE - header = header + mapping.tolist() - data[0:0] = header - return [(BIGCHARSET, data)] + a = array.array(code, bytes(b)) + assert a.itemsize == _sre.CODESIZE + assert len(a) * a.itemsize == len(b) + return a.tolist() def _simple(av): # check if av is a "simple" operator diff --git a/Misc/NEWS b/Misc/NEWS index 725c0ab..ce0ed51 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -37,6 +37,8 @@ Core and Builtins Library ------- +- Issue #19329: Optimized compiling charsets in regular expressions. + - Issue #22410: Module level functions in the re module now cache compiled locale-dependent regular expressions taking into account the locale. -- cgit v0.12