diff options
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r-- | Lib/sre_compile.py | 69 |
1 files changed, 46 insertions, 23 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 3e54819..1d59d7e 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -16,7 +16,10 @@ from sre_constants import * assert _sre.MAGIC == MAGIC, "SRE module mismatch" -MAXCODE = 65535 +if _sre.CODESIZE == 2: + MAXCODE = 65535 +else: + MAXCODE = 0xFFFFFFFFL def _compile(code, pattern, flags): # internal: compile a (sub)pattern @@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup): # XXX: could append to charmap tail return charset # cannot compress except IndexError: - if sys.maxunicode != 65535: - # XXX: big charsets don't work in UCS-4 builds - return charset # character set contains unicode characters return _optimize_unicode(charset, fixup) # compress character map @@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup): def _mk_bitmap(bits): data = [] - m = 1; v = 0 + if _sre.CODESIZE == 2: + start = (1, 0) + else: + start = (1L, 0L) + m, v = start for c in bits: if c: v = v + m m = m << 1 if m > MAXCODE: data.append(v) - m = 1; v = 0 + m, v = start return data # To represent a big charset, first a bitmap of all characters in the @@ -258,21 +262,38 @@ def _mk_bitmap(bits): # less significant byte is a bit index in the chunk (just like the # CHARSET matching). +# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets +# of the basic multilingual plane; an efficient representation +# for all of UTF-16 has not yet been developed. This means, +# in particular, that negated charsets cannot be represented as +# bigcharsets. + def _optimize_unicode(charset, fixup): + try: + import array + except ImportError: + return charset charmap = [0]*65536 negate = 0 - for op, av in charset: - if op is NEGATE: - negate = 1 - elif op is LITERAL: - charmap[fixup(av)] = 1 - elif op is RANGE: - for i in range(fixup(av[0]), fixup(av[1])+1): - charmap[i] = 1 - elif op is CATEGORY: - # XXX: could expand category - return charset # cannot compress + try: + for op, av in charset: + if op is NEGATE: + negate = 1 + elif op is LITERAL: + charmap[fixup(av)] = 1 + elif op is RANGE: + for i in range(fixup(av[0]), fixup(av[1])+1): + charmap[i] = 1 + elif op is CATEGORY: + # XXX: could expand category + return charset # cannot compress + except IndexError: + # non-BMP characters + return charset if negate: + if sys.maxunicode != 65535: + # XXX: negation does not work with big charsets + return charset for i in range(65536): charmap[i] = not charmap[i] comps = {} @@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup): block = block + 1 data = data + _mk_bitmap(chunk) header = [block] - assert MAXCODE == 65535 - for i in range(128): - if sys.byteorder == 'big': - header.append(256*mapping[2*i]+mapping[2*i+1]) - else: - header.append(mapping[2*i]+256*mapping[2*i+1]) + if MAXCODE == 65535: + code = 'H' + else: + code = 'L' + # Convert block indices to byte array of 256 bytes + mapping = array.array('b', mapping).tostring() + # Convert byte array to word array + header = header + array.array(code, mapping).tolist() data[0:0] = header return [(BIGCHARSET, data)] |