From 78e2f06cc66178887ee0d6d243370efa241a675a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Sat, 19 Apr 2003 12:56:08 +0000 Subject: Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds. --- Lib/sre_compile.py | 69 ++++++++++++++++++++++++++++++++----------------- Lib/sre_constants.py | 2 +- Modules/_sre.c | 52 ++++++++++++++++++++++++++++++------- Modules/sre_constants.h | 2 +- 4 files changed, 90 insertions(+), 35 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 3e54819..1d59d7e 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -16,7 +16,10 @@ from sre_constants import * assert _sre.MAGIC == MAGIC, "SRE module mismatch" -MAXCODE = 65535 +if _sre.CODESIZE == 2: + MAXCODE = 65535 +else: + MAXCODE = 0xFFFFFFFFL def _compile(code, pattern, flags): # internal: compile a (sub)pattern @@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup): # XXX: could append to charmap tail return charset # cannot compress except IndexError: - if sys.maxunicode != 65535: - # XXX: big charsets don't work in UCS-4 builds - return charset # character set contains unicode characters return _optimize_unicode(charset, fixup) # compress character map @@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup): def _mk_bitmap(bits): data = [] - m = 1; v = 0 + if _sre.CODESIZE == 2: + start = (1, 0) + else: + start = (1L, 0L) + m, v = start for c in bits: if c: v = v + m m = m << 1 if m > MAXCODE: data.append(v) - m = 1; v = 0 + m, v = start return data # To represent a big charset, first a bitmap of all characters in the @@ -258,21 +262,38 @@ def _mk_bitmap(bits): # less significant byte is a bit index in the chunk (just like the # CHARSET matching). +# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets +# of the basic multilingual plane; an efficient representation +# for all of UTF-16 has not yet been developed. This means, +# in particular, that negated charsets cannot be represented as +# bigcharsets. + def _optimize_unicode(charset, fixup): + try: + import array + except ImportError: + return charset charmap = [0]*65536 negate = 0 - for op, av in charset: - if op is NEGATE: - negate = 1 - elif op is LITERAL: - charmap[fixup(av)] = 1 - elif op is RANGE: - for i in range(fixup(av[0]), fixup(av[1])+1): - charmap[i] = 1 - elif op is CATEGORY: - # XXX: could expand category - return charset # cannot compress + try: + for op, av in charset: + if op is NEGATE: + negate = 1 + elif op is LITERAL: + charmap[fixup(av)] = 1 + elif op is RANGE: + for i in range(fixup(av[0]), fixup(av[1])+1): + charmap[i] = 1 + elif op is CATEGORY: + # XXX: could expand category + return charset # cannot compress + except IndexError: + # non-BMP characters + return charset if negate: + if sys.maxunicode != 65535: + # XXX: negation does not work with big charsets + return charset for i in range(65536): charmap[i] = not charmap[i] comps = {} @@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup): block = block + 1 data = data + _mk_bitmap(chunk) header = [block] - assert MAXCODE == 65535 - for i in range(128): - if sys.byteorder == 'big': - header.append(256*mapping[2*i]+mapping[2*i+1]) - else: - header.append(mapping[2*i]+256*mapping[2*i+1]) + if MAXCODE == 65535: + code = 'H' + else: + code = 'L' + # Convert block indices to byte array of 256 bytes + mapping = array.array('b', mapping).tostring() + # Convert byte array to word array + header = header + array.array(code, mapping).tolist() data[0:0] = header return [(BIGCHARSET, data)] diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 2cd85a3..07b24dd 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20010701 +MAGIC = 20030419 # max code word in this release diff --git a/Modules/_sre.c b/Modules/_sre.c index dde365b..8cae095 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -20,6 +20,7 @@ * 2001-10-24 fl added finditer primitive (for 2.2 only) * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) * 2002-11-09 fl fixed empty sub/subn return type + * 2003-04-18 mvl fully support 4-byte codes * * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * @@ -510,10 +511,18 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) break; case SRE_OP_CHARSET: - /* (16 bits per code word) */ - if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15)))) - return ok; - set += 16; + if (sizeof(SRE_CODE) == 2) { + /* (16 bits per code word) */ + if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15)))) + return ok; + set += 16; + } + else { + /* (32 bits per code word) */ + if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31)))) + return ok; + set += 8; + } break; case SRE_OP_BIGCHARSET: @@ -521,11 +530,25 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) { int count, block; count = *(set++); - block = ((unsigned char*)set)[ch >> 8]; - set += 128; - if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15))) - return ok; - set += count*16; + + if (sizeof(SRE_CODE) == 2) { + block = ((unsigned char*)set)[ch >> 8]; + set += 128; + if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15))) + return ok; + set += count*16; + } + else { + if (ch < 65536) + block = ((unsigned char*)set)[ch >> 8]; + else + block = -1; + set += 64; + if (block >=0 && + (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31)))) + return ok; + set += count*8; + } break; } @@ -1371,7 +1394,10 @@ _compile(PyObject* self_, PyObject* args) for (i = 0; i < n; i++) { PyObject *o = PyList_GET_ITEM(code, i); - self->code[i] = (SRE_CODE) PyInt_AsLong(o); + if (PyInt_Check(o)) + self->code[i] = (SRE_CODE) PyInt_AsLong(o); + else + self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o); } if (PyErr_Occurred()) { @@ -3045,6 +3071,12 @@ PyMODINIT_FUNC init_sre(void) Py_DECREF(x); } + x = PyInt_FromLong(sizeof(SRE_CODE)); + if (x) { + PyDict_SetItemString(d, "CODESIZE", x); + Py_DECREF(x); + } + x = PyString_FromString(copyright); if (x) { PyDict_SetItemString(d, "copyright", x); diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 540008e..619ea00 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20010701 +#define SRE_MAGIC 20030419 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 -- cgit v0.12