summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/sre_compile.py69
-rw-r--r--Lib/sre_constants.py2
-rw-r--r--Modules/_sre.c52
-rw-r--r--Modules/sre_constants.h2
4 files changed, 90 insertions, 35 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 3e54819..1d59d7e 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -16,7 +16,10 @@ from sre_constants import *
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
-MAXCODE = 65535
+if _sre.CODESIZE == 2:
+ MAXCODE = 65535
+else:
+ MAXCODE = 0xFFFFFFFFL
def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
@@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup):
# XXX: could append to charmap tail
return charset # cannot compress
except IndexError:
- if sys.maxunicode != 65535:
- # XXX: big charsets don't work in UCS-4 builds
- return charset
# character set contains unicode characters
return _optimize_unicode(charset, fixup)
# compress character map
@@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup):
def _mk_bitmap(bits):
data = []
- m = 1; v = 0
+ if _sre.CODESIZE == 2:
+ start = (1, 0)
+ else:
+ start = (1L, 0L)
+ m, v = start
for c in bits:
if c:
v = v + m
m = m << 1
if m > MAXCODE:
data.append(v)
- m = 1; v = 0
+ m, v = start
return data
# To represent a big charset, first a bitmap of all characters in the
@@ -258,21 +262,38 @@ def _mk_bitmap(bits):
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
+# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
+# of the basic multilingual plane; an efficient representation
+# for all of UTF-16 has not yet been developed. This means,
+# in particular, that negated charsets cannot be represented as
+# bigcharsets.
+
def _optimize_unicode(charset, fixup):
+ try:
+ import array
+ except ImportError:
+ return charset
charmap = [0]*65536
negate = 0
- for op, av in charset:
- if op is NEGATE:
- negate = 1
- elif op is LITERAL:
- charmap[fixup(av)] = 1
- elif op is RANGE:
- for i in range(fixup(av[0]), fixup(av[1])+1):
- charmap[i] = 1
- elif op is CATEGORY:
- # XXX: could expand category
- return charset # cannot compress
+ try:
+ for op, av in charset:
+ if op is NEGATE:
+ negate = 1
+ elif op is LITERAL:
+ charmap[fixup(av)] = 1
+ elif op is RANGE:
+ for i in range(fixup(av[0]), fixup(av[1])+1):
+ charmap[i] = 1
+ elif op is CATEGORY:
+ # XXX: could expand category
+ return charset # cannot compress
+ except IndexError:
+ # non-BMP characters
+ return charset
if negate:
+ if sys.maxunicode != 65535:
+ # XXX: negation does not work with big charsets
+ return charset
for i in range(65536):
charmap[i] = not charmap[i]
comps = {}
@@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup):
block = block + 1
data = data + _mk_bitmap(chunk)
header = [block]
- assert MAXCODE == 65535
- for i in range(128):
- if sys.byteorder == 'big':
- header.append(256*mapping[2*i]+mapping[2*i+1])
- else:
- header.append(mapping[2*i]+256*mapping[2*i+1])
+ if MAXCODE == 65535:
+ code = 'H'
+ else:
+ code = 'L'
+ # Convert block indices to byte array of 256 bytes
+ mapping = array.array('b', mapping).tostring()
+ # Convert byte array to word array
+ header = header + array.array(code, mapping).tolist()
data[0:0] = header
return [(BIGCHARSET, data)]
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index 2cd85a3..07b24dd 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
# update when constants are added or removed
-MAGIC = 20010701
+MAGIC = 20030419
# max code word in this release
diff --git a/Modules/_sre.c b/Modules/_sre.c
index dde365b..8cae095 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -20,6 +20,7 @@
* 2001-10-24 fl added finditer primitive (for 2.2 only)
* 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
* 2002-11-09 fl fixed empty sub/subn return type
+ * 2003-04-18 mvl fully support 4-byte codes
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
@@ -510,10 +511,18 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
break;
case SRE_OP_CHARSET:
- /* <CHARSET> <bitmap> (16 bits per code word) */
- if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
- return ok;
- set += 16;
+ if (sizeof(SRE_CODE) == 2) {
+ /* <CHARSET> <bitmap> (16 bits per code word) */
+ if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
+ return ok;
+ set += 16;
+ }
+ else {
+ /* <CHARSET> <bitmap> (32 bits per code word) */
+ if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
+ return ok;
+ set += 8;
+ }
break;
case SRE_OP_BIGCHARSET:
@@ -521,11 +530,25 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
{
int count, block;
count = *(set++);
- block = ((unsigned char*)set)[ch >> 8];
- set += 128;
- if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
- return ok;
- set += count*16;
+
+ if (sizeof(SRE_CODE) == 2) {
+ block = ((unsigned char*)set)[ch >> 8];
+ set += 128;
+ if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
+ return ok;
+ set += count*16;
+ }
+ else {
+ if (ch < 65536)
+ block = ((unsigned char*)set)[ch >> 8];
+ else
+ block = -1;
+ set += 64;
+ if (block >=0 &&
+ (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
+ return ok;
+ set += count*8;
+ }
break;
}
@@ -1371,7 +1394,10 @@ _compile(PyObject* self_, PyObject* args)
for (i = 0; i < n; i++) {
PyObject *o = PyList_GET_ITEM(code, i);
- self->code[i] = (SRE_CODE) PyInt_AsLong(o);
+ if (PyInt_Check(o))
+ self->code[i] = (SRE_CODE) PyInt_AsLong(o);
+ else
+ self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
}
if (PyErr_Occurred()) {
@@ -3045,6 +3071,12 @@ PyMODINIT_FUNC init_sre(void)
Py_DECREF(x);
}
+ x = PyInt_FromLong(sizeof(SRE_CODE));
+ if (x) {
+ PyDict_SetItemString(d, "CODESIZE", x);
+ Py_DECREF(x);
+ }
+
x = PyString_FromString(copyright);
if (x) {
PyDict_SetItemString(d, "copyright", x);
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index 540008e..619ea00 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
-#define SRE_MAGIC 20010701
+#define SRE_MAGIC 20030419
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2