Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds.

author: Martin v. Löwis <martin@v.loewis.de> 2003-04-19 12:56:08 (GMT)
committer: Martin v. Löwis <martin@v.loewis.de> 2003-04-19 12:56:08 (GMT)
commit: 78e2f06cc66178887ee0d6d243370efa241a675a (patch)
tree: 07910e9afd123f6c5c394d46f6dbdf087ef978ba /Lib
parent: 53d93adc46c73e6fdcd6db2d16f0136eadba9839 (diff)
download: cpython-78e2f06cc66178887ee0d6d243370efa241a675a.zip
cpython-78e2f06cc66178887ee0d6d243370efa241a675a.tar.gz
cpython-78e2f06cc66178887ee0d6d243370efa241a675a.tar.bz2
2 files changed, 47 insertions, 24 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 3e54819..1d59d7e 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -16,7 +16,10 @@ from sre_constants import *
 
 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
 
-MAXCODE = 65535
+if _sre.CODESIZE == 2:
+    MAXCODE = 65535
+else:
+    MAXCODE = 0xFFFFFFFFL
 
 def _compile(code, pattern, flags):
     # internal: compile a (sub)pattern
@@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup):
                 # XXX: could append to charmap tail
                 return charset # cannot compress
     except IndexError:
-        if sys.maxunicode != 65535:
-            # XXX: big charsets don't work in UCS-4 builds
-            return charset
         # character set contains unicode characters
         return _optimize_unicode(charset, fixup)
     # compress character map
@@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup):
 
 def _mk_bitmap(bits):
     data = []
-    m = 1; v = 0
+    if _sre.CODESIZE == 2:
+        start = (1, 0)
+    else:
+        start = (1L, 0L)
+    m, v = start
     for c in bits:
         if c:
             v = v + m
         m = m << 1
         if m > MAXCODE:
             data.append(v)
-            m = 1; v = 0
+            m, v = start
     return data
 
 # To represent a big charset, first a bitmap of all characters in the
@@ -258,21 +262,38 @@ def _mk_bitmap(bits):
 # less significant byte is a bit index in the chunk (just like the
 # CHARSET matching).
 
+# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
+# of the basic multilingual plane; an efficient representation
+# for all of UTF-16 has not yet been developed. This means,
+# in particular, that negated charsets cannot be represented as
+# bigcharsets.
+
 def _optimize_unicode(charset, fixup):
+    try:
+        import array
+    except ImportError:
+        return charset
     charmap = [0]*65536
     negate = 0
-    for op, av in charset:
-        if op is NEGATE:
-            negate = 1
-        elif op is LITERAL:
-            charmap[fixup(av)] = 1
-        elif op is RANGE:
-            for i in range(fixup(av[0]), fixup(av[1])+1):
-                charmap[i] = 1
-        elif op is CATEGORY:
-            # XXX: could expand category
-            return charset # cannot compress
+    try:
+        for op, av in charset:
+            if op is NEGATE:
+                negate = 1
+            elif op is LITERAL:
+                charmap[fixup(av)] = 1
+            elif op is RANGE:
+                for i in range(fixup(av[0]), fixup(av[1])+1):
+                    charmap[i] = 1
+            elif op is CATEGORY:
+                # XXX: could expand category
+                return charset # cannot compress
+    except IndexError:
+        # non-BMP characters
+        return charset
     if negate:
+        if sys.maxunicode != 65535:
+            # XXX: negation does not work with big charsets
+            return charset
         for i in range(65536):
             charmap[i] = not charmap[i]
     comps = {}
@@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup):
             block = block + 1
             data = data + _mk_bitmap(chunk)
     header = [block]
-    assert MAXCODE == 65535
-    for i in range(128):
-        if sys.byteorder == 'big':
-            header.append(256*mapping[2*i]+mapping[2*i+1])
-        else:
-            header.append(mapping[2*i]+256*mapping[2*i+1])
+    if MAXCODE == 65535:
+        code = 'H'
+    else:
+        code = 'L'
+    # Convert block indices to byte array of 256 bytes
+    mapping = array.array('b', mapping).tostring()
+    # Convert byte array to word array
+    header = header + array.array(code, mapping).tolist()
     data[0:0] = header
     return [(BIGCHARSET, data)]
 
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index 2cd85a3..07b24dd 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
 
 # update when constants are added or removed
 
-MAGIC = 20010701
+MAGIC = 20030419
 
 # max code word in this release
author	Martin v. Löwis <martin@v.loewis.de>	2003-04-19 12:56:08 (GMT)
committer	Martin v. Löwis <martin@v.loewis.de>	2003-04-19 12:56:08 (GMT)
commit	78e2f06cc66178887ee0d6d243370efa241a675a (patch)
tree	07910e9afd123f6c5c394d46f6dbdf087ef978ba /Lib
parent	53d93adc46c73e6fdcd6db2d16f0136eadba9839 (diff)
download	cpython-78e2f06cc66178887ee0d6d243370efa241a675a.zip cpython-78e2f06cc66178887ee0d6d243370efa241a675a.tar.gz cpython-78e2f06cc66178887ee0d6d243370efa241a675a.tar.bz2