unicode database compression, step 1:

- use unidb compression for the unicodedata module. on Windows, the new unidatabase module is 120k, down from nearly 600k.
author: Fredrik Lundh <fredrik@pythonware.com> 2000-09-24 23:18:31 (GMT)
committer: Fredrik Lundh <fredrik@pythonware.com> 2000-09-24 23:18:31 (GMT)
commit: f367cacb98ce3fcd1653546835dff3c3cbf5216a (patch)
tree: edc7af8b94ace6e5d739fe9316920e50bf174ddc
parent: 51dc968b0bc6784c9d461252dfda1e48c8aad42d (diff)
download: cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.zip
cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.tar.gz
cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.tar.bz2
1 files changed, 202 insertions, 0 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
new file mode 100644
index 0000000..c36fadf
--- /dev/null
+++ b/Tools/unicode/makeunicodedata.py
@@ -0,0 +1,202 @@
+#
+# makeunidb.py -- generate a compact version of the unicode property
+# database (unicodedatabase.h)
+#
+
+import sys
+
+SCRIPT = sys.argv[0]
+VERSION = "1.0"
+
+UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"
+
+CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
+    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
+    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
+    "So" ]
+
+BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
+    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
+    "ON" ]
+
+def maketable():
+
+    unicode = UnicodeData(UNICODE_DATA)
+
+    # extract unicode properties
+    dummy = (0, 0, 0, 0, "NULL")
+    table = [dummy]
+    cache = {0: dummy}
+    index = [0] * len(unicode.chars)
+
+    DECOMPOSITION = [""]
+
+    for char in unicode.chars:
+        record = unicode.table[char]
+        if record:
+            # extract database properties
+            category = CATEGORY_NAMES.index(record[2])
+            combining = int(record[3])
+            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
+            mirrored = record[9] == "Y"
+            if record[5]:
+                decomposition = '"%s"' % record[5]
+            else:
+                decomposition = "NULL"
+            item = (
+                category, combining, bidirectional, mirrored, decomposition
+                )
+            # add entry to index and item tables
+            i = cache.get(item)
+            if i is None:
+                cache[item] = i = len(table)
+                table.append(item)
+            index[char] = i
+
+    # FIXME: we really should compress the decomposition stuff
+    # (see the unidb utilities for one way to do this)
+
+    FILE = "unicodedata_db.h"
+
+    sys.stdout = open(FILE, "w")
+
+    print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
+    print
+    print "/* a list of unique database records */"
+    print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
+    for item in table:
+        print "    {%d, %d, %d, %d, %s}," % item
+    print "};"
+    print
+
+    print "/* string literals */"
+    print "const char *_PyUnicode_CategoryNames[] = {"
+    for name in CATEGORY_NAMES:
+        print "    \"%s\"," % name
+    print "    NULL"
+    print "};"
+
+    print "const char *_PyUnicode_BidirectionalNames[] = {"
+    for name in BIDIRECTIONAL_NAMES:
+        print "    \"%s\"," % name
+    print "    NULL"
+    print "};"
+
+    # split index table
+    index1, index2, shift = splitbins(index)
+
+    print "/* index tables used to find the right database record */"
+    print "#define SHIFT", shift
+    Array("index1", index1).dump(sys.stdout)
+    Array("index2", index2).dump(sys.stdout)
+
+    sys.stdout = sys.__stdout__
+
+# --------------------------------------------------------------------
+# the following support code is taken from the unidb utilities
+# Copyright (c) 1999-2000 by Secret Labs AB
+
+# load a unicode-data file from disk
+
+import string, sys
+
+class UnicodeData:
+
+    def __init__(self, filename):
+        file = open(filename)
+        table = [None] * 65536
+        while 1:
+            s = file.readline()
+            if not s:
+                break
+            s = string.split(string.strip(s), ";")
+            char = string.atoi(s[0], 16)
+            table[char] = s
+
+        # public attributes
+        self.filename = filename
+        self.table = table
+        self.chars = range(65536) # unicode
+
+    def uselatin1(self):
+        # restrict character range to ISO Latin 1
+        self.chars = range(256)
+
+# stuff to deal with arrays of unsigned integers
+
+class Array:
+
+    def __init__(self, name, data):
+        self.name = name
+        self.data = data
+
+    def dump(self, file):
+        # write data to file, as a C array
+        size = getsize(self.data)
+        # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
+        file.write("static ")
+        if size == 1:
+            file.write("unsigned char")
+        elif size == 2:
+            file.write("unsigned short")
+        else:
+            file.write("unsigned int")
+        file.write(" " + self.name + "[] = {\n")
+        if self.data:
+            s = "    "
+            for item in self.data:
+                i = str(item) + ", "
+                if len(s) + len(i) > 78:
+                    file.write(s + "\n")
+                    s = "    " + i
+                else:
+                    s = s + i
+            if string.strip(s):
+                file.write(s + "\n")
+        file.write("};\n\n")
+
+def getsize(data):
+    # return smallest possible integer size for the given array
+    maxdata = max(data)
+    if maxdata < 256:
+        return 1
+    elif maxdata < 65536:
+        return 2
+    else:
+        return 4
+
+def splitbins(bins):
+    # split a sparse integer table into two tables, such as:
+    #   value = t2[(t1[char>>shift]<<shift)+(char&mask)]
+    # and value == 0 means no data
+    bytes = sys.maxint
+    for shift in range(16):
+        bin1 = []
+        bin2 = []
+        size = 2**shift
+        bincache = {}
+        for i in range(0, len(bins), size):
+            bin = bins[i:i+size]
+            index = bincache.get(tuple(bin))
+            if index is None:
+                index = len(bin2)
+                bincache[tuple(bin)] = index
+                for v in bin:
+                    if v is None:
+                        bin2.append(0)
+                    else:
+                        bin2.append(v)
+            bin1.append(index>>shift)
+        # determine memory size
+        b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2)
+        if b < bytes:
+            best = shift, bin1, bin2
+            bytes = b
+    shift, bin1, bin2 = best
+##     print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
+##         len(bin1), len(bin2), shift, bytes
+##         )
+    return bin1, bin2, shift
+
+if __name__ == "__main__":
+    maketable()
author	Fredrik Lundh <fredrik@pythonware.com>	2000-09-24 23:18:31 (GMT)
committer	Fredrik Lundh <fredrik@pythonware.com>	2000-09-24 23:18:31 (GMT)
commit	f367cacb98ce3fcd1653546835dff3c3cbf5216a (patch)
tree	edc7af8b94ace6e5d739fe9316920e50bf174ddc
parent	51dc968b0bc6784c9d461252dfda1e48c8aad42d (diff)
download	cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.zip cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.tar.gz cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.tar.bz2