diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2000-09-24 23:18:31 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2000-09-24 23:18:31 (GMT) |
commit | f367cacb98ce3fcd1653546835dff3c3cbf5216a (patch) | |
tree | edc7af8b94ace6e5d739fe9316920e50bf174ddc /Tools | |
parent | 51dc968b0bc6784c9d461252dfda1e48c8aad42d (diff) | |
download | cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.zip cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.tar.gz cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.tar.bz2 |
unicode database compression, step 1:
- use unidb compression for the unicodedata module. on Windows,
the new unidatabase module is 120k, down from nearly 600k.
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 202 |
1 files changed, 202 insertions, 0 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py new file mode 100644 index 0000000..c36fadf --- /dev/null +++ b/Tools/unicode/makeunicodedata.py @@ -0,0 +1,202 @@ +# +# makeunidb.py -- generate a compact version of the unicode property +# database (unicodedatabase.h) +# + +import sys + +SCRIPT = sys.argv[0] +VERSION = "1.0" + +UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt" + +CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", + "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", + "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", + "So" ] + +BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", + "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", + "ON" ] + +def maketable(): + + unicode = UnicodeData(UNICODE_DATA) + + # extract unicode properties + dummy = (0, 0, 0, 0, "NULL") + table = [dummy] + cache = {0: dummy} + index = [0] * len(unicode.chars) + + DECOMPOSITION = [""] + + for char in unicode.chars: + record = unicode.table[char] + if record: + # extract database properties + category = CATEGORY_NAMES.index(record[2]) + combining = int(record[3]) + bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) + mirrored = record[9] == "Y" + if record[5]: + decomposition = '"%s"' % record[5] + else: + decomposition = "NULL" + item = ( + category, combining, bidirectional, mirrored, decomposition + ) + # add entry to index and item tables + i = cache.get(item) + if i is None: + cache[item] = i = len(table) + table.append(item) + index[char] = i + + # FIXME: we really should compress the decomposition stuff + # (see the unidb utilities for one way to do this) + + FILE = "unicodedata_db.h" + + sys.stdout = open(FILE, "w") + + print "/* this file was generated by %s %s */" % (SCRIPT, VERSION) + print + print "/* a list of unique database records */" + print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" + for item in table: + print " {%d, %d, %d, %d, %s}," % item + print "};" + print + + print "/* string literals */" + print "const char *_PyUnicode_CategoryNames[] = {" + for name in CATEGORY_NAMES: + print " \"%s\"," % name + print " NULL" + print "};" + + print "const char *_PyUnicode_BidirectionalNames[] = {" + for name in BIDIRECTIONAL_NAMES: + print " \"%s\"," % name + print " NULL" + print "};" + + # split index table + index1, index2, shift = splitbins(index) + + print "/* index tables used to find the right database record */" + print "#define SHIFT", shift + Array("index1", index1).dump(sys.stdout) + Array("index2", index2).dump(sys.stdout) + + sys.stdout = sys.__stdout__ + +# -------------------------------------------------------------------- +# the following support code is taken from the unidb utilities +# Copyright (c) 1999-2000 by Secret Labs AB + +# load a unicode-data file from disk + +import string, sys + +class UnicodeData: + + def __init__(self, filename): + file = open(filename) + table = [None] * 65536 + while 1: + s = file.readline() + if not s: + break + s = string.split(string.strip(s), ";") + char = string.atoi(s[0], 16) + table[char] = s + + # public attributes + self.filename = filename + self.table = table + self.chars = range(65536) # unicode + + def uselatin1(self): + # restrict character range to ISO Latin 1 + self.chars = range(256) + +# stuff to deal with arrays of unsigned integers + +class Array: + + def __init__(self, name, data): + self.name = name + self.data = data + + def dump(self, file): + # write data to file, as a C array + size = getsize(self.data) + # print >>sys.stderr, self.name+":", size*len(self.data), "bytes" + file.write("static ") + if size == 1: + file.write("unsigned char") + elif size == 2: + file.write("unsigned short") + else: + file.write("unsigned int") + file.write(" " + self.name + "[] = {\n") + if self.data: + s = " " + for item in self.data: + i = str(item) + ", " + if len(s) + len(i) > 78: + file.write(s + "\n") + s = " " + i + else: + s = s + i + if string.strip(s): + file.write(s + "\n") + file.write("};\n\n") + +def getsize(data): + # return smallest possible integer size for the given array + maxdata = max(data) + if maxdata < 256: + return 1 + elif maxdata < 65536: + return 2 + else: + return 4 + +def splitbins(bins): + # split a sparse integer table into two tables, such as: + # value = t2[(t1[char>>shift]<<shift)+(char&mask)] + # and value == 0 means no data + bytes = sys.maxint + for shift in range(16): + bin1 = [] + bin2 = [] + size = 2**shift + bincache = {} + for i in range(0, len(bins), size): + bin = bins[i:i+size] + index = bincache.get(tuple(bin)) + if index is None: + index = len(bin2) + bincache[tuple(bin)] = index + for v in bin: + if v is None: + bin2.append(0) + else: + bin2.append(v) + bin1.append(index>>shift) + # determine memory size + b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2) + if b < bytes: + best = shift, bin1, bin2 + bytes = b + shift, bin1, bin2 = best +## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( +## len(bin1), len(bin2), shift, bytes +## ) + return bin1, bin2, shift + +if __name__ == "__main__": + maketable() |