summaryrefslogtreecommitdiffstats
path: root/Tools/unicode
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2000-09-24 23:18:31 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2000-09-24 23:18:31 (GMT)
commitf367cacb98ce3fcd1653546835dff3c3cbf5216a (patch)
treeedc7af8b94ace6e5d739fe9316920e50bf174ddc /Tools/unicode
parent51dc968b0bc6784c9d461252dfda1e48c8aad42d (diff)
downloadcpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.zip
cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.tar.gz
cpython-f367cacb98ce3fcd1653546835dff3c3cbf5216a.tar.bz2
unicode database compression, step 1:
- use unidb compression for the unicodedata module. on Windows, the new unidatabase module is 120k, down from nearly 600k.
Diffstat (limited to 'Tools/unicode')
-rw-r--r--Tools/unicode/makeunicodedata.py202
1 files changed, 202 insertions, 0 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
new file mode 100644
index 0000000..c36fadf
--- /dev/null
+++ b/Tools/unicode/makeunicodedata.py
@@ -0,0 +1,202 @@
+#
+# makeunidb.py -- generate a compact version of the unicode property
+# database (unicodedatabase.h)
+#
+
+import sys
+
+SCRIPT = sys.argv[0]
+VERSION = "1.0"
+
+UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"
+
+CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
+ "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
+ "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
+ "So" ]
+
+BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
+ "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
+ "ON" ]
+
+def maketable():
+
+ unicode = UnicodeData(UNICODE_DATA)
+
+ # extract unicode properties
+ dummy = (0, 0, 0, 0, "NULL")
+ table = [dummy]
+ cache = {0: dummy}
+ index = [0] * len(unicode.chars)
+
+ DECOMPOSITION = [""]
+
+ for char in unicode.chars:
+ record = unicode.table[char]
+ if record:
+ # extract database properties
+ category = CATEGORY_NAMES.index(record[2])
+ combining = int(record[3])
+ bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
+ mirrored = record[9] == "Y"
+ if record[5]:
+ decomposition = '"%s"' % record[5]
+ else:
+ decomposition = "NULL"
+ item = (
+ category, combining, bidirectional, mirrored, decomposition
+ )
+ # add entry to index and item tables
+ i = cache.get(item)
+ if i is None:
+ cache[item] = i = len(table)
+ table.append(item)
+ index[char] = i
+
+ # FIXME: we really should compress the decomposition stuff
+ # (see the unidb utilities for one way to do this)
+
+ FILE = "unicodedata_db.h"
+
+ sys.stdout = open(FILE, "w")
+
+ print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
+ print
+ print "/* a list of unique database records */"
+ print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
+ for item in table:
+ print " {%d, %d, %d, %d, %s}," % item
+ print "};"
+ print
+
+ print "/* string literals */"
+ print "const char *_PyUnicode_CategoryNames[] = {"
+ for name in CATEGORY_NAMES:
+ print " \"%s\"," % name
+ print " NULL"
+ print "};"
+
+ print "const char *_PyUnicode_BidirectionalNames[] = {"
+ for name in BIDIRECTIONAL_NAMES:
+ print " \"%s\"," % name
+ print " NULL"
+ print "};"
+
+ # split index table
+ index1, index2, shift = splitbins(index)
+
+ print "/* index tables used to find the right database record */"
+ print "#define SHIFT", shift
+ Array("index1", index1).dump(sys.stdout)
+ Array("index2", index2).dump(sys.stdout)
+
+ sys.stdout = sys.__stdout__
+
+# --------------------------------------------------------------------
+# the following support code is taken from the unidb utilities
+# Copyright (c) 1999-2000 by Secret Labs AB
+
+# load a unicode-data file from disk
+
+import string, sys
+
+class UnicodeData:
+
+ def __init__(self, filename):
+ file = open(filename)
+ table = [None] * 65536
+ while 1:
+ s = file.readline()
+ if not s:
+ break
+ s = string.split(string.strip(s), ";")
+ char = string.atoi(s[0], 16)
+ table[char] = s
+
+ # public attributes
+ self.filename = filename
+ self.table = table
+ self.chars = range(65536) # unicode
+
+ def uselatin1(self):
+ # restrict character range to ISO Latin 1
+ self.chars = range(256)
+
+# stuff to deal with arrays of unsigned integers
+
+class Array:
+
+ def __init__(self, name, data):
+ self.name = name
+ self.data = data
+
+ def dump(self, file):
+ # write data to file, as a C array
+ size = getsize(self.data)
+ # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
+ file.write("static ")
+ if size == 1:
+ file.write("unsigned char")
+ elif size == 2:
+ file.write("unsigned short")
+ else:
+ file.write("unsigned int")
+ file.write(" " + self.name + "[] = {\n")
+ if self.data:
+ s = " "
+ for item in self.data:
+ i = str(item) + ", "
+ if len(s) + len(i) > 78:
+ file.write(s + "\n")
+ s = " " + i
+ else:
+ s = s + i
+ if string.strip(s):
+ file.write(s + "\n")
+ file.write("};\n\n")
+
+def getsize(data):
+ # return smallest possible integer size for the given array
+ maxdata = max(data)
+ if maxdata < 256:
+ return 1
+ elif maxdata < 65536:
+ return 2
+ else:
+ return 4
+
+def splitbins(bins):
+ # split a sparse integer table into two tables, such as:
+ # value = t2[(t1[char>>shift]<<shift)+(char&mask)]
+ # and value == 0 means no data
+ bytes = sys.maxint
+ for shift in range(16):
+ bin1 = []
+ bin2 = []
+ size = 2**shift
+ bincache = {}
+ for i in range(0, len(bins), size):
+ bin = bins[i:i+size]
+ index = bincache.get(tuple(bin))
+ if index is None:
+ index = len(bin2)
+ bincache[tuple(bin)] = index
+ for v in bin:
+ if v is None:
+ bin2.append(0)
+ else:
+ bin2.append(v)
+ bin1.append(index>>shift)
+ # determine memory size
+ b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2)
+ if b < bytes:
+ best = shift, bin1, bin2
+ bytes = b
+ shift, bin1, bin2 = best
+## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
+## len(bin1), len(bin2), shift, bytes
+## )
+ return bin1, bin2, shift
+
+if __name__ == "__main__":
+ maketable()