diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2000-09-25 17:59:57 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2000-09-25 17:59:57 (GMT) |
commit | e9133f7e2efbe68f89cb35cbc6a8bcc8bf389ad1 (patch) | |
tree | a38f6efc0328a58cf804f7d9c35cda5aa444bd9b /Tools | |
parent | e53793bf4c2361346c21d4dec0b7a617a850436c (diff) | |
download | cpython-e9133f7e2efbe68f89cb35cbc6a8bcc8bf389ad1.zip cpython-e9133f7e2efbe68f89cb35cbc6a8bcc8bf389ad1.tar.gz cpython-e9133f7e2efbe68f89cb35cbc6a8bcc8bf389ad1.tar.bz2 |
unicode database compression, step 3:
- use unidb compression for the unicodectype module. smaller,
faster, and slightly more portable...
- also mention the unicode directory in Tools/README
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/README | 3 | ||||
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 106 |
2 files changed, 100 insertions, 9 deletions
diff --git a/Tools/README b/Tools/README index a0f5972..b5bdf2e 100644 --- a/Tools/README +++ b/Tools/README @@ -21,6 +21,9 @@ scripts A number of useful single-file programs, e.g. tabnanny.py (by Tim Peters), which checks for inconsistent mixing of tabs and spaces. +unicode Tools used to generate unicode database files for + Python 2.0 (by Fredrik Lundh). + versioncheck A tool to automate checking whether you have the latest version of a package (by Jack Jansen). diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 4781ec4..b8411ad 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -1,9 +1,13 @@ # -# generate a compact version of the unicode property database +# (re)generate unicode property and type databases +# +# this script converts a unicode 3.0 database file to +# Modules/unicodedata_db.h and Objects/unicodetype_db.h # # history: # 2000-09-24 fl created (based on bits and pieces from unidb) # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table +# 2000-09-25 fl added character type table # # written by Fredrik Lundh (fredrik@pythonware.com), September 2000 # @@ -13,7 +17,7 @@ import sys SCRIPT = sys.argv[0] VERSION = "1.1" -UNICODE_DATA = "../UnicodeData-Latest.txt" +UNICODE_DATA = "UnicodeData-Latest.txt" CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", @@ -24,7 +28,16 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", "ON" ] -def maketable(): +ALPHA_MASK = 0x01 +DECIMAL_MASK = 0x02 +DIGIT_MASK = 0x04 +LOWER_MASK = 0x08 +NUMERIC_MASK = 0x10 +SPACE_MASK = 0x20 +TITLE_MASK = 0x40 +UPPER_MASK = 0x80 + +def maketables(): unicode = UnicodeData(UNICODE_DATA) @@ -74,7 +87,7 @@ def maketable(): i = 0 decomp_index[char] = i - FILE = "unicodedata_db.h" + FILE = "Modules/unicodedata_db.h" sys.stdout = open(FILE, "w") @@ -87,6 +100,9 @@ def maketable(): print "};" print + # FIXME: the following tables should be made static, and + # the support code moved into unicodedatabase.c + print "/* string literals */" print "const char *_PyUnicode_CategoryNames[] = {" for name in CATEGORY_NAMES: @@ -106,24 +122,96 @@ def maketable(): print " NULL" print "};" - # split index table + # split record index table index1, index2, shift = splitbins(index) - print "/* index tables used to find the right database record */" + print "/* index tables for the database records */" print "#define SHIFT", shift Array("index1", index1).dump(sys.stdout) Array("index2", index2).dump(sys.stdout) - # split index table + # split decomposition index table index1, index2, shift = splitbins(decomp_index) - print "/* same, for the decomposition data */" + print "/* index tables for the decomposition data */" print "#define DECOMP_SHIFT", shift Array("decomp_index1", index1).dump(sys.stdout) Array("decomp_index2", index2).dump(sys.stdout) sys.stdout = sys.__stdout__ + # + # 3) unicode type data + + # extract unicode types + dummy = (0, 0, 0, 0) + table = [dummy] + cache = {0: dummy} + index = [0] * len(unicode.chars) + + for char in unicode.chars: + record = unicode.table[char] + if record: + # extract database properties + category = record[2] + bidirectional = record[4] + flags = 0 + if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: + flags |= ALPHA_MASK + if category == "Ll": + flags |= LOWER_MASK + if category == "Zs" or bidirectional in ("WS", "B", "S"): + flags |= SPACE_MASK + if category in ["Lt", "Lu"]: + flags |= TITLE_MASK + if category == "Lu": + flags |= UPPER_MASK + # use delta predictor for upper/lower/title + if record[12]: + upper = (int(record[12], 16) - char) & 0xffff + else: + upper = 0 + if record[13]: + lower = (int(record[13], 16) - char) & 0xffff + else: + lower = 0 + if record[14]: + title = (int(record[14], 16) - char) & 0xffff + else: + title = 0 + item = ( + flags, upper, lower, title + ) + # add entry to index and item tables + i = cache.get(item) + if i is None: + cache[item] = i = len(table) + table.append(item) + index[char] = i + + FILE = "Objects/unicodetype_db.h" + + sys.stdout = open(FILE, "w") + + print "/* this file was generated by %s %s */" % (SCRIPT, VERSION) + print + print "/* a list of unique character type descriptors */" + print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {" + for item in table: + print " {%d, %d, %d, %d}," % item + print "};" + print + + # split decomposition index table + index1, index2, shift = splitbins(index) + + print "/* type indexes */" + print "#define SHIFT", shift + Array("index1", index1).dump(sys.stdout) + Array("index2", index2).dump(sys.stdout) + + sys.stdout = sys.__stdout__ + # -------------------------------------------------------------------- # the following support code is taken from the unidb utilities # Copyright (c) 1999-2000 by Secret Labs AB @@ -259,4 +347,4 @@ def splitbins(t, trace=0): return best if __name__ == "__main__": - maketable() + maketables() |