diff options
Diffstat (limited to 'Tools/unicode/makeunicodedata.py')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 50 |
1 files changed, 39 insertions, 11 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 8c0c075..15841d7 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -9,6 +9,7 @@ # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table # 2000-09-25 fl added character type table # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields +# 2000-11-03 fl expand first/last ranges # # written by Fredrik Lundh (fredrik@pythonware.com), September 2000 # @@ -39,10 +40,13 @@ SPACE_MASK = 0x20 TITLE_MASK = 0x40 UPPER_MASK = 0x80 -def maketables(): +def maketables(trace=0): unicode = UnicodeData(UNICODE_DATA) + print "--- Processing", UNICODE_DATA, "..." + print len(filter(None, unicode.table)), "characters" + # extract unicode properties dummy = (0, 0, 0, 0) table = [dummy] @@ -91,6 +95,11 @@ def maketables(): FILE = "Modules/unicodedata_db.h" + print "--- Writing", FILE, "..." + + print len(table), "unique properties" + print len(decomp_data), "unique decomposition entries" + fp = open(FILE, "w") print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) print >>fp @@ -125,7 +134,7 @@ def maketables(): print >>fp, "};" # split record index table - index1, index2, shift = splitbins(index) + index1, index2, shift = splitbins(index, trace) print >>fp, "/* index tables for the database records */" print >>fp, "#define SHIFT", shift @@ -133,7 +142,7 @@ def maketables(): Array("index2", index2).dump(fp) # split decomposition index table - index1, index2, shift = splitbins(decomp_index) + index1, index2, shift = splitbins(decomp_index, trace) print >>fp, "/* index tables for the decomposition data */" print >>fp, "#define DECOMP_SHIFT", shift @@ -200,12 +209,14 @@ def maketables(): table.append(item) index[char] = i - print len(table), "ctype entries" - FILE = "Objects/unicodetype_db.h" fp = open(FILE, "w") + print "--- Writing", FILE, "..." + + print len(table), "unique character type entries" + print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) print >>fp print >>fp, "/* a list of unique character type descriptors */" @@ -216,7 +227,7 @@ def maketables(): print >>fp # split decomposition index table - index1, index2, shift = splitbins(index) + index1, index2, shift = splitbins(index, trace) print >>fp, "/* type indexes */" print >>fp, "#define SHIFT", shift @@ -233,7 +244,7 @@ import string, sys class UnicodeData: - def __init__(self, filename): + def __init__(self, filename, expand=1): file = open(filename) table = [None] * 65536 while 1: @@ -244,6 +255,22 @@ class UnicodeData: char = string.atoi(s[0], 16) table[char] = s + # expand first-last ranges (ignore surrogates and private use) + if expand: + field = None + for i in range(0, 0xD800): + s = table[i] + if s: + if s[1][-6:] == "First>": + s[1] = "" + field = s[:] + elif s[1][-5:] == "Last>": + s[1] = "" + field = None + elif field: + field[0] = hex(i) + table[i] = field + # public attributes self.filename = filename self.table = table @@ -306,8 +333,9 @@ def splitbins(t, trace=0): t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] where mask is a bitmask isolating the last "shift" bits. - If optional arg trace is true (default false), progress info is - printed to sys.stderr. + If optional arg trace is non-zero (default zero), progress info + is printed to sys.stderr. The higher the value, the more info + you'll get. """ import sys @@ -341,7 +369,7 @@ def splitbins(t, trace=0): t1.append(index >> shift) # determine memory size b = len(t1)*getsize(t1) + len(t2)*getsize(t2) - if trace: + if trace > 1: dump(t1, t2, shift, b) if b < bytes: best = t1, t2, shift @@ -358,4 +386,4 @@ def splitbins(t, trace=0): return best if __name__ == "__main__": - maketables() + maketables(1) |