diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2001-01-21 17:01:31 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2001-01-21 17:01:31 (GMT) |
commit | 9e9bcda547340a47b5f82d0463f4cdc29bd3b143 (patch) | |
tree | 480624c9c3fd9ad95a62efed9195eeaee545f27a /Tools/unicode/makeunicodedata.py | |
parent | d38855c35a9fe8e469ba6474161f4225a1eb07f6 (diff) | |
download | cpython-9e9bcda547340a47b5f82d0463f4cdc29bd3b143.zip cpython-9e9bcda547340a47b5f82d0463f4cdc29bd3b143.tar.gz cpython-9e9bcda547340a47b5f82d0463f4cdc29bd3b143.tar.bz2 |
forgot to check in the new makeunicodedata.py script
Diffstat (limited to 'Tools/unicode/makeunicodedata.py')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 288 |
1 files changed, 271 insertions, 17 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 15841d7..3a362ec 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -2,14 +2,16 @@ # (re)generate unicode property and type databases # # this script converts a unicode 3.0 database file to -# Modules/unicodedata_db.h and Objects/unicodetype_db.h +# Modules/unicodedata_db.h, Modules/unicodename_db.h, +# and Objects/unicodetype_db.h # # history: # 2000-09-24 fl created (based on bits and pieces from unidb) # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table # 2000-09-25 fl added character type table -# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields +# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0) # 2000-11-03 fl expand first/last ranges +# 2001-01-19 fl added character name tables (2.1) # # written by Fredrik Lundh (fredrik@pythonware.com), September 2000 # @@ -17,7 +19,7 @@ import sys SCRIPT = sys.argv[0] -VERSION = "1.1" +VERSION = "2.1" UNICODE_DATA = "UnicodeData-Latest.txt" @@ -42,18 +44,32 @@ UPPER_MASK = 0x80 def maketables(trace=0): + print "--- Reading", UNICODE_DATA, "..." + unicode = UnicodeData(UNICODE_DATA) - print "--- Processing", UNICODE_DATA, "..." print len(filter(None, unicode.table)), "characters" - # extract unicode properties + makeunicodedata(unicode, trace) + makeunicodetype(unicode, trace) + makeunicodename(unicode, trace) + +# -------------------------------------------------------------------- +# unicode character properties + +def makeunicodedata(unicode, trace): + dummy = (0, 0, 0, 0) table = [dummy] cache = {0: dummy} index = [0] * len(unicode.chars) + FILE = "Modules/unicodedata_db.h" + + print "--- Preparing", FILE, "..." + # 1) database properties + for char in unicode.chars: record = unicode.table[char] if record: @@ -93,13 +109,11 @@ def maketables(trace=0): i = 0 decomp_index[char] = i - FILE = "Modules/unicodedata_db.h" - - print "--- Writing", FILE, "..." - print len(table), "unique properties" print len(decomp_data), "unique decomposition entries" + print "--- Writing", FILE, "..." + fp = open(FILE, "w") print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) print >>fp @@ -111,7 +125,7 @@ def maketables(trace=0): print >>fp, "};" print >>fp - # FIXME: the following tables should be made static, and + # FIXME: <fl> the following tables could be made static, and # the support code moved into unicodedatabase.c print >>fp, "/* string literals */" @@ -149,8 +163,16 @@ def maketables(trace=0): Array("decomp_index1", index1).dump(fp) Array("decomp_index2", index2).dump(fp) - # - # 3) unicode type data + fp.close() + +# -------------------------------------------------------------------- +# unicode character type tables + +def makeunicodetype(unicode, trace): + + FILE = "Objects/unicodetype_db.h" + + print "--- Preparing", FILE, "..." # extract unicode types dummy = (0, 0, 0, 0, 0, 0) @@ -209,14 +231,11 @@ def maketables(trace=0): table.append(item) index[char] = i - FILE = "Objects/unicodetype_db.h" - - fp = open(FILE, "w") + print len(table), "unique character type entries" print "--- Writing", FILE, "..." - print len(table), "unique character type entries" - + fp = open(FILE, "w") print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) print >>fp print >>fp, "/* a list of unique character type descriptors */" @@ -234,6 +253,155 @@ def maketables(trace=0): Array("index1", index1).dump(fp) Array("index2", index2).dump(fp) + fp.close() + +# -------------------------------------------------------------------- +# unicode name database + +def makeunicodename(unicode, trace): + + FILE = "Modules/unicodename_db.h" + + print "--- Preparing", FILE, "..." + + # collect names + names = [None] * len(unicode.chars) + + for char in unicode.chars: + record = unicode.table[char] + if record: + name = record[1].strip() + if name and name[0] != "<": + names[char] = name + chr(0) + + print len(filter(lambda n: n is not None, names)), "distinct names" + + # collect unique words from names (note that we differ between + # words inside a sentence, and words ending a sentence. the + # latter includes the trailing null byte. + + words = {} + n = b = 0 + for char in unicode.chars: + name = names[char] + if name: + w = name.split() + b = b + len(name) + n = n + len(w) + for w in w: + l = words.get(w) + if l: + l.append(None) + else: + words[w] = [len(words)] + + print n, "words in text;", b, "bytes" + + wordlist = words.items() + + # sort on falling frequency + wordlist.sort(lambda a, b: len(b[1])-len(a[1])) + + # statistics + n = 0 + for i in range(128): + n = n + len(wordlist[i][1]) + print n, "short words (7-bit indices)" + + # pick the 128 most commonly used words, and sort the rest on + # falling length (to maximize overlap) + + wordlist, wordtail = wordlist[:128], wordlist[128:] + wordtail.sort(lambda a, b: len(b[0])-len(a[0])) + wordlist.extend(wordtail) + + # generate lexicon from words + + lexicon_offset = [0] + lexicon = "" + words = {} + + # build a lexicon string + offset = 0 + for w, x in wordlist: + # encoding: bit 7 indicates last character in word (chr(128) + # indicates the last character in an entire string) + ww = w[:-1] + chr(ord(w[-1])+128) + # reuse string tails, when possible + o = string.find(lexicon, ww) + if o < 0: + o = offset + lexicon = lexicon + ww + offset = offset + len(w) + words[w] = len(lexicon_offset) + lexicon_offset.append(offset) + + print len(words), "words in lexicon;", len(lexicon), "bytes" + + assert len(words) < 32768 # 15-bit word indices + + lexicon = map(ord, lexicon) + + # generate phrasebook from names and lexicon + phrasebook = [0] + phrasebook_offset = [0] * len(unicode.chars) + for char in unicode.chars: + name = names[char] + if name: + w = name.split() + phrasebook_offset[char] = len(phrasebook) + for w in w: + i = words[w] + if i < 128: + phrasebook.append(128+i) + else: + phrasebook.append(i>>8) + phrasebook.append(i&255) + + # + # unicode name hash table + + # extract names + data = [] + for char in unicode.chars: + record = unicode.table[char] + if record: + name = record[1].strip() + if name and name[0] != "<": + data.append((name, char)) + + # the magic number 47 was chosen to minimize the number of + # collisions on the current data set. if you like, change it + # and see what happens... + + codehash = Hash("code", data, 47) + + print "--- Writing", FILE, "..." + + fp = open(FILE, "w") + print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) + print >>fp + print >>fp, "#define NAME_MAXLEN", 256 + print >>fp + print >>fp, "/* lexicon */" + Array("lexicon", lexicon).dump(fp) + Array("lexicon_offset", lexicon_offset).dump(fp) + + # split decomposition index table + offset1, offset2, shift = splitbins(phrasebook_offset, trace) + + print >>fp, "/* code->name phrasebook */" + print >>fp, "#define phrasebook_shift", shift + + Array("phrasebook", phrasebook).dump(fp) + Array("phrasebook_offset1", offset1).dump(fp) + Array("phrasebook_offset2", offset2).dump(fp) + + print >>fp, "/* name->code dictionary */" + codehash.dump(fp) + + fp.close() + # -------------------------------------------------------------------- # the following support code is taken from the unidb utilities # Copyright (c) 1999-2000 by Secret Labs AB @@ -280,6 +448,92 @@ class UnicodeData: # restrict character range to ISO Latin 1 self.chars = range(256) +# hash table tools + +# this is a straight-forward reimplementation of Python's built-in +# dictionary type, using a static data structure, and a custom string +# hash algorithm. + +def myhash(s, magic): + h = 0 + for c in map(ord, string.upper(s)): + h = (h * magic) + c + ix = h & 0xff000000 + if ix: + h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff + return h + +SIZES = [ + (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17), + (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3), + (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9), + (2097152,5), (4194304,3), (8388608,33), (16777216,27) +] + +class Hash: + def __init__(self, name, data, magic): + # turn a (key, value) list into a static hash table structure + + # determine table size + for size, poly in SIZES: + if size > len(data): + poly = size + poly + break + else: + raise AssertionError, "ran out of polynominals" + + print size, "slots in hash table" + + table = [None] * size + + mask = size-1 + + n = 0 + + hash = myhash + + # initialize hash table + for key, value in data: + h = hash(key, magic) + i = (~h) & mask + v = table[i] + if v is None: + table[i] = value + continue + incr = (h ^ (h >> 3)) & mask; + if not incr: + incr = mask + while 1: + n = n + 1 + i = (i + incr) & mask + v = table[i] + if v is None: + table[i] = value + break + incr = incr << 1 + if incr > mask: + incr = incr ^ poly + + print n, "collisions" + self.collisions = n + + for i in range(len(table)): + if table[i] is None: + table[i] = 0 + + self.data = Array(name + "_hash", table) + self.magic = magic + self.name = name + self.size = size + self.poly = poly + + def dump(self, file): + # write data to file, as a C array + self.data.dump(file) + file.write("#define %s_magic %d\n" % (self.name, self.magic)) + file.write("#define %s_size %d\n" % (self.name, self.size)) + file.write("#define %s_poly %d\n" % (self.name, self.poly)) + # stuff to deal with arrays of unsigned integers class Array: |