forgot to check in the new makeunicodedata.py script

author: Fredrik Lundh <fredrik@pythonware.com> 2001-01-21 17:01:31 (GMT)
committer: Fredrik Lundh <fredrik@pythonware.com> 2001-01-21 17:01:31 (GMT)
commit: 9e9bcda547340a47b5f82d0463f4cdc29bd3b143 (patch)
tree: 480624c9c3fd9ad95a62efed9195eeaee545f27a /Tools/unicode
parent: d38855c35a9fe8e469ba6474161f4225a1eb07f6 (diff)
download: cpython-9e9bcda547340a47b5f82d0463f4cdc29bd3b143.zip
cpython-9e9bcda547340a47b5f82d0463f4cdc29bd3b143.tar.gz
cpython-9e9bcda547340a47b5f82d0463f4cdc29bd3b143.tar.bz2
1 files changed, 271 insertions, 17 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 15841d7..3a362ec 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -2,14 +2,16 @@
 # (re)generate unicode property and type databases
 #
 # this script converts a unicode 3.0 database file to
-# Modules/unicodedata_db.h and Objects/unicodetype_db.h
+# Modules/unicodedata_db.h, Modules/unicodename_db.h,
+# and Objects/unicodetype_db.h
 #
 # history:
 # 2000-09-24 fl   created (based on bits and pieces from unidb)
 # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
 # 2000-09-25 fl   added character type table
-# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields
+# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
 # 2000-11-03 fl   expand first/last ranges
+# 2001-01-19 fl   added character name tables (2.1)
 #
 # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
 #
@@ -17,7 +19,7 @@
 import sys
 
 SCRIPT = sys.argv[0]
-VERSION = "1.1"
+VERSION = "2.1"
 
 UNICODE_DATA = "UnicodeData-Latest.txt"
 
@@ -42,18 +44,32 @@ UPPER_MASK = 0x80
 
 def maketables(trace=0):
 
+    print "--- Reading", UNICODE_DATA, "..."
+
     unicode = UnicodeData(UNICODE_DATA)
 
-    print "--- Processing", UNICODE_DATA, "..."
     print len(filter(None, unicode.table)), "characters"
 
-    # extract unicode properties
+    makeunicodedata(unicode, trace)
+    makeunicodetype(unicode, trace)
+    makeunicodename(unicode, trace)
+
+# --------------------------------------------------------------------
+# unicode character properties
+
+def makeunicodedata(unicode, trace):
+
     dummy = (0, 0, 0, 0)
     table = [dummy]
     cache = {0: dummy}
     index = [0] * len(unicode.chars)
 
+    FILE = "Modules/unicodedata_db.h"
+
+    print "--- Preparing", FILE, "..."
+
     # 1) database properties
+
     for char in unicode.chars:
         record = unicode.table[char]
         if record:
@@ -93,13 +109,11 @@ def maketables(trace=0):
                 i = 0
             decomp_index[char] = i
 
-    FILE = "Modules/unicodedata_db.h"
-
-    print "--- Writing", FILE, "..."
-
     print len(table), "unique properties"
     print len(decomp_data), "unique decomposition entries"
 
+    print "--- Writing", FILE, "..."
+
     fp = open(FILE, "w")
     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
     print >>fp
@@ -111,7 +125,7 @@ def maketables(trace=0):
     print >>fp, "};"
     print >>fp
 
-    # FIXME: the following tables should be made static, and
+    # FIXME: <fl> the following tables could be made static, and
     # the support code moved into unicodedatabase.c
 
     print >>fp, "/* string literals */"
@@ -149,8 +163,16 @@ def maketables(trace=0):
     Array("decomp_index1", index1).dump(fp)
     Array("decomp_index2", index2).dump(fp)
 
-    #
-    # 3) unicode type data
+    fp.close()
+
+# --------------------------------------------------------------------
+# unicode character type tables
+
+def makeunicodetype(unicode, trace):
+
+    FILE = "Objects/unicodetype_db.h"
+
+    print "--- Preparing", FILE, "..."
 
     # extract unicode types
     dummy = (0, 0, 0, 0, 0, 0)
@@ -209,14 +231,11 @@ def maketables(trace=0):
                 table.append(item)
             index[char] = i
 
-    FILE = "Objects/unicodetype_db.h"
-
-    fp = open(FILE, "w")
+    print len(table), "unique character type entries"
 
     print "--- Writing", FILE, "..."
 
-    print len(table), "unique character type entries"
-
+    fp = open(FILE, "w")
     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
     print >>fp
     print >>fp, "/* a list of unique character type descriptors */"
@@ -234,6 +253,155 @@ def maketables(trace=0):
     Array("index1", index1).dump(fp)
     Array("index2", index2).dump(fp)
 
+    fp.close()
+
+# --------------------------------------------------------------------
+# unicode name database
+
+def makeunicodename(unicode, trace):
+
+    FILE = "Modules/unicodename_db.h"
+
+    print "--- Preparing", FILE, "..."
+
+    # collect names
+    names = [None] * len(unicode.chars)
+
+    for char in unicode.chars:
+        record = unicode.table[char]
+        if record:
+            name = record[1].strip()
+            if name and name[0] != "<":
+                names[char] = name + chr(0)
+
+    print len(filter(lambda n: n is not None, names)), "distinct names"
+
+    # collect unique words from names (note that we differ between
+    # words inside a sentence, and words ending a sentence.  the
+    # latter includes the trailing null byte.
+
+    words = {}
+    n = b = 0
+    for char in unicode.chars:
+        name = names[char]
+        if name:
+            w = name.split()
+            b = b + len(name)
+            n = n + len(w)
+            for w in w:
+                l = words.get(w)
+                if l:
+                    l.append(None)
+                else:
+                    words[w] = [len(words)]
+
+    print n, "words in text;", b, "bytes"
+
+    wordlist = words.items()
+
+    # sort on falling frequency
+    wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
+
+    # statistics
+    n = 0
+    for i in range(128):
+        n = n + len(wordlist[i][1])
+    print n, "short words (7-bit indices)"
+
+    # pick the 128 most commonly used words, and sort the rest on
+    # falling length (to maximize overlap)
+
+    wordlist, wordtail = wordlist[:128], wordlist[128:]
+    wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
+    wordlist.extend(wordtail)
+
+    # generate lexicon from words
+
+    lexicon_offset = [0]
+    lexicon = ""
+    words = {}
+
+    # build a lexicon string
+    offset = 0
+    for w, x in wordlist:
+        # encoding: bit 7 indicates last character in word (chr(128)
+        # indicates the last character in an entire string)
+        ww = w[:-1] + chr(ord(w[-1])+128)
+        # reuse string tails, when possible
+        o = string.find(lexicon, ww)
+        if o < 0:
+            o = offset
+            lexicon = lexicon + ww
+            offset = offset + len(w)
+        words[w] = len(lexicon_offset)
+        lexicon_offset.append(offset)
+
+    print len(words), "words in lexicon;", len(lexicon), "bytes"
+
+    assert len(words) < 32768 # 15-bit word indices
+
+    lexicon = map(ord, lexicon)
+
+    # generate phrasebook from names and lexicon
+    phrasebook = [0]
+    phrasebook_offset = [0] * len(unicode.chars)
+    for char in unicode.chars:
+        name = names[char]
+        if name:
+            w = name.split()
+            phrasebook_offset[char] = len(phrasebook)
+            for w in w:
+                i = words[w]
+                if i < 128:
+                    phrasebook.append(128+i)
+                else:
+                    phrasebook.append(i>>8)
+                    phrasebook.append(i&255)
+
+    #
+    # unicode name hash table
+
+    # extract names
+    data = []
+    for char in unicode.chars:
+        record = unicode.table[char]
+        if record:
+            name = record[1].strip()
+            if name and name[0] != "<":
+                data.append((name, char))
+
+    # the magic number 47 was chosen to minimize the number of
+    # collisions on the current data set.  if you like, change it
+    # and see what happens...
+
+    codehash = Hash("code", data, 47)
+
+    print "--- Writing", FILE, "..."
+
+    fp = open(FILE, "w")
+    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
+    print >>fp
+    print >>fp, "#define NAME_MAXLEN", 256
+    print >>fp
+    print >>fp, "/* lexicon */"
+    Array("lexicon", lexicon).dump(fp)
+    Array("lexicon_offset", lexicon_offset).dump(fp)
+
+    # split decomposition index table
+    offset1, offset2, shift = splitbins(phrasebook_offset, trace)
+
+    print >>fp, "/* code->name phrasebook */"
+    print >>fp, "#define phrasebook_shift", shift
+
+    Array("phrasebook", phrasebook).dump(fp)
+    Array("phrasebook_offset1", offset1).dump(fp)
+    Array("phrasebook_offset2", offset2).dump(fp)
+
+    print >>fp, "/* name->code dictionary */"
+    codehash.dump(fp)
+
+    fp.close()
+
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
 # Copyright (c) 1999-2000 by Secret Labs AB
@@ -280,6 +448,92 @@ class UnicodeData:
         # restrict character range to ISO Latin 1
         self.chars = range(256)
 
+# hash table tools
+
+# this is a straight-forward reimplementation of Python's built-in
+# dictionary type, using a static data structure, and a custom string
+# hash algorithm.
+
+def myhash(s, magic):
+    h = 0
+    for c in map(ord, string.upper(s)):
+        h = (h * magic) + c
+        ix = h & 0xff000000
+        if ix:
+            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
+    return h
+
+SIZES = [
+    (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
+    (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
+    (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
+    (2097152,5), (4194304,3), (8388608,33), (16777216,27)
+]
+
+class Hash:
+    def __init__(self, name, data, magic):
+        # turn a (key, value) list into a static hash table structure
+
+        # determine table size
+        for size, poly in SIZES:
+            if size > len(data):
+                poly = size + poly
+                break
+        else:
+            raise AssertionError, "ran out of polynominals"
+
+        print size, "slots in hash table"
+
+        table = [None] * size
+
+        mask = size-1
+
+        n = 0
+
+        hash = myhash
+
+        # initialize hash table
+        for key, value in data:
+            h = hash(key, magic)
+            i = (~h) & mask
+            v = table[i]
+            if v is None:
+                table[i] = value
+                continue
+            incr = (h ^ (h >> 3)) & mask;
+            if not incr:
+                incr = mask
+            while 1:
+                n = n + 1
+                i = (i + incr) & mask
+                v = table[i]
+                if v is None:
+                    table[i] = value
+                    break
+                incr = incr << 1
+                if incr > mask:
+                    incr = incr ^ poly
+
+        print n, "collisions"
+        self.collisions = n
+
+        for i in range(len(table)):
+            if table[i] is None:
+                table[i] = 0
+
+        self.data = Array(name + "_hash", table)
+        self.magic = magic
+        self.name = name
+        self.size = size
+        self.poly = poly
+
+    def dump(self, file):
+        # write data to file, as a C array
+        self.data.dump(file)
+        file.write("#define %s_magic %d\n" % (self.name, self.magic))
+        file.write("#define %s_size %d\n" % (self.name, self.size))
+        file.write("#define %s_poly %d\n" % (self.name, self.poly))
+
 # stuff to deal with arrays of unsigned integers
 
 class Array:
author	Fredrik Lundh <fredrik@pythonware.com>	2001-01-21 17:01:31 (GMT)
committer	Fredrik Lundh <fredrik@pythonware.com>	2001-01-21 17:01:31 (GMT)
commit	9e9bcda547340a47b5f82d0463f4cdc29bd3b143 (patch)
tree	480624c9c3fd9ad95a62efed9195eeaee545f27a /Tools/unicode
parent	d38855c35a9fe8e469ba6474161f4225a1eb07f6 (diff)
download	cpython-9e9bcda547340a47b5f82d0463f4cdc29bd3b143.zip cpython-9e9bcda547340a47b5f82d0463f4cdc29bd3b143.tar.gz cpython-9e9bcda547340a47b5f82d0463f4cdc29bd3b143.tar.bz2