diff options
author | Marc-André Lemburg <mal@egenix.com> | 2000-06-28 16:49:29 (GMT) |
---|---|---|
committer | Marc-André Lemburg <mal@egenix.com> | 2000-06-28 16:49:29 (GMT) |
commit | c5bb9c21fe9f9b23fa952eab5e1d2f21fdf966b7 (patch) | |
tree | 05508de2722544c4425689984ed1bffb454b137d /Tools | |
parent | 93c409a590b00ea5b28112fe2b7ca3bc862850b8 (diff) | |
download | cpython-c5bb9c21fe9f9b23fa952eab5e1d2f21fdf966b7.zip cpython-c5bb9c21fe9f9b23fa952eab5e1d2f21fdf966b7.tar.gz cpython-c5bb9c21fe9f9b23fa952eab5e1d2f21fdf966b7.tar.bz2 |
Marc-Andre Lemburg <mal@lemburg.com>:
Generator for the new ucnhash module (ucnhash.h|c). Uses perfect_hash.py
to create the ucnhash module.
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/perfecthash/GenUCNHash.py | 109 |
1 files changed, 109 insertions, 0 deletions
diff --git a/Tools/perfecthash/GenUCNHash.py b/Tools/perfecthash/GenUCNHash.py new file mode 100644 index 0000000..b04a9b2 --- /dev/null +++ b/Tools/perfecthash/GenUCNHash.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python +import sys +import string +import perfect_hash + +# This is a user of perfect_hash.py +# that takes as input the UnicodeData.txt file available from: +# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt + +# It generates a hash table from Unicode Character Name -> +# unicode code space value. + +# These variables determine which hash function is tried first. +# Yields a multiple of 1.7875 for UnicodeData.txt on 2000/06/24/ +f1Seed = 1694245428 +f2Seed = -1917331657 + +# Maximum allowed multipler, if this isn't None then instead of continually +# increasing C, it resets it back to initC to keep searching for +# a solution. +minC = 1.7875 +# Initial multiplier for trying to find a perfect hash function. +initC = 1.7875 + +moduleName = "ucnhash" +dataArrayName = "aucn" +dataArrayType = "_Py_UnicodeCharacterName" +headerFileName = "ucnhash.h" +cFileName = "ucnhash.c" +structName = "_Py_UCNHashAPI" + +keys = [] +hashData = {} + +def generateOutputFiles(perfHash, hashData): + header = perfHash.generate_header(structName) + header = header + """ +typedef struct +{ + const char *pszUCN; + unsigned int uiValue; +} _Py_UnicodeCharacterName; + +""" + + code = perfHash.generate_code(moduleName, + dataArrayName, + dataArrayType, + structName) + out = open(headerFileName, "w") + out.write(header) + out = open(cFileName, "w") + out.write("#include <%s>\n" % headerFileName) + out.write(code) + perfHash.generate_graph(out) + out.write(""" + +static const _Py_UnicodeCharacterName aucn[] = +{ +""") + for i in xrange(len(keys)): + v = hashData[keys[i][0]] + out.write(' { "' + keys[i][0] + '", ' + hex(v) + " }," + "\n") + out.write("};\n\n") + sys.stderr.write('\nGenerated output files: \n') + sys.stderr.write('%s\n%s\n' % (headerFileName, cFileName)) + +def main(): + # Suck in UnicodeData.txt and spit out the generated files. + input = open(sys.argv[1], 'r') + i = 0 + while 1: + line = input.readline() + if line == "": break + fields = string.split(line, ';') + if len(fields) < 2: + sys.stderr.write('Ill-formated line!\n') + sys.stderr.write('line #: %d\n' % (i + 1)) + sys.exit() + data, key = fields[:2] + key = string.strip( key ) + # Any name starting with '<' is a control, or start/end character, + # so skip it... + if key[0] == "<": + continue + hashcode = i + i = i + 1 + # force the name to uppercase + keys.append( (string.upper(key),hashcode) ) + data = string.atoi(data, 16) + hashData[key] = data + + input.close() + sys.stderr.write('%i key/hash pairs read\n' % len(keys) ) + perfHash = perfect_hash.generate_hash(keys, 1, + minC, initC, + f1Seed, f2Seed, + # increment, tries + 0.0025, 50) + generateOutputFiles(perfHash, hashData) + +if __name__ == '__main__': + if len(sys.argv) == 1: + sys.stdout = sys.stderr + print 'Usage: %s <input filename>' % sys.argv[0] + print ' The input file needs to be UnicodeData.txt' + sys.exit() + main() + |