summaryrefslogtreecommitdiffstats
path: root/Tools
diff options
context:
space:
mode:
authorMarc-André Lemburg <mal@egenix.com>2000-06-28 16:49:29 (GMT)
committerMarc-André Lemburg <mal@egenix.com>2000-06-28 16:49:29 (GMT)
commitc5bb9c21fe9f9b23fa952eab5e1d2f21fdf966b7 (patch)
tree05508de2722544c4425689984ed1bffb454b137d /Tools
parent93c409a590b00ea5b28112fe2b7ca3bc862850b8 (diff)
downloadcpython-c5bb9c21fe9f9b23fa952eab5e1d2f21fdf966b7.zip
cpython-c5bb9c21fe9f9b23fa952eab5e1d2f21fdf966b7.tar.gz
cpython-c5bb9c21fe9f9b23fa952eab5e1d2f21fdf966b7.tar.bz2
Marc-Andre Lemburg <mal@lemburg.com>:
Generator for the new ucnhash module (ucnhash.h|c). Uses perfect_hash.py to create the ucnhash module.
Diffstat (limited to 'Tools')
-rw-r--r--Tools/perfecthash/GenUCNHash.py109
1 files changed, 109 insertions, 0 deletions
diff --git a/Tools/perfecthash/GenUCNHash.py b/Tools/perfecthash/GenUCNHash.py
new file mode 100644
index 0000000..b04a9b2
--- /dev/null
+++ b/Tools/perfecthash/GenUCNHash.py
@@ -0,0 +1,109 @@
+#! /usr/bin/env python
+import sys
+import string
+import perfect_hash
+
+# This is a user of perfect_hash.py
+# that takes as input the UnicodeData.txt file available from:
+# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
+
+# It generates a hash table from Unicode Character Name ->
+# unicode code space value.
+
+# These variables determine which hash function is tried first.
+# Yields a multiple of 1.7875 for UnicodeData.txt on 2000/06/24/
+f1Seed = 1694245428
+f2Seed = -1917331657
+
+# Maximum allowed multipler, if this isn't None then instead of continually
+# increasing C, it resets it back to initC to keep searching for
+# a solution.
+minC = 1.7875
+# Initial multiplier for trying to find a perfect hash function.
+initC = 1.7875
+
+moduleName = "ucnhash"
+dataArrayName = "aucn"
+dataArrayType = "_Py_UnicodeCharacterName"
+headerFileName = "ucnhash.h"
+cFileName = "ucnhash.c"
+structName = "_Py_UCNHashAPI"
+
+keys = []
+hashData = {}
+
+def generateOutputFiles(perfHash, hashData):
+ header = perfHash.generate_header(structName)
+ header = header + """
+typedef struct
+{
+ const char *pszUCN;
+ unsigned int uiValue;
+} _Py_UnicodeCharacterName;
+
+"""
+
+ code = perfHash.generate_code(moduleName,
+ dataArrayName,
+ dataArrayType,
+ structName)
+ out = open(headerFileName, "w")
+ out.write(header)
+ out = open(cFileName, "w")
+ out.write("#include <%s>\n" % headerFileName)
+ out.write(code)
+ perfHash.generate_graph(out)
+ out.write("""
+
+static const _Py_UnicodeCharacterName aucn[] =
+{
+""")
+ for i in xrange(len(keys)):
+ v = hashData[keys[i][0]]
+ out.write(' { "' + keys[i][0] + '", ' + hex(v) + " }," + "\n")
+ out.write("};\n\n")
+ sys.stderr.write('\nGenerated output files: \n')
+ sys.stderr.write('%s\n%s\n' % (headerFileName, cFileName))
+
+def main():
+ # Suck in UnicodeData.txt and spit out the generated files.
+ input = open(sys.argv[1], 'r')
+ i = 0
+ while 1:
+ line = input.readline()
+ if line == "": break
+ fields = string.split(line, ';')
+ if len(fields) < 2:
+ sys.stderr.write('Ill-formated line!\n')
+ sys.stderr.write('line #: %d\n' % (i + 1))
+ sys.exit()
+ data, key = fields[:2]
+ key = string.strip( key )
+ # Any name starting with '<' is a control, or start/end character,
+ # so skip it...
+ if key[0] == "<":
+ continue
+ hashcode = i
+ i = i + 1
+ # force the name to uppercase
+ keys.append( (string.upper(key),hashcode) )
+ data = string.atoi(data, 16)
+ hashData[key] = data
+
+ input.close()
+ sys.stderr.write('%i key/hash pairs read\n' % len(keys) )
+ perfHash = perfect_hash.generate_hash(keys, 1,
+ minC, initC,
+ f1Seed, f2Seed,
+ # increment, tries
+ 0.0025, 50)
+ generateOutputFiles(perfHash, hashData)
+
+if __name__ == '__main__':
+ if len(sys.argv) == 1:
+ sys.stdout = sys.stderr
+ print 'Usage: %s <input filename>' % sys.argv[0]
+ print ' The input file needs to be UnicodeData.txt'
+ sys.exit()
+ main()
+