summaryrefslogtreecommitdiffstats
path: root/Tools/unicode
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2000-09-25 17:59:57 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2000-09-25 17:59:57 (GMT)
commite9133f7e2efbe68f89cb35cbc6a8bcc8bf389ad1 (patch)
treea38f6efc0328a58cf804f7d9c35cda5aa444bd9b /Tools/unicode
parente53793bf4c2361346c21d4dec0b7a617a850436c (diff)
downloadcpython-e9133f7e2efbe68f89cb35cbc6a8bcc8bf389ad1.zip
cpython-e9133f7e2efbe68f89cb35cbc6a8bcc8bf389ad1.tar.gz
cpython-e9133f7e2efbe68f89cb35cbc6a8bcc8bf389ad1.tar.bz2
unicode database compression, step 3:
- use unidb compression for the unicodectype module. smaller, faster, and slightly more portable... - also mention the unicode directory in Tools/README
Diffstat (limited to 'Tools/unicode')
-rw-r--r--Tools/unicode/makeunicodedata.py106
1 files changed, 97 insertions, 9 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 4781ec4..b8411ad 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -1,9 +1,13 @@
#
-# generate a compact version of the unicode property database
+# (re)generate unicode property and type databases
+#
+# this script converts a unicode 3.0 database file to
+# Modules/unicodedata_db.h and Objects/unicodetype_db.h
#
# history:
# 2000-09-24 fl created (based on bits and pieces from unidb)
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
+# 2000-09-25 fl added character type table
#
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
#
@@ -13,7 +17,7 @@ import sys
SCRIPT = sys.argv[0]
VERSION = "1.1"
-UNICODE_DATA = "../UnicodeData-Latest.txt"
+UNICODE_DATA = "UnicodeData-Latest.txt"
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@@ -24,7 +28,16 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
"ON" ]
-def maketable():
+ALPHA_MASK = 0x01
+DECIMAL_MASK = 0x02
+DIGIT_MASK = 0x04
+LOWER_MASK = 0x08
+NUMERIC_MASK = 0x10
+SPACE_MASK = 0x20
+TITLE_MASK = 0x40
+UPPER_MASK = 0x80
+
+def maketables():
unicode = UnicodeData(UNICODE_DATA)
@@ -74,7 +87,7 @@ def maketable():
i = 0
decomp_index[char] = i
- FILE = "unicodedata_db.h"
+ FILE = "Modules/unicodedata_db.h"
sys.stdout = open(FILE, "w")
@@ -87,6 +100,9 @@ def maketable():
print "};"
print
+ # FIXME: the following tables should be made static, and
+ # the support code moved into unicodedatabase.c
+
print "/* string literals */"
print "const char *_PyUnicode_CategoryNames[] = {"
for name in CATEGORY_NAMES:
@@ -106,24 +122,96 @@ def maketable():
print " NULL"
print "};"
- # split index table
+ # split record index table
index1, index2, shift = splitbins(index)
- print "/* index tables used to find the right database record */"
+ print "/* index tables for the database records */"
print "#define SHIFT", shift
Array("index1", index1).dump(sys.stdout)
Array("index2", index2).dump(sys.stdout)
- # split index table
+ # split decomposition index table
index1, index2, shift = splitbins(decomp_index)
- print "/* same, for the decomposition data */"
+ print "/* index tables for the decomposition data */"
print "#define DECOMP_SHIFT", shift
Array("decomp_index1", index1).dump(sys.stdout)
Array("decomp_index2", index2).dump(sys.stdout)
sys.stdout = sys.__stdout__
+ #
+ # 3) unicode type data
+
+ # extract unicode types
+ dummy = (0, 0, 0, 0)
+ table = [dummy]
+ cache = {0: dummy}
+ index = [0] * len(unicode.chars)
+
+ for char in unicode.chars:
+ record = unicode.table[char]
+ if record:
+ # extract database properties
+ category = record[2]
+ bidirectional = record[4]
+ flags = 0
+ if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
+ flags |= ALPHA_MASK
+ if category == "Ll":
+ flags |= LOWER_MASK
+ if category == "Zs" or bidirectional in ("WS", "B", "S"):
+ flags |= SPACE_MASK
+ if category in ["Lt", "Lu"]:
+ flags |= TITLE_MASK
+ if category == "Lu":
+ flags |= UPPER_MASK
+ # use delta predictor for upper/lower/title
+ if record[12]:
+ upper = (int(record[12], 16) - char) & 0xffff
+ else:
+ upper = 0
+ if record[13]:
+ lower = (int(record[13], 16) - char) & 0xffff
+ else:
+ lower = 0
+ if record[14]:
+ title = (int(record[14], 16) - char) & 0xffff
+ else:
+ title = 0
+ item = (
+ flags, upper, lower, title
+ )
+ # add entry to index and item tables
+ i = cache.get(item)
+ if i is None:
+ cache[item] = i = len(table)
+ table.append(item)
+ index[char] = i
+
+ FILE = "Objects/unicodetype_db.h"
+
+ sys.stdout = open(FILE, "w")
+
+ print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
+ print
+ print "/* a list of unique character type descriptors */"
+ print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
+ for item in table:
+ print " {%d, %d, %d, %d}," % item
+ print "};"
+ print
+
+ # split decomposition index table
+ index1, index2, shift = splitbins(index)
+
+ print "/* type indexes */"
+ print "#define SHIFT", shift
+ Array("index1", index1).dump(sys.stdout)
+ Array("index2", index2).dump(sys.stdout)
+
+ sys.stdout = sys.__stdout__
+
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB
@@ -259,4 +347,4 @@ def splitbins(t, trace=0):
return best
if __name__ == "__main__":
- maketable()
+ maketables()