summaryrefslogtreecommitdiffstats
path: root/Tools/unicode/makeunicodedata.py
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/unicode/makeunicodedata.py')
-rw-r--r--Tools/unicode/makeunicodedata.py62
1 files changed, 47 insertions, 15 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index f2e6dc8..4781ec4 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -1,14 +1,19 @@
#
-# makeunidb.py -- generate a compact version of the unicode property
-# database (unicodedatabase.h)
+# generate a compact version of the unicode property database
+#
+# history:
+# 2000-09-24 fl created (based on bits and pieces from unidb)
+# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
+#
+# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
#
import sys
SCRIPT = sys.argv[0]
-VERSION = "1.0"
+VERSION = "1.1"
-UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"
+UNICODE_DATA = "../UnicodeData-Latest.txt"
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@@ -24,13 +29,12 @@ def maketable():
unicode = UnicodeData(UNICODE_DATA)
# extract unicode properties
- dummy = (0, 0, 0, 0, "NULL")
+ dummy = (0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
- DECOMPOSITION = [""]
-
+ # 1) database properties
for char in unicode.chars:
record = unicode.table[char]
if record:
@@ -39,12 +43,8 @@ def maketable():
combining = int(record[3])
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
mirrored = record[9] == "Y"
- if record[5]:
- decomposition = '"%s"' % record[5]
- else:
- decomposition = "NULL"
item = (
- category, combining, bidirectional, mirrored, decomposition
+ category, combining, bidirectional, mirrored
)
# add entry to index and item tables
i = cache.get(item)
@@ -53,8 +53,26 @@ def maketable():
table.append(item)
index[char] = i
- # FIXME: we really should compress the decomposition stuff
- # (see the unidb utilities for one way to do this)
+ # 2) decomposition data
+
+ # FIXME: <fl> using the encoding stuff from unidb would save
+ # another 50k or so, but I'll leave that for 2.1...
+
+ decomp_data = [""]
+ decomp_index = [0] * len(unicode.chars)
+
+ for char in unicode.chars:
+ record = unicode.table[char]
+ if record:
+ if record[5]:
+ try:
+ i = decomp_data.index(record[5])
+ except ValueError:
+ i = len(decomp_data)
+ decomp_data.append(record[5])
+ else:
+ i = 0
+ decomp_index[char] = i
FILE = "unicodedata_db.h"
@@ -65,7 +83,7 @@ def maketable():
print "/* a list of unique database records */"
print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
for item in table:
- print " {%d, %d, %d, %d, %s}," % item
+ print " {%d, %d, %d, %d}," % item
print "};"
print
@@ -82,6 +100,12 @@ def maketable():
print " NULL"
print "};"
+ print "static const char *decomp_data[] = {"
+ for name in decomp_data:
+ print " \"%s\"," % name
+ print " NULL"
+ print "};"
+
# split index table
index1, index2, shift = splitbins(index)
@@ -90,6 +114,14 @@ def maketable():
Array("index1", index1).dump(sys.stdout)
Array("index2", index2).dump(sys.stdout)
+ # split index table
+ index1, index2, shift = splitbins(decomp_index)
+
+ print "/* same, for the decomposition data */"
+ print "#define DECOMP_SHIFT", shift
+ Array("decomp_index1", index1).dump(sys.stdout)
+ Array("decomp_index2", index2).dump(sys.stdout)
+
sys.stdout = sys.__stdout__
# --------------------------------------------------------------------