summaryrefslogtreecommitdiffstats
path: root/Tools/unicode
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2000-11-03 20:24:15 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2000-11-03 20:24:15 (GMT)
commitfad27aee11e8ece649b72c8ad9513f46882d23ba (patch)
treeaf66b2dc99233bf774bd2059f0c5f02150c6c0ed /Tools/unicode
parent063ee7bbe6bc6078addbb9137db8c4ef909f3225 (diff)
downloadcpython-fad27aee11e8ece649b72c8ad9513f46882d23ba.zip
cpython-fad27aee11e8ece649b72c8ad9513f46882d23ba.tar.gz
cpython-fad27aee11e8ece649b72c8ad9513f46882d23ba.tar.bz2
Added 38,642 missing characters to the Unicode database (first-last
ranges) -- but thanks to the 2.0 compression scheme, this doesn't add a single byte to the resulting binaries (!) Closes bug #117524
Diffstat (limited to 'Tools/unicode')
-rw-r--r--Tools/unicode/makeunicodedata.py50
1 files changed, 39 insertions, 11 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 8c0c075..15841d7 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -9,6 +9,7 @@
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
# 2000-09-25 fl added character type table
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
+# 2000-11-03 fl expand first/last ranges
#
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
#
@@ -39,10 +40,13 @@ SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
-def maketables():
+def maketables(trace=0):
unicode = UnicodeData(UNICODE_DATA)
+ print "--- Processing", UNICODE_DATA, "..."
+ print len(filter(None, unicode.table)), "characters"
+
# extract unicode properties
dummy = (0, 0, 0, 0)
table = [dummy]
@@ -91,6 +95,11 @@ def maketables():
FILE = "Modules/unicodedata_db.h"
+ print "--- Writing", FILE, "..."
+
+ print len(table), "unique properties"
+ print len(decomp_data), "unique decomposition entries"
+
fp = open(FILE, "w")
print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
print >>fp
@@ -125,7 +134,7 @@ def maketables():
print >>fp, "};"
# split record index table
- index1, index2, shift = splitbins(index)
+ index1, index2, shift = splitbins(index, trace)
print >>fp, "/* index tables for the database records */"
print >>fp, "#define SHIFT", shift
@@ -133,7 +142,7 @@ def maketables():
Array("index2", index2).dump(fp)
# split decomposition index table
- index1, index2, shift = splitbins(decomp_index)
+ index1, index2, shift = splitbins(decomp_index, trace)
print >>fp, "/* index tables for the decomposition data */"
print >>fp, "#define DECOMP_SHIFT", shift
@@ -200,12 +209,14 @@ def maketables():
table.append(item)
index[char] = i
- print len(table), "ctype entries"
-
FILE = "Objects/unicodetype_db.h"
fp = open(FILE, "w")
+ print "--- Writing", FILE, "..."
+
+ print len(table), "unique character type entries"
+
print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
print >>fp
print >>fp, "/* a list of unique character type descriptors */"
@@ -216,7 +227,7 @@ def maketables():
print >>fp
# split decomposition index table
- index1, index2, shift = splitbins(index)
+ index1, index2, shift = splitbins(index, trace)
print >>fp, "/* type indexes */"
print >>fp, "#define SHIFT", shift
@@ -233,7 +244,7 @@ import string, sys
class UnicodeData:
- def __init__(self, filename):
+ def __init__(self, filename, expand=1):
file = open(filename)
table = [None] * 65536
while 1:
@@ -244,6 +255,22 @@ class UnicodeData:
char = string.atoi(s[0], 16)
table[char] = s
+ # expand first-last ranges (ignore surrogates and private use)
+ if expand:
+ field = None
+ for i in range(0, 0xD800):
+ s = table[i]
+ if s:
+ if s[1][-6:] == "First>":
+ s[1] = ""
+ field = s[:]
+ elif s[1][-5:] == "Last>":
+ s[1] = ""
+ field = None
+ elif field:
+ field[0] = hex(i)
+ table[i] = field
+
# public attributes
self.filename = filename
self.table = table
@@ -306,8 +333,9 @@ def splitbins(t, trace=0):
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
where mask is a bitmask isolating the last "shift" bits.
- If optional arg trace is true (default false), progress info is
- printed to sys.stderr.
+ If optional arg trace is non-zero (default zero), progress info
+ is printed to sys.stderr. The higher the value, the more info
+ you'll get.
"""
import sys
@@ -341,7 +369,7 @@ def splitbins(t, trace=0):
t1.append(index >> shift)
# determine memory size
b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
- if trace:
+ if trace > 1:
dump(t1, t2, shift, b)
if b < bytes:
best = t1, t2, shift
@@ -358,4 +386,4 @@ def splitbins(t, trace=0):
return best
if __name__ == "__main__":
- maketables()
+ maketables(1)