1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
|
#
# makeunidb.py -- generate a compact version of the unicode property
# database (unicodedatabase.h)
#
import sys
SCRIPT = sys.argv[0]
VERSION = "1.0"
UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
"So" ]
BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
"ON" ]
def maketable():
unicode = UnicodeData(UNICODE_DATA)
# extract unicode properties
dummy = (0, 0, 0, 0, "NULL")
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
DECOMPOSITION = [""]
for char in unicode.chars:
record = unicode.table[char]
if record:
# extract database properties
category = CATEGORY_NAMES.index(record[2])
combining = int(record[3])
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
mirrored = record[9] == "Y"
if record[5]:
decomposition = '"%s"' % record[5]
else:
decomposition = "NULL"
item = (
category, combining, bidirectional, mirrored, decomposition
)
# add entry to index and item tables
i = cache.get(item)
if i is None:
cache[item] = i = len(table)
table.append(item)
index[char] = i
# FIXME: we really should compress the decomposition stuff
# (see the unidb utilities for one way to do this)
FILE = "unicodedata_db.h"
sys.stdout = open(FILE, "w")
print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
print
print "/* a list of unique database records */"
print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
for item in table:
print " {%d, %d, %d, %d, %s}," % item
print "};"
print
print "/* string literals */"
print "const char *_PyUnicode_CategoryNames[] = {"
for name in CATEGORY_NAMES:
print " \"%s\"," % name
print " NULL"
print "};"
print "const char *_PyUnicode_BidirectionalNames[] = {"
for name in BIDIRECTIONAL_NAMES:
print " \"%s\"," % name
print " NULL"
print "};"
# split index table
index1, index2, shift = splitbins(index)
print "/* index tables used to find the right database record */"
print "#define SHIFT", shift
Array("index1", index1).dump(sys.stdout)
Array("index2", index2).dump(sys.stdout)
sys.stdout = sys.__stdout__
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB
# load a unicode-data file from disk
import string, sys
class UnicodeData:
def __init__(self, filename):
file = open(filename)
table = [None] * 65536
while 1:
s = file.readline()
if not s:
break
s = string.split(string.strip(s), ";")
char = string.atoi(s[0], 16)
table[char] = s
# public attributes
self.filename = filename
self.table = table
self.chars = range(65536) # unicode
def uselatin1(self):
# restrict character range to ISO Latin 1
self.chars = range(256)
# stuff to deal with arrays of unsigned integers
class Array:
def __init__(self, name, data):
self.name = name
self.data = data
def dump(self, file):
# write data to file, as a C array
size = getsize(self.data)
# print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
file.write("static ")
if size == 1:
file.write("unsigned char")
elif size == 2:
file.write("unsigned short")
else:
file.write("unsigned int")
file.write(" " + self.name + "[] = {\n")
if self.data:
s = " "
for item in self.data:
i = str(item) + ", "
if len(s) + len(i) > 78:
file.write(s + "\n")
s = " " + i
else:
s = s + i
if string.strip(s):
file.write(s + "\n")
file.write("};\n\n")
def getsize(data):
# return smallest possible integer size for the given array
maxdata = max(data)
if maxdata < 256:
return 1
elif maxdata < 65536:
return 2
else:
return 4
def splitbins(bins):
# split a sparse integer table into two tables, such as:
# value = t2[(t1[char>>shift]<<shift)+(char&mask)]
# and value == 0 means no data
bytes = sys.maxint
for shift in range(16):
bin1 = []
bin2 = []
size = 2**shift
bincache = {}
for i in range(0, len(bins), size):
bin = bins[i:i+size]
index = bincache.get(tuple(bin))
if index is None:
index = len(bin2)
bincache[tuple(bin)] = index
for v in bin:
if v is None:
bin2.append(0)
else:
bin2.append(v)
bin1.append(index>>shift)
# determine memory size
b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2)
if b < bytes:
best = shift, bin1, bin2
bytes = b
shift, bin1, bin2 = best
## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
## len(bin1), len(bin2), shift, bytes
## )
return bin1, bin2, shift
if __name__ == "__main__":
maketable()
|