diff options
author | Tim Peters <tim.peters@gmail.com> | 2000-09-25 07:13:41 (GMT) |
---|---|---|
committer | Tim Peters <tim.peters@gmail.com> | 2000-09-25 07:13:41 (GMT) |
commit | 2101348830ff0d65cebd4caf886011f45bcc7618 (patch) | |
tree | 5b9d52a52269ed9f16c2f0c738a05ef76e2e19ba /Tools/unicode | |
parent | 68ded6e6f148ef0dc95418be7a9bb61385942362 (diff) | |
download | cpython-2101348830ff0d65cebd4caf886011f45bcc7618.zip cpython-2101348830ff0d65cebd4caf886011f45bcc7618.tar.gz cpython-2101348830ff0d65cebd4caf886011f45bcc7618.tar.bz2 |
Fiddled w/ /F's cool new splitbins function: documented it, generalized it
a bit, sped it a lot primarily by removing the unused assumption that None was
a legit bin entry (the function doesn't really need to assume that there's
anything special about 0), added an optional "trace" argument, and in __debug__
mode added exhaustive verification that the decomposition is both correct and
doesn't overstep any array bounds (which wasn't obvious to me from staring at the
generated C code -- now I feel safe!). Did not commit a new unicodedata_db.h, as
the one produced by this version is identical to the one already checked in.
Diffstat (limited to 'Tools/unicode')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 80 |
1 files changed, 54 insertions, 26 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index c36fadf..f2e6dc8 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -165,38 +165,66 @@ def getsize(data): else: return 4 -def splitbins(bins): - # split a sparse integer table into two tables, such as: - # value = t2[(t1[char>>shift]<<shift)+(char&mask)] - # and value == 0 means no data - bytes = sys.maxint - for shift in range(16): - bin1 = [] - bin2 = [] +def splitbins(t, trace=0): + """t, trace=0 -> (t1, t2, shift). Split a table to save space. + + t is a sequence of ints. This function can be useful to save space if + many of the ints are the same. t1 and t2 are lists of ints, and shift + is an int, chosen to minimize the combined size of t1 and t2 (in C + code), and where for each i in range(len(t)), + t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] + where mask is a bitmask isolating the last "shift" bits. + + If optional arg trace is true (default false), progress info is + printed to sys.stderr. + """ + + import sys + if trace: + def dump(t1, t2, shift, bytes): + print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( + len(t1), len(t2), shift, bytes) + print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \ + "bytes" + n = len(t)-1 # last valid index + maxshift = 0 # the most we can shift n and still have something left + if n > 0: + while n >> 1: + n >>= 1 + maxshift += 1 + del n + bytes = sys.maxint # smallest total size so far + t = tuple(t) # so slices can be dict keys + for shift in range(maxshift + 1): + t1 = [] + t2 = [] size = 2**shift bincache = {} - for i in range(0, len(bins), size): - bin = bins[i:i+size] - index = bincache.get(tuple(bin)) + for i in range(0, len(t), size): + bin = t[i:i+size] + index = bincache.get(bin) if index is None: - index = len(bin2) - bincache[tuple(bin)] = index - for v in bin: - if v is None: - bin2.append(0) - else: - bin2.append(v) - bin1.append(index>>shift) + index = len(t2) + bincache[bin] = index + t2.extend(bin) + t1.append(index >> shift) # determine memory size - b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2) + b = len(t1)*getsize(t1) + len(t2)*getsize(t2) + if trace: + dump(t1, t2, shift, b) if b < bytes: - best = shift, bin1, bin2 + best = t1, t2, shift bytes = b - shift, bin1, bin2 = best -## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( -## len(bin1), len(bin2), shift, bytes -## ) - return bin1, bin2, shift + t1, t2, shift = best + if trace: + print >>sys.stderr, "Best:", + dump(t1, t2, shift, bytes) + if __debug__: + # exhaustively verify that the decomposition is correct + mask = ~((~0) << shift) # i.e., low-bit mask of shift bits + for i in xrange(len(t)): + assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] + return best if __name__ == "__main__": maketable() |