diff options
Diffstat (limited to 'Tools/scripts/gencodec.py')
-rw-r--r-- | Tools/scripts/gencodec.py | 50 |
1 files changed, 39 insertions, 11 deletions
diff --git a/Tools/scripts/gencodec.py b/Tools/scripts/gencodec.py index 45b69b0..39b42ff 100644 --- a/Tools/scripts/gencodec.py +++ b/Tools/scripts/gencodec.py @@ -1,9 +1,9 @@ """ Unicode Mapping Parser and Codec Generator. This script parses Unicode mapping files as available from the Unicode -site (ftp.unicode.org) and creates Python codec modules from them. The -codecs use the standard character mapping codec to actually apply the -mapping. +site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec +modules from them. The codecs use the standard character mapping codec +to actually apply the mapping. Synopsis: gencodec.py dir codec_prefix @@ -18,6 +18,7 @@ same location (with .mapping extension). Written by Marc-Andre Lemburg (mal@lemburg.com). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. +(c) Copyright Guido van Rossum, 2000. """#" @@ -70,6 +71,10 @@ def readmap(filename, lines = f.readlines() f.close() enc2uni = {} + identity = [] + unmapped = range(256) + for i in range(256): + unmapped[i] = i for line in lines: line = strip(line) if not line or line[0] == '#': @@ -85,8 +90,22 @@ def readmap(filename, comment = '' else: comment = comment[1:] - if enc != uni: + if enc < 256: + unmapped.remove(enc) + if enc == uni: + identity.append(enc) + else: + enc2uni[enc] = (uni,comment) + else: enc2uni[enc] = (uni,comment) + # If there are more identity-mapped entries than unmapped entries, + # it pays to generate an identity dictionary first, add add explicit + # mappings to None for the rest + if len(identity)>=len(unmapped): + for enc in unmapped: + enc2uni[enc] = (None, "") + enc2uni['IDENTITY'] = 256 + return enc2uni def hexrepr(t, @@ -143,11 +162,12 @@ def codegen(name,map,comments=1): """ l = [ '''\ -""" Python Character Mapping Codec generated from '%s'. +""" Python Character Mapping Codec generated from '%s' with gencodec.py. Written by Marc-Andre Lemburg (mal@lemburg.com). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. +(c) Copyright 2000 Guido van Rossum. """#" @@ -178,15 +198,23 @@ def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) ### Decoding Map - -decoding_map = { ''' % name, ] + + if map.has_key("IDENTITY"): + l.append("decoding_map = codecs.make_identity_dict(range(%d))" + % map["IDENTITY"]) + l.append("decoding_map.update({") + splits = 1 + del map["IDENTITY"] + else: + l.append("decoding_map = {") + splits = 0 + mappings = map.items() mappings.sort() append = l.append i = 0 - splits = 0 for e,value in mappings: try: (u,c) = value @@ -198,7 +226,7 @@ decoding_map = { append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c)) else: append('\t%s: %s,' % (key,unicoderepr(u))) - i = i + 1 + i += 1 if i == 4096: # Split the definition into parts to that the Python # parser doesn't dump core @@ -206,7 +234,7 @@ decoding_map = { append('}') else: append('})') - append('map.update({') + append('decoding_map.update({') i = 0 splits = splits + 1 if splits == 0: @@ -265,7 +293,7 @@ def rewritepythondir(dir,prefix='',comments=1): mapnames = os.listdir(dir) for mapname in mapnames: - if mapname[-len('.mapping'):] != '.mapping': + if not mapname.endswith('.mapping'): continue codefile = mapname[:-len('.mapping')] + '.py' print 'converting %s to %s' % (mapname, |