diff options
author | Marc-André Lemburg <mal@egenix.com> | 2005-10-21 13:45:17 (GMT) |
---|---|---|
committer | Marc-André Lemburg <mal@egenix.com> | 2005-10-21 13:45:17 (GMT) |
commit | c5694c8bf4bf2008b42e0107fb245415df4147fd (patch) | |
tree | 8c5ddc2a102cd42329da26805f232f09d3302a2d /Tools | |
parent | 31441302171fe882976bcc05f5ded9645cd690af (diff) | |
download | cpython-c5694c8bf4bf2008b42e0107fb245415df4147fd.zip cpython-c5694c8bf4bf2008b42e0107fb245415df4147fd.tar.gz cpython-c5694c8bf4bf2008b42e0107fb245415df4147fd.tar.bz2 |
Moved gencodec.py to the Tools/unicode/ directory.
Added new support for decoding tables.
Cleaned up the implementation a bit.
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/unicode/gencodec.py (renamed from Tools/scripts/gencodec.py) | 267 |
1 files changed, 179 insertions, 88 deletions
diff --git a/Tools/scripts/gencodec.py b/Tools/unicode/gencodec.py index 75337d6..7bce3d5 100644 --- a/Tools/scripts/gencodec.py +++ b/Tools/unicode/gencodec.py @@ -15,17 +15,22 @@ lowercase with hyphens replaced by underscores. The tool also writes marshalled versions of the mapping tables to the same location (with .mapping extension). -Written by Marc-Andre Lemburg (mal@lemburg.com). +Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate +Unicode table maps for decoding. (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. (c) Copyright Guido van Rossum, 2000. +(c) Copyright Marc-Andre Lemburg, 2005. """#" -import re,os,time,marshal +import re, os, time, marshal, codecs -# Create numeric tables or character based ones ? -numeric = 1 +# Maximum allowed size of charmap tables +MAX_TABLE_SIZE = 8192 + +# Standard undefined Unicode code point +UNI_UNDEFINED = unichr(0xFFFE) mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' '\s+' @@ -69,8 +74,15 @@ def readmap(filename): enc2uni = {} identity = [] unmapped = range(256) - for i in range(256): - unmapped[i] = i + + # UTC mapping tables per convention don't include the identity + # mappings for code points 0x00 - 0x1F and 0x7F, unless these are + # explicitly mapped to different characters or undefined + for i in range(32) + [127]: + identity.append(i) + unmapped.remove(i) + enc2uni[i] = (i, 'CONTROL CHARACTER') + for line in lines: line = line.strip() if not line or line[0] == '#': @@ -82,22 +94,23 @@ def readmap(filename): enc,uni,comment = m.groups() enc = parsecodes(enc) uni = parsecodes(uni) - if not comment: + if comment is None: comment = '' else: - comment = comment[1:] + comment = comment[1:].strip() if enc < 256: - unmapped.remove(enc) + if enc in unmapped: + unmapped.remove(enc) if enc == uni: identity.append(enc) - else: - enc2uni[enc] = (uni,comment) + enc2uni[enc] = (uni,comment) else: enc2uni[enc] = (uni,comment) + # If there are more identity-mapped entries than unmapped entries, # it pays to generate an identity dictionary first, and add explicit # mappings to None for the rest - if len(identity)>=len(unmapped): + if len(identity) >= len(unmapped): for enc in unmapped: enc2uni[enc] = (None, "") enc2uni['IDENTITY'] = 256 @@ -112,44 +125,146 @@ def hexrepr(t): len(t) except: return '0x%04x' % t - return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')' + try: + return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')' + except TypeError, why: + print '* failed to convert %r: %s' % (t, why) + raise -def unicoderepr(t): +def python_mapdef_code(varname, map, comments=1): - if t is None: - return 'None' - if numeric: - return hexrepr(t) + l = [] + append = l.append + if map.has_key("IDENTITY"): + append("%s = codecs.make_identity_dict(range(%d))" % + (varname, map["IDENTITY"])) + append("%s.update({" % varname) + splits = 1 + del map["IDENTITY"] + identity = 1 else: - try: - len(t) - except: - return repr(unichr(t)) - return repr(''.join(map(unichr, t))) - -def keyrepr(t): + append("%s = {" % varname) + splits = 0 + identity = 0 - if t is None: - return 'None' - if numeric: - return hexrepr(t) + mappings = map.items() + mappings.sort() + i = 0 + for mapkey, mapvalue in mappings: + mapcomment = '' + if isinstance(mapkey, tuple): + (mapkey, mapcomment) = mapkey + if isinstance(mapvalue, tuple): + (mapvalue, mapcomment) = mapvalue + if mapkey is None: + continue + if (identity and + mapkey == mapvalue and + mapkey < 256): + # No need to include identity mappings, since these + # are already set for the first 256 code points. + continue + key = hexrepr(mapkey) + value = hexrepr(mapvalue) + if mapcomment and comments: + append(' %s: %s,\t# %s' % (key, value, mapcomment)) + else: + append(' %s: %s,' % (key, value)) + i += 1 + if i == 4096: + # Split the definition into parts to that the Python + # parser doesn't dump core + if splits == 0: + append('}') + else: + append('})') + append('%s.update({' % varname) + i = 0 + splits = splits + 1 + if splits == 0: + append('}') else: - try: - len(t) - except: - if t < 256: - return repr(chr(t)) + append('})') + + return l + +def python_tabledef_code(varname, map, comments=1): + + l = [] + append = l.append + append('%s = (' % varname) + + # Analyze map and create table dict + mappings = map.items() + mappings.sort() + table = {} + maxkey = 0 + if map.has_key('IDENTITY'): + for key in range(256): + table[key] = (key, '') + maxkey = 255 + del map['IDENTITY'] + for mapkey, mapvalue in mappings: + mapcomment = '' + if isinstance(mapkey, tuple): + (mapkey, mapcomment) = mapkey + if isinstance(mapvalue, tuple): + (mapvalue, mapcomment) = mapvalue + if mapkey is None: + continue + table[mapkey] = (mapvalue, mapcomment) + if mapkey > maxkey: + maxkey = mapkey + if maxkey > MAX_TABLE_SIZE: + # Table too large + return None + + # Create table code + for key in range(maxkey + 1): + if key not in table: + mapvalue = None + mapcomment = 'UNDEFINED' + else: + mapvalue, mapcomment = table[key] + if mapvalue is None: + mapchar = UNI_UNDEFINED + else: + if isinstance(mapvalue, tuple): + # 1-n mappings not supported + return None else: - return repr(unichr(t)) - return repr(''.join(map(chr, t))) + mapchar = unichr(mapvalue) + if mapcomment and comments: + append(' %r\t# %s -> %s' % (mapchar, + hexrepr(key), + mapcomment)) + else: + append(' %r' % mapchar) -def codegen(name,map,comments=1): + append(')') + return l + +def codegen(name, map, comments=1): """ Returns Python source for the given map. Comments are included in the source, if comments is true (default). """ + # Generate code + decoding_map_code = python_mapdef_code( + 'decoding_map', + map, + comments=comments) + decoding_table_code = python_tabledef_code( + 'decoding_table', + map, + comments=comments) + encoding_map_code = python_mapdef_code( + 'encoding_map', + codecs.make_encoding_map(map), + comments=comments) + l = [ '''\ """ Python Character Mapping Codec generated from '%s' with gencodec.py. @@ -167,9 +282,16 @@ class Codec(codecs.Codec): return codecs.charmap_encode(input,errors,encoding_map) def decode(self,input,errors='strict'): - - return codecs.charmap_decode(input,errors,decoding_map) - +''' % name + ] + if decoding_table_code: + l.append('''\ + return codecs.charmap_decode(input,errors,decoding_table)''') + else: + l.append('''\ + return codecs.charmap_decode(input,errors,decoding_map)''') + + l.append(''' class StreamWriter(Codec,codecs.StreamWriter): pass @@ -183,54 +305,21 @@ def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) ### Decoding Map -''' % name, - ] +''') + l.extend(decoding_map_code) - if map.has_key("IDENTITY"): - l.append("decoding_map = codecs.make_identity_dict(range(%d))" - % map["IDENTITY"]) - l.append("decoding_map.update({") - splits = 1 - del map["IDENTITY"] - else: - l.append("decoding_map = {") - splits = 0 + # Add optional decoding table + if decoding_table_code: + l.append(''' +### Decoding Table +''') + l.extend(decoding_table_code) - mappings = map.items() - mappings.sort() - append = l.append - i = 0 - for e,value in mappings: - try: - (u,c) = value - except TypeError: - u = value - c = '' - key = keyrepr(e) - if c and comments: - append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c)) - else: - append('\t%s: %s,' % (key,unicoderepr(u))) - i += 1 - if i == 4096: - # Split the definition into parts to that the Python - # parser doesn't dump core - if splits == 0: - append('}') - else: - append('})') - append('decoding_map.update({') - i = 0 - splits = splits + 1 - if splits == 0: - append('}') - else: - append('})') - append(''' + l.append(''' ### Encoding Map - -encoding_map = codecs.make_encoding_map(decoding_map) ''') + l.extend(encoding_map_code) + return '\n'.join(l) def pymap(name,map,pyfile,comments=1): @@ -253,6 +342,7 @@ def convertdir(dir,prefix='',comments=1): mapnames = os.listdir(dir) for mapname in mapnames: + mappathname = os.path.join(dir, mapname) name = os.path.split(mapname)[1] name = name.replace('-','_') name = name.split('.')[0] @@ -267,10 +357,11 @@ def convertdir(dir,prefix='',comments=1): if not map: print '* map is empty; skipping' else: - pymap(mapname, map, prefix + codefile,comments) - marshalmap(mapname, map, prefix + marshalfile) - except ValueError: - print '* conversion failed' + pymap(mappathname, map, prefix + codefile,comments) + marshalmap(mappathname, map, prefix + marshalfile) + except ValueError, why: + print '* conversion failed: %s' % why + raise def rewritepythondir(dir,prefix='',comments=1): |