From c5694c8bf4bf2008b42e0107fb245415df4147fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lemburg?= Date: Fri, 21 Oct 2005 13:45:17 +0000 Subject: Moved gencodec.py to the Tools/unicode/ directory. Added new support for decoding tables. Cleaned up the implementation a bit. --- Tools/scripts/gencodec.py | 300 ----------------------------------- Tools/unicode/gencodec.py | 391 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 391 insertions(+), 300 deletions(-) delete mode 100644 Tools/scripts/gencodec.py create mode 100644 Tools/unicode/gencodec.py diff --git a/Tools/scripts/gencodec.py b/Tools/scripts/gencodec.py deleted file mode 100644 index 75337d6..0000000 --- a/Tools/scripts/gencodec.py +++ /dev/null @@ -1,300 +0,0 @@ -""" Unicode Mapping Parser and Codec Generator. - -This script parses Unicode mapping files as available from the Unicode -site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec -modules from them. The codecs use the standard character mapping codec -to actually apply the mapping. - -Synopsis: gencodec.py dir codec_prefix - -All files in dir are scanned and those producing non-empty mappings -will be written to .py with being the -first part of the map's filename ('a' in a.b.c.txt) converted to -lowercase with hyphens replaced by underscores. - -The tool also writes marshalled versions of the mapping tables to the -same location (with .mapping extension). - -Written by Marc-Andre Lemburg (mal@lemburg.com). - -(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -(c) Copyright Guido van Rossum, 2000. - -"""#" - -import re,os,time,marshal - -# Create numeric tables or character based ones ? -numeric = 1 - -mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' - '\s+' - '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' - '\s*' - '(#.+)?') - -def parsecodes(codes, - len=len, filter=filter,range=range): - - """ Converts code combinations to either a single code integer - or a tuple of integers. - - meta-codes (in angular brackets, e.g. and ) are - ignored. - - Empty codes or illegal ones are returned as None. - - """ - if not codes: - return None - l = codes.split('+') - if len(l) == 1: - return int(l[0],16) - for i in range(len(l)): - try: - l[i] = int(l[i],16) - except ValueError: - l[i] = None - l = filter(lambda x: x is not None, l) - if len(l) == 1: - return l[0] - else: - return tuple(l) - -def readmap(filename): - - f = open(filename,'r') - lines = f.readlines() - f.close() - enc2uni = {} - identity = [] - unmapped = range(256) - for i in range(256): - unmapped[i] = i - for line in lines: - line = line.strip() - if not line or line[0] == '#': - continue - m = mapRE.match(line) - if not m: - #print '* not matched: %s' % repr(line) - continue - enc,uni,comment = m.groups() - enc = parsecodes(enc) - uni = parsecodes(uni) - if not comment: - comment = '' - else: - comment = comment[1:] - if enc < 256: - unmapped.remove(enc) - if enc == uni: - identity.append(enc) - else: - enc2uni[enc] = (uni,comment) - else: - enc2uni[enc] = (uni,comment) - # If there are more identity-mapped entries than unmapped entries, - # it pays to generate an identity dictionary first, and add explicit - # mappings to None for the rest - if len(identity)>=len(unmapped): - for enc in unmapped: - enc2uni[enc] = (None, "") - enc2uni['IDENTITY'] = 256 - - return enc2uni - -def hexrepr(t): - - if t is None: - return 'None' - try: - len(t) - except: - return '0x%04x' % t - return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')' - -def unicoderepr(t): - - if t is None: - return 'None' - if numeric: - return hexrepr(t) - else: - try: - len(t) - except: - return repr(unichr(t)) - return repr(''.join(map(unichr, t))) - -def keyrepr(t): - - if t is None: - return 'None' - if numeric: - return hexrepr(t) - else: - try: - len(t) - except: - if t < 256: - return repr(chr(t)) - else: - return repr(unichr(t)) - return repr(''.join(map(chr, t))) - -def codegen(name,map,comments=1): - - """ Returns Python source for the given map. - - Comments are included in the source, if comments is true (default). - - """ - l = [ - '''\ -""" Python Character Mapping Codec generated from '%s' with gencodec.py. - -"""#" - -import codecs - -### Codec APIs - -class Codec(codecs.Codec): - - def encode(self,input,errors='strict'): - - return codecs.charmap_encode(input,errors,encoding_map) - - def decode(self,input,errors='strict'): - - return codecs.charmap_decode(input,errors,decoding_map) - -class StreamWriter(Codec,codecs.StreamWriter): - pass - -class StreamReader(Codec,codecs.StreamReader): - pass - -### encodings module API - -def getregentry(): - - return (Codec().encode,Codec().decode,StreamReader,StreamWriter) - -### Decoding Map -''' % name, - ] - - if map.has_key("IDENTITY"): - l.append("decoding_map = codecs.make_identity_dict(range(%d))" - % map["IDENTITY"]) - l.append("decoding_map.update({") - splits = 1 - del map["IDENTITY"] - else: - l.append("decoding_map = {") - splits = 0 - - mappings = map.items() - mappings.sort() - append = l.append - i = 0 - for e,value in mappings: - try: - (u,c) = value - except TypeError: - u = value - c = '' - key = keyrepr(e) - if c and comments: - append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c)) - else: - append('\t%s: %s,' % (key,unicoderepr(u))) - i += 1 - if i == 4096: - # Split the definition into parts to that the Python - # parser doesn't dump core - if splits == 0: - append('}') - else: - append('})') - append('decoding_map.update({') - i = 0 - splits = splits + 1 - if splits == 0: - append('}') - else: - append('})') - append(''' -### Encoding Map - -encoding_map = codecs.make_encoding_map(decoding_map) -''') - return '\n'.join(l) - -def pymap(name,map,pyfile,comments=1): - - code = codegen(name,map,comments) - f = open(pyfile,'w') - f.write(code) - f.close() - -def marshalmap(name,map,marshalfile): - - d = {} - for e,(u,c) in map.items(): - d[e] = (u,c) - f = open(marshalfile,'wb') - marshal.dump(d,f) - f.close() - -def convertdir(dir,prefix='',comments=1): - - mapnames = os.listdir(dir) - for mapname in mapnames: - name = os.path.split(mapname)[1] - name = name.replace('-','_') - name = name.split('.')[0] - name = name.lower() - codefile = name + '.py' - marshalfile = name + '.mapping' - print 'converting %s to %s and %s' % (mapname, - prefix + codefile, - prefix + marshalfile) - try: - map = readmap(os.path.join(dir,mapname)) - if not map: - print '* map is empty; skipping' - else: - pymap(mapname, map, prefix + codefile,comments) - marshalmap(mapname, map, prefix + marshalfile) - except ValueError: - print '* conversion failed' - -def rewritepythondir(dir,prefix='',comments=1): - - mapnames = os.listdir(dir) - for mapname in mapnames: - if not mapname.endswith('.mapping'): - continue - codefile = mapname[:-len('.mapping')] + '.py' - print 'converting %s to %s' % (mapname, - prefix + codefile) - try: - map = marshal.load(open(os.path.join(dir,mapname), - 'rb')) - if not map: - print '* map is empty; skipping' - else: - pymap(mapname, map, prefix + codefile,comments) - except ValueError, why: - print '* conversion failed: %s' % why - -if __name__ == '__main__': - - import sys - if 1: - apply(convertdir,tuple(sys.argv[1:])) - else: - apply(rewritepythondir,tuple(sys.argv[1:])) diff --git a/Tools/unicode/gencodec.py b/Tools/unicode/gencodec.py new file mode 100644 index 0000000..7bce3d5 --- /dev/null +++ b/Tools/unicode/gencodec.py @@ -0,0 +1,391 @@ +""" Unicode Mapping Parser and Codec Generator. + +This script parses Unicode mapping files as available from the Unicode +site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec +modules from them. The codecs use the standard character mapping codec +to actually apply the mapping. + +Synopsis: gencodec.py dir codec_prefix + +All files in dir are scanned and those producing non-empty mappings +will be written to .py with being the +first part of the map's filename ('a' in a.b.c.txt) converted to +lowercase with hyphens replaced by underscores. + +The tool also writes marshalled versions of the mapping tables to the +same location (with .mapping extension). + +Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate +Unicode table maps for decoding. + +(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. +(c) Copyright Guido van Rossum, 2000. +(c) Copyright Marc-Andre Lemburg, 2005. + +"""#" + +import re, os, time, marshal, codecs + +# Maximum allowed size of charmap tables +MAX_TABLE_SIZE = 8192 + +# Standard undefined Unicode code point +UNI_UNDEFINED = unichr(0xFFFE) + +mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' + '\s+' + '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' + '\s*' + '(#.+)?') + +def parsecodes(codes, + len=len, filter=filter,range=range): + + """ Converts code combinations to either a single code integer + or a tuple of integers. + + meta-codes (in angular brackets, e.g. and ) are + ignored. + + Empty codes or illegal ones are returned as None. + + """ + if not codes: + return None + l = codes.split('+') + if len(l) == 1: + return int(l[0],16) + for i in range(len(l)): + try: + l[i] = int(l[i],16) + except ValueError: + l[i] = None + l = filter(lambda x: x is not None, l) + if len(l) == 1: + return l[0] + else: + return tuple(l) + +def readmap(filename): + + f = open(filename,'r') + lines = f.readlines() + f.close() + enc2uni = {} + identity = [] + unmapped = range(256) + + # UTC mapping tables per convention don't include the identity + # mappings for code points 0x00 - 0x1F and 0x7F, unless these are + # explicitly mapped to different characters or undefined + for i in range(32) + [127]: + identity.append(i) + unmapped.remove(i) + enc2uni[i] = (i, 'CONTROL CHARACTER') + + for line in lines: + line = line.strip() + if not line or line[0] == '#': + continue + m = mapRE.match(line) + if not m: + #print '* not matched: %s' % repr(line) + continue + enc,uni,comment = m.groups() + enc = parsecodes(enc) + uni = parsecodes(uni) + if comment is None: + comment = '' + else: + comment = comment[1:].strip() + if enc < 256: + if enc in unmapped: + unmapped.remove(enc) + if enc == uni: + identity.append(enc) + enc2uni[enc] = (uni,comment) + else: + enc2uni[enc] = (uni,comment) + + # If there are more identity-mapped entries than unmapped entries, + # it pays to generate an identity dictionary first, and add explicit + # mappings to None for the rest + if len(identity) >= len(unmapped): + for enc in unmapped: + enc2uni[enc] = (None, "") + enc2uni['IDENTITY'] = 256 + + return enc2uni + +def hexrepr(t): + + if t is None: + return 'None' + try: + len(t) + except: + return '0x%04x' % t + try: + return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')' + except TypeError, why: + print '* failed to convert %r: %s' % (t, why) + raise + +def python_mapdef_code(varname, map, comments=1): + + l = [] + append = l.append + if map.has_key("IDENTITY"): + append("%s = codecs.make_identity_dict(range(%d))" % + (varname, map["IDENTITY"])) + append("%s.update({" % varname) + splits = 1 + del map["IDENTITY"] + identity = 1 + else: + append("%s = {" % varname) + splits = 0 + identity = 0 + + mappings = map.items() + mappings.sort() + i = 0 + for mapkey, mapvalue in mappings: + mapcomment = '' + if isinstance(mapkey, tuple): + (mapkey, mapcomment) = mapkey + if isinstance(mapvalue, tuple): + (mapvalue, mapcomment) = mapvalue + if mapkey is None: + continue + if (identity and + mapkey == mapvalue and + mapkey < 256): + # No need to include identity mappings, since these + # are already set for the first 256 code points. + continue + key = hexrepr(mapkey) + value = hexrepr(mapvalue) + if mapcomment and comments: + append(' %s: %s,\t# %s' % (key, value, mapcomment)) + else: + append(' %s: %s,' % (key, value)) + i += 1 + if i == 4096: + # Split the definition into parts to that the Python + # parser doesn't dump core + if splits == 0: + append('}') + else: + append('})') + append('%s.update({' % varname) + i = 0 + splits = splits + 1 + if splits == 0: + append('}') + else: + append('})') + + return l + +def python_tabledef_code(varname, map, comments=1): + + l = [] + append = l.append + append('%s = (' % varname) + + # Analyze map and create table dict + mappings = map.items() + mappings.sort() + table = {} + maxkey = 0 + if map.has_key('IDENTITY'): + for key in range(256): + table[key] = (key, '') + maxkey = 255 + del map['IDENTITY'] + for mapkey, mapvalue in mappings: + mapcomment = '' + if isinstance(mapkey, tuple): + (mapkey, mapcomment) = mapkey + if isinstance(mapvalue, tuple): + (mapvalue, mapcomment) = mapvalue + if mapkey is None: + continue + table[mapkey] = (mapvalue, mapcomment) + if mapkey > maxkey: + maxkey = mapkey + if maxkey > MAX_TABLE_SIZE: + # Table too large + return None + + # Create table code + for key in range(maxkey + 1): + if key not in table: + mapvalue = None + mapcomment = 'UNDEFINED' + else: + mapvalue, mapcomment = table[key] + if mapvalue is None: + mapchar = UNI_UNDEFINED + else: + if isinstance(mapvalue, tuple): + # 1-n mappings not supported + return None + else: + mapchar = unichr(mapvalue) + if mapcomment and comments: + append(' %r\t# %s -> %s' % (mapchar, + hexrepr(key), + mapcomment)) + else: + append(' %r' % mapchar) + + append(')') + return l + +def codegen(name, map, comments=1): + + """ Returns Python source for the given map. + + Comments are included in the source, if comments is true (default). + + """ + # Generate code + decoding_map_code = python_mapdef_code( + 'decoding_map', + map, + comments=comments) + decoding_table_code = python_tabledef_code( + 'decoding_table', + map, + comments=comments) + encoding_map_code = python_mapdef_code( + 'encoding_map', + codecs.make_encoding_map(map), + comments=comments) + + l = [ + '''\ +""" Python Character Mapping Codec generated from '%s' with gencodec.py. + +"""#" + +import codecs + +### Codec APIs + +class Codec(codecs.Codec): + + def encode(self,input,errors='strict'): + + return codecs.charmap_encode(input,errors,encoding_map) + + def decode(self,input,errors='strict'): +''' % name + ] + if decoding_table_code: + l.append('''\ + return codecs.charmap_decode(input,errors,decoding_table)''') + else: + l.append('''\ + return codecs.charmap_decode(input,errors,decoding_map)''') + + l.append(''' +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + +### Decoding Map +''') + l.extend(decoding_map_code) + + # Add optional decoding table + if decoding_table_code: + l.append(''' +### Decoding Table +''') + l.extend(decoding_table_code) + + l.append(''' +### Encoding Map +''') + l.extend(encoding_map_code) + + return '\n'.join(l) + +def pymap(name,map,pyfile,comments=1): + + code = codegen(name,map,comments) + f = open(pyfile,'w') + f.write(code) + f.close() + +def marshalmap(name,map,marshalfile): + + d = {} + for e,(u,c) in map.items(): + d[e] = (u,c) + f = open(marshalfile,'wb') + marshal.dump(d,f) + f.close() + +def convertdir(dir,prefix='',comments=1): + + mapnames = os.listdir(dir) + for mapname in mapnames: + mappathname = os.path.join(dir, mapname) + name = os.path.split(mapname)[1] + name = name.replace('-','_') + name = name.split('.')[0] + name = name.lower() + codefile = name + '.py' + marshalfile = name + '.mapping' + print 'converting %s to %s and %s' % (mapname, + prefix + codefile, + prefix + marshalfile) + try: + map = readmap(os.path.join(dir,mapname)) + if not map: + print '* map is empty; skipping' + else: + pymap(mappathname, map, prefix + codefile,comments) + marshalmap(mappathname, map, prefix + marshalfile) + except ValueError, why: + print '* conversion failed: %s' % why + raise + +def rewritepythondir(dir,prefix='',comments=1): + + mapnames = os.listdir(dir) + for mapname in mapnames: + if not mapname.endswith('.mapping'): + continue + codefile = mapname[:-len('.mapping')] + '.py' + print 'converting %s to %s' % (mapname, + prefix + codefile) + try: + map = marshal.load(open(os.path.join(dir,mapname), + 'rb')) + if not map: + print '* map is empty; skipping' + else: + pymap(mapname, map, prefix + codefile,comments) + except ValueError, why: + print '* conversion failed: %s' % why + +if __name__ == '__main__': + + import sys + if 1: + apply(convertdir,tuple(sys.argv[1:])) + else: + apply(rewritepythondir,tuple(sys.argv[1:])) -- cgit v0.12