diff options
author | Guido van Rossum <guido@python.org> | 2000-03-10 22:36:57 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2000-03-10 22:36:57 (GMT) |
commit | 34a79115c5d5be53581f49ced5a5a17171cabb7d (patch) | |
tree | 7a851429c335c7bac81a04013498c39b73e005a0 /Tools/scripts/gencodec.py | |
parent | fd9eed33aae55e57c84c654493c470e4ad78bc82 (diff) | |
download | cpython-34a79115c5d5be53581f49ced5a5a17171cabb7d.zip cpython-34a79115c5d5be53581f49ced5a5a17171cabb7d.tar.gz cpython-34a79115c5d5be53581f49ced5a5a17171cabb7d.tar.bz2 |
Marc-Andre Lemburg: added
gencodec.py - Create Python codecs from Unicode mapping files
Diffstat (limited to 'Tools/scripts/gencodec.py')
-rw-r--r-- | Tools/scripts/gencodec.py | 289 |
1 files changed, 289 insertions, 0 deletions
diff --git a/Tools/scripts/gencodec.py b/Tools/scripts/gencodec.py new file mode 100644 index 0000000..b5680ee --- /dev/null +++ b/Tools/scripts/gencodec.py @@ -0,0 +1,289 @@ +""" Unicode Mapping Parser and Codec Generator. + +This script parses Unicode mapping files as available from the Unicode +site (ftp.unicode.org) and creates Python codec modules from them. The +codecs use the standard character mapping codec to actually apply the +mapping. + +Synopsis: gencodec.py dir codec_prefix + +All files in dir are scanned and those producing non-empty mappings +will be written to <codec_prefix><mapname>.py with <mapname> being the +first part of the map's filename ('a' in a.b.c.txt) converted to +lowercase with hyphens replaced by underscores. + +The tool also write marhsalled versions of the mapping tables to the +same location (with .mapping extension). + +Written by Marc-Andre Lemburg (mal@lemburg.com). + +(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. + +"""#" + +import string,re,os,time,marshal + +# Create numeric tables or character based ones ? +numeric = 1 + +mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' + '\s+' + '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' + '\s*' + '(#.+)?') + +def parsecodes(codes, + + split=string.split,atoi=string.atoi,len=len, + filter=filter,range=range): + + """ Converts code combinations to either a single code integer + or a tuple of integers. + + meta-codes (in angular brackets, e.g. <LR> and <RL>) are + ignored. + + Empty codes or illegal ones are returned as None. + + """ + if not codes: + return None + l = split(codes,'+') + if len(l) == 1: + return atoi(l[0],16) + for i in range(len(l)): + try: + l[i] = atoi(l[i],16) + except ValueError: + l[i] = None + l = filter(lambda x: x is not None, l) + if len(l) == 1: + return l[0] + else: + return tuple(l) + +def readmap(filename, + + strip=string.strip): + + f = open(filename,'r') + lines = f.readlines() + f.close() + enc2uni = {} + for line in lines: + line = strip(line) + if not line or line[0] == '#': + continue + m = mapRE.match(line) + if not m: + #print '* not matched: %s' % repr(line) + continue + enc,uni,comment = m.groups() + enc = parsecodes(enc) + uni = parsecodes(uni) + if not comment: + comment = '' + else: + comment = comment[1:] + if enc != uni: + enc2uni[enc] = (uni,comment) + return enc2uni + +def hexrepr(t, + + join=string.join): + + if t is None: + return 'None' + try: + len(t) + except: + return '0x%04x' % t + return '(' + join(map(lambda t: '0x%04x' % t, t),', ') + ')' + +def unicoderepr(t, + + join=string.join): + + if t is None: + return 'None' + if numeric: + return hexrepr(t) + else: + try: + len(t) + except: + return repr(unichr(t)) + return repr(join(map(unichr, t),'')) + +def keyrepr(t, + + join=string.join): + + if t is None: + return 'None' + if numeric: + return hexrepr(t) + else: + try: + len(t) + except: + if t < 256: + return repr(chr(t)) + else: + return repr(unichr(t)) + return repr(join(map(chr, t),'')) + +def codegen(name,map,comments=1): + + """ Returns Python source for the given map. + + Comments are included in the source, if comments is true (default). + + """ + l = [ + '''\ +""" Python Character Mapping Codec generated from '%s'. + +Written by Marc-Andre Lemburg (mal@lemburg.com). + +(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. + +"""#" + +import codecs + +### Codec APIs + +class Codec(codecs.Codec): + + def encode(self,input,errors='strict'): + + return codecs.charmap_encode(input,errors,encoding_map) + + def decode(self,input,errors='strict'): + + return codecs.charmap_decode(input,errors,decoding_map) + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + +### Decoding Map + +decoding_map = { +''' % name, + ] + mappings = map.items() + mappings.sort() + append = l.append + i = 0 + splits = 0 + for e,value in mappings: + try: + (u,c) = value + except TypeError: + u = value + c = '' + key = keyrepr(e) + if c and comments: + append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c)) + else: + append('\t%s: %s,' % (key,unicoderepr(u))) + i = i + 1 + if i == 4096: + # Split the definition into parts to that the Python + # parser doesn't dump core + if splits == 0: + append('}') + else: + append('})') + append('map.update({') + i = 0 + splits = splits + 1 + if splits == 0: + append('}') + else: + append('})') + append(''' +### Encoding Map + +encoding_map = {} +for k,v in decoding_map.items(): + encoding_map[v] = k +''') + return string.join(l,'\n') + +def pymap(name,map,pyfile,comments=1): + + code = codegen(name,map,comments) + f = open(pyfile,'w') + f.write(code) + f.close() + +def marshalmap(name,map,marshalfile): + + d = {} + for e,(u,c) in map.items(): + d[e] = (u,c) + f = open(marshalfile,'wb') + marshal.dump(d,f) + f.close() + +def convertdir(dir,prefix='',comments=1): + + mapnames = os.listdir(dir) + for mapname in mapnames: + name = os.path.split(mapname)[1] + name = string.replace(name,'-','_') + name = string.split(name, '.')[0] + name = string.lower(name) + codefile = name + '.py' + marshalfile = name + '.mapping' + print 'converting %s to %s and %s' % (mapname, + prefix + codefile, + prefix + marshalfile) + try: + map = readmap(os.path.join(dir,mapname)) + if not map: + print '* map is empty; skipping' + else: + pymap(mapname, map, prefix + codefile,comments) + marshalmap(mapname, map, prefix + marshalfile) + except ValueError: + print '* conversion failed' + +def rewritepythondir(dir,prefix='',comments=1): + + mapnames = os.listdir(dir) + for mapname in mapnames: + if mapname[-len('.mapping'):] != '.mapping': + continue + codefile = mapname[:-len('.mapping')] + '.py' + print 'converting %s to %s' % (mapname, + prefix + codefile) + try: + map = marshal.load(open(os.path.join(dir,mapname), + 'rb')) + if not map: + print '* map is empty; skipping' + else: + pymap(mapname, map, prefix + codefile,comments) + except ValueError, why: + print '* conversion failed: %s' % why + +if __name__ == '__main__': + + import sys + if 1: + apply(convertdir,tuple(sys.argv[1:])) + else: + apply(rewritepythondir,tuple(sys.argv[1:])) |