summaryrefslogtreecommitdiffstats
path: root/Tools/scripts/gencodec.py
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2000-03-10 22:36:57 (GMT)
committerGuido van Rossum <guido@python.org>2000-03-10 22:36:57 (GMT)
commit34a79115c5d5be53581f49ced5a5a17171cabb7d (patch)
tree7a851429c335c7bac81a04013498c39b73e005a0 /Tools/scripts/gencodec.py
parentfd9eed33aae55e57c84c654493c470e4ad78bc82 (diff)
downloadcpython-34a79115c5d5be53581f49ced5a5a17171cabb7d.zip
cpython-34a79115c5d5be53581f49ced5a5a17171cabb7d.tar.gz
cpython-34a79115c5d5be53581f49ced5a5a17171cabb7d.tar.bz2
Marc-Andre Lemburg: added
gencodec.py - Create Python codecs from Unicode mapping files
Diffstat (limited to 'Tools/scripts/gencodec.py')
-rw-r--r--Tools/scripts/gencodec.py289
1 files changed, 289 insertions, 0 deletions
diff --git a/Tools/scripts/gencodec.py b/Tools/scripts/gencodec.py
new file mode 100644
index 0000000..b5680ee
--- /dev/null
+++ b/Tools/scripts/gencodec.py
@@ -0,0 +1,289 @@
+""" Unicode Mapping Parser and Codec Generator.
+
+This script parses Unicode mapping files as available from the Unicode
+site (ftp.unicode.org) and creates Python codec modules from them. The
+codecs use the standard character mapping codec to actually apply the
+mapping.
+
+Synopsis: gencodec.py dir codec_prefix
+
+All files in dir are scanned and those producing non-empty mappings
+will be written to <codec_prefix><mapname>.py with <mapname> being the
+first part of the map's filename ('a' in a.b.c.txt) converted to
+lowercase with hyphens replaced by underscores.
+
+The tool also write marhsalled versions of the mapping tables to the
+same location (with .mapping extension).
+
+Written by Marc-Andre Lemburg (mal@lemburg.com).
+
+(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+
+"""#"
+
+import string,re,os,time,marshal
+
+# Create numeric tables or character based ones ?
+numeric = 1
+
+mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
+ '\s+'
+ '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
+ '\s*'
+ '(#.+)?')
+
+def parsecodes(codes,
+
+ split=string.split,atoi=string.atoi,len=len,
+ filter=filter,range=range):
+
+ """ Converts code combinations to either a single code integer
+ or a tuple of integers.
+
+ meta-codes (in angular brackets, e.g. <LR> and <RL>) are
+ ignored.
+
+ Empty codes or illegal ones are returned as None.
+
+ """
+ if not codes:
+ return None
+ l = split(codes,'+')
+ if len(l) == 1:
+ return atoi(l[0],16)
+ for i in range(len(l)):
+ try:
+ l[i] = atoi(l[i],16)
+ except ValueError:
+ l[i] = None
+ l = filter(lambda x: x is not None, l)
+ if len(l) == 1:
+ return l[0]
+ else:
+ return tuple(l)
+
+def readmap(filename,
+
+ strip=string.strip):
+
+ f = open(filename,'r')
+ lines = f.readlines()
+ f.close()
+ enc2uni = {}
+ for line in lines:
+ line = strip(line)
+ if not line or line[0] == '#':
+ continue
+ m = mapRE.match(line)
+ if not m:
+ #print '* not matched: %s' % repr(line)
+ continue
+ enc,uni,comment = m.groups()
+ enc = parsecodes(enc)
+ uni = parsecodes(uni)
+ if not comment:
+ comment = ''
+ else:
+ comment = comment[1:]
+ if enc != uni:
+ enc2uni[enc] = (uni,comment)
+ return enc2uni
+
+def hexrepr(t,
+
+ join=string.join):
+
+ if t is None:
+ return 'None'
+ try:
+ len(t)
+ except:
+ return '0x%04x' % t
+ return '(' + join(map(lambda t: '0x%04x' % t, t),', ') + ')'
+
+def unicoderepr(t,
+
+ join=string.join):
+
+ if t is None:
+ return 'None'
+ if numeric:
+ return hexrepr(t)
+ else:
+ try:
+ len(t)
+ except:
+ return repr(unichr(t))
+ return repr(join(map(unichr, t),''))
+
+def keyrepr(t,
+
+ join=string.join):
+
+ if t is None:
+ return 'None'
+ if numeric:
+ return hexrepr(t)
+ else:
+ try:
+ len(t)
+ except:
+ if t < 256:
+ return repr(chr(t))
+ else:
+ return repr(unichr(t))
+ return repr(join(map(chr, t),''))
+
+def codegen(name,map,comments=1):
+
+ """ Returns Python source for the given map.
+
+ Comments are included in the source, if comments is true (default).
+
+ """
+ l = [
+ '''\
+""" Python Character Mapping Codec generated from '%s'.
+
+Written by Marc-Andre Lemburg (mal@lemburg.com).
+
+(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+
+"""#"
+
+import codecs
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+
+ def encode(self,input,errors='strict'):
+
+ return codecs.charmap_encode(input,errors,encoding_map)
+
+ def decode(self,input,errors='strict'):
+
+ return codecs.charmap_decode(input,errors,decoding_map)
+
+class StreamWriter(Codec,codecs.StreamWriter):
+ pass
+
+class StreamReader(Codec,codecs.StreamReader):
+ pass
+
+### encodings module API
+
+def getregentry():
+
+ return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
+
+### Decoding Map
+
+decoding_map = {
+''' % name,
+ ]
+ mappings = map.items()
+ mappings.sort()
+ append = l.append
+ i = 0
+ splits = 0
+ for e,value in mappings:
+ try:
+ (u,c) = value
+ except TypeError:
+ u = value
+ c = ''
+ key = keyrepr(e)
+ if c and comments:
+ append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
+ else:
+ append('\t%s: %s,' % (key,unicoderepr(u)))
+ i = i + 1
+ if i == 4096:
+ # Split the definition into parts to that the Python
+ # parser doesn't dump core
+ if splits == 0:
+ append('}')
+ else:
+ append('})')
+ append('map.update({')
+ i = 0
+ splits = splits + 1
+ if splits == 0:
+ append('}')
+ else:
+ append('})')
+ append('''
+### Encoding Map
+
+encoding_map = {}
+for k,v in decoding_map.items():
+ encoding_map[v] = k
+''')
+ return string.join(l,'\n')
+
+def pymap(name,map,pyfile,comments=1):
+
+ code = codegen(name,map,comments)
+ f = open(pyfile,'w')
+ f.write(code)
+ f.close()
+
+def marshalmap(name,map,marshalfile):
+
+ d = {}
+ for e,(u,c) in map.items():
+ d[e] = (u,c)
+ f = open(marshalfile,'wb')
+ marshal.dump(d,f)
+ f.close()
+
+def convertdir(dir,prefix='',comments=1):
+
+ mapnames = os.listdir(dir)
+ for mapname in mapnames:
+ name = os.path.split(mapname)[1]
+ name = string.replace(name,'-','_')
+ name = string.split(name, '.')[0]
+ name = string.lower(name)
+ codefile = name + '.py'
+ marshalfile = name + '.mapping'
+ print 'converting %s to %s and %s' % (mapname,
+ prefix + codefile,
+ prefix + marshalfile)
+ try:
+ map = readmap(os.path.join(dir,mapname))
+ if not map:
+ print '* map is empty; skipping'
+ else:
+ pymap(mapname, map, prefix + codefile,comments)
+ marshalmap(mapname, map, prefix + marshalfile)
+ except ValueError:
+ print '* conversion failed'
+
+def rewritepythondir(dir,prefix='',comments=1):
+
+ mapnames = os.listdir(dir)
+ for mapname in mapnames:
+ if mapname[-len('.mapping'):] != '.mapping':
+ continue
+ codefile = mapname[:-len('.mapping')] + '.py'
+ print 'converting %s to %s' % (mapname,
+ prefix + codefile)
+ try:
+ map = marshal.load(open(os.path.join(dir,mapname),
+ 'rb'))
+ if not map:
+ print '* map is empty; skipping'
+ else:
+ pymap(mapname, map, prefix + codefile,comments)
+ except ValueError, why:
+ print '* conversion failed: %s' % why
+
+if __name__ == '__main__':
+
+ import sys
+ if 1:
+ apply(convertdir,tuple(sys.argv[1:]))
+ else:
+ apply(rewritepythondir,tuple(sys.argv[1:]))