summaryrefslogtreecommitdiffstats
path: root/Tools/scripts/gencodec.py
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/scripts/gencodec.py')
-rw-r--r--Tools/scripts/gencodec.py50
1 files changed, 39 insertions, 11 deletions
diff --git a/Tools/scripts/gencodec.py b/Tools/scripts/gencodec.py
index 45b69b0..39b42ff 100644
--- a/Tools/scripts/gencodec.py
+++ b/Tools/scripts/gencodec.py
@@ -1,9 +1,9 @@
""" Unicode Mapping Parser and Codec Generator.
This script parses Unicode mapping files as available from the Unicode
-site (ftp.unicode.org) and creates Python codec modules from them. The
-codecs use the standard character mapping codec to actually apply the
-mapping.
+site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
+modules from them. The codecs use the standard character mapping codec
+to actually apply the mapping.
Synopsis: gencodec.py dir codec_prefix
@@ -18,6 +18,7 @@ same location (with .mapping extension).
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+(c) Copyright Guido van Rossum, 2000.
"""#"
@@ -70,6 +71,10 @@ def readmap(filename,
lines = f.readlines()
f.close()
enc2uni = {}
+ identity = []
+ unmapped = range(256)
+ for i in range(256):
+ unmapped[i] = i
for line in lines:
line = strip(line)
if not line or line[0] == '#':
@@ -85,8 +90,22 @@ def readmap(filename,
comment = ''
else:
comment = comment[1:]
- if enc != uni:
+ if enc < 256:
+ unmapped.remove(enc)
+ if enc == uni:
+ identity.append(enc)
+ else:
+ enc2uni[enc] = (uni,comment)
+ else:
enc2uni[enc] = (uni,comment)
+ # If there are more identity-mapped entries than unmapped entries,
+ # it pays to generate an identity dictionary first, add add explicit
+ # mappings to None for the rest
+ if len(identity)>=len(unmapped):
+ for enc in unmapped:
+ enc2uni[enc] = (None, "")
+ enc2uni['IDENTITY'] = 256
+
return enc2uni
def hexrepr(t,
@@ -143,11 +162,12 @@ def codegen(name,map,comments=1):
"""
l = [
'''\
-""" Python Character Mapping Codec generated from '%s'.
+""" Python Character Mapping Codec generated from '%s' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -178,15 +198,23 @@ def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
### Decoding Map
-
-decoding_map = {
''' % name,
]
+
+ if map.has_key("IDENTITY"):
+ l.append("decoding_map = codecs.make_identity_dict(range(%d))"
+ % map["IDENTITY"])
+ l.append("decoding_map.update({")
+ splits = 1
+ del map["IDENTITY"]
+ else:
+ l.append("decoding_map = {")
+ splits = 0
+
mappings = map.items()
mappings.sort()
append = l.append
i = 0
- splits = 0
for e,value in mappings:
try:
(u,c) = value
@@ -198,7 +226,7 @@ decoding_map = {
append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
else:
append('\t%s: %s,' % (key,unicoderepr(u)))
- i = i + 1
+ i += 1
if i == 4096:
# Split the definition into parts to that the Python
# parser doesn't dump core
@@ -206,7 +234,7 @@ decoding_map = {
append('}')
else:
append('})')
- append('map.update({')
+ append('decoding_map.update({')
i = 0
splits = splits + 1
if splits == 0:
@@ -265,7 +293,7 @@ def rewritepythondir(dir,prefix='',comments=1):
mapnames = os.listdir(dir)
for mapname in mapnames:
- if mapname[-len('.mapping'):] != '.mapping':
+ if not mapname.endswith('.mapping'):
continue
codefile = mapname[:-len('.mapping')] + '.py'
print 'converting %s to %s' % (mapname,