1 files changed, 39 insertions, 11 deletions
diff --git a/Tools/scripts/gencodec.py b/Tools/scripts/gencodec.py
index 45b69b0..39b42ff 100644
--- a/Tools/scripts/gencodec.py
+++ b/Tools/scripts/gencodec.py
@@ -1,9 +1,9 @@
 """ Unicode Mapping Parser and Codec Generator.
 
 This script parses Unicode mapping files as available from the Unicode
-site (ftp.unicode.org) and creates Python codec modules from them. The
-codecs use the standard character mapping codec to actually apply the
-mapping.
+site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
+modules from them. The codecs use the standard character mapping codec
+to actually apply the mapping.
 
 Synopsis: gencodec.py dir codec_prefix
 
@@ -18,6 +18,7 @@ same location (with .mapping extension).
 Written by Marc-Andre Lemburg (mal@lemburg.com).
 
 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+(c) Copyright Guido van Rossum, 2000.
 
 """#"
 
@@ -70,6 +71,10 @@ def readmap(filename,
     lines = f.readlines()
     f.close()
     enc2uni = {}
+    identity = []
+    unmapped = range(256)
+    for i in range(256):
+        unmapped[i] = i
     for line in lines:
         line = strip(line)
         if not line or line[0] == '#':
@@ -85,8 +90,22 @@ def readmap(filename,
             comment = ''
         else:
             comment = comment[1:]
-        if enc != uni:
+        if enc < 256:
+            unmapped.remove(enc)
+            if enc == uni:
+                identity.append(enc)
+            else:
+                enc2uni[enc] = (uni,comment)
+        else:
             enc2uni[enc] = (uni,comment)
+    # If there are more identity-mapped entries than unmapped entries,
+    # it pays to generate an identity dictionary first, add add explicit
+    # mappings to None for the rest
+    if len(identity)>=len(unmapped):
+        for enc in unmapped:
+            enc2uni[enc] = (None, "")
+        enc2uni['IDENTITY'] = 256
+
     return enc2uni
 
 def hexrepr(t,
@@ -143,11 +162,12 @@ def codegen(name,map,comments=1):
     """
     l = [
         '''\
-""" Python Character Mapping Codec generated from '%s'.
+""" Python Character Mapping Codec generated from '%s' with gencodec.py.
 
 Written by Marc-Andre Lemburg (mal@lemburg.com).
 
 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+(c) Copyright 2000 Guido van Rossum.
 
 """#"
 
@@ -178,15 +198,23 @@ def getregentry():
     return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
 
 ### Decoding Map
-
-decoding_map = {
 ''' % name,
         ]
+
+    if map.has_key("IDENTITY"):
+        l.append("decoding_map = codecs.make_identity_dict(range(%d))"
+                 % map["IDENTITY"])
+        l.append("decoding_map.update({")
+        splits = 1
+        del map["IDENTITY"]
+    else:
+        l.append("decoding_map = {")
+        splits = 0
+        
     mappings = map.items()
     mappings.sort()
     append = l.append
     i = 0
-    splits = 0
     for e,value in mappings:
         try:
             (u,c) = value
@@ -198,7 +226,7 @@ decoding_map = {
             append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
         else:
             append('\t%s: %s,' % (key,unicoderepr(u)))
-        i = i + 1
+        i += 1
         if i == 4096:
             # Split the definition into parts to that the Python
             # parser doesn't dump core
@@ -206,7 +234,7 @@ decoding_map = {
                 append('}')
             else:
                 append('})')
-            append('map.update({')
+            append('decoding_map.update({')
             i = 0
             splits = splits + 1
     if splits == 0:
@@ -265,7 +293,7 @@ def rewritepythondir(dir,prefix='',comments=1):
     
     mapnames = os.listdir(dir)
     for mapname in mapnames:
-        if mapname[-len('.mapping'):] != '.mapping':
+        if not mapname.endswith('.mapping'):
             continue
         codefile = mapname[:-len('.mapping')] + '.py'
         print 'converting %s to %s' % (mapname,