diff options
Diffstat (limited to 'Lib/encodings')
-rw-r--r-- | Lib/encodings/__init__.py | 34 | ||||
-rw-r--r-- | Lib/encodings/aliases.py | 427 |
2 files changed, 355 insertions, 106 deletions
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 3830954..65f5c9c 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -4,8 +4,8 @@ directory. Codec modules must have names corresponding to standard lower-case - encoding names with hyphens mapped to underscores, e.g. 'utf-8' is - implemented by the module 'utf_8.py'. + encoding names with hyphens and periods mapped to underscores, + e.g. 'utf-8' is implemented by the module 'utf_8.py'. Each codec module must export the following interface: @@ -28,10 +28,11 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). """#" -import codecs,aliases,exceptions +import codecs,exceptions _cache = {} _unknown = '--unknown--' +_import_tail = ['*'] class CodecRegistryError(exceptions.LookupError, exceptions.SystemError): @@ -40,19 +41,37 @@ class CodecRegistryError(exceptions.LookupError, def search_function(encoding): # Cache lookup - entry = _cache.get(encoding,_unknown) + entry = _cache.get(encoding, _unknown) if entry is not _unknown: return entry - # Import the module + # Import the module: + # + # First look in the encodings package, then try to lookup the + # encoding in the aliases mapping and retry the import using the + # default import module lookup scheme with the alias name. + # modname = encoding.replace('-', '_') - modname = aliases.aliases.get(modname,modname) + modname = modname.replace('.', '_') try: - mod = __import__(modname,globals(),locals(),'*') + mod = __import__('encodings.' + modname, + globals(), locals(), _import_tail) except ImportError,why: + import aliases + modname = aliases.aliases.get(modname, _unknown) + if modname is not _unknown: + try: + mod = __import__(modname, + globals(), locals(), _import_tail) + except ImportError,why: + mod = None + else: + mod = None + if mod is None: # cache misses _cache[encoding] = None return None + # Now ask the module for the registry entry try: @@ -79,6 +98,7 @@ def search_function(encoding): except AttributeError: pass else: + import aliases for alias in codecaliases: if not aliases.aliases.has_key(alias): aliases.aliases[alias] = modname diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py index d6101da..b6882ae 100644 --- a/Lib/encodings/aliases.py +++ b/Lib/encodings/aliases.py @@ -4,108 +4,337 @@ map encodings names to module names. Note that the search function converts the encoding names to lower - case and replaces hyphens with underscores *before* performing the - lookup. + case and replaces hyphens and periods with underscores *before* + performing the lookup. + + Contents: + + The following aliases dictionary contains mappings of all IANA + character set names for which the Python core library provides + codecs. In addition to these, a few Python specific codec + aliases have also been added. + + About the CJK codec aliases: + + The codecs for these encodings are not distributed with the + Python core, but are included here for reference, since the + locale module relies on having these aliases available. """ aliases = { - # Latin-1 - 'latin': 'latin_1', - 'latin1': 'latin_1', - - # UTF-7 - 'utf7': 'utf_7', - 'u7': 'utf_7', - - # UTF-8 - 'utf': 'utf_8', - 'utf8': 'utf_8', - 'u8': 'utf_8', - 'utf8@ucs2': 'utf_8', - 'utf8@ucs4': 'utf_8', - - # UTF-16 - 'utf16': 'utf_16', - 'u16': 'utf_16', - 'utf_16be': 'utf_16_be', - 'utf_16le': 'utf_16_le', - 'unicodebigunmarked': 'utf_16_be', - 'unicodelittleunmarked': 'utf_16_le', - - # ASCII - 'us_ascii': 'ascii', - 'ansi_x3.4_1968': 'ascii', # used on Linux - '646': 'ascii', # used on Solaris - - # EBCDIC - 'ebcdic_cp_us': 'cp037', - 'ibm039': 'cp037', - 'ibm1140': 'cp1140', - - # ISO - '8859': 'latin_1', - 'iso8859': 'latin_1', - 'iso8859_1': 'latin_1', - 'iso_8859_1': 'latin_1', - 'iso_8859_10': 'iso8859_10', - 'iso_8859_13': 'iso8859_13', - 'iso_8859_14': 'iso8859_14', - 'iso_8859_15': 'iso8859_15', - 'iso_8859_2': 'iso8859_2', - 'iso_8859_3': 'iso8859_3', - 'iso_8859_4': 'iso8859_4', - 'iso_8859_5': 'iso8859_5', - 'iso_8859_6': 'iso8859_6', - 'iso_8859_7': 'iso8859_7', - 'iso_8859_8': 'iso8859_8', - 'iso_8859_9': 'iso8859_9', - - # Mac - 'maclatin2': 'mac_latin2', - 'maccentraleurope': 'mac_latin2', - 'maccyrillic': 'mac_cyrillic', - 'macgreek': 'mac_greek', - 'maciceland': 'mac_iceland', - 'macroman': 'mac_roman', - 'macturkish': 'mac_turkish', - - # Windows - 'windows_1251': 'cp1251', - 'windows_1252': 'cp1252', - 'windows_1254': 'cp1254', - 'windows_1255': 'cp1255', - - # MBCS - 'dbcs': 'mbcs', - - # Code pages - '437': 'cp437', - - # CJK - # - # The codecs for these encodings are not distributed with the - # Python core, but are included here for reference, since the - # locale module relies on having these aliases available. - # - 'jis_7': 'jis_7', - 'iso_2022_jp': 'jis_7', - 'ujis': 'euc_jp', - 'ajec': 'euc_jp', - 'eucjp': 'euc_jp', - 'tis260': 'tactis', - 'sjis': 'shift_jis', - - # Content transfer/compression encodings - 'rot13': 'rot_13', - 'base64': 'base64_codec', - 'base_64': 'base64_codec', - 'zlib': 'zlib_codec', - 'zip': 'zlib_codec', - 'hex': 'hex_codec', - 'uu': 'uu_codec', - 'quopri': 'quopri_codec', - 'quotedprintable': 'quopri_codec', - 'quoted_printable': 'quopri_codec', + # ascii codec + '646' : 'ascii', + 'ansi_x3_4_1968' : 'ascii', + 'ansi_x3_4_1986' : 'ascii', + 'cp367' : 'ascii', + 'csascii' : 'ascii', + 'ibm367' : 'ascii', + 'iso646_us' : 'ascii', + 'iso_646_irv:1991' : 'ascii', + 'iso_ir_6' : 'ascii', + 'us' : 'ascii', + 'us_ascii' : 'ascii', + + # base64_codec codec + 'base64' : 'base64_codec', + 'base_64' : 'base64_codec', + + # cp037 codec + 'csibm037' : 'cp037', + 'ebcdic_cp_ca' : 'cp037', + 'ebcdic_cp_nl' : 'cp037', + 'ebcdic_cp_us' : 'cp037', + 'ebcdic_cp_wt' : 'cp037', + 'ibm037' : 'cp037', + 'ibm039' : 'cp037', + + # cp1026 codec + 'csibm1026' : 'cp1026', + 'ibm1026' : 'cp1026', + + # cp1140 codec + 'ibm1140' : 'cp1140', + + # cp1250 codec + 'windows_1250' : 'cp1250', + + # cp1251 codec + 'windows_1251' : 'cp1251', + + # cp1252 codec + 'windows_1252' : 'cp1252', + + # cp1253 codec + 'windows_1253' : 'cp1253', + + # cp1254 codec + 'windows_1254' : 'cp1254', + + # cp1255 codec + 'windows_1255' : 'cp1255', + + # cp1256 codec + 'windows_1256' : 'cp1256', + + # cp1257 codec + 'windows_1257' : 'cp1257', + + # cp1258 codec + 'windows_1258' : 'cp1258', + + # cp424 codec + 'csibm424' : 'cp424', + 'ebcdic_cp_he' : 'cp424', + 'ibm424' : 'cp424', + + # cp437 codec + '437' : 'cp437', + 'cspc8codepage437' : 'cp437', + 'ibm437' : 'cp437', + + # cp500 codec + 'csibm500' : 'cp500', + 'ebcdic_cp_be' : 'cp500', + 'ebcdic_cp_ch' : 'cp500', + 'ibm500' : 'cp500', + + # cp775 codec + 'cspc775baltic' : 'cp775', + 'ibm775' : 'cp775', + + # cp850 codec + '850' : 'cp850', + 'cspc850multilingual' : 'cp850', + 'ibm850' : 'cp850', + + # cp852 codec + '852' : 'cp852', + 'cspcp852' : 'cp852', + 'ibm852' : 'cp852', + + # cp855 codec + '855' : 'cp855', + 'csibm855' : 'cp855', + 'ibm855' : 'cp855', + + # cp857 codec + '857' : 'cp857', + 'csibm857' : 'cp857', + 'ibm857' : 'cp857', + + # cp860 codec + '860' : 'cp860', + 'csibm860' : 'cp860', + 'ibm860' : 'cp860', + + # cp861 codec + '861' : 'cp861', + 'cp_is' : 'cp861', + 'csibm861' : 'cp861', + 'ibm861' : 'cp861', + + # cp862 codec + '862' : 'cp862', + 'cspc862latinhebrew' : 'cp862', + 'ibm862' : 'cp862', + + # cp863 codec + '863' : 'cp863', + 'csibm863' : 'cp863', + 'ibm863' : 'cp863', + + # cp864 codec + 'csibm864' : 'cp864', + 'ibm864' : 'cp864', + + # cp865 codec + '865' : 'cp865', + 'csibm865' : 'cp865', + 'ibm865' : 'cp865', + + # cp866 codec + '866' : 'cp866', + 'csibm866' : 'cp866', + 'ibm866' : 'cp866', + + # cp869 codec + '869' : 'cp869', + 'cp_gr' : 'cp869', + 'csibm869' : 'cp869', + 'ibm869' : 'cp869', + + # hex_codec codec + 'hex' : 'hex_codec', + + # iso8859_10 codec + 'csisolatin6' : 'iso8859_10', + 'iso_8859_10' : 'iso8859_10', + 'iso_8859_10:1992' : 'iso8859_10', + 'iso_ir_157' : 'iso8859_10', + 'l6' : 'iso8859_10', + 'latin6' : 'iso8859_10', + + # iso8859_13 codec + 'iso_8859_13' : 'iso8859_13', + + # iso8859_14 codec + 'iso_8859_14' : 'iso8859_14', + 'iso_8859_14:1998' : 'iso8859_14', + 'iso_celtic' : 'iso8859_14', + 'iso_ir_199' : 'iso8859_14', + 'l8' : 'iso8859_14', + 'latin8' : 'iso8859_14', + + # iso8859_15 codec + 'iso_8859_15' : 'iso8859_15', + + # iso8859_2 codec + 'csisolatin2' : 'iso8859_2', + 'iso_8859_2' : 'iso8859_2', + 'iso_8859_2:1987' : 'iso8859_2', + 'iso_ir_101' : 'iso8859_2', + 'l2' : 'iso8859_2', + 'latin2' : 'iso8859_2', + + # iso8859_3 codec + 'csisolatin3' : 'iso8859_3', + 'iso_8859_3' : 'iso8859_3', + 'iso_8859_3:1988' : 'iso8859_3', + 'iso_ir_109' : 'iso8859_3', + 'l3' : 'iso8859_3', + 'latin3' : 'iso8859_3', + + # iso8859_4 codec + 'csisolatin4' : 'iso8859_4', + 'iso_8859_4' : 'iso8859_4', + 'iso_8859_4:1988' : 'iso8859_4', + 'iso_ir_110' : 'iso8859_4', + 'l4' : 'iso8859_4', + 'latin4' : 'iso8859_4', + + # iso8859_5 codec + 'csisolatincyrillic' : 'iso8859_5', + 'cyrillic' : 'iso8859_5', + 'iso_8859_5' : 'iso8859_5', + 'iso_8859_5:1988' : 'iso8859_5', + 'iso_ir_144' : 'iso8859_5', + + # iso8859_6 codec + 'arabic' : 'iso8859_6', + 'asmo_708' : 'iso8859_6', + 'csisolatinarabic' : 'iso8859_6', + 'ecma_114' : 'iso8859_6', + 'iso_8859_6' : 'iso8859_6', + 'iso_8859_6:1987' : 'iso8859_6', + 'iso_ir_127' : 'iso8859_6', + + # iso8859_7 codec + 'csisolatingreek' : 'iso8859_7', + 'ecma_118' : 'iso8859_7', + 'elot_928' : 'iso8859_7', + 'greek' : 'iso8859_7', + 'greek8' : 'iso8859_7', + 'iso_8859_7' : 'iso8859_7', + 'iso_8859_7:1987' : 'iso8859_7', + 'iso_ir_126' : 'iso8859_7', + + # iso8859_8 codec + 'csisolatinhebrew' : 'iso8859_8', + 'hebrew' : 'iso8859_8', + 'iso_8859_8' : 'iso8859_8', + 'iso_8859_8:1988' : 'iso8859_8', + 'iso_ir_138' : 'iso8859_8', + + # iso8859_9 codec + 'csisolatin5' : 'iso8859_9', + 'iso_8859_9' : 'iso8859_9', + 'iso_8859_9:1989' : 'iso8859_9', + 'iso_ir_148' : 'iso8859_9', + 'l5' : 'iso8859_9', + 'latin5' : 'iso8859_9', + + # jis_7 codec + 'csiso2022jp' : 'jis_7', + 'iso_2022_jp' : 'jis_7', + + # koi8_r codec + 'cskoi8r' : 'koi8_r', + + # latin_1 codec + '8859' : 'latin_1', + 'cp819' : 'latin_1', + 'csisolatin1' : 'latin_1', + 'ibm819' : 'latin_1', + 'iso8859' : 'latin_1', + 'iso_8859_1' : 'latin_1', + 'iso_8859_1:1987' : 'latin_1', + 'iso_ir_100' : 'latin_1', + 'l1' : 'latin_1', + 'latin' : 'latin_1', + 'latin1' : 'latin_1', + + # mac_cyrillic codec + 'maccyrillic' : 'mac_cyrillic', + + # mac_greek codec + 'macgreek' : 'mac_greek', + + # mac_iceland codec + 'maciceland' : 'mac_iceland', + + # mac_latin2 codec + 'maccentraleurope' : 'mac_latin2', + 'maclatin2' : 'mac_latin2', + + # mac_roman codec + 'macroman' : 'mac_roman', + + # mac_turkish codec + 'macturkish' : 'mac_turkish', + + # mbcs codec + 'dbcs' : 'mbcs', + + # quopri_codec codec + 'quopri' : 'quopri_codec', + 'quoted_printable' : 'quopri_codec', + 'quotedprintable' : 'quopri_codec', + + # rot_13 codec + 'rot13' : 'rot_13', + + # tactis codec + 'tis260' : 'tactis', + + # utf_16 codec + 'u16' : 'utf_16', + 'utf16' : 'utf_16', + + # utf_16_be codec + 'unicodebigunmarked' : 'utf_16_be', + 'utf_16be' : 'utf_16_be', + + # utf_16_le codec + 'unicodelittleunmarked' : 'utf_16_le', + 'utf_16le' : 'utf_16_le', + + # utf_7 codec + 'u7' : 'utf_7', + 'utf7' : 'utf_7', + + # utf_8 codec + 'u8' : 'utf_8', + 'utf' : 'utf_8', + 'utf8' : 'utf_8', + 'utf8@ucs2' : 'utf_8', + 'utf8@ucs4' : 'utf_8', + + # uu_codec codec + 'uu' : 'uu_codec', + + # zlib_codec codec + 'zip' : 'zlib_codec', + 'zlib' : 'zlib_codec', } |