diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2002-12-31 12:39:07 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2002-12-31 12:39:07 (GMT) |
commit | 5c37a7717d7d7190dfa30b33101d6bb4d7aafbde (patch) | |
tree | ba2cfdcc7fd4f481b24b91e5e01251b2b349a573 /Doc | |
parent | a8aed02f1e67be19634cedf6be6d6f469a9e0fb6 (diff) | |
download | cpython-5c37a7717d7d7190dfa30b33101d6bb4d7aafbde.zip cpython-5c37a7717d7d7190dfa30b33101d6bb4d7aafbde.tar.gz cpython-5c37a7717d7d7190dfa30b33101d6bb4d7aafbde.tar.bz2 |
Document standard encodings.
Diffstat (limited to 'Doc')
-rw-r--r-- | Doc/lib/libcodecs.tex | 343 |
1 files changed, 343 insertions, 0 deletions
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex index 44713f5..355ac5d 100644 --- a/Doc/lib/libcodecs.tex +++ b/Doc/lib/libcodecs.tex @@ -511,3 +511,346 @@ the \function{lookup()} function to construct the instance. \class{StreamReader} and \class{StreamWriter} classes. They inherit all other methods and attribute from the underlying stream. +\subsection{Standard Encodings} + +Python comes with a number of codecs builtin, either implemented as C +functions, or with dictionaries as mapping tables. The following table +lists the codecs by name, together with a few common aliases, and the +languages for which the encoding is likely used. Neither the list of +aliases nor the list of languages is meant to be exhaustive. Notice +that spelling alternatives that only differ in case or use a hyphen +instead of an underscore are also valid aliases. + +Many of the character sets support the same languages. They vary in +individual characters (e.g. whether the EURO SIGN is supported or +not), and in the assignment of characters to code positions. For the +European languages in particular, the following variants typically +exist: + +\begin{itemize} +\item an ISO 8859 codeset +\item a Microsoft Windows code page, which is typically derived from + a 8859 codeset, but replaces control characters with additional + graphic characters +\item an IBM EBCDIC code page +\item an IBM PC code page, which is ASCII compatible +\end{itemize} + +\begin{longtableiii}{l|l|l}{textrm}{Codec}{Aliases}{Languages} + +\lineiii{ascii} + {646, us-ascii} + {English} + +\lineiii{cp037} + {IBM037, IBM039} + {English} + +\lineiii{cp424} + {EBCDIC-CP-HE, IBM424} + {Hebrew} + +\lineiii{cp437} + {437, IBM437} + {English} + +\lineiii{cp500} + {EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500} + {Western Europe} + +\lineiii{cp737} + {} + {Greek} + +\lineiii{cp775} + {IBM775} + {Baltic languages} + +\lineiii{cp850} + {850, IBM850} + {Western Europe} + +\lineiii{cp852} + {852, IBM852} + {Central and Eastern Europe} + +\lineiii{cp855} + {855, IBM855} + {Bulgarian, Byelorussian, Macedonian, Russian, Serbian} + +\lineiii{cp856} + {} + {Hebrew} + +\lineiii{cp857} + {857, IBM857} + {Turkish} + +\lineiii{cp860} + {860, IBM860} + {Portuguese} + +\lineiii{cp861} + {861, CP-IS, IBM861} + {Icelandic} + +\lineiii{cp862} + {862, IBM862} + {Hebrew} + +\lineiii{cp863} + {863, IBM863} + {Canadian} + +\lineiii{cp864} + {IBM864} + {Arabic} + +\lineiii{cp865} + {865, IBM865} + {Danish, Norwegian} + +\lineiii{cp869} + {869, CP-GR, IBM869} + {Greek} + +\lineiii{cp874} + {} + {Thai} + +\lineiii{cp875} + {} + {Greek} + +\lineiii{cp1006} + {} + {Urdu} + +\lineiii{cp1026} + {ibm1026} + {Turkish} + +\lineiii{cp1140} + {ibm1140} + {Western Europe} + +\lineiii{cp1250} + {windows-1250} + {Central and Eastern Europe} + +\lineiii{cp1251} + {windows-1251} + {Bulgarian, Byelorussian, Macedonian, Russian, Serbian} + +\lineiii{cp1252} + {windows-1252} + {Western Europe} + +\lineiii{cp1253} + {windows-1253} + {Greek} + +\lineiii{cp1254} + {windows-1254} + {Turkish} + +\lineiii{cp1255} + {windows-1255} + {Hebrew} + +\lineiii{cp1256} + {windows1256} + {Arabic} + +\lineiii{cp1257} + {windows-1257} + {Baltic languages} + +\lineiii{cp1258} + {windows-1258} + {Vietnamese} + +\lineiii{latin_1} + {iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1} + {West Europe} + +\lineiii{iso8859_2} + {iso-8859-2, latin2, L2} + {Central and Eastern Europe} + +\lineiii{iso8859_3} + {iso-8859-3, latin3, L3} + {Esperanto, Maltese} + +\lineiii{iso8859_4} + {iso-8859-4, latin4, L4} + {Baltic languagues} + +\lineiii{iso8859_5} + {iso-8859-5, cyrillic} + {Bulgarian, Byelorussian, Macedonian, Russian, Serbian} + +\lineiii{iso8859_6} + {iso-8859-6, arabic} + {Arabic} + +\lineiii{iso8859_7} + {iso-8859-7, greek, greek8} + {Greek} + +\lineiii{iso8859_8} + {iso-8859-8, hebrew} + {Hebrew} + +\lineiii{iso8859_9} + {iso-8859-9, latin5, L5} + {Turkish} + +\lineiii{iso8859_10} + {iso-8859-10, latin6, L6} + {Nordic languages} + +\lineiii{iso8859_13} + {iso-8859-13} + {Baltic languages} + +\lineiii{iso8859_14} + {iso-8859-14, latin8, L8} + {Celtic languages} + +\lineiii{iso8859_15} + {iso-8859-15} + {Western Europe} + +\lineiii{koi8_r} + {} + {Russian} + +\lineiii{koi8_u} + {} + {Ukrainian} + +\lineiii{mac_cyrillic} + {maccyrillic} + {Bulgarian, Byelorussian, Macedonian, Russian, Serbian} + +\lineiii{mac_greek} + {macgreek} + {Greek} + +\lineiii{mac_iceland} + {maciceland} + {Icelandic} + +\lineiii{mac_latin2} + {maclatin2, maccentraleurope} + {Central and Eastern Europe} + +\lineiii{mac_roman} + {macroman} + {Western Europe} + +\lineiii{mac_turkish} + {macturkish} + {Turkish} + +\lineiii{utf_16} + {U16, utf16} + {all languages} + +\lineiii{utf_16_be} + {UTF-16BE} + {all languages (BMP only)} + +\lineiii{utf_16_le} + {UTF-16LE} + {all languages (BMP only)} + +\lineiii{utf_7} + {U7} + {all languages} + +\lineiii{utf_8} + {U8, UTF, utf8} + {all languages} + +\end{longtableiii} + +A number of codecs are specific to Python, so their codec names have +no meaning outside Python. Some of them don't convert from Unicode +strings to byte strings, but instead use the property of the Python +codecs machinery that any bijective function with one argument can be +considered as an encoding. + +For the codecs listed below, the result in the ``encoding'' direction +is always a byte string. The result of the ``decoding'' direction is +listed as operand type in the table. + +\begin{tableiv}{l|l|l|l}{textrm}{Codec}{Aliases}{Operand type}{Purpose} + +\lineiv{base64_codec} + {base64, base-64} + {byte string} + {Convert operand to MIME base64} + +\lineiv{hex_codec} + {hex} + {byte string} + {Convert operand to hexadecimal representation, with two digits per byte} + +\lineiv{mbcs} + {dbcs} + {Unicode string} + {Windows only: Encode operand according to the ANSI codepage (CP_ACP)} + +\lineiv{palmos} + {} + {Unicode string} + {Encoding of PalmOS 3.5} + +\lineiv{quopri_codec} + {quopri, quoted-printable, quotedprintable} + {byte string} + {Convert operand to MIME quoted printable} + +\lineiv{raw_unicode_escape} + {} + {Unicode string} + {Produce a string that is suitable as raw Unicode literal in Python source code} + +\lineiv{rot_13} + {rot13} + {byte string} + {Returns the Caesar-cypher encryption of the operand} + +\lineiv{string_escape} + {} + {byte string} + {Produce a string that is suitable as string literal in Python source code} + +\lineiv{undefined} + {} + {any} + {Raise an exception for all conversion. Can be used as the system encoding if no automatic coercion between byte and Unicode strings is desired.} + +\lineiv{unicode_escape} + {} + {Unicode string} + {Produce a string that is suitable as Unicode literal in Python source code} + +\lineiv{unicode_internal} + {} + {Unicode string} + {Return the internal represenation of the operand} + +\lineiv{uu_codec} + {uu} + {byte string} + {Convert the operand using uuencode} + +\lineiv{zlib_codec} + {zip, zlib} + {byte string} + {Compress the operand using gzip} + +\end{tableiv} |