summaryrefslogtreecommitdiffstats
path: root/Doc/lib/libcodecs.tex
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2002-12-31 12:39:07 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2002-12-31 12:39:07 (GMT)
commit5c37a7717d7d7190dfa30b33101d6bb4d7aafbde (patch)
treeba2cfdcc7fd4f481b24b91e5e01251b2b349a573 /Doc/lib/libcodecs.tex
parenta8aed02f1e67be19634cedf6be6d6f469a9e0fb6 (diff)
downloadcpython-5c37a7717d7d7190dfa30b33101d6bb4d7aafbde.zip
cpython-5c37a7717d7d7190dfa30b33101d6bb4d7aafbde.tar.gz
cpython-5c37a7717d7d7190dfa30b33101d6bb4d7aafbde.tar.bz2
Document standard encodings.
Diffstat (limited to 'Doc/lib/libcodecs.tex')
-rw-r--r--Doc/lib/libcodecs.tex343
1 files changed, 343 insertions, 0 deletions
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex
index 44713f5..355ac5d 100644
--- a/Doc/lib/libcodecs.tex
+++ b/Doc/lib/libcodecs.tex
@@ -511,3 +511,346 @@ the \function{lookup()} function to construct the instance.
\class{StreamReader} and \class{StreamWriter} classes. They inherit
all other methods and attribute from the underlying stream.
+\subsection{Standard Encodings}
+
+Python comes with a number of codecs builtin, either implemented as C
+functions, or with dictionaries as mapping tables. The following table
+lists the codecs by name, together with a few common aliases, and the
+languages for which the encoding is likely used. Neither the list of
+aliases nor the list of languages is meant to be exhaustive. Notice
+that spelling alternatives that only differ in case or use a hyphen
+instead of an underscore are also valid aliases.
+
+Many of the character sets support the same languages. They vary in
+individual characters (e.g. whether the EURO SIGN is supported or
+not), and in the assignment of characters to code positions. For the
+European languages in particular, the following variants typically
+exist:
+
+\begin{itemize}
+\item an ISO 8859 codeset
+\item a Microsoft Windows code page, which is typically derived from
+ a 8859 codeset, but replaces control characters with additional
+ graphic characters
+\item an IBM EBCDIC code page
+\item an IBM PC code page, which is ASCII compatible
+\end{itemize}
+
+\begin{longtableiii}{l|l|l}{textrm}{Codec}{Aliases}{Languages}
+
+\lineiii{ascii}
+ {646, us-ascii}
+ {English}
+
+\lineiii{cp037}
+ {IBM037, IBM039}
+ {English}
+
+\lineiii{cp424}
+ {EBCDIC-CP-HE, IBM424}
+ {Hebrew}
+
+\lineiii{cp437}
+ {437, IBM437}
+ {English}
+
+\lineiii{cp500}
+ {EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500}
+ {Western Europe}
+
+\lineiii{cp737}
+ {}
+ {Greek}
+
+\lineiii{cp775}
+ {IBM775}
+ {Baltic languages}
+
+\lineiii{cp850}
+ {850, IBM850}
+ {Western Europe}
+
+\lineiii{cp852}
+ {852, IBM852}
+ {Central and Eastern Europe}
+
+\lineiii{cp855}
+ {855, IBM855}
+ {Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
+
+\lineiii{cp856}
+ {}
+ {Hebrew}
+
+\lineiii{cp857}
+ {857, IBM857}
+ {Turkish}
+
+\lineiii{cp860}
+ {860, IBM860}
+ {Portuguese}
+
+\lineiii{cp861}
+ {861, CP-IS, IBM861}
+ {Icelandic}
+
+\lineiii{cp862}
+ {862, IBM862}
+ {Hebrew}
+
+\lineiii{cp863}
+ {863, IBM863}
+ {Canadian}
+
+\lineiii{cp864}
+ {IBM864}
+ {Arabic}
+
+\lineiii{cp865}
+ {865, IBM865}
+ {Danish, Norwegian}
+
+\lineiii{cp869}
+ {869, CP-GR, IBM869}
+ {Greek}
+
+\lineiii{cp874}
+ {}
+ {Thai}
+
+\lineiii{cp875}
+ {}
+ {Greek}
+
+\lineiii{cp1006}
+ {}
+ {Urdu}
+
+\lineiii{cp1026}
+ {ibm1026}
+ {Turkish}
+
+\lineiii{cp1140}
+ {ibm1140}
+ {Western Europe}
+
+\lineiii{cp1250}
+ {windows-1250}
+ {Central and Eastern Europe}
+
+\lineiii{cp1251}
+ {windows-1251}
+ {Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
+
+\lineiii{cp1252}
+ {windows-1252}
+ {Western Europe}
+
+\lineiii{cp1253}
+ {windows-1253}
+ {Greek}
+
+\lineiii{cp1254}
+ {windows-1254}
+ {Turkish}
+
+\lineiii{cp1255}
+ {windows-1255}
+ {Hebrew}
+
+\lineiii{cp1256}
+ {windows1256}
+ {Arabic}
+
+\lineiii{cp1257}
+ {windows-1257}
+ {Baltic languages}
+
+\lineiii{cp1258}
+ {windows-1258}
+ {Vietnamese}
+
+\lineiii{latin_1}
+ {iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1}
+ {West Europe}
+
+\lineiii{iso8859_2}
+ {iso-8859-2, latin2, L2}
+ {Central and Eastern Europe}
+
+\lineiii{iso8859_3}
+ {iso-8859-3, latin3, L3}
+ {Esperanto, Maltese}
+
+\lineiii{iso8859_4}
+ {iso-8859-4, latin4, L4}
+ {Baltic languagues}
+
+\lineiii{iso8859_5}
+ {iso-8859-5, cyrillic}
+ {Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
+
+\lineiii{iso8859_6}
+ {iso-8859-6, arabic}
+ {Arabic}
+
+\lineiii{iso8859_7}
+ {iso-8859-7, greek, greek8}
+ {Greek}
+
+\lineiii{iso8859_8}
+ {iso-8859-8, hebrew}
+ {Hebrew}
+
+\lineiii{iso8859_9}
+ {iso-8859-9, latin5, L5}
+ {Turkish}
+
+\lineiii{iso8859_10}
+ {iso-8859-10, latin6, L6}
+ {Nordic languages}
+
+\lineiii{iso8859_13}
+ {iso-8859-13}
+ {Baltic languages}
+
+\lineiii{iso8859_14}
+ {iso-8859-14, latin8, L8}
+ {Celtic languages}
+
+\lineiii{iso8859_15}
+ {iso-8859-15}
+ {Western Europe}
+
+\lineiii{koi8_r}
+ {}
+ {Russian}
+
+\lineiii{koi8_u}
+ {}
+ {Ukrainian}
+
+\lineiii{mac_cyrillic}
+ {maccyrillic}
+ {Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
+
+\lineiii{mac_greek}
+ {macgreek}
+ {Greek}
+
+\lineiii{mac_iceland}
+ {maciceland}
+ {Icelandic}
+
+\lineiii{mac_latin2}
+ {maclatin2, maccentraleurope}
+ {Central and Eastern Europe}
+
+\lineiii{mac_roman}
+ {macroman}
+ {Western Europe}
+
+\lineiii{mac_turkish}
+ {macturkish}
+ {Turkish}
+
+\lineiii{utf_16}
+ {U16, utf16}
+ {all languages}
+
+\lineiii{utf_16_be}
+ {UTF-16BE}
+ {all languages (BMP only)}
+
+\lineiii{utf_16_le}
+ {UTF-16LE}
+ {all languages (BMP only)}
+
+\lineiii{utf_7}
+ {U7}
+ {all languages}
+
+\lineiii{utf_8}
+ {U8, UTF, utf8}
+ {all languages}
+
+\end{longtableiii}
+
+A number of codecs are specific to Python, so their codec names have
+no meaning outside Python. Some of them don't convert from Unicode
+strings to byte strings, but instead use the property of the Python
+codecs machinery that any bijective function with one argument can be
+considered as an encoding.
+
+For the codecs listed below, the result in the ``encoding'' direction
+is always a byte string. The result of the ``decoding'' direction is
+listed as operand type in the table.
+
+\begin{tableiv}{l|l|l|l}{textrm}{Codec}{Aliases}{Operand type}{Purpose}
+
+\lineiv{base64_codec}
+ {base64, base-64}
+ {byte string}
+ {Convert operand to MIME base64}
+
+\lineiv{hex_codec}
+ {hex}
+ {byte string}
+ {Convert operand to hexadecimal representation, with two digits per byte}
+
+\lineiv{mbcs}
+ {dbcs}
+ {Unicode string}
+ {Windows only: Encode operand according to the ANSI codepage (CP_ACP)}
+
+\lineiv{palmos}
+ {}
+ {Unicode string}
+ {Encoding of PalmOS 3.5}
+
+\lineiv{quopri_codec}
+ {quopri, quoted-printable, quotedprintable}
+ {byte string}
+ {Convert operand to MIME quoted printable}
+
+\lineiv{raw_unicode_escape}
+ {}
+ {Unicode string}
+ {Produce a string that is suitable as raw Unicode literal in Python source code}
+
+\lineiv{rot_13}
+ {rot13}
+ {byte string}
+ {Returns the Caesar-cypher encryption of the operand}
+
+\lineiv{string_escape}
+ {}
+ {byte string}
+ {Produce a string that is suitable as string literal in Python source code}
+
+\lineiv{undefined}
+ {}
+ {any}
+ {Raise an exception for all conversion. Can be used as the system encoding if no automatic coercion between byte and Unicode strings is desired.}
+
+\lineiv{unicode_escape}
+ {}
+ {Unicode string}
+ {Produce a string that is suitable as Unicode literal in Python source code}
+
+\lineiv{unicode_internal}
+ {}
+ {Unicode string}
+ {Return the internal represenation of the operand}
+
+\lineiv{uu_codec}
+ {uu}
+ {byte string}
+ {Convert the operand using uuencode}
+
+\lineiv{zlib_codec}
+ {zip, zlib}
+ {byte string}
+ {Compress the operand using gzip}
+
+\end{tableiv}