summaryrefslogtreecommitdiffstats
path: root/Doc/lib/emailcharsets.tex
diff options
context:
space:
mode:
Diffstat (limited to 'Doc/lib/emailcharsets.tex')
-rw-r--r--Doc/lib/emailcharsets.tex240
1 files changed, 240 insertions, 0 deletions
diff --git a/Doc/lib/emailcharsets.tex b/Doc/lib/emailcharsets.tex
new file mode 100644
index 0000000..d1ae728
--- /dev/null
+++ b/Doc/lib/emailcharsets.tex
@@ -0,0 +1,240 @@
+\declaremodule{standard}{email.Charset}
+\modulesynopsis{Character Sets}
+
+This module provides a class \class{Charset} for representing
+character sets and character set conversions in email messages, as
+well as a character set registry and several convenience methods for
+manipulating this registry. Instances of \class{Charset} are used in
+several other modules within the \module{email} package.
+
+\versionadded{2.2.2}
+
+\begin{classdesc}{Charset}{\optional{input_charset}}
+Map character sets to their email properties.
+
+This class provides information about the requirements imposed on
+email for a specific character set. It also provides convenience
+routines for converting between character sets, given the availability
+of the applicable codecs. Given a character set, it will do its best
+to provide information on how to use that character set in an email
+message in an RFC-compliant way.
+
+Certain character sets must be encoded with quoted-printable or base64
+when used in email headers or bodies. Certain character sets must be
+converted outright, and are not allowed in email.
+
+Optional \var{input_charset} is as described below. After being alias
+normalized it is also used as a lookup into the registry of character
+sets to find out the header encoding, body encoding, and output
+conversion codec to be used for the character set. For example, if
+\var{input_charset} is \code{iso-8859-1}, then headers and bodies will
+be encoded using quoted-printable and no output conversion codec is
+necessary. If \var{input_charset} is \code{euc-jp}, then headers will
+be encoded with base64, bodies will not be encoded, but output text
+will be converted from the \code{euc-jp} character set to the
+\code{iso-2022-jp} character set.
+\end{classdesc}
+
+\class{Charset} instances have the following data attributes:
+
+\begin{datadesc}{input_charset}
+The initial character set specified. Common aliases are converted to
+their \emph{official} email names (e.g. \code{latin_1} is converted to
+\code{iso-8859-1}). Defaults to 7-bit \code{us-ascii}.
+\end{datadesc}
+
+\begin{datadesc}{header_encoding}
+If the character set must be encoded before it can be used in an
+email header, this attribute will be set to \code{Charset.QP} (for
+quoted-printable), \code{Charset.BASE64} (for base64 encoding), or
+\code{Charset.SHORTEST} for the shortest of QP or BASE64 encoding.
+Otherwise, it will be \code{None}.
+\end{datadesc}
+
+\begin{datadesc}{body_encoding}
+Same as \var{header_encoding}, but describes the encoding for the
+mail message's body, which indeed may be different than the header
+encoding. \code{Charset.SHORTEST} is not allowed for
+\var{body_encoding}.
+\end{datadesc}
+
+\begin{datadesc}{output_charset}
+Some character sets must be converted before they can be used in
+email headers or bodies. If the \var{input_charset} is one of
+them, this attribute will contain the name of the character set
+output will be converted to. Otherwise, it will be \code{None}.
+\end{datadesc}
+
+\begin{datadesc}{input_codec}
+The name of the Python codec used to convert the \var{input_charset} to
+Unicode. If no conversion codec is necessary, this attribute will be
+\code{None}.
+\end{datadesc}
+
+\begin{datadesc}{output_codec}
+The name of the Python codec used to convert Unicode to the
+\var{output_charset}. If no conversion codec is necessary, this
+attribute will have the same value as the \var{input_codec}.
+\end{datadesc}
+
+\class{Charset} instances also have the following methods:
+
+\begin{methoddesc}[Charset]{get_body_encoding}{}
+Return the content transfer encoding used for body encoding.
+
+This is either the string \samp{quoted-printable} or \samp{base64}
+depending on the encoding used, or it is a function, in which case you
+should call the function with a single argument, the Message object
+being encoded. The function should then set the
+\mailheader{Content-Transfer-Encoding} header itself to whatever is
+appropriate.
+
+Returns the string \samp{quoted-printable} if
+\var{body_encoding} is \code{QP}, returns the string
+\samp{base64} if \var{body_encoding} is \code{BASE64}, and returns the
+string \samp{7bit} otherwise.
+\end{methoddesc}
+
+\begin{methoddesc}{convert}{s}
+Convert the string \var{s} from the \var{input_codec} to the
+\var{output_codec}.
+\end{methoddesc}
+
+\begin{methoddesc}{to_splittable}{s}
+Convert a possibly multibyte string to a safely splittable format.
+\var{s} is the string to split.
+
+Uses the \var{input_codec} to try and convert the string to Unicode,
+so it can be safely split on character boundaries (even for multibyte
+characters).
+
+Returns the string as-is if it isn't known how to convert \var{s} to
+Unicode with the \var{input_charset}.
+
+Characters that could not be converted to Unicode will be replaced
+with the Unicode replacement character \character{U+FFFD}.
+\end{methoddesc}
+
+\begin{methoddesc}{from_splittable}{ustr\optional{, to_output}}
+Convert a splittable string back into an encoded string. \var{ustr}
+is a Unicode string to ``unsplit''.
+
+This method uses the proper codec to try and convert the string from
+Unicode back into an encoded format. Return the string as-is if it is
+not Unicode, or if it could not be converted from Unicode.
+
+Characters that could not be converted from Unicode will be replaced
+with an appropriate character (usually \character{?}).
+
+If \var{to_output} is \code{True} (the default), uses
+\var{output_codec} to convert to an
+encoded format. If \var{to_output} is \code{False}, it uses
+\var{input_codec}.
+\end{methoddesc}
+
+\begin{methoddesc}{get_output_charset}{}
+Return the output character set.
+
+This is the \var{output_charset} attribute if that is not \code{None},
+otherwise it is \var{input_charset}.
+\end{methoddesc}
+
+\begin{methoddesc}{encoded_header_len}{}
+Return the length of the encoded header string, properly calculating
+for quoted-printable or base64 encoding.
+\end{methoddesc}
+
+\begin{methoddesc}{header_encode}{s\optional{, convert}}
+Header-encode the string \var{s}.
+
+If \var{convert} is \code{True}, the string will be converted from the
+input charset to the output charset automatically. This is not useful
+for multibyte character sets, which have line length issues (multibyte
+characters must be split on a character, not a byte boundary); use the
+higher-level \class{Header} class to deal with these issues (see
+\refmodule{email.Header}). \var{convert} defaults to \code{False}.
+
+The type of encoding (base64 or quoted-printable) will be based on
+the \var{header_encoding} attribute.
+\end{methoddesc}
+
+\begin{methoddesc}{body_encode}{s\optional{, convert}}
+Body-encode the string \var{s}.
+
+If \var{convert} is \code{True} (the default), the string will be
+converted from the input charset to output charset automatically.
+Unlike \method{header_encode()}, there are no issues with byte
+boundaries and multibyte charsets in email bodies, so this is usually
+pretty safe.
+
+The type of encoding (base64 or quoted-printable) will be based on
+the \var{body_encoding} attribute.
+\end{methoddesc}
+
+The \class{Charset} class also provides a number of methods to support
+standard operations and built-in functions.
+
+\begin{methoddesc}[Charset]{__str__}{}
+Returns \var{input_charset} as a string coerced to lower case.
+\end{methoddesc}
+
+\begin{methoddesc}[Charset]{__eq__}{other}
+This method allows you to compare two \class{Charset} instances for equality.
+\end{methoddesc}
+
+\begin{methoddesc}[Header]{__ne__}{other}
+This method allows you to compare two \class{Charset} instances for inequality.
+\end{methoddesc}
+
+The \module{email.Charset} module also provides the following
+functions for adding new entries to the global character set, alias,
+and codec registries:
+
+\begin{funcdesc}{add_charset}{charset\optional{, header_enc\optional{,
+ body_enc\optional{, output_charset}}}}
+Add character properties to the global registry.
+
+\var{charset} is the input character set, and must be the canonical
+name of a character set.
+
+Optional \var{header_enc} and \var{body_enc} is either
+\code{Charset.QP} for quoted-printable, \code{Charset.BASE64} for
+base64 encoding, \code{Charset.SHORTEST} for the shortest of
+quoted-printable or base64 encoding, or \code{None} for no encoding.
+\code{SHORTEST} is only valid for \var{header_enc}. The default is
+\code{None} for no encoding.
+
+Optional \var{output_charset} is the character set that the output
+should be in. Conversions will proceed from input charset, to
+Unicode, to the output charset when the method
+\method{Charset.convert()} is called. The default is to output in the
+same character set as the input.
+
+Both \var{input_charset} and \var{output_charset} must have Unicode
+codec entries in the module's character set-to-codec mapping; use
+\function{add_codec()} to add codecs the module does
+not know about. See the \refmodule{codecs} module's documentation for
+more information.
+
+The global character set registry is kept in the module global
+dictionary \code{CHARSETS}.
+\end{funcdesc}
+
+\begin{funcdesc}{add_alias}{alias, canonical}
+Add a character set alias. \var{alias} is the alias name,
+e.g. \code{latin-1}. \var{canonical} is the character set's canonical
+name, e.g. \code{iso-8859-1}.
+
+The global charset alias registry is kept in the module global
+dictionary \code{ALIASES}.
+\end{funcdesc}
+
+\begin{funcdesc}{add_codec}{charset, codecname}
+Add a codec that map characters in the given character set to and from
+Unicode.
+
+\var{charset} is the canonical name of a character set.
+\var{codecname} is the name of a Python codec, as appropriate for the
+second argument to the \function{unicode()} built-in, or to the
+\method{encode()} method of a Unicode string.
+\end{funcdesc}