diff options
Diffstat (limited to 'Doc/lib')
-rw-r--r-- | Doc/lib/lib.tex | 1 | ||||
-rw-r--r-- | Doc/lib/libcodecs.tex | 72 | ||||
-rw-r--r-- | Doc/lib/libstringprep.tex | 134 |
3 files changed, 206 insertions, 1 deletions
diff --git a/Doc/lib/lib.tex b/Doc/lib/lib.tex index 755023d..5faeedc 100644 --- a/Doc/lib/lib.tex +++ b/Doc/lib/lib.tex @@ -112,6 +112,7 @@ and how to embed it in other applications. \input{libtextwrap} \input{libcodecs} \input{libunicodedata} +\input{libstringprep} \input{libmisc} % Miscellaneous Services \input{libpydoc} diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex index caaaaf4..38586ef 100644 --- a/Doc/lib/libcodecs.tex +++ b/Doc/lib/libcodecs.tex @@ -5,7 +5,7 @@ \modulesynopsis{Encode and decode data and streams.} \moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com} \sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com} - +\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de} \index{Unicode} \index{Codecs} @@ -809,6 +809,11 @@ listed as operand type in the table. {byte string} {Convert operand to hexadecimal representation, with two digits per byte} +\lineiv{idna} + {} + {Unicode string} + {Implements \rfc{3490}. \versionadded{2.3}. See also \module{encodings.idna}} + \lineiv{mbcs} {dbcs} {Unicode string} @@ -819,6 +824,11 @@ listed as operand type in the table. {Unicode string} {Encoding of PalmOS 3.5} +\lineiv{punycode} + {} + {Unicode string} + {Implements \rfc{3492}. \versionadded{2.3}} + \lineiv{quopri_codec} {quopri, quoted-printable, quotedprintable} {byte string} @@ -865,3 +875,63 @@ listed as operand type in the table. {Compress the operand using gzip} \end{tableiv} + +\subsection{\module{encodings.idna} --- + Internationalized Domain Names in Applications} + +\declaremodule{standard}{encodings.idna} +\modulesynopsis{Internationalized Domain Names implementation} +\moduleauthor{Martin v. L\"owis} + +This module implements \rfc{3490} (Internationalized Domain Names in +Applications) and \rfc{3492} (Nameprep: A Stringprep Profile for +Internationalized Domain Names (IDN)). It builds upon the +\code{punycode} encoding and \module{stringprep}. \versionadded{2.3} + +These RFCs together define a protocol to support non-ASCII characters +in domain names. A domain name containing non-ASCII characters (such +as ``www.Alliancefran\,caise.nu'') is converted into an +ASCII-compatible encoding (ACE, such as +``www.xn--alliancefranaise-npb.nu''). The ACE form of the domain name +is then used in all places where arbitrary characters are not allowed +by the protocol, such as DNS queries, HTTP \code{Host:} fields, and so +on. This conversion is carried out in the application; if possible +invisible to the user: The application should transparently convert +Unicode domain labels to IDNA on the wire, and convert back ACE labels +to Unicode before presenting them to the user. + +Python supports this conversion in several ways: The \code{idna} codec +allows to convert between Unicode and the ACE. Furthermore, the +\module{socket} module transparently converts Unicode host names to +ACE, so that applications need not be concerned about converting host +names themselves when they pass them to the socket module. On top of +that, modules that have host names as function parameters, such as +\module{httplib} and \module{ftplib}, accept Unicode host names +(\module{httplib} then also transparently sends an IDNA hostname in +the \code{Host:} field if it sends that field at all). + +When receiving host names from the wire (such as in reverse name +lookup), no automatic conversion to Unicode is performed: Applications +wishing to present such host names to the user should decode them to +Unicode. + +The module \module{encodings.idna} also implements the nameprep +procedure, which performs certain normalizations on host names, to +achieve case-insensitivity of international domain names, and to unify +similar characters. The nameprep functions can be used directly if +desired. + +\begin{funcdesc}{nameprep}{label} +Return the nameprepped version of \var{label}. The implementation +currently assumes query strings, so \code{AllowUnassigned} is +true. +\end{funcdesc} + +\begin{funcdesc}{ToASCCII}{label} +Convert a label to ASCII, as specified in \rfc{3490}. +\code{UseSTD3ASCIIRules} is assumed to be false. +\end{funcdesc} + +\begin{funcdesc}{ToUnicode}{label} +Convert a label to Unicode, as specified in \rfc{3490}. +\end{funcdesc} diff --git a/Doc/lib/libstringprep.tex b/Doc/lib/libstringprep.tex new file mode 100644 index 0000000..3492d02 --- /dev/null +++ b/Doc/lib/libstringprep.tex @@ -0,0 +1,134 @@ +\section{\module{stringprep} --- + Internet String Preparation} + +\declaremodule{standard}{stringprep} +\modulesynopsis{String preparation, as per RFC 3453} +\moduleauthor{Martin v. L\"owis}{martin@v.loewis.de} +\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de} + +When identifying things (such as host names) in the internet, it is +often necessary to compare such identifications for +``equality''. Exactly how this comparison is executed may depend on +the application domain, e.g. whether it should be case-insensitive or +not. It may be also necessary to restrict the possible +identifications, to allow only identifications consisting of +``printable'' characters. + +\rfc{3454} defines a procedure for ``preparing'' Unicode strings in +internet protocols. Before passing strings onto the wire, they are +processed with the preparation procedure, after which they have a +certain normalized form. The RFC defines a set of tables, which can be +combined into profiles. Each profile must define which tables it uses, +and what other optional parts of the \code{stringprep} procedure are +part of the profile. One example of a \code{stringprep} profile is +\code{nameprep}, which is used for internationalized domain names. + +The module \module{stringprep} only exposes the tables from RFC +3454. As these tables would be very large to represent them as +dictionaries or lists, the module uses the Unicode character database +internally. The module source code itself was generated using the +\code{mkstringprep.py} utility. + +As a result, these tables are exposed as functions, not as data +structures. There are two kinds of tables in the RFC: sets and +mappings. For a set, \module{stringprep} provides the ``characteristic +function'', i.e. a function that returns true if the parameter is part +of the set. For mappings, it provides the mapping function: given the +key, it returns the associated value. Below is a list of all functions +available in the module. + +\begin{funcdesc}{in_table_a1}{code} +Determine whether \var{code} is in table{A.1} (Unassigned code points +in Unicode 3.2). +\end{funcdesc} + +\begin{funcdesc}{in_table_b1}{code} +Determine whether \var{code} is in table{B.1} (Commonly mapped to +nothing). +\end{funcdesc} + +\begin{funcdesc}{map_table_b2}{code} +Return the mapped value for \var{code} according to table{B.2} +(Mapping for case-folding used with NFKC). +\end{funcdesc} + +\begin{funcdesc}{map_table_b3}{code} +Return the mapped value for \var{code} according to table{B.3} +(Mapping for case-folding used with no normalization). +\end{funcdesc} + +\begin{funcdesc}{in_table_c11}{code} +Determine whether \var{code} is in table{C.1.1} +(ASCII space characters). +\end{funcdesc} + +\begin{funcdesc}{in_table_c12}{code} +Determine whether \var{code} is in table{C.1.2} +(Non-ASCII space characters). +\end{funcdesc} + +\begin{funcdesc}{in_table_c11_c12}{code} +Determine whether \var{code} is in table{C.1} +(Space characters, union of C.1.1 and C.1.2). +\end{funcdesc} + +\begin{funcdesc}{in_table_c21}{code} +Determine whether \var{code} is in table{C.2.1} +(ASCII control characters). +\end{funcdesc} + +\begin{funcdesc}{in_table_c22}{code} +Determine whether \var{code} is in table{C.2.2} +(Non-ASCII control characters). +\end{funcdesc} + +\begin{funcdesc}{in_table_c21_c22}{code} +Determine whether \var{code} is in table{C.2} +(Control characters, union of C.2.1 and C.2.2). +\end{funcdesc} + +\begin{funcdesc}{in_table_c3}{code} +Determine whether \var{code} is in table{C.3} +(Private use). +\end{funcdesc} + +\begin{funcdesc}{in_table_c4}{code} +Determine whether \var{code} is in table{C.4} +(Non-character code points). +\end{funcdesc} + +\begin{funcdesc}{in_table_c5}{code} +Determine whether \var{code} is in table{C.5} +(Surrogate codes). +\end{funcdesc} + +\begin{funcdesc}{in_table_c6}{code} +Determine whether \var{code} is in table{C.6} +(Inappropriate for plain text). +\end{funcdesc} + +\begin{funcdesc}{in_table_c7}{code} +Determine whether \var{code} is in table{C.7} +(Inappropriate for canonical representation). +\end{funcdesc} + +\begin{funcdesc}{in_table_c8}{code} +Determine whether \var{code} is in table{C.8} +(Change display properties or are deprecated). +\end{funcdesc} + +\begin{funcdesc}{in_table_c9}{code} +Determine whether \var{code} is in table{C.9} +(Tagging characters). +\end{funcdesc} + +\begin{funcdesc}{in_table_d1}{code} +Determine whether \var{code} is in table{D.1} +(Characters with bidirectional property ``R'' or ``AL''). +\end{funcdesc} + +\begin{funcdesc}{in_table_d2}{code} +Determine whether \var{code} is in table{D.2} +(Characters with bidirectional property ``L''). +\end{funcdesc} + |