summaryrefslogtreecommitdiffstats
path: root/Doc/lib/libpyexpat.tex
diff options
context:
space:
mode:
authorFred Drake <fdrake@acm.org>2001-02-08 15:40:33 (GMT)
committerFred Drake <fdrake@acm.org>2001-02-08 15:40:33 (GMT)
commit5ed1dac4c0d9329accde4a90e5cc34c3085570d7 (patch)
treeb4938b912f247274cd4083b97f58f2cd801986a3 /Doc/lib/libpyexpat.tex
parent85d835f0abb78584c150a8d205264d60f2be58fe (diff)
downloadcpython-5ed1dac4c0d9329accde4a90e5cc34c3085570d7.zip
cpython-5ed1dac4c0d9329accde4a90e5cc34c3085570d7.tar.gz
cpython-5ed1dac4c0d9329accde4a90e5cc34c3085570d7.tar.bz2
Update documentation for pyexpat (xml.parsers.expat), to reflect the new
support for Expat 1.95.*.
Diffstat (limited to 'Doc/lib/libpyexpat.tex')
-rw-r--r--Doc/lib/libpyexpat.tex336
1 files changed, 274 insertions, 62 deletions
diff --git a/Doc/lib/libpyexpat.tex b/Doc/lib/libpyexpat.tex
index b50c52d..fafdfcb 100644
--- a/Doc/lib/libpyexpat.tex
+++ b/Doc/lib/libpyexpat.tex
@@ -1,6 +1,15 @@
\section{\module{xml.parsers.expat} ---
Fast XML parsing using Expat}
+% Markup notes:
+%
+% Many of the attributes of the XMLParser objects are callbacks.
+% Since signature information must be presented, these are described
+% using the methoddesc environment. Since they are attributes which
+% are set by client code, in-text references to these attributes
+% should be marked using the \member macro and should not include the
+% parentheses used when marking functions and methods.
+
\declaremodule{standard}{xml.parsers.expat}
\modulesynopsis{An interface to the Expat non-validating XML parser.}
\moduleauthor{Paul Prescod}{paul@prescod.net}
@@ -45,7 +54,9 @@ Creates and returns a new \class{xmlparser} object.
\var{encoding}, if specified, must be a string naming the encoding
used by the XML data. Expat doesn't support as many encodings as
Python does, and its repertoire of encodings can't be extended; it
-supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII.
+supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII. If
+\var{encoding} is given it will override the implicit or explicit
+encoding of the document.
Expat can optionally do XML namespace processing for you, enabled by
providing a value for \var{namespace_separator}. The value must be a
@@ -54,11 +65,11 @@ string has an illegal length (\code{None} is considered the same as
omission). When namespace processing is enabled, element type names
and attribute names that belong to a namespace will be expanded. The
element name passed to the element handlers
-\function{StartElementHandler()} and \function{EndElementHandler()}
+\member{StartElementHandler} and \member{EndElementHandler}
will be the concatenation of the namespace URI, the namespace
separator character, and the local part of the name. If the namespace
separator is a zero byte (\code{chr(0)}) then the namespace URI and
-the local part will be concatenated without any separator.
+the local part will be concatenated without any separator.
For example, if \var{namespace_separator} is set to a space character
(\character{ }) and the following document is parsed:
@@ -72,7 +83,7 @@ For example, if \var{namespace_separator} is set to a space character
</root>
\end{verbatim}
-\function{StartElementHandler()} will receive the following strings
+\member{StartElementHandler} will receive the following strings
for each element:
\begin{verbatim}
@@ -101,11 +112,12 @@ empty string when there's no more data.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{SetBase}{base}
-Sets the base to be used for resolving relative URIs in system identifiers in
-declarations. Resolving relative identifiers is left to the application:
-this value will be passed through as the base argument to the
-\function{ExternalEntityRefHandler}, \function{NotationDeclHandler},
-and \function{UnparsedEntityDeclHandler} functions.
+Sets the base to be used for resolving relative URIs in system
+identifiers in declarations. Resolving relative identifiers is left
+to the application: this value will be passed through as the
+\var{base} argument to the \function{ExternalEntityRefHandler},
+\function{NotationDeclHandler}, and
+\function{UnparsedEntityDeclHandler} functions.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{GetBase}{}
@@ -120,19 +132,45 @@ Create a ``child'' parser which can be used to parse an external
parsed entity referred to by content parsed by the parent parser. The
\var{context} parameter should be the string passed to the
\method{ExternalEntityRefHandler()} handler function, described below.
+The child parser is created with the \member{ordered_attributes},
+\member{returns_unicode} and \member{specified_attributes} set to the
+values of this parser.
\end{methoddesc}
\class{xmlparser} objects have the following attributes:
+\begin{memberdesc}[xmlparser]{ordered_attributes}
+Setting this attribute to a non-zero integer causes the attributes to
+be reported as a list rather than a dictionary. The attributes are
+presented in the order found in the document text. For each
+attribute, two list entries are presented: the attribute name and the
+attribute value. (Older versions of this module also used this
+format.) By default, this attribute is false; it may be changed at
+any time.
+\versionadded{2.1}
+\end{memberdesc}
+
\begin{memberdesc}[xmlparser]{returns_unicode}
-If this attribute is set to 1, the handler functions will be passed
-Unicode strings. If \member{returns_unicode} is 0, 8-bit strings
-containing UTF-8 encoded data will be passed to the handlers.
+If this attribute is set to a non-zero integer, the handler functions
+will be passed Unicode strings. If \member{returns_unicode} is 0,
+8-bit strings containing UTF-8 encoded data will be passed to the
+handlers.
\versionchanged[Can be changed at any time to affect the result
type.]{1.6}
\end{memberdesc}
+\begin{memberdesc}[xmlparser]{specified_attributes}
+If set to a non-zero integer, the parser will report only those
+attributes which were specified in the document instance and not those
+which were derived from attribute declarations. Applications which
+set this need to be especially careful to use what additional
+information is available from the declarations as needed to comply
+with the standards for the behavior of XML processors. By default,
+this attribute is false; it may be changed at any time.
+\versionadded{2.1}
+\end{memberdesc}
+
The following attributes contain values relating to the most recent
error encountered by an \class{xmlparser} object, and will only have
correct values once a call to \method{Parse()} or \method{ParseFile()}
@@ -163,6 +201,59 @@ be taken from the following list, and \var{func} must be a callable
object accepting the correct number of arguments. The arguments are
all strings, unless otherwise stated.
+\begin{methoddesc}[xmlparser]{XmlDeclHandler}{version, encoding, standalone}
+Called when the XML declaration is parsed. The XML declaration is the
+(optional) declaration of the applicable version of the XML
+recommendation, the encoding of the document text, and an optional
+``standalone'' declaration. \var{version} and \var{encoding} will be
+strings of the type dictated by the \member{returns_unicode}
+attribute, and \var{standalone} will be \code{1} if the document is
+declared standalone, \code{0} if it is declared not to be standalone,
+or \code{-1} if the standalone clause was omitted.
+This is only available with Expat version 1.95.0 or newer.
+\versionadded{2.1}
+\end{methoddesc}
+
+\begin{methoddesc}[xmlparser]{StartDoctypeDeclHandler}{doctypeName,
+ systemId, publicId,
+ has_internal_subset}
+Called when Expat begins parsing the document type declaration
+(\code{<!DOCTYPE \ldots}). The \var{doctypeName} is provided exactly
+as presented. The \var{systemId} and \var{publicId} parameters give
+the system and public identifiers if specified, or \code{None} if
+omitted. \var{has_internal_subset} will be true if the document
+contains and internal document declaration subset.
+This requires Expat version 1.2 or newer.
+\end{methoddesc}
+
+\begin{methoddesc}[xmlparser]{EndDoctypeDeclHandler}{}
+Called when Expat is done parsing the document type delaration.
+This requires Expat version 1.2 or newer.
+\end{methoddesc}
+
+\begin{methoddesc}[xmlparser]{ElementDeclHandler}{name, model}
+Called once for each element type declaration. \var{name} is the name
+of the element type, and \var{model} is a representation of the
+content model.
+\end{methoddesc}
+
+\begin{methoddesc}[xmlparser]{AttlistDeclHandler}{elname, attname,
+ type, default, required}
+Called for each declared attribute for an element type. If an
+attribute list declaration declares three attributes, this handler is
+called three times, once for each attribute. \var{elname} is the name
+of the element to which the declaration applies and \var{attname} is
+the name of the attribute declared. The attribute type is a string
+passed as \var{type}; the possible values are \code{'CDATA'},
+\code{'ID'}, \code{'IDREF'}, ...
+\var{default} gives the default value for the attribute used when the
+attribute is not specified by the document instance, or \code{None} if
+there is no default value (\code{\#IMPLIED} values). If the attribute
+is required to be given in the document instance, \var{required} will
+be true.
+This requires Expat version 1.95.0 or newer.
+\end{methoddesc}
+
\begin{methoddesc}[xmlparser]{StartElementHandler}{name, attributes}
Called for the start of every element. \var{name} is a string
containing the element name, and \var{attributes} is a dictionary
@@ -174,39 +265,77 @@ Called for the end of every element.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{ProcessingInstructionHandler}{target, data}
-Called for every processing instruction.
+Called for every processing instruction.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{CharacterDataHandler}{data}
-Called for character data.
+Called for character data. This will be called for normal character
+data, CDATA marked content, and ignorable whitespace. Applications
+which must distinguish these cases can use the
+\member{StartCdataSectionHandler}, \member{EndCdataSectionHandler},
+and \member{ElementDeclHandler} callbacks to collect the required
+information.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{UnparsedEntityDeclHandler}{entityName, base,
systemId, publicId,
notationName}
-Called for unparsed (NDATA) entity declarations.
+Called for unparsed (NDATA) entity declarations. This is only present
+for version 1.2 of the Expat library; for more recent versions, use
+\member{EntityDeclHandler} instead. (The underlying function in the
+Expat library has been declared obsolete.)
+\end{methoddesc}
+
+\begin{methoddesc}[xmlparser]{EntityDeclHandler}{entityName,
+ is_parameter_entity, value,
+ base, systemId,
+ publicId,
+ notationName}
+Called for all entity declarations. For parameter and internal
+entities, \var{value} will be a string giving the declared contents
+of the entity; this will be \code{None} for external entities. The
+\var{notationName} parameter will be \code{None} for parsed entities,
+and the name of the notation for unparsed entities.
+\var{is_parameter_entity} will be true if the entity is a paremeter
+entity or false for general entities (most applications only need to
+be concerned with general entities).
+This is only available starting with version 1.95.0 of the Expat
+library.
+\versionadded{2.1}
\end{methoddesc}
\begin{methoddesc}[xmlparser]{NotationDeclHandler}{notationName, base,
systemId, publicId}
-Called for notation declarations.
+Called for notation declarations. \var{notationName}, \var{base}, and
+\var{systemId}, and \var{publicId} are strings if given. If the
+public identifier is omitted, \var{publicId} will be \code{None}.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{StartNamespaceDeclHandler}{prefix, uri}
-Called when an element contains a namespace declaration.
+Called when an element contains a namespace declaration. Namespace
+declarations are processed before the \member{StartElementHandler} is
+called for the element on which declarations are placed.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{EndNamespaceDeclHandler}{prefix}
Called when the closing tag is reached for an element
-that contained a namespace declaration.
+that contained a namespace declaration. This is called once for each
+namespace declaration on the element in the reverse of the order for
+which the \member{StartNamespaceDeclHandler} was called to indicate
+the start of each namespace declaration's scope. Calls to this
+handler are made after the corresponding \member{EndElementHandler}
+for the end of the element.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{CommentHandler}{data}
-Called for comments.
+Called for comments. \var{data} is the text of the comment, excluding
+the leading `\code{<!--}' and trailing `\code{-->}'.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{StartCdataSectionHandler}{}
-Called at the start of a CDATA section.
+Called at the start of a CDATA section. This and
+\member{StartCdataSectionHandler} are needed to be able to identify
+the syntactical start and end for CDATA sections.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{EndCdataSectionHandler}{}
@@ -226,14 +355,33 @@ but doesn't inhibit expansion of internal entities.
The entity reference will not be passed to the default handler.
\end{methoddesc}
-\begin{methoddesc}[xmlparser]{NotStandaloneHandler}{}
-Called if the XML document hasn't been declared as being a standalone
-document.
+\begin{methoddesc}[xmlparser]{NotStandaloneHandler}{} Called if the
+XML document hasn't been declared as being a standalone document.
+This happens when there is an external subset or a reference to a
+parameter entity, but the XML declaration does not set standalone to
+\code{yes} in an XML declaration. If this handler returns \code{0},
+then the parser will throw an \constant{XML_ERROR_NOT_STANDALONE}
+error. If this handler is not set, no exception is raised by the
+parser for this condition.
\end{methoddesc}
\begin{methoddesc}[xmlparser]{ExternalEntityRefHandler}{context, base,
systemId, publicId}
-Called for references to external entities.
+Called for references to external entities. \var{base} is the current
+base, as set by a previous call to \method{SetBase()}. The public and
+system identifiers, \var{systemId} and \var{publicId}, are strings if
+given; if the public identifier is not given, \var{publicId} will be
+\code{None}.
+
+For external entities to be parsed, this handler must be implemented.
+It is responsible for creating the sub-parser using
+\code{ExternalEntityRefHandler(\var{context})}, initializing it with
+the appropriate callbacks, and parsing the entity. If this handler
+returns \code{0}, the parser will throw an
+\constant{XML_ERROR_EXTERNAL_ENTITY_HANDLING} error.
+
+If this handler is not provided, external entities are reported by the
+\member{DefaultHandler} callback, if provided.
\end{methoddesc}
@@ -281,6 +429,67 @@ End element: parent
\end{verbatim}
+\subsection{Content Model Descriptions \label{expat-content-models}}
+\sectionauthor{Fred L. Drake, Jr.}{fdrake@acm.org}
+
+Content modules are described using nested tuples. Each tuple
+contains four values: the type, the quantifier, the name, and a tuple
+of children. Children are simply additional content module
+descriptions.
+
+The values of the first two fields are constants defined in the
+\code{model} object of the \module{xml.parsers.expat} module. These
+constants can be collected in two groups: the model type group and the
+quantifier group.
+
+The constants in the model type group are:
+
+\begin{datadescni}{XML_CTYPE_ANY}
+The element named by the model name was declared to have a content
+model of \code{ANY}.
+\end{datadescni}
+
+\begin{datadescni}{XML_CTYPE_CHOICE}
+The named element allows a choice from a number of options; this is
+used for content models such as \code{(A | B | C)}.
+\end{datadescni}
+
+\begin{datadescni}{XML_CTYPE_EMPTY}
+Elements which are declared to be \code{EMPTY} have this model type.
+\end{datadescni}
+
+\begin{datadescni}{XML_CTYPE_MIXED}
+\end{datadescni}
+
+\begin{datadescni}{XML_CTYPE_NAME}
+\end{datadescni}
+
+\begin{datadescni}{XML_CTYPE_SEQ}
+Models which represent a series of models which follow one after the
+other are indicated with this model type. This is used for models
+such as \code{(A, B, C)}.
+\end{datadescni}
+
+
+The constants in the quantifier group are:
+
+\begin{datadescni}{XML_CQUANT_NONE}
+\end{datadescni}
+
+\begin{datadescni}{XML_CQUANT_OPT}
+The model is option: it can appear once or not at all, as for
+\code{A?}.
+\end{datadescni}
+
+\begin{datadescni}{XML_CQUANT_PLUS}
+The model must occur one or more times (\code{A+}).
+\end{datadescni}
+
+\begin{datadescni}{XML_CQUANT_REP}
+The model must occur zero or more times, as for \code{A*}.
+\end{datadescni}
+
+
\subsection{Expat error constants \label{expat-errors}}
\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
@@ -291,66 +500,69 @@ parser object after an error has occurred.
The \code{errors} object has the following attributes:
-\begin{datadesc}{XML_ERROR_ASYNC_ENTITY}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_ASYNC_ENTITY}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF}
+An entity reference in an attribute value referred to an external
+entity instead of an internal entity.
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_BAD_CHAR_REF}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_BAD_CHAR_REF}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_BINARY_ENTITY_REF}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_BINARY_ENTITY_REF}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_DUPLICATE_ATTRIBUTE}
+\begin{datadescni}{XML_ERROR_DUPLICATE_ATTRIBUTE}
An attribute was used more than once in a start tag.
-\end{datadesc}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_INCORRECT_ENCODING}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_INCORRECT_ENCODING}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_INVALID_TOKEN}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_INVALID_TOKEN}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_JUNK_AFTER_DOC_ELEMENT}
+\begin{datadescni}{XML_ERROR_JUNK_AFTER_DOC_ELEMENT}
Something other than whitespace occurred after the document element.
-\end{datadesc}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_MISPLACED_XML_PI}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_MISPLACED_XML_PI}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_NO_ELEMENTS}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_NO_ELEMENTS}
+The document contains no elements.
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_NO_MEMORY}
+\begin{datadescni}{XML_ERROR_NO_MEMORY}
Expat was not able to allocate memory internally.
-\end{datadesc}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_PARAM_ENTITY_REF}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_PARAM_ENTITY_REF}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_PARTIAL_CHAR}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_PARTIAL_CHAR}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_RECURSIVE_ENTITY_REF}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_RECURSIVE_ENTITY_REF}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_SYNTAX}
+\begin{datadescni}{XML_ERROR_SYNTAX}
Some unspecified syntax error was encountered.
-\end{datadesc}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_TAG_MISMATCH}
+\begin{datadescni}{XML_ERROR_TAG_MISMATCH}
An end tag did not match the innermost open start tag.
-\end{datadesc}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_UNCLOSED_TOKEN}
-\end{datadesc}
+\begin{datadescni}{XML_ERROR_UNCLOSED_TOKEN}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_UNDEFINED_ENTITY}
+\begin{datadescni}{XML_ERROR_UNDEFINED_ENTITY}
A reference was made to a entity which was not defined.
-\end{datadesc}
+\end{datadescni}
-\begin{datadesc}{XML_ERROR_UNKNOWN_ENCODING}
+\begin{datadescni}{XML_ERROR_UNKNOWN_ENCODING}
The document encoding is not supported by Expat.
-\end{datadesc}
+\end{datadescni}