diff options
author | Fred Drake <fdrake@acm.org> | 2001-02-08 15:40:33 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2001-02-08 15:40:33 (GMT) |
commit | 5ed1dac4c0d9329accde4a90e5cc34c3085570d7 (patch) | |
tree | b4938b912f247274cd4083b97f58f2cd801986a3 /Doc/lib/libpyexpat.tex | |
parent | 85d835f0abb78584c150a8d205264d60f2be58fe (diff) | |
download | cpython-5ed1dac4c0d9329accde4a90e5cc34c3085570d7.zip cpython-5ed1dac4c0d9329accde4a90e5cc34c3085570d7.tar.gz cpython-5ed1dac4c0d9329accde4a90e5cc34c3085570d7.tar.bz2 |
Update documentation for pyexpat (xml.parsers.expat), to reflect the new
support for Expat 1.95.*.
Diffstat (limited to 'Doc/lib/libpyexpat.tex')
-rw-r--r-- | Doc/lib/libpyexpat.tex | 336 |
1 files changed, 274 insertions, 62 deletions
diff --git a/Doc/lib/libpyexpat.tex b/Doc/lib/libpyexpat.tex index b50c52d..fafdfcb 100644 --- a/Doc/lib/libpyexpat.tex +++ b/Doc/lib/libpyexpat.tex @@ -1,6 +1,15 @@ \section{\module{xml.parsers.expat} --- Fast XML parsing using Expat} +% Markup notes: +% +% Many of the attributes of the XMLParser objects are callbacks. +% Since signature information must be presented, these are described +% using the methoddesc environment. Since they are attributes which +% are set by client code, in-text references to these attributes +% should be marked using the \member macro and should not include the +% parentheses used when marking functions and methods. + \declaremodule{standard}{xml.parsers.expat} \modulesynopsis{An interface to the Expat non-validating XML parser.} \moduleauthor{Paul Prescod}{paul@prescod.net} @@ -45,7 +54,9 @@ Creates and returns a new \class{xmlparser} object. \var{encoding}, if specified, must be a string naming the encoding used by the XML data. Expat doesn't support as many encodings as Python does, and its repertoire of encodings can't be extended; it -supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII. +supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII. If +\var{encoding} is given it will override the implicit or explicit +encoding of the document. Expat can optionally do XML namespace processing for you, enabled by providing a value for \var{namespace_separator}. The value must be a @@ -54,11 +65,11 @@ string has an illegal length (\code{None} is considered the same as omission). When namespace processing is enabled, element type names and attribute names that belong to a namespace will be expanded. The element name passed to the element handlers -\function{StartElementHandler()} and \function{EndElementHandler()} +\member{StartElementHandler} and \member{EndElementHandler} will be the concatenation of the namespace URI, the namespace separator character, and the local part of the name. If the namespace separator is a zero byte (\code{chr(0)}) then the namespace URI and -the local part will be concatenated without any separator. +the local part will be concatenated without any separator. For example, if \var{namespace_separator} is set to a space character (\character{ }) and the following document is parsed: @@ -72,7 +83,7 @@ For example, if \var{namespace_separator} is set to a space character </root> \end{verbatim} -\function{StartElementHandler()} will receive the following strings +\member{StartElementHandler} will receive the following strings for each element: \begin{verbatim} @@ -101,11 +112,12 @@ empty string when there's no more data. \end{methoddesc} \begin{methoddesc}[xmlparser]{SetBase}{base} -Sets the base to be used for resolving relative URIs in system identifiers in -declarations. Resolving relative identifiers is left to the application: -this value will be passed through as the base argument to the -\function{ExternalEntityRefHandler}, \function{NotationDeclHandler}, -and \function{UnparsedEntityDeclHandler} functions. +Sets the base to be used for resolving relative URIs in system +identifiers in declarations. Resolving relative identifiers is left +to the application: this value will be passed through as the +\var{base} argument to the \function{ExternalEntityRefHandler}, +\function{NotationDeclHandler}, and +\function{UnparsedEntityDeclHandler} functions. \end{methoddesc} \begin{methoddesc}[xmlparser]{GetBase}{} @@ -120,19 +132,45 @@ Create a ``child'' parser which can be used to parse an external parsed entity referred to by content parsed by the parent parser. The \var{context} parameter should be the string passed to the \method{ExternalEntityRefHandler()} handler function, described below. +The child parser is created with the \member{ordered_attributes}, +\member{returns_unicode} and \member{specified_attributes} set to the +values of this parser. \end{methoddesc} \class{xmlparser} objects have the following attributes: +\begin{memberdesc}[xmlparser]{ordered_attributes} +Setting this attribute to a non-zero integer causes the attributes to +be reported as a list rather than a dictionary. The attributes are +presented in the order found in the document text. For each +attribute, two list entries are presented: the attribute name and the +attribute value. (Older versions of this module also used this +format.) By default, this attribute is false; it may be changed at +any time. +\versionadded{2.1} +\end{memberdesc} + \begin{memberdesc}[xmlparser]{returns_unicode} -If this attribute is set to 1, the handler functions will be passed -Unicode strings. If \member{returns_unicode} is 0, 8-bit strings -containing UTF-8 encoded data will be passed to the handlers. +If this attribute is set to a non-zero integer, the handler functions +will be passed Unicode strings. If \member{returns_unicode} is 0, +8-bit strings containing UTF-8 encoded data will be passed to the +handlers. \versionchanged[Can be changed at any time to affect the result type.]{1.6} \end{memberdesc} +\begin{memberdesc}[xmlparser]{specified_attributes} +If set to a non-zero integer, the parser will report only those +attributes which were specified in the document instance and not those +which were derived from attribute declarations. Applications which +set this need to be especially careful to use what additional +information is available from the declarations as needed to comply +with the standards for the behavior of XML processors. By default, +this attribute is false; it may be changed at any time. +\versionadded{2.1} +\end{memberdesc} + The following attributes contain values relating to the most recent error encountered by an \class{xmlparser} object, and will only have correct values once a call to \method{Parse()} or \method{ParseFile()} @@ -163,6 +201,59 @@ be taken from the following list, and \var{func} must be a callable object accepting the correct number of arguments. The arguments are all strings, unless otherwise stated. +\begin{methoddesc}[xmlparser]{XmlDeclHandler}{version, encoding, standalone} +Called when the XML declaration is parsed. The XML declaration is the +(optional) declaration of the applicable version of the XML +recommendation, the encoding of the document text, and an optional +``standalone'' declaration. \var{version} and \var{encoding} will be +strings of the type dictated by the \member{returns_unicode} +attribute, and \var{standalone} will be \code{1} if the document is +declared standalone, \code{0} if it is declared not to be standalone, +or \code{-1} if the standalone clause was omitted. +This is only available with Expat version 1.95.0 or newer. +\versionadded{2.1} +\end{methoddesc} + +\begin{methoddesc}[xmlparser]{StartDoctypeDeclHandler}{doctypeName, + systemId, publicId, + has_internal_subset} +Called when Expat begins parsing the document type declaration +(\code{<!DOCTYPE \ldots}). The \var{doctypeName} is provided exactly +as presented. The \var{systemId} and \var{publicId} parameters give +the system and public identifiers if specified, or \code{None} if +omitted. \var{has_internal_subset} will be true if the document +contains and internal document declaration subset. +This requires Expat version 1.2 or newer. +\end{methoddesc} + +\begin{methoddesc}[xmlparser]{EndDoctypeDeclHandler}{} +Called when Expat is done parsing the document type delaration. +This requires Expat version 1.2 or newer. +\end{methoddesc} + +\begin{methoddesc}[xmlparser]{ElementDeclHandler}{name, model} +Called once for each element type declaration. \var{name} is the name +of the element type, and \var{model} is a representation of the +content model. +\end{methoddesc} + +\begin{methoddesc}[xmlparser]{AttlistDeclHandler}{elname, attname, + type, default, required} +Called for each declared attribute for an element type. If an +attribute list declaration declares three attributes, this handler is +called three times, once for each attribute. \var{elname} is the name +of the element to which the declaration applies and \var{attname} is +the name of the attribute declared. The attribute type is a string +passed as \var{type}; the possible values are \code{'CDATA'}, +\code{'ID'}, \code{'IDREF'}, ... +\var{default} gives the default value for the attribute used when the +attribute is not specified by the document instance, or \code{None} if +there is no default value (\code{\#IMPLIED} values). If the attribute +is required to be given in the document instance, \var{required} will +be true. +This requires Expat version 1.95.0 or newer. +\end{methoddesc} + \begin{methoddesc}[xmlparser]{StartElementHandler}{name, attributes} Called for the start of every element. \var{name} is a string containing the element name, and \var{attributes} is a dictionary @@ -174,39 +265,77 @@ Called for the end of every element. \end{methoddesc} \begin{methoddesc}[xmlparser]{ProcessingInstructionHandler}{target, data} -Called for every processing instruction. +Called for every processing instruction. \end{methoddesc} \begin{methoddesc}[xmlparser]{CharacterDataHandler}{data} -Called for character data. +Called for character data. This will be called for normal character +data, CDATA marked content, and ignorable whitespace. Applications +which must distinguish these cases can use the +\member{StartCdataSectionHandler}, \member{EndCdataSectionHandler}, +and \member{ElementDeclHandler} callbacks to collect the required +information. \end{methoddesc} \begin{methoddesc}[xmlparser]{UnparsedEntityDeclHandler}{entityName, base, systemId, publicId, notationName} -Called for unparsed (NDATA) entity declarations. +Called for unparsed (NDATA) entity declarations. This is only present +for version 1.2 of the Expat library; for more recent versions, use +\member{EntityDeclHandler} instead. (The underlying function in the +Expat library has been declared obsolete.) +\end{methoddesc} + +\begin{methoddesc}[xmlparser]{EntityDeclHandler}{entityName, + is_parameter_entity, value, + base, systemId, + publicId, + notationName} +Called for all entity declarations. For parameter and internal +entities, \var{value} will be a string giving the declared contents +of the entity; this will be \code{None} for external entities. The +\var{notationName} parameter will be \code{None} for parsed entities, +and the name of the notation for unparsed entities. +\var{is_parameter_entity} will be true if the entity is a paremeter +entity or false for general entities (most applications only need to +be concerned with general entities). +This is only available starting with version 1.95.0 of the Expat +library. +\versionadded{2.1} \end{methoddesc} \begin{methoddesc}[xmlparser]{NotationDeclHandler}{notationName, base, systemId, publicId} -Called for notation declarations. +Called for notation declarations. \var{notationName}, \var{base}, and +\var{systemId}, and \var{publicId} are strings if given. If the +public identifier is omitted, \var{publicId} will be \code{None}. \end{methoddesc} \begin{methoddesc}[xmlparser]{StartNamespaceDeclHandler}{prefix, uri} -Called when an element contains a namespace declaration. +Called when an element contains a namespace declaration. Namespace +declarations are processed before the \member{StartElementHandler} is +called for the element on which declarations are placed. \end{methoddesc} \begin{methoddesc}[xmlparser]{EndNamespaceDeclHandler}{prefix} Called when the closing tag is reached for an element -that contained a namespace declaration. +that contained a namespace declaration. This is called once for each +namespace declaration on the element in the reverse of the order for +which the \member{StartNamespaceDeclHandler} was called to indicate +the start of each namespace declaration's scope. Calls to this +handler are made after the corresponding \member{EndElementHandler} +for the end of the element. \end{methoddesc} \begin{methoddesc}[xmlparser]{CommentHandler}{data} -Called for comments. +Called for comments. \var{data} is the text of the comment, excluding +the leading `\code{<!--}' and trailing `\code{-->}'. \end{methoddesc} \begin{methoddesc}[xmlparser]{StartCdataSectionHandler}{} -Called at the start of a CDATA section. +Called at the start of a CDATA section. This and +\member{StartCdataSectionHandler} are needed to be able to identify +the syntactical start and end for CDATA sections. \end{methoddesc} \begin{methoddesc}[xmlparser]{EndCdataSectionHandler}{} @@ -226,14 +355,33 @@ but doesn't inhibit expansion of internal entities. The entity reference will not be passed to the default handler. \end{methoddesc} -\begin{methoddesc}[xmlparser]{NotStandaloneHandler}{} -Called if the XML document hasn't been declared as being a standalone -document. +\begin{methoddesc}[xmlparser]{NotStandaloneHandler}{} Called if the +XML document hasn't been declared as being a standalone document. +This happens when there is an external subset or a reference to a +parameter entity, but the XML declaration does not set standalone to +\code{yes} in an XML declaration. If this handler returns \code{0}, +then the parser will throw an \constant{XML_ERROR_NOT_STANDALONE} +error. If this handler is not set, no exception is raised by the +parser for this condition. \end{methoddesc} \begin{methoddesc}[xmlparser]{ExternalEntityRefHandler}{context, base, systemId, publicId} -Called for references to external entities. +Called for references to external entities. \var{base} is the current +base, as set by a previous call to \method{SetBase()}. The public and +system identifiers, \var{systemId} and \var{publicId}, are strings if +given; if the public identifier is not given, \var{publicId} will be +\code{None}. + +For external entities to be parsed, this handler must be implemented. +It is responsible for creating the sub-parser using +\code{ExternalEntityRefHandler(\var{context})}, initializing it with +the appropriate callbacks, and parsing the entity. If this handler +returns \code{0}, the parser will throw an +\constant{XML_ERROR_EXTERNAL_ENTITY_HANDLING} error. + +If this handler is not provided, external entities are reported by the +\member{DefaultHandler} callback, if provided. \end{methoddesc} @@ -281,6 +429,67 @@ End element: parent \end{verbatim} +\subsection{Content Model Descriptions \label{expat-content-models}} +\sectionauthor{Fred L. Drake, Jr.}{fdrake@acm.org} + +Content modules are described using nested tuples. Each tuple +contains four values: the type, the quantifier, the name, and a tuple +of children. Children are simply additional content module +descriptions. + +The values of the first two fields are constants defined in the +\code{model} object of the \module{xml.parsers.expat} module. These +constants can be collected in two groups: the model type group and the +quantifier group. + +The constants in the model type group are: + +\begin{datadescni}{XML_CTYPE_ANY} +The element named by the model name was declared to have a content +model of \code{ANY}. +\end{datadescni} + +\begin{datadescni}{XML_CTYPE_CHOICE} +The named element allows a choice from a number of options; this is +used for content models such as \code{(A | B | C)}. +\end{datadescni} + +\begin{datadescni}{XML_CTYPE_EMPTY} +Elements which are declared to be \code{EMPTY} have this model type. +\end{datadescni} + +\begin{datadescni}{XML_CTYPE_MIXED} +\end{datadescni} + +\begin{datadescni}{XML_CTYPE_NAME} +\end{datadescni} + +\begin{datadescni}{XML_CTYPE_SEQ} +Models which represent a series of models which follow one after the +other are indicated with this model type. This is used for models +such as \code{(A, B, C)}. +\end{datadescni} + + +The constants in the quantifier group are: + +\begin{datadescni}{XML_CQUANT_NONE} +\end{datadescni} + +\begin{datadescni}{XML_CQUANT_OPT} +The model is option: it can appear once or not at all, as for +\code{A?}. +\end{datadescni} + +\begin{datadescni}{XML_CQUANT_PLUS} +The model must occur one or more times (\code{A+}). +\end{datadescni} + +\begin{datadescni}{XML_CQUANT_REP} +The model must occur zero or more times, as for \code{A*}. +\end{datadescni} + + \subsection{Expat error constants \label{expat-errors}} \sectionauthor{A.M. Kuchling}{amk1@bigfoot.com} @@ -291,66 +500,69 @@ parser object after an error has occurred. The \code{errors} object has the following attributes: -\begin{datadesc}{XML_ERROR_ASYNC_ENTITY} -\end{datadesc} +\begin{datadescni}{XML_ERROR_ASYNC_ENTITY} +\end{datadescni} -\begin{datadesc}{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF} -\end{datadesc} +\begin{datadescni}{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF} +An entity reference in an attribute value referred to an external +entity instead of an internal entity. +\end{datadescni} -\begin{datadesc}{XML_ERROR_BAD_CHAR_REF} -\end{datadesc} +\begin{datadescni}{XML_ERROR_BAD_CHAR_REF} +\end{datadescni} -\begin{datadesc}{XML_ERROR_BINARY_ENTITY_REF} -\end{datadesc} +\begin{datadescni}{XML_ERROR_BINARY_ENTITY_REF} +\end{datadescni} -\begin{datadesc}{XML_ERROR_DUPLICATE_ATTRIBUTE} +\begin{datadescni}{XML_ERROR_DUPLICATE_ATTRIBUTE} An attribute was used more than once in a start tag. -\end{datadesc} +\end{datadescni} -\begin{datadesc}{XML_ERROR_INCORRECT_ENCODING} -\end{datadesc} +\begin{datadescni}{XML_ERROR_INCORRECT_ENCODING} +\end{datadescni} -\begin{datadesc}{XML_ERROR_INVALID_TOKEN} -\end{datadesc} +\begin{datadescni}{XML_ERROR_INVALID_TOKEN} +\end{datadescni} -\begin{datadesc}{XML_ERROR_JUNK_AFTER_DOC_ELEMENT} +\begin{datadescni}{XML_ERROR_JUNK_AFTER_DOC_ELEMENT} Something other than whitespace occurred after the document element. -\end{datadesc} +\end{datadescni} -\begin{datadesc}{XML_ERROR_MISPLACED_XML_PI} -\end{datadesc} +\begin{datadescni}{XML_ERROR_MISPLACED_XML_PI} +\end{datadescni} -\begin{datadesc}{XML_ERROR_NO_ELEMENTS} -\end{datadesc} +\begin{datadescni}{XML_ERROR_NO_ELEMENTS} +The document contains no elements. +\end{datadescni} -\begin{datadesc}{XML_ERROR_NO_MEMORY} +\begin{datadescni}{XML_ERROR_NO_MEMORY} Expat was not able to allocate memory internally. -\end{datadesc} +\end{datadescni} -\begin{datadesc}{XML_ERROR_PARAM_ENTITY_REF} -\end{datadesc} +\begin{datadescni}{XML_ERROR_PARAM_ENTITY_REF} +\end{datadescni} -\begin{datadesc}{XML_ERROR_PARTIAL_CHAR} -\end{datadesc} +\begin{datadescni}{XML_ERROR_PARTIAL_CHAR} +\end{datadescni} -\begin{datadesc}{XML_ERROR_RECURSIVE_ENTITY_REF} -\end{datadesc} +\begin{datadescni}{XML_ERROR_RECURSIVE_ENTITY_REF} +\end{datadescni} -\begin{datadesc}{XML_ERROR_SYNTAX} +\begin{datadescni}{XML_ERROR_SYNTAX} Some unspecified syntax error was encountered. -\end{datadesc} +\end{datadescni} -\begin{datadesc}{XML_ERROR_TAG_MISMATCH} +\begin{datadescni}{XML_ERROR_TAG_MISMATCH} An end tag did not match the innermost open start tag. -\end{datadesc} +\end{datadescni} -\begin{datadesc}{XML_ERROR_UNCLOSED_TOKEN} -\end{datadesc} +\begin{datadescni}{XML_ERROR_UNCLOSED_TOKEN} +\end{datadescni} -\begin{datadesc}{XML_ERROR_UNDEFINED_ENTITY} +\begin{datadescni}{XML_ERROR_UNDEFINED_ENTITY} A reference was made to a entity which was not defined. -\end{datadesc} +\end{datadescni} -\begin{datadesc}{XML_ERROR_UNKNOWN_ENCODING} +\begin{datadescni}{XML_ERROR_UNKNOWN_ENCODING} The document encoding is not supported by Expat. -\end{datadesc} +\end{datadescni} |