summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/lib/libxmllib.tex108
-rw-r--r--Lib/xmllib.py183
2 files changed, 180 insertions, 111 deletions
diff --git a/Doc/lib/libxmllib.tex b/Doc/lib/libxmllib.tex
index 7a7c85d..a785a73 100644
--- a/Doc/lib/libxmllib.tex
+++ b/Doc/lib/libxmllib.tex
@@ -14,7 +14,28 @@ for parsing text files formatted in XML (eXtended Markup Language).
The \class{XMLParser} class must be instantiated without arguments.
\end{classdesc}
-This class provides the following interface methods:
+This class provides the following interface methods and instance variables:
+
+\begin{memberdesc}{attributes}
+A mapping of element names to mappings. The latter mapping maps
+attribute names that are valid for the element to the default value of
+the attribute, or if there is no default to \code{None}. The default
+value is the empty dictionary.
+\end{memberdesc}
+
+\begin{memberdesc}{elements}
+A mapping of element names to tuples. The tuples contain a function
+for handling the start and end tag respectively of the element, or
+\code{None} if the method \method{unknown_starttag()} or
+\method{unknown_endtag()} is to be called. The default value is the
+empty dictionary.
+\end{memberdesc}
+
+\begin{memberdesc}{entitydefs}
+A mapping of entitynames to their values. The default value contains
+definitions for \code{'lt'}, \code{'gt'}, \code{'amp'}, \code{'quot'},
+and \code{'apos'}.
+\end{memberdesc}
\begin{methoddesc}{reset}{}
Reset the instance. Loses all unprocessed data. This is called
@@ -33,7 +54,7 @@ when the close tag matching the last unclosed open tag is encountered.
\begin{methoddesc}{feed}{data}
Feed some text to the parser. It is processed insofar as it consists
-of complete elements; incomplete data is buffered until more data is
+of complete tags; incomplete data is buffered until more data is
fed or \method{close()} is called.
\end{methoddesc}
@@ -65,29 +86,29 @@ the root element.
\end{methoddesc}
\begin{methoddesc}{handle_starttag}{tag, method, attributes}
-This method is called to handle start tags for which a
-\method{start_\var{tag}()} method has been defined. The \var{tag}
-argument is the name of the tag, and the \var{method} argument is the
-bound method which should be used to support semantic interpretation
-of the start tag. The \var{attributes} argument is a dictionary of
-attributes, the key being the \var{name} and the value being the
-\var{value} of the attribute found inside the tag's \code{<>} brackets.
-Character and entity references in the \var{value} have
-been interpreted. For instance, for the tag
+This method is called to handle start tags for which a start tag
+handler is defined in the instance variable \member{elements}. The
+\var{tag} argument is the name of the tag, and the \var{method}
+argument is the function (method) which should be used to support semantic
+interpretation of the start tag. The \var{attributes} argument is a
+dictionary of attributes, the key being the \var{name} and the value
+being the \var{value} of the attribute found inside the tag's
+\code{<>} brackets. Character and entity references in the
+\var{value} have been interpreted. For instance, for the start tag
\code{<A HREF="http://www.cwi.nl/">}, this method would be called as
-\code{handle_starttag('A', self.start_A, \{'HREF': 'http://www.cwi.nl/'\})}.
+\code{handle_starttag('A', self.elements['A'][0], \{'HREF': 'http://www.cwi.nl/'\})}.
The base implementation simply calls \var{method} with \var{attributes}
as the only argument.
\end{methoddesc}
\begin{methoddesc}{handle_endtag}{tag, method}
-This method is called to handle endtags for which an
-\method{end_\var{tag}()} method has been defined. The \var{tag}
-argument is the name of the tag, and the
-\var{method} argument is the bound method which should be used to
-support semantic interpretation of the end tag. If no
-\method{end_\var{tag}()} method is defined for the closing element, this
-handler is not called. The base implementation simply calls
+This method is called to handle endtags for which an end tag handler
+is defined in the instance variable \member{elements}. The \var{tag}
+argument is the name of the tag, and the \var{method} argument is the
+function (method) which should be used to support semantic
+interpretation of the end tag. For instance, for the endtag
+\code{</A>}, this method would be called as \code{handle_endtag('A',
+self.elements['A'][1])}. The base implementation simply calls
\var{method}.
\end{methoddesc}
@@ -149,7 +170,7 @@ closing delimiter, but not the delimiter itself. For example, the
instruction \samp{<?XML text?>} will cause this method to be called
with the arguments \code{'XML'} and \code{'text'}. The default method
does nothing. Note that if a document starts with \samp{<?xml
-...?>}, \method{handle_xml()} is called to handle it.
+..?>}, \method{handle_xml()} is called to handle it.
\end{methoddesc}
\begin{methoddesc}{handle_special}{data}
@@ -196,32 +217,21 @@ intended to be overridden by a derived class; the base class
implementation does nothing.
\end{methoddesc}
-Apart from overriding or extending the methods listed above, derived
-classes may also define methods and variables of the following form to
-define processing of specific tags. Tag names in the input stream are
-case dependent; the \var{tag} occurring in method names must be in the
-correct case:
-
-\begin{methoddescni}{start_\var{tag}}{attributes}
-This method is called to process an opening tag \var{tag}. The
-\var{attributes} argument has the same meaning as described for
-\method{handle_starttag()} above. In fact, the base implementation of
-\method{handle_starttag()} calls this method.
-\end{methoddescni}
-
-\begin{methoddescni}{end_\var{tag}}{}
-This method is called to process a closing tag \var{tag}.
-\end{methoddescni}
-
-\begin{memberdescni}{\var{tag}_attributes}
-If a class or instance variable \member{\var{tag}_attributes} exists, it
-should be a list or a dictionary. If a list, the elements of the list
-are the valid attributes for the element \var{tag}; if a dictionary,
-the keys are the valid attributes for the element \var{tag}, and the
-values the default values of the attributes, or \code{None} if there
-is no default.
-In addition to the attributes that were present in the tag, the
-attribute dictionary that is passed to \method{handle_starttag()} and
-\method{unknown_starttag()} contains values for all attributes that
-have a default value.
-\end{memberdescni}
+\subsection{XML Namespaces}
+
+This module has support for XML namespaces as defined in the XML
+Namespaces proposed recommendation.
+
+Tag and attribute names that are defined in an XML namespace are
+handled as if the name of the tag or element consisted of the
+namespace (i.e. the URL that defines the namespace) followed by a
+space and the name of the tag or attribute. For instance, the tag
+\code{<html xmlns='http://www.w3.org/TR/REC-html40'>} is treated as if
+the tag name was \code{'http://www.w3.org/TR/REC-html40 html'}, and
+the tag \code{<html:a href='http://frob.com'>} inside the above
+mentioned element is treated as if the tag name were
+\code{'http://www.w3.org/TR/REC-html40 a'} and the attribute name as
+if it were \code{'http://www.w3.org/TR/REC-html40 src'}.
+
+An older draft of the XML Namespaces proposal is also recognized, but
+triggers a warning.
diff --git a/Lib/xmllib.py b/Lib/xmllib.py
index bea210b..c551deb 100644
--- a/Lib/xmllib.py
+++ b/Lib/xmllib.py
@@ -5,7 +5,7 @@ import re
import string
-version = '0.1'
+version = '0.2'
# Regular expressions used for parsing
@@ -64,6 +64,13 @@ commentclose = re.compile('-->')
doubledash = re.compile('--')
attrtrans = string.maketrans(' \r\n\t', ' ')
+# definitions for XML namespaces
+_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"
+ncname = re.compile(_NCName + '$')
+qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
+ '(?P<local>' + _NCName + ')$')
+
+xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
# XML parser base class -- find tags and call handler functions.
# Usage: p = XMLParser(); p.feed(data); ...; p.close().
@@ -76,10 +83,11 @@ attrtrans = string.maketrans(' \r\n\t', ' ')
# as argument.
class XMLParser:
+ attributes = {} # default, to be overridden
+ elements = {} # default, to be overridden
# Interface -- initialize and reset this instance
- def __init__(self, verbose=0):
- self.verbose = verbose
+ def __init__(self):
self.reset()
# Interface -- reset this instance. Loses all unprocessed data
@@ -92,6 +100,7 @@ class XMLParser:
self.__at_start = 1
self.__seen_doctype = None
self.__seen_starttag = 0
+ self.__namespaces = {'xml':None} # xml is implicitly declared
# For derived classes only -- enter literal mode (CDATA) till EOF
def setnomoretags(self):
@@ -333,7 +342,7 @@ class XMLParser:
if self.stack:
self.syntax_error('missing end tags')
while self.stack:
- self.finish_endtag(self.stack[-1])
+ self.finish_endtag(self.stack[-1][0])
# Internal -- parse comment, return length or -1 if not terminated
def parse_comment(self, i):
@@ -413,7 +422,7 @@ class XMLParser:
self.handle_cdata(rawdata[i+9:res.start(0)])
return res.end(0)
- __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None}
+ __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
# Internal -- handle a processing instruction tag
def parse_proc(self, i):
rawdata = self.rawdata
@@ -428,29 +437,45 @@ class XMLParser:
raise RuntimeError, 'unexpected call to parse_proc'
k = res.end(0)
name = res.group(0)
- if string.find(string.lower(name), 'xml') >= 0:
- self.syntax_error('illegal processing instruction target name')
- self.handle_proc(name, rawdata[k:j])
+ if name == 'xml:namespace':
+ self.syntax_error('old-fashioned namespace declaration')
+ # namespace declaration
+ # this must come after the <?xml?> declaration (if any)
+ # and before the <!DOCTYPE> (if any).
+ if self.__seen_doctype or self.__seen_starttag:
+ self.syntax_error('xml:namespace declaration too late in document')
+ attrdict, namespace, k = self.parse_attributes(name, k, j)
+ if namespace:
+ self.syntax_error('namespace declaration inside namespace declaration')
+ for attrname in attrdict.keys():
+ if not self.__xml_namespace_attributes.has_key(attrname):
+ self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
+ if not attrdict.has_key('ns') or not attrdict.has_key('prefix'):
+ self.syntax_error('xml:namespace without required attributes')
+ prefix = attrdict.get('prefix')
+ if ncname.match(prefix) is None:
+ self.syntax_error('xml:namespace illegal prefix value')
+ return end.end(0)
+ if self.__namespaces.has_key(prefix):
+ self.syntax_error('xml:namespace prefix not unique')
+ self.__namespaces[prefix] = attrdict['ns']
+ else:
+ if string.find(string.lower(name), 'xml') >= 0:
+ self.syntax_error('illegal processing instruction target name')
+ self.handle_proc(name, rawdata[k:j])
return end.end(0)
# Internal -- parse attributes between i and j
- def parse_attributes(self, tag, i, j, attributes = None):
+ def parse_attributes(self, tag, i, j):
rawdata = self.rawdata
- # Now parse the data between i and j into a tag and attrs
attrdict = {}
- try:
- # convert attributes list to dictionary
- d = {}
- for a in attributes:
- d[a] = None
- attributes = d
- except TypeError:
- pass
+ namespace = {}
while i < j:
res = attrfind.match(rawdata, i)
if res is None:
break
attrname, attrvalue = res.group('name', 'value')
+ i = res.end(0)
if attrvalue is None:
self.syntax_error("no value specified for attribute `%s'" % attrname)
attrvalue = attrname
@@ -459,22 +484,19 @@ class XMLParser:
attrvalue = attrvalue[1:-1]
else:
self.syntax_error("attribute `%s' value not quoted" % attrname)
+ res = xmlns.match(attrname)
+ if res is not None:
+ # namespace declaration
+ ncname = res.group('ncname')
+ namespace[ncname or ''] = attrvalue or None
+ continue
if '<' in attrvalue:
self.syntax_error("`<' illegal in attribute value")
- if attributes is not None and not attributes.has_key(attrname):
- self.syntax_error("unknown attribute `%s' of element `%s'" %
- (attrname, tag))
if attrdict.has_key(attrname):
self.syntax_error("attribute `%s' specified twice" % attrname)
attrvalue = string.translate(attrvalue, attrtrans)
attrdict[attrname] = self.translate_references(attrvalue)
- i = res.end(0)
- if attributes is not None:
- # fill in with default attributes
- for key, val in attributes.items():
- if val is not None and not attrdict.has_key(key):
- attrdict[key] = val
- return attrdict, i
+ return attrdict, namespace, i
# Internal -- handle starttag, return length or -1 if not terminated
def parse_starttag(self, i):
@@ -487,19 +509,63 @@ class XMLParser:
if tag is None or tag.end(0) != end.end(0):
self.syntax_error('garbage in starttag')
return end.end(0)
- tagname = tag.group('tagname')
+ nstag = tagname = tag.group('tagname')
if not self.__seen_starttag and self.__seen_doctype and \
tagname != self.__seen_doctype:
self.syntax_error('starttag does not match DOCTYPE')
if self.__seen_starttag and not self.stack:
self.syntax_error('multiple elements on top level')
- if hasattr(self, tagname + '_attributes'):
- attributes = getattr(self, tagname + '_attributes')
- else:
- attributes = None
k, j = tag.span('attrs')
- attrdict, k = self.parse_attributes(tagname, k, j, attributes)
- self.finish_starttag(tagname, attrdict)
+ attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
+ self.stack.append((tagname, nsdict, nstag))
+ res = qname.match(tagname)
+ if res is not None:
+ prefix, nstag = res.group('prefix', 'local')
+ if prefix is None:
+ prefix = ''
+ ns = None
+ for t, d, nst in self.stack:
+ if d.has_key(prefix):
+ ns = d[prefix]
+ if ns is None and prefix != '':
+ ns = self.__namespaces.get(prefix)
+ if ns is not None:
+ nstag = ns + ' ' + nstag
+ elif prefix != '':
+ nstag = prefix + ':' + nstag # undo split
+ self.stack[-1] = tagname, nsdict, nstag
+ # translate namespace of attributes
+ nattrdict = {}
+ for key, val in attrdict.items():
+ res = qname.match(key)
+ if res is not None:
+ aprefix, key = res.group('prefix', 'local')
+ if aprefix is None:
+ aprefix = ''
+ ans = None
+ for t, d, nst in self.stack:
+ if d.has_key(aprefix):
+ ans = d[aprefix]
+ if ans is None and aprefix != '':
+ ans = self.__namespaces.get(aprefix)
+ if ans is not None:
+ key = ans + ' ' + key
+ elif aprefix != '':
+ key = aprefix + ':' + key
+ elif ns is not None:
+ key = ns + ' ' + key
+ nattrdict[key] = val
+ attrdict = nattrdict
+ attributes = self.attributes.get(nstag)
+ if attributes is not None:
+ for key in attrdict.keys():
+ if not attributes.has_key(key):
+ self.syntax_error("unknown attribute `%s' in tag `%s'" % (key, tagname))
+ for key, val in attributes.items():
+ if val is not None and not attrdict.has_key(key):
+ attrdict[key] = val
+ method = self.elements.get(nstag, (None, None))[0]
+ self.finish_starttag(nstag, attrdict, method)
if tag.group('slash') == '/':
self.finish_endtag(tagname)
return tag.end(0)
@@ -521,7 +587,7 @@ class XMLParser:
else:
tag = res.group(0)
if self.literal:
- if not self.stack or tag != self.stack[-1]:
+ if not self.stack or tag != self.stack[-1][0]:
self.handle_data(rawdata[i])
return i+1
self.literal = 0
@@ -532,21 +598,14 @@ class XMLParser:
return end.end(0)
# Internal -- finish processing of start tag
- # Return -1 for unknown tag, 1 for balanced tag
- def finish_starttag(self, tag, attrs):
- self.stack.append(tag)
- methodname = 'start_' + tag
- if hasattr(self, methodname):
- method = getattr(self, methodname)
- self.handle_starttag(tag, method, attrs)
- return 1
+ def finish_starttag(self, tagname, attrdict, method):
+ if method is not None:
+ self.handle_starttag(tagname, method, attrdict)
else:
- self.unknown_starttag(tag, attrs)
- return -1
+ self.unknown_starttag(tagname, attrdict)
# Internal -- finish processing of end tag
def finish_endtag(self, tag):
- methodname = 'end_' + tag
if not tag:
self.syntax_error('name-less end tag')
found = len(self.stack) - 1
@@ -554,27 +613,27 @@ class XMLParser:
self.unknown_endtag(tag)
return
else:
- if tag not in self.stack:
+ found = -1
+ for i in range(len(self.stack)):
+ if tag == self.stack[i][0]:
+ found = i
+ if found == -1:
self.syntax_error('unopened end tag')
- if hasattr(self, methodname):
- method = getattr(self, methodname)
+ method = self.elements.get(tag, (None, None))[1]
+ if method is not None:
self.handle_endtag(tag, method)
else:
self.unknown_endtag(tag)
return
- found = len(self.stack)
- for i in range(found):
- if self.stack[i] == tag:
- found = i
while len(self.stack) > found:
if found < len(self.stack) - 1:
- self.syntax_error('missing close tag for %s' % self.stack[-1])
- tag = self.stack[-1]
- if hasattr(self, methodname):
- method = getattr(self, methodname)
- self.handle_endtag(tag, method)
+ self.syntax_error('missing close tag for %s' % self.stack[-1][2])
+ nstag = self.stack[-1][2]
+ method = self.elements.get(nstag, (None, None))[1]
+ if method is not None:
+ self.handle_endtag(nstag, method)
else:
- self.unknown_endtag(tag)
+ self.unknown_endtag(nstag)
del self.stack[-1]
# Overridable -- handle xml processing instruction
@@ -654,9 +713,9 @@ class XMLParser:
class TestXMLParser(XMLParser):
- def __init__(self, verbose=0):
+ def __init__(self):
self.testdata = ""
- XMLParser.__init__(self, verbose)
+ XMLParser.__init__(self)
def handle_xml(self, encoding, standalone):
self.flush()