diff options
-rw-r--r-- | Doc/lib/libxmllib.tex | 108 | ||||
-rw-r--r-- | Lib/xmllib.py | 183 |
2 files changed, 180 insertions, 111 deletions
diff --git a/Doc/lib/libxmllib.tex b/Doc/lib/libxmllib.tex index 7a7c85d..a785a73 100644 --- a/Doc/lib/libxmllib.tex +++ b/Doc/lib/libxmllib.tex @@ -14,7 +14,28 @@ for parsing text files formatted in XML (eXtended Markup Language). The \class{XMLParser} class must be instantiated without arguments. \end{classdesc} -This class provides the following interface methods: +This class provides the following interface methods and instance variables: + +\begin{memberdesc}{attributes} +A mapping of element names to mappings. The latter mapping maps +attribute names that are valid for the element to the default value of +the attribute, or if there is no default to \code{None}. The default +value is the empty dictionary. +\end{memberdesc} + +\begin{memberdesc}{elements} +A mapping of element names to tuples. The tuples contain a function +for handling the start and end tag respectively of the element, or +\code{None} if the method \method{unknown_starttag()} or +\method{unknown_endtag()} is to be called. The default value is the +empty dictionary. +\end{memberdesc} + +\begin{memberdesc}{entitydefs} +A mapping of entitynames to their values. The default value contains +definitions for \code{'lt'}, \code{'gt'}, \code{'amp'}, \code{'quot'}, +and \code{'apos'}. +\end{memberdesc} \begin{methoddesc}{reset}{} Reset the instance. Loses all unprocessed data. This is called @@ -33,7 +54,7 @@ when the close tag matching the last unclosed open tag is encountered. \begin{methoddesc}{feed}{data} Feed some text to the parser. It is processed insofar as it consists -of complete elements; incomplete data is buffered until more data is +of complete tags; incomplete data is buffered until more data is fed or \method{close()} is called. \end{methoddesc} @@ -65,29 +86,29 @@ the root element. \end{methoddesc} \begin{methoddesc}{handle_starttag}{tag, method, attributes} -This method is called to handle start tags for which a -\method{start_\var{tag}()} method has been defined. The \var{tag} -argument is the name of the tag, and the \var{method} argument is the -bound method which should be used to support semantic interpretation -of the start tag. The \var{attributes} argument is a dictionary of -attributes, the key being the \var{name} and the value being the -\var{value} of the attribute found inside the tag's \code{<>} brackets. -Character and entity references in the \var{value} have -been interpreted. For instance, for the tag +This method is called to handle start tags for which a start tag +handler is defined in the instance variable \member{elements}. The +\var{tag} argument is the name of the tag, and the \var{method} +argument is the function (method) which should be used to support semantic +interpretation of the start tag. The \var{attributes} argument is a +dictionary of attributes, the key being the \var{name} and the value +being the \var{value} of the attribute found inside the tag's +\code{<>} brackets. Character and entity references in the +\var{value} have been interpreted. For instance, for the start tag \code{<A HREF="http://www.cwi.nl/">}, this method would be called as -\code{handle_starttag('A', self.start_A, \{'HREF': 'http://www.cwi.nl/'\})}. +\code{handle_starttag('A', self.elements['A'][0], \{'HREF': 'http://www.cwi.nl/'\})}. The base implementation simply calls \var{method} with \var{attributes} as the only argument. \end{methoddesc} \begin{methoddesc}{handle_endtag}{tag, method} -This method is called to handle endtags for which an -\method{end_\var{tag}()} method has been defined. The \var{tag} -argument is the name of the tag, and the -\var{method} argument is the bound method which should be used to -support semantic interpretation of the end tag. If no -\method{end_\var{tag}()} method is defined for the closing element, this -handler is not called. The base implementation simply calls +This method is called to handle endtags for which an end tag handler +is defined in the instance variable \member{elements}. The \var{tag} +argument is the name of the tag, and the \var{method} argument is the +function (method) which should be used to support semantic +interpretation of the end tag. For instance, for the endtag +\code{</A>}, this method would be called as \code{handle_endtag('A', +self.elements['A'][1])}. The base implementation simply calls \var{method}. \end{methoddesc} @@ -149,7 +170,7 @@ closing delimiter, but not the delimiter itself. For example, the instruction \samp{<?XML text?>} will cause this method to be called with the arguments \code{'XML'} and \code{'text'}. The default method does nothing. Note that if a document starts with \samp{<?xml -...?>}, \method{handle_xml()} is called to handle it. +..?>}, \method{handle_xml()} is called to handle it. \end{methoddesc} \begin{methoddesc}{handle_special}{data} @@ -196,32 +217,21 @@ intended to be overridden by a derived class; the base class implementation does nothing. \end{methoddesc} -Apart from overriding or extending the methods listed above, derived -classes may also define methods and variables of the following form to -define processing of specific tags. Tag names in the input stream are -case dependent; the \var{tag} occurring in method names must be in the -correct case: - -\begin{methoddescni}{start_\var{tag}}{attributes} -This method is called to process an opening tag \var{tag}. The -\var{attributes} argument has the same meaning as described for -\method{handle_starttag()} above. In fact, the base implementation of -\method{handle_starttag()} calls this method. -\end{methoddescni} - -\begin{methoddescni}{end_\var{tag}}{} -This method is called to process a closing tag \var{tag}. -\end{methoddescni} - -\begin{memberdescni}{\var{tag}_attributes} -If a class or instance variable \member{\var{tag}_attributes} exists, it -should be a list or a dictionary. If a list, the elements of the list -are the valid attributes for the element \var{tag}; if a dictionary, -the keys are the valid attributes for the element \var{tag}, and the -values the default values of the attributes, or \code{None} if there -is no default. -In addition to the attributes that were present in the tag, the -attribute dictionary that is passed to \method{handle_starttag()} and -\method{unknown_starttag()} contains values for all attributes that -have a default value. -\end{memberdescni} +\subsection{XML Namespaces} + +This module has support for XML namespaces as defined in the XML +Namespaces proposed recommendation. + +Tag and attribute names that are defined in an XML namespace are +handled as if the name of the tag or element consisted of the +namespace (i.e. the URL that defines the namespace) followed by a +space and the name of the tag or attribute. For instance, the tag +\code{<html xmlns='http://www.w3.org/TR/REC-html40'>} is treated as if +the tag name was \code{'http://www.w3.org/TR/REC-html40 html'}, and +the tag \code{<html:a href='http://frob.com'>} inside the above +mentioned element is treated as if the tag name were +\code{'http://www.w3.org/TR/REC-html40 a'} and the attribute name as +if it were \code{'http://www.w3.org/TR/REC-html40 src'}. + +An older draft of the XML Namespaces proposal is also recognized, but +triggers a warning. diff --git a/Lib/xmllib.py b/Lib/xmllib.py index bea210b..c551deb 100644 --- a/Lib/xmllib.py +++ b/Lib/xmllib.py @@ -5,7 +5,7 @@ import re import string -version = '0.1' +version = '0.2' # Regular expressions used for parsing @@ -64,6 +64,13 @@ commentclose = re.compile('-->') doubledash = re.compile('--') attrtrans = string.maketrans(' \r\n\t', ' ') +# definitions for XML namespaces +_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":" +ncname = re.compile(_NCName + '$') +qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix + '(?P<local>' + _NCName + ')$') + +xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$') # XML parser base class -- find tags and call handler functions. # Usage: p = XMLParser(); p.feed(data); ...; p.close(). @@ -76,10 +83,11 @@ attrtrans = string.maketrans(' \r\n\t', ' ') # as argument. class XMLParser: + attributes = {} # default, to be overridden + elements = {} # default, to be overridden # Interface -- initialize and reset this instance - def __init__(self, verbose=0): - self.verbose = verbose + def __init__(self): self.reset() # Interface -- reset this instance. Loses all unprocessed data @@ -92,6 +100,7 @@ class XMLParser: self.__at_start = 1 self.__seen_doctype = None self.__seen_starttag = 0 + self.__namespaces = {'xml':None} # xml is implicitly declared # For derived classes only -- enter literal mode (CDATA) till EOF def setnomoretags(self): @@ -333,7 +342,7 @@ class XMLParser: if self.stack: self.syntax_error('missing end tags') while self.stack: - self.finish_endtag(self.stack[-1]) + self.finish_endtag(self.stack[-1][0]) # Internal -- parse comment, return length or -1 if not terminated def parse_comment(self, i): @@ -413,7 +422,7 @@ class XMLParser: self.handle_cdata(rawdata[i+9:res.start(0)]) return res.end(0) - __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None} + __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None} # Internal -- handle a processing instruction tag def parse_proc(self, i): rawdata = self.rawdata @@ -428,29 +437,45 @@ class XMLParser: raise RuntimeError, 'unexpected call to parse_proc' k = res.end(0) name = res.group(0) - if string.find(string.lower(name), 'xml') >= 0: - self.syntax_error('illegal processing instruction target name') - self.handle_proc(name, rawdata[k:j]) + if name == 'xml:namespace': + self.syntax_error('old-fashioned namespace declaration') + # namespace declaration + # this must come after the <?xml?> declaration (if any) + # and before the <!DOCTYPE> (if any). + if self.__seen_doctype or self.__seen_starttag: + self.syntax_error('xml:namespace declaration too late in document') + attrdict, namespace, k = self.parse_attributes(name, k, j) + if namespace: + self.syntax_error('namespace declaration inside namespace declaration') + for attrname in attrdict.keys(): + if not self.__xml_namespace_attributes.has_key(attrname): + self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname) + if not attrdict.has_key('ns') or not attrdict.has_key('prefix'): + self.syntax_error('xml:namespace without required attributes') + prefix = attrdict.get('prefix') + if ncname.match(prefix) is None: + self.syntax_error('xml:namespace illegal prefix value') + return end.end(0) + if self.__namespaces.has_key(prefix): + self.syntax_error('xml:namespace prefix not unique') + self.__namespaces[prefix] = attrdict['ns'] + else: + if string.find(string.lower(name), 'xml') >= 0: + self.syntax_error('illegal processing instruction target name') + self.handle_proc(name, rawdata[k:j]) return end.end(0) # Internal -- parse attributes between i and j - def parse_attributes(self, tag, i, j, attributes = None): + def parse_attributes(self, tag, i, j): rawdata = self.rawdata - # Now parse the data between i and j into a tag and attrs attrdict = {} - try: - # convert attributes list to dictionary - d = {} - for a in attributes: - d[a] = None - attributes = d - except TypeError: - pass + namespace = {} while i < j: res = attrfind.match(rawdata, i) if res is None: break attrname, attrvalue = res.group('name', 'value') + i = res.end(0) if attrvalue is None: self.syntax_error("no value specified for attribute `%s'" % attrname) attrvalue = attrname @@ -459,22 +484,19 @@ class XMLParser: attrvalue = attrvalue[1:-1] else: self.syntax_error("attribute `%s' value not quoted" % attrname) + res = xmlns.match(attrname) + if res is not None: + # namespace declaration + ncname = res.group('ncname') + namespace[ncname or ''] = attrvalue or None + continue if '<' in attrvalue: self.syntax_error("`<' illegal in attribute value") - if attributes is not None and not attributes.has_key(attrname): - self.syntax_error("unknown attribute `%s' of element `%s'" % - (attrname, tag)) if attrdict.has_key(attrname): self.syntax_error("attribute `%s' specified twice" % attrname) attrvalue = string.translate(attrvalue, attrtrans) attrdict[attrname] = self.translate_references(attrvalue) - i = res.end(0) - if attributes is not None: - # fill in with default attributes - for key, val in attributes.items(): - if val is not None and not attrdict.has_key(key): - attrdict[key] = val - return attrdict, i + return attrdict, namespace, i # Internal -- handle starttag, return length or -1 if not terminated def parse_starttag(self, i): @@ -487,19 +509,63 @@ class XMLParser: if tag is None or tag.end(0) != end.end(0): self.syntax_error('garbage in starttag') return end.end(0) - tagname = tag.group('tagname') + nstag = tagname = tag.group('tagname') if not self.__seen_starttag and self.__seen_doctype and \ tagname != self.__seen_doctype: self.syntax_error('starttag does not match DOCTYPE') if self.__seen_starttag and not self.stack: self.syntax_error('multiple elements on top level') - if hasattr(self, tagname + '_attributes'): - attributes = getattr(self, tagname + '_attributes') - else: - attributes = None k, j = tag.span('attrs') - attrdict, k = self.parse_attributes(tagname, k, j, attributes) - self.finish_starttag(tagname, attrdict) + attrdict, nsdict, k = self.parse_attributes(tagname, k, j) + self.stack.append((tagname, nsdict, nstag)) + res = qname.match(tagname) + if res is not None: + prefix, nstag = res.group('prefix', 'local') + if prefix is None: + prefix = '' + ns = None + for t, d, nst in self.stack: + if d.has_key(prefix): + ns = d[prefix] + if ns is None and prefix != '': + ns = self.__namespaces.get(prefix) + if ns is not None: + nstag = ns + ' ' + nstag + elif prefix != '': + nstag = prefix + ':' + nstag # undo split + self.stack[-1] = tagname, nsdict, nstag + # translate namespace of attributes + nattrdict = {} + for key, val in attrdict.items(): + res = qname.match(key) + if res is not None: + aprefix, key = res.group('prefix', 'local') + if aprefix is None: + aprefix = '' + ans = None + for t, d, nst in self.stack: + if d.has_key(aprefix): + ans = d[aprefix] + if ans is None and aprefix != '': + ans = self.__namespaces.get(aprefix) + if ans is not None: + key = ans + ' ' + key + elif aprefix != '': + key = aprefix + ':' + key + elif ns is not None: + key = ns + ' ' + key + nattrdict[key] = val + attrdict = nattrdict + attributes = self.attributes.get(nstag) + if attributes is not None: + for key in attrdict.keys(): + if not attributes.has_key(key): + self.syntax_error("unknown attribute `%s' in tag `%s'" % (key, tagname)) + for key, val in attributes.items(): + if val is not None and not attrdict.has_key(key): + attrdict[key] = val + method = self.elements.get(nstag, (None, None))[0] + self.finish_starttag(nstag, attrdict, method) if tag.group('slash') == '/': self.finish_endtag(tagname) return tag.end(0) @@ -521,7 +587,7 @@ class XMLParser: else: tag = res.group(0) if self.literal: - if not self.stack or tag != self.stack[-1]: + if not self.stack or tag != self.stack[-1][0]: self.handle_data(rawdata[i]) return i+1 self.literal = 0 @@ -532,21 +598,14 @@ class XMLParser: return end.end(0) # Internal -- finish processing of start tag - # Return -1 for unknown tag, 1 for balanced tag - def finish_starttag(self, tag, attrs): - self.stack.append(tag) - methodname = 'start_' + tag - if hasattr(self, methodname): - method = getattr(self, methodname) - self.handle_starttag(tag, method, attrs) - return 1 + def finish_starttag(self, tagname, attrdict, method): + if method is not None: + self.handle_starttag(tagname, method, attrdict) else: - self.unknown_starttag(tag, attrs) - return -1 + self.unknown_starttag(tagname, attrdict) # Internal -- finish processing of end tag def finish_endtag(self, tag): - methodname = 'end_' + tag if not tag: self.syntax_error('name-less end tag') found = len(self.stack) - 1 @@ -554,27 +613,27 @@ class XMLParser: self.unknown_endtag(tag) return else: - if tag not in self.stack: + found = -1 + for i in range(len(self.stack)): + if tag == self.stack[i][0]: + found = i + if found == -1: self.syntax_error('unopened end tag') - if hasattr(self, methodname): - method = getattr(self, methodname) + method = self.elements.get(tag, (None, None))[1] + if method is not None: self.handle_endtag(tag, method) else: self.unknown_endtag(tag) return - found = len(self.stack) - for i in range(found): - if self.stack[i] == tag: - found = i while len(self.stack) > found: if found < len(self.stack) - 1: - self.syntax_error('missing close tag for %s' % self.stack[-1]) - tag = self.stack[-1] - if hasattr(self, methodname): - method = getattr(self, methodname) - self.handle_endtag(tag, method) + self.syntax_error('missing close tag for %s' % self.stack[-1][2]) + nstag = self.stack[-1][2] + method = self.elements.get(nstag, (None, None))[1] + if method is not None: + self.handle_endtag(nstag, method) else: - self.unknown_endtag(tag) + self.unknown_endtag(nstag) del self.stack[-1] # Overridable -- handle xml processing instruction @@ -654,9 +713,9 @@ class XMLParser: class TestXMLParser(XMLParser): - def __init__(self, verbose=0): + def __init__(self): self.testdata = "" - XMLParser.__init__(self, verbose) + XMLParser.__init__(self) def handle_xml(self, encoding, standalone): self.flush() |