diff options
author | Guido van Rossum <guido@python.org> | 1998-12-18 20:17:13 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1998-12-18 20:17:13 (GMT) |
commit | b083a9fb540b002fe8e386dd4168deacfb0bf574 (patch) | |
tree | b7ba2f60473f70bd645df514e2463cdb08d90c52 | |
parent | 6de7d0c3388f93288449449cc2a711358ffc4529 (diff) | |
download | cpython-b083a9fb540b002fe8e386dd4168deacfb0bf574.zip cpython-b083a9fb540b002fe8e386dd4168deacfb0bf574.tar.gz cpython-b083a9fb540b002fe8e386dd4168deacfb0bf574.tar.bz2 |
Sjoerd Mullender writes:
Here is my current version of xmllib.py and the documentation. This
version has some API changes with respect to the version currently in
Python (also the one in 1.5.2a).
This version supports XML namespaces.
-rw-r--r-- | Doc/lib/libxmllib.tex | 108 | ||||
-rw-r--r-- | Lib/xmllib.py | 183 |
2 files changed, 180 insertions, 111 deletions
diff --git a/Doc/lib/libxmllib.tex b/Doc/lib/libxmllib.tex index 7a7c85d..a785a73 100644 --- a/Doc/lib/libxmllib.tex +++ b/Doc/lib/libxmllib.tex @@ -14,7 +14,28 @@ for parsing text files formatted in XML (eXtended Markup Language). The \class{XMLParser} class must be instantiated without arguments. \end{classdesc} -This class provides the following interface methods: +This class provides the following interface methods and instance variables: + +\begin{memberdesc}{attributes} +A mapping of element names to mappings. The latter mapping maps +attribute names that are valid for the element to the default value of +the attribute, or if there is no default to \code{None}. The default +value is the empty dictionary. +\end{memberdesc} + +\begin{memberdesc}{elements} +A mapping of element names to tuples. The tuples contain a function +for handling the start and end tag respectively of the element, or +\code{None} if the method \method{unknown_starttag()} or +\method{unknown_endtag()} is to be called. The default value is the +empty dictionary. +\end{memberdesc} + +\begin{memberdesc}{entitydefs} +A mapping of entitynames to their values. The default value contains +definitions for \code{'lt'}, \code{'gt'}, \code{'amp'}, \code{'quot'}, +and \code{'apos'}. +\end{memberdesc} \begin{methoddesc}{reset}{} Reset the instance. Loses all unprocessed data. This is called @@ -33,7 +54,7 @@ when the close tag matching the last unclosed open tag is encountered. \begin{methoddesc}{feed}{data} Feed some text to the parser. It is processed insofar as it consists -of complete elements; incomplete data is buffered until more data is +of complete tags; incomplete data is buffered until more data is fed or \method{close()} is called. \end{methoddesc} @@ -65,29 +86,29 @@ the root element. \end{methoddesc} \begin{methoddesc}{handle_starttag}{tag, method, attributes} -This method is called to handle start tags for which a -\method{start_\var{tag}()} method has been defined. The \var{tag} -argument is the name of the tag, and the \var{method} argument is the -bound method which should be used to support semantic interpretation -of the start tag. The \var{attributes} argument is a dictionary of -attributes, the key being the \var{name} and the value being the -\var{value} of the attribute found inside the tag's \code{<>} brackets. -Character and entity references in the \var{value} have -been interpreted. For instance, for the tag +This method is called to handle start tags for which a start tag +handler is defined in the instance variable \member{elements}. The +\var{tag} argument is the name of the tag, and the \var{method} +argument is the function (method) which should be used to support semantic +interpretation of the start tag. The \var{attributes} argument is a +dictionary of attributes, the key being the \var{name} and the value +being the \var{value} of the attribute found inside the tag's +\code{<>} brackets. Character and entity references in the +\var{value} have been interpreted. For instance, for the start tag \code{<A HREF="http://www.cwi.nl/">}, this method would be called as -\code{handle_starttag('A', self.start_A, \{'HREF': 'http://www.cwi.nl/'\})}. +\code{handle_starttag('A', self.elements['A'][0], \{'HREF': 'http://www.cwi.nl/'\})}. The base implementation simply calls \var{method} with \var{attributes} as the only argument. \end{methoddesc} \begin{methoddesc}{handle_endtag}{tag, method} -This method is called to handle endtags for which an -\method{end_\var{tag}()} method has been defined. The \var{tag} -argument is the name of the tag, and the -\var{method} argument is the bound method which should be used to -support semantic interpretation of the end tag. If no -\method{end_\var{tag}()} method is defined for the closing element, this -handler is not called. The base implementation simply calls +This method is called to handle endtags for which an end tag handler +is defined in the instance variable \member{elements}. The \var{tag} +argument is the name of the tag, and the \var{method} argument is the +function (method) which should be used to support semantic +interpretation of the end tag. For instance, for the endtag +\code{</A>}, this method would be called as \code{handle_endtag('A', +self.elements['A'][1])}. The base implementation simply calls \var{method}. \end{methoddesc} @@ -149,7 +170,7 @@ closing delimiter, but not the delimiter itself. For example, the instruction \samp{<?XML text?>} will cause this method to be called with the arguments \code{'XML'} and \code{'text'}. The default method does nothing. Note that if a document starts with \samp{<?xml -...?>}, \method{handle_xml()} is called to handle it. +..?>}, \method{handle_xml()} is called to handle it. \end{methoddesc} \begin{methoddesc}{handle_special}{data} @@ -196,32 +217,21 @@ intended to be overridden by a derived class; the base class implementation does nothing. \end{methoddesc} -Apart from overriding or extending the methods listed above, derived -classes may also define methods and variables of the following form to -define processing of specific tags. Tag names in the input stream are -case dependent; the \var{tag} occurring in method names must be in the -correct case: - -\begin{methoddescni}{start_\var{tag}}{attributes} -This method is called to process an opening tag \var{tag}. The -\var{attributes} argument has the same meaning as described for -\method{handle_starttag()} above. In fact, the base implementation of -\method{handle_starttag()} calls this method. -\end{methoddescni} - -\begin{methoddescni}{end_\var{tag}}{} -This method is called to process a closing tag \var{tag}. -\end{methoddescni} - -\begin{memberdescni}{\var{tag}_attributes} -If a class or instance variable \member{\var{tag}_attributes} exists, it -should be a list or a dictionary. If a list, the elements of the list -are the valid attributes for the element \var{tag}; if a dictionary, -the keys are the valid attributes for the element \var{tag}, and the -values the default values of the attributes, or \code{None} if there -is no default. -In addition to the attributes that were present in the tag, the -attribute dictionary that is passed to \method{handle_starttag()} and -\method{unknown_starttag()} contains values for all attributes that -have a default value. -\end{memberdescni} +\subsection{XML Namespaces} + +This module has support for XML namespaces as defined in the XML +Namespaces proposed recommendation. + +Tag and attribute names that are defined in an XML namespace are +handled as if the name of the tag or element consisted of the +namespace (i.e. the URL that defines the namespace) followed by a +space and the name of the tag or attribute. For instance, the tag +\code{<html xmlns='http://www.w3.org/TR/REC-html40'>} is treated as if +the tag name was \code{'http://www.w3.org/TR/REC-html40 html'}, and +the tag \code{<html:a href='http://frob.com'>} inside the above +mentioned element is treated as if the tag name were +\code{'http://www.w3.org/TR/REC-html40 a'} and the attribute name as +if it were \code{'http://www.w3.org/TR/REC-html40 src'}. + +An older draft of the XML Namespaces proposal is also recognized, but +triggers a warning. diff --git a/Lib/xmllib.py b/Lib/xmllib.py index bea210b..c551deb 100644 --- a/Lib/xmllib.py +++ b/Lib/xmllib.py @@ -5,7 +5,7 @@ import re import string -version = '0.1' +version = '0.2' # Regular expressions used for parsing @@ -64,6 +64,13 @@ commentclose = re.compile('-->') doubledash = re.compile('--') attrtrans = string.maketrans(' \r\n\t', ' ') +# definitions for XML namespaces +_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":" +ncname = re.compile(_NCName + '$') +qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix + '(?P<local>' + _NCName + ')$') + +xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$') # XML parser base class -- find tags and call handler functions. # Usage: p = XMLParser(); p.feed(data); ...; p.close(). @@ -76,10 +83,11 @@ attrtrans = string.maketrans(' \r\n\t', ' ') # as argument. class XMLParser: + attributes = {} # default, to be overridden + elements = {} # default, to be overridden # Interface -- initialize and reset this instance - def __init__(self, verbose=0): - self.verbose = verbose + def __init__(self): self.reset() # Interface -- reset this instance. Loses all unprocessed data @@ -92,6 +100,7 @@ class XMLParser: self.__at_start = 1 self.__seen_doctype = None self.__seen_starttag = 0 + self.__namespaces = {'xml':None} # xml is implicitly declared # For derived classes only -- enter literal mode (CDATA) till EOF def setnomoretags(self): @@ -333,7 +342,7 @@ class XMLParser: if self.stack: self.syntax_error('missing end tags') while self.stack: - self.finish_endtag(self.stack[-1]) + self.finish_endtag(self.stack[-1][0]) # Internal -- parse comment, return length or -1 if not terminated def parse_comment(self, i): @@ -413,7 +422,7 @@ class XMLParser: self.handle_cdata(rawdata[i+9:res.start(0)]) return res.end(0) - __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None} + __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None} # Internal -- handle a processing instruction tag def parse_proc(self, i): rawdata = self.rawdata @@ -428,29 +437,45 @@ class XMLParser: raise RuntimeError, 'unexpected call to parse_proc' k = res.end(0) name = res.group(0) - if string.find(string.lower(name), 'xml') >= 0: - self.syntax_error('illegal processing instruction target name') - self.handle_proc(name, rawdata[k:j]) + if name == 'xml:namespace': + self.syntax_error('old-fashioned namespace declaration') + # namespace declaration + # this must come after the <?xml?> declaration (if any) + # and before the <!DOCTYPE> (if any). + if self.__seen_doctype or self.__seen_starttag: + self.syntax_error('xml:namespace declaration too late in document') + attrdict, namespace, k = self.parse_attributes(name, k, j) + if namespace: + self.syntax_error('namespace declaration inside namespace declaration') + for attrname in attrdict.keys(): + if not self.__xml_namespace_attributes.has_key(attrname): + self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname) + if not attrdict.has_key('ns') or not attrdict.has_key('prefix'): + self.syntax_error('xml:namespace without required attributes') + prefix = attrdict.get('prefix') + if ncname.match(prefix) is None: + self.syntax_error('xml:namespace illegal prefix value') + return end.end(0) + if self.__namespaces.has_key(prefix): + self.syntax_error('xml:namespace prefix not unique') + self.__namespaces[prefix] = attrdict['ns'] + else: + if string.find(string.lower(name), 'xml') >= 0: + self.syntax_error('illegal processing instruction target name') + self.handle_proc(name, rawdata[k:j]) return end.end(0) # Internal -- parse attributes between i and j - def parse_attributes(self, tag, i, j, attributes = None): + def parse_attributes(self, tag, i, j): rawdata = self.rawdata - # Now parse the data between i and j into a tag and attrs attrdict = {} - try: - # convert attributes list to dictionary - d = {} - for a in attributes: - d[a] = None - attributes = d - except TypeError: - pass + namespace = {} while i < j: res = attrfind.match(rawdata, i) if res is None: break attrname, attrvalue = res.group('name', 'value') + i = res.end(0) if attrvalue is None: self.syntax_error("no value specified for attribute `%s'" % attrname) attrvalue = attrname @@ -459,22 +484,19 @@ class XMLParser: attrvalue = attrvalue[1:-1] else: self.syntax_error("attribute `%s' value not quoted" % attrname) + res = xmlns.match(attrname) + if res is not None: + # namespace declaration + ncname = res.group('ncname') + namespace[ncname or ''] = attrvalue or None + continue if '<' in attrvalue: self.syntax_error("`<' illegal in attribute value") - if attributes is not None and not attributes.has_key(attrname): - self.syntax_error("unknown attribute `%s' of element `%s'" % - (attrname, tag)) if attrdict.has_key(attrname): self.syntax_error("attribute `%s' specified twice" % attrname) attrvalue = string.translate(attrvalue, attrtrans) attrdict[attrname] = self.translate_references(attrvalue) - i = res.end(0) - if attributes is not None: - # fill in with default attributes - for key, val in attributes.items(): - if val is not None and not attrdict.has_key(key): - attrdict[key] = val - return attrdict, i + return attrdict, namespace, i # Internal -- handle starttag, return length or -1 if not terminated def parse_starttag(self, i): @@ -487,19 +509,63 @@ class XMLParser: if tag is None or tag.end(0) != end.end(0): self.syntax_error('garbage in starttag') return end.end(0) - tagname = tag.group('tagname') + nstag = tagname = tag.group('tagname') if not self.__seen_starttag and self.__seen_doctype and \ tagname != self.__seen_doctype: self.syntax_error('starttag does not match DOCTYPE') if self.__seen_starttag and not self.stack: self.syntax_error('multiple elements on top level') - if hasattr(self, tagname + '_attributes'): - attributes = getattr(self, tagname + '_attributes') - else: - attributes = None k, j = tag.span('attrs') - attrdict, k = self.parse_attributes(tagname, k, j, attributes) - self.finish_starttag(tagname, attrdict) + attrdict, nsdict, k = self.parse_attributes(tagname, k, j) + self.stack.append((tagname, nsdict, nstag)) + res = qname.match(tagname) + if res is not None: + prefix, nstag = res.group('prefix', 'local') + if prefix is None: + prefix = '' + ns = None + for t, d, nst in self.stack: + if d.has_key(prefix): + ns = d[prefix] + if ns is None and prefix != '': + ns = self.__namespaces.get(prefix) + if ns is not None: + nstag = ns + ' ' + nstag + elif prefix != '': + nstag = prefix + ':' + nstag # undo split + self.stack[-1] = tagname, nsdict, nstag + # translate namespace of attributes + nattrdict = {} + for key, val in attrdict.items(): + res = qname.match(key) + if res is not None: + aprefix, key = res.group('prefix', 'local') + if aprefix is None: + aprefix = '' + ans = None + for t, d, nst in self.stack: + if d.has_key(aprefix): + ans = d[aprefix] + if ans is None and aprefix != '': + ans = self.__namespaces.get(aprefix) + if ans is not None: + key = ans + ' ' + key + elif aprefix != '': + key = aprefix + ':' + key + elif ns is not None: + key = ns + ' ' + key + nattrdict[key] = val + attrdict = nattrdict + attributes = self.attributes.get(nstag) + if attributes is not None: + for key in attrdict.keys(): + if not attributes.has_key(key): + self.syntax_error("unknown attribute `%s' in tag `%s'" % (key, tagname)) + for key, val in attributes.items(): + if val is not None and not attrdict.has_key(key): + attrdict[key] = val + method = self.elements.get(nstag, (None, None))[0] + self.finish_starttag(nstag, attrdict, method) if tag.group('slash') == '/': self.finish_endtag(tagname) return tag.end(0) @@ -521,7 +587,7 @@ class XMLParser: else: tag = res.group(0) if self.literal: - if not self.stack or tag != self.stack[-1]: + if not self.stack or tag != self.stack[-1][0]: self.handle_data(rawdata[i]) return i+1 self.literal = 0 @@ -532,21 +598,14 @@ class XMLParser: return end.end(0) # Internal -- finish processing of start tag - # Return -1 for unknown tag, 1 for balanced tag - def finish_starttag(self, tag, attrs): - self.stack.append(tag) - methodname = 'start_' + tag - if hasattr(self, methodname): - method = getattr(self, methodname) - self.handle_starttag(tag, method, attrs) - return 1 + def finish_starttag(self, tagname, attrdict, method): + if method is not None: + self.handle_starttag(tagname, method, attrdict) else: - self.unknown_starttag(tag, attrs) - return -1 + self.unknown_starttag(tagname, attrdict) # Internal -- finish processing of end tag def finish_endtag(self, tag): - methodname = 'end_' + tag if not tag: self.syntax_error('name-less end tag') found = len(self.stack) - 1 @@ -554,27 +613,27 @@ class XMLParser: self.unknown_endtag(tag) return else: - if tag not in self.stack: + found = -1 + for i in range(len(self.stack)): + if tag == self.stack[i][0]: + found = i + if found == -1: self.syntax_error('unopened end tag') - if hasattr(self, methodname): - method = getattr(self, methodname) + method = self.elements.get(tag, (None, None))[1] + if method is not None: self.handle_endtag(tag, method) else: self.unknown_endtag(tag) return - found = len(self.stack) - for i in range(found): - if self.stack[i] == tag: - found = i while len(self.stack) > found: if found < len(self.stack) - 1: - self.syntax_error('missing close tag for %s' % self.stack[-1]) - tag = self.stack[-1] - if hasattr(self, methodname): - method = getattr(self, methodname) - self.handle_endtag(tag, method) + self.syntax_error('missing close tag for %s' % self.stack[-1][2]) + nstag = self.stack[-1][2] + method = self.elements.get(nstag, (None, None))[1] + if method is not None: + self.handle_endtag(nstag, method) else: - self.unknown_endtag(tag) + self.unknown_endtag(nstag) del self.stack[-1] # Overridable -- handle xml processing instruction @@ -654,9 +713,9 @@ class XMLParser: class TestXMLParser(XMLParser): - def __init__(self, verbose=0): + def __init__(self): self.testdata = "" - XMLParser.__init__(self, verbose) + XMLParser.__init__(self) def handle_xml(self, encoding, standalone): self.flush() |