2 files changed, 180 insertions, 111 deletions
diff --git a/Doc/lib/libxmllib.tex b/Doc/lib/libxmllib.tex
index 7a7c85d..a785a73 100644
--- a/Doc/lib/libxmllib.tex
+++ b/Doc/lib/libxmllib.tex
@@ -14,7 +14,28 @@ for parsing text files formatted in XML (eXtended Markup Language).
 The \class{XMLParser} class must be instantiated without arguments.
 \end{classdesc}
 
-This class provides the following interface methods:
+This class provides the following interface methods and instance variables:
+
+\begin{memberdesc}{attributes}
+A mapping of element names to mappings.  The latter mapping maps
+attribute names that are valid for the element to the default value of 
+the attribute, or if there is no default to \code{None}.  The default
+value is the empty dictionary.
+\end{memberdesc}
+
+\begin{memberdesc}{elements} 
+A mapping of element names to tuples.  The tuples contain a function
+for handling the start and end tag respectively of the element, or
+\code{None} if the method \method{unknown_starttag()} or
+\method{unknown_endtag()} is to be called.  The default value is the
+empty dictionary.
+\end{memberdesc}
+
+\begin{memberdesc}{entitydefs}
+A mapping of entitynames to their values.  The default value contains
+definitions for \code{'lt'}, \code{'gt'}, \code{'amp'}, \code{'quot'}, 
+and \code{'apos'}.
+\end{memberdesc}
 
 \begin{methoddesc}{reset}{}
 Reset the instance.  Loses all unprocessed data.  This is called
@@ -33,7 +54,7 @@ when the close tag matching the last unclosed open tag is encountered.
 
 \begin{methoddesc}{feed}{data}
 Feed some text to the parser.  It is processed insofar as it consists
-of complete elements; incomplete data is buffered until more data is
+of complete tags; incomplete data is buffered until more data is
 fed or \method{close()} is called.
 \end{methoddesc}
 
@@ -65,29 +86,29 @@ the root element.
 \end{methoddesc}
 
 \begin{methoddesc}{handle_starttag}{tag, method, attributes}
-This method is called to handle start tags for which a
-\method{start_\var{tag}()} method has been defined.  The \var{tag}
-argument is the name of the tag, and the \var{method} argument is the
-bound method which should be used to support semantic interpretation
-of the start tag.  The \var{attributes} argument is a dictionary of
-attributes, the key being the \var{name} and the value being the
-\var{value} of the attribute found inside the tag's \code{<>} brackets.
-Character and entity references in the \var{value} have
-been interpreted.  For instance, for the tag
+This method is called to handle start tags for which a start tag
+handler is defined in the instance variable \member{elements}.  The
+\var{tag} argument is the name of the tag, and the \var{method}
+argument is the function (method) which should be used to support semantic
+interpretation of the start tag.  The \var{attributes} argument is a
+dictionary of attributes, the key being the \var{name} and the value
+being the \var{value} of the attribute found inside the tag's
+\code{<>} brackets.  Character and entity references in the
+\var{value} have been interpreted.  For instance, for the start tag
 \code{<A HREF="http://www.cwi.nl/">}, this method would be called as
-\code{handle_starttag('A', self.start_A, \{'HREF': 'http://www.cwi.nl/'\})}.
+\code{handle_starttag('A', self.elements['A'][0], \{'HREF': 'http://www.cwi.nl/'\})}.
 The base implementation simply calls \var{method} with \var{attributes}
 as the only argument.
 \end{methoddesc}
 
 \begin{methoddesc}{handle_endtag}{tag, method}
-This method is called to handle endtags for which an
-\method{end_\var{tag}()} method has been defined.  The \var{tag}
-argument is the name of the tag, and the
-\var{method} argument is the bound method which should be used to
-support semantic interpretation of the end tag.  If no
-\method{end_\var{tag}()} method is defined for the closing element, this
-handler is not called.  The base implementation simply calls
+This method is called to handle endtags for which an end tag handler
+is defined in the instance variable \member{elements}.  The \var{tag}
+argument is the name of the tag, and the \var{method} argument is the
+function (method) which should be used to support semantic
+interpretation of the end tag.  For instance, for the endtag
+\code{</A>}, this method would be called as \code{handle_endtag('A',
+self.elements['A'][1])}.  The base implementation simply calls
 \var{method}.
 \end{methoddesc}
 
@@ -149,7 +170,7 @@ closing delimiter, but not the delimiter itself.  For example, the
 instruction \samp{<?XML text?>} will cause this method to be called
 with the arguments \code{'XML'} and \code{'text'}.  The default method
 does nothing.  Note that if a document starts with \samp{<?xml
-...?>}, \method{handle_xml()} is called to handle it.
+..?>}, \method{handle_xml()} is called to handle it.
 \end{methoddesc}
 
 \begin{methoddesc}{handle_special}{data}
@@ -196,32 +217,21 @@ intended to be overridden by a derived class; the base class
 implementation does nothing.
 \end{methoddesc}
 
-Apart from overriding or extending the methods listed above, derived
-classes may also define methods and variables of the following form to
-define processing of specific tags.  Tag names in the input stream are
-case dependent; the \var{tag} occurring in method names must be in the
-correct case:
-
-\begin{methoddescni}{start_\var{tag}}{attributes}
-This method is called to process an opening tag \var{tag}.  The
-\var{attributes} argument has the same meaning as described for
-\method{handle_starttag()} above.  In fact, the base implementation of
-\method{handle_starttag()} calls this method.
-\end{methoddescni}
-
-\begin{methoddescni}{end_\var{tag}}{}
-This method is called to process a closing tag \var{tag}.
-\end{methoddescni}
-
-\begin{memberdescni}{\var{tag}_attributes}
-If a class or instance variable \member{\var{tag}_attributes} exists, it 
-should be a list or a dictionary.  If a list, the elements of the list 
-are the valid attributes for the element \var{tag}; if a dictionary,
-the keys are the valid attributes for the element \var{tag}, and the
-values the default values of the attributes, or \code{None} if there
-is no default.
-In addition to the attributes that were present in the tag, the
-attribute dictionary that is passed to \method{handle_starttag()} and
-\method{unknown_starttag()} contains values for all attributes that
-have a default value.
-\end{memberdescni}
+\subsection{XML Namespaces}
+
+This module has support for XML namespaces as defined in the XML
+Namespaces proposed recommendation.
+
+Tag and attribute names that are defined in an XML namespace are
+handled as if the name of the tag or element consisted of the
+namespace (i.e. the URL that defines the namespace) followed by a
+space and the name of the tag or attribute.  For instance, the tag
+\code{<html xmlns='http://www.w3.org/TR/REC-html40'>} is treated as if 
+the tag name was \code{'http://www.w3.org/TR/REC-html40 html'}, and
+the tag \code{<html:a href='http://frob.com'>} inside the above
+mentioned element is treated as if the tag name were
+\code{'http://www.w3.org/TR/REC-html40 a'} and the attribute name as
+if it were \code{'http://www.w3.org/TR/REC-html40 src'}.
+
+An older draft of the XML Namespaces proposal is also recognized, but
+triggers a warning.
diff --git a/Lib/xmllib.py b/Lib/xmllib.py
index bea210b..c551deb 100644
--- a/Lib/xmllib.py
+++ b/Lib/xmllib.py
@@ -5,7 +5,7 @@ import re
 import string
 
 
-version = '0.1'
+version = '0.2'
 
 # Regular expressions used for parsing
 
@@ -64,6 +64,13 @@ commentclose = re.compile('-->')
 doubledash = re.compile('--')
 attrtrans = string.maketrans(' \r\n\t', '    ')
 
+# definitions for XML namespaces
+_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"
+ncname = re.compile(_NCName + '$')
+qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
+                   '(?P<local>' + _NCName + ')$')
+
+xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
 
 # XML parser base class -- find tags and call handler functions.
 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
@@ -76,10 +83,11 @@ attrtrans = string.maketrans(' \r\n\t', '    ')
 # as argument.
 
 class XMLParser:
+    attributes = {}                     # default, to be overridden
+    elements = {}                       # default, to be overridden
 
     # Interface -- initialize and reset this instance
-    def __init__(self, verbose=0):
-        self.verbose = verbose
+    def __init__(self):
         self.reset()
 
     # Interface -- reset this instance.  Loses all unprocessed data
@@ -92,6 +100,7 @@ class XMLParser:
         self.__at_start = 1
         self.__seen_doctype = None
         self.__seen_starttag = 0
+        self.__namespaces = {'xml':None}   # xml is implicitly declared
 
     # For derived classes only -- enter literal mode (CDATA) till EOF
     def setnomoretags(self):
@@ -333,7 +342,7 @@ class XMLParser:
             if self.stack:
                 self.syntax_error('missing end tags')
                 while self.stack:
-                    self.finish_endtag(self.stack[-1])
+                    self.finish_endtag(self.stack[-1][0])
 
     # Internal -- parse comment, return length or -1 if not terminated
     def parse_comment(self, i):
@@ -413,7 +422,7 @@ class XMLParser:
         self.handle_cdata(rawdata[i+9:res.start(0)])
         return res.end(0)
 
-    __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None}
+    __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
     # Internal -- handle a processing instruction tag
     def parse_proc(self, i):
         rawdata = self.rawdata
@@ -428,29 +437,45 @@ class XMLParser:
             raise RuntimeError, 'unexpected call to parse_proc'
         k = res.end(0)
         name = res.group(0)
-        if string.find(string.lower(name), 'xml') >= 0:
-            self.syntax_error('illegal processing instruction target name')
-        self.handle_proc(name, rawdata[k:j])
+        if name == 'xml:namespace':
+            self.syntax_error('old-fashioned namespace declaration')
+            # namespace declaration
+            # this must come after the <?xml?> declaration (if any)
+            # and before the <!DOCTYPE> (if any).
+            if self.__seen_doctype or self.__seen_starttag:
+                self.syntax_error('xml:namespace declaration too late in document')
+            attrdict, namespace, k = self.parse_attributes(name, k, j)
+            if namespace:
+                self.syntax_error('namespace declaration inside namespace declaration')
+            for attrname in attrdict.keys():
+                if not self.__xml_namespace_attributes.has_key(attrname):
+                    self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
+            if not attrdict.has_key('ns') or not attrdict.has_key('prefix'):
+                self.syntax_error('xml:namespace without required attributes')
+            prefix = attrdict.get('prefix')
+            if ncname.match(prefix) is None:
+                self.syntax_error('xml:namespace illegal prefix value')
+                return end.end(0)
+            if self.__namespaces.has_key(prefix):
+                self.syntax_error('xml:namespace prefix not unique')
+            self.__namespaces[prefix] = attrdict['ns']
+        else:
+            if string.find(string.lower(name), 'xml') >= 0:
+                self.syntax_error('illegal processing instruction target name')
+            self.handle_proc(name, rawdata[k:j])
         return end.end(0)
 
     # Internal -- parse attributes between i and j
-    def parse_attributes(self, tag, i, j, attributes = None):
+    def parse_attributes(self, tag, i, j):
         rawdata = self.rawdata
-        # Now parse the data between i and j into a tag and attrs
         attrdict = {}
-        try:
-            # convert attributes list to dictionary
-            d = {}
-            for a in attributes:
-                d[a] = None
-            attributes = d
-        except TypeError:
-            pass
+        namespace = {}
         while i < j:
             res = attrfind.match(rawdata, i)
             if res is None:
                 break
             attrname, attrvalue = res.group('name', 'value')
+            i = res.end(0)
             if attrvalue is None:
                 self.syntax_error("no value specified for attribute `%s'" % attrname)
                 attrvalue = attrname
@@ -459,22 +484,19 @@ class XMLParser:
                 attrvalue = attrvalue[1:-1]
             else:
                 self.syntax_error("attribute `%s' value not quoted" % attrname)
+            res = xmlns.match(attrname)
+            if res is not None:
+                # namespace declaration
+                ncname = res.group('ncname')
+                namespace[ncname or ''] = attrvalue or None
+                continue
             if '<' in attrvalue:
                 self.syntax_error("`<' illegal in attribute value")
-            if attributes is not None and not attributes.has_key(attrname):
-                self.syntax_error("unknown attribute `%s' of element `%s'" %
-                                  (attrname, tag))
             if attrdict.has_key(attrname):
                 self.syntax_error("attribute `%s' specified twice" % attrname)
             attrvalue = string.translate(attrvalue, attrtrans)
             attrdict[attrname] = self.translate_references(attrvalue)
-            i = res.end(0)
-        if attributes is not None:
-            # fill in with default attributes
-            for key, val in attributes.items():
-                if val is not None and not attrdict.has_key(key):
-                    attrdict[key] = val
-        return attrdict, i
+        return attrdict, namespace, i
 
     # Internal -- handle starttag, return length or -1 if not terminated
     def parse_starttag(self, i):
@@ -487,19 +509,63 @@ class XMLParser:
         if tag is None or tag.end(0) != end.end(0):
             self.syntax_error('garbage in starttag')
             return end.end(0)
-        tagname = tag.group('tagname')
+        nstag = tagname = tag.group('tagname')
         if not self.__seen_starttag and self.__seen_doctype and \
            tagname != self.__seen_doctype:
             self.syntax_error('starttag does not match DOCTYPE')
         if self.__seen_starttag and not self.stack:
             self.syntax_error('multiple elements on top level')
-        if hasattr(self, tagname + '_attributes'):
-            attributes = getattr(self, tagname + '_attributes')
-        else:
-            attributes = None
         k, j = tag.span('attrs')
-        attrdict, k = self.parse_attributes(tagname, k, j, attributes)
-        self.finish_starttag(tagname, attrdict)
+        attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
+        self.stack.append((tagname, nsdict, nstag))
+        res = qname.match(tagname)
+        if res is not None:
+            prefix, nstag = res.group('prefix', 'local')
+            if prefix is None:
+                prefix = ''
+            ns = None
+            for t, d, nst in self.stack:
+                if d.has_key(prefix):
+                    ns = d[prefix]
+            if ns is None and prefix != '':
+                ns = self.__namespaces.get(prefix)
+            if ns is not None:
+                nstag = ns + ' ' + nstag
+            elif prefix != '':
+                nstag = prefix + ':' + nstag # undo split
+            self.stack[-1] = tagname, nsdict, nstag
+        # translate namespace of attributes
+        nattrdict = {}
+        for key, val in attrdict.items():
+            res = qname.match(key)
+            if res is not None:
+                aprefix, key = res.group('prefix', 'local')
+                if aprefix is None:
+                    aprefix = ''
+                ans = None
+                for t, d, nst in self.stack:
+                    if d.has_key(aprefix):
+                        ans = d[aprefix]
+                if ans is None and aprefix != '':
+                    ans = self.__namespaces.get(aprefix)
+                if ans is not None:
+                    key = ans + ' ' + key
+                elif aprefix != '':
+                    key = aprefix + ':' + key
+                elif ns is not None:
+                    key = ns + ' ' + key
+            nattrdict[key] = val
+        attrdict = nattrdict
+        attributes = self.attributes.get(nstag)
+        if attributes is not None:
+            for key in attrdict.keys():
+                if not attributes.has_key(key):
+                    self.syntax_error("unknown attribute `%s' in tag `%s'" % (key, tagname))
+            for key, val in attributes.items():
+                if val is not None and not attrdict.has_key(key):
+                    attrdict[key] = val
+        method = self.elements.get(nstag, (None, None))[0]
+        self.finish_starttag(nstag, attrdict, method)
         if tag.group('slash') == '/':
             self.finish_endtag(tagname)
         return tag.end(0)
@@ -521,7 +587,7 @@ class XMLParser:
         else:
             tag = res.group(0)
             if self.literal:
-                if not self.stack or tag != self.stack[-1]:
+                if not self.stack or tag != self.stack[-1][0]:
                     self.handle_data(rawdata[i])
                     return i+1
                 self.literal = 0
@@ -532,21 +598,14 @@ class XMLParser:
         return end.end(0)
 
     # Internal -- finish processing of start tag
-    # Return -1 for unknown tag, 1 for balanced tag
-    def finish_starttag(self, tag, attrs):
-        self.stack.append(tag)
-        methodname = 'start_' + tag
-        if hasattr(self, methodname):
-            method = getattr(self, methodname)
-            self.handle_starttag(tag, method, attrs)
-            return 1
+    def finish_starttag(self, tagname, attrdict, method):
+        if method is not None:
+            self.handle_starttag(tagname, method, attrdict)
         else:
-            self.unknown_starttag(tag, attrs)
-            return -1
+            self.unknown_starttag(tagname, attrdict)
 
     # Internal -- finish processing of end tag
     def finish_endtag(self, tag):
-        methodname = 'end_' + tag
         if not tag:
             self.syntax_error('name-less end tag')
             found = len(self.stack) - 1
@@ -554,27 +613,27 @@ class XMLParser:
                 self.unknown_endtag(tag)
                 return
         else:
-            if tag not in self.stack:
+            found = -1
+            for i in range(len(self.stack)):
+                if tag == self.stack[i][0]:
+                    found = i
+            if found == -1:
                 self.syntax_error('unopened end tag')
-                if hasattr(self, methodname):
-                    method = getattr(self, methodname)
+                method = self.elements.get(tag, (None, None))[1]
+                if method is not None:
                     self.handle_endtag(tag, method)
                 else:
                     self.unknown_endtag(tag)
                 return
-            found = len(self.stack)
-            for i in range(found):
-                if self.stack[i] == tag:
-                    found = i
         while len(self.stack) > found:
             if found < len(self.stack) - 1:
-                self.syntax_error('missing close tag for %s' % self.stack[-1])
-            tag = self.stack[-1]
-            if hasattr(self, methodname):
-                method = getattr(self, methodname)
-                self.handle_endtag(tag, method)
+                self.syntax_error('missing close tag for %s' % self.stack[-1][2])
+            nstag = self.stack[-1][2]
+            method = self.elements.get(nstag, (None, None))[1]
+            if method is not None:
+                self.handle_endtag(nstag, method)
             else:
-                self.unknown_endtag(tag)
+                self.unknown_endtag(nstag)
             del self.stack[-1]
 
     # Overridable -- handle xml processing instruction
@@ -654,9 +713,9 @@ class XMLParser:
 
 class TestXMLParser(XMLParser):
 
-    def __init__(self, verbose=0):
+    def __init__(self):
         self.testdata = ""
-        XMLParser.__init__(self, verbose)
+        XMLParser.__init__(self)
 
     def handle_xml(self, encoding, standalone):
         self.flush()