Sjoerd Mullender writes:

Here is my current version of xmllib.py and the documentation. This version has some API changes with respect to the version currently in Python (also the one in 1.5.2a). This version supports XML namespaces.
author: Guido van Rossum <guido@python.org> 1998-12-18 20:17:13 (GMT)
committer: Guido van Rossum <guido@python.org> 1998-12-18 20:17:13 (GMT)
commit: b083a9fb540b002fe8e386dd4168deacfb0bf574 (patch)
tree: b7ba2f60473f70bd645df514e2463cdb08d90c52 /Lib/xmllib.py
parent: 6de7d0c3388f93288449449cc2a711358ffc4529 (diff)
download: cpython-b083a9fb540b002fe8e386dd4168deacfb0bf574.zip
cpython-b083a9fb540b002fe8e386dd4168deacfb0bf574.tar.gz
cpython-b083a9fb540b002fe8e386dd4168deacfb0bf574.tar.bz2
1 files changed, 121 insertions, 62 deletions
diff --git a/Lib/xmllib.py b/Lib/xmllib.py
index bea210b..c551deb 100644
--- a/Lib/xmllib.py
+++ b/Lib/xmllib.py
@@ -5,7 +5,7 @@ import re
 import string
 
 
-version = '0.1'
+version = '0.2'
 
 # Regular expressions used for parsing
 
@@ -64,6 +64,13 @@ commentclose = re.compile('-->')
 doubledash = re.compile('--')
 attrtrans = string.maketrans(' \r\n\t', '    ')
 
+# definitions for XML namespaces
+_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"
+ncname = re.compile(_NCName + '$')
+qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
+                   '(?P<local>' + _NCName + ')$')
+
+xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
 
 # XML parser base class -- find tags and call handler functions.
 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
@@ -76,10 +83,11 @@ attrtrans = string.maketrans(' \r\n\t', '    ')
 # as argument.
 
 class XMLParser:
+    attributes = {}                     # default, to be overridden
+    elements = {}                       # default, to be overridden
 
     # Interface -- initialize and reset this instance
-    def __init__(self, verbose=0):
-        self.verbose = verbose
+    def __init__(self):
         self.reset()
 
     # Interface -- reset this instance.  Loses all unprocessed data
@@ -92,6 +100,7 @@ class XMLParser:
         self.__at_start = 1
         self.__seen_doctype = None
         self.__seen_starttag = 0
+        self.__namespaces = {'xml':None}   # xml is implicitly declared
 
     # For derived classes only -- enter literal mode (CDATA) till EOF
     def setnomoretags(self):
@@ -333,7 +342,7 @@ class XMLParser:
             if self.stack:
                 self.syntax_error('missing end tags')
                 while self.stack:
-                    self.finish_endtag(self.stack[-1])
+                    self.finish_endtag(self.stack[-1][0])
 
     # Internal -- parse comment, return length or -1 if not terminated
     def parse_comment(self, i):
@@ -413,7 +422,7 @@ class XMLParser:
         self.handle_cdata(rawdata[i+9:res.start(0)])
         return res.end(0)
 
-    __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None}
+    __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
     # Internal -- handle a processing instruction tag
     def parse_proc(self, i):
         rawdata = self.rawdata
@@ -428,29 +437,45 @@ class XMLParser:
             raise RuntimeError, 'unexpected call to parse_proc'
         k = res.end(0)
         name = res.group(0)
-        if string.find(string.lower(name), 'xml') >= 0:
-            self.syntax_error('illegal processing instruction target name')
-        self.handle_proc(name, rawdata[k:j])
+        if name == 'xml:namespace':
+            self.syntax_error('old-fashioned namespace declaration')
+            # namespace declaration
+            # this must come after the <?xml?> declaration (if any)
+            # and before the <!DOCTYPE> (if any).
+            if self.__seen_doctype or self.__seen_starttag:
+                self.syntax_error('xml:namespace declaration too late in document')
+            attrdict, namespace, k = self.parse_attributes(name, k, j)
+            if namespace:
+                self.syntax_error('namespace declaration inside namespace declaration')
+            for attrname in attrdict.keys():
+                if not self.__xml_namespace_attributes.has_key(attrname):
+                    self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
+            if not attrdict.has_key('ns') or not attrdict.has_key('prefix'):
+                self.syntax_error('xml:namespace without required attributes')
+            prefix = attrdict.get('prefix')
+            if ncname.match(prefix) is None:
+                self.syntax_error('xml:namespace illegal prefix value')
+                return end.end(0)
+            if self.__namespaces.has_key(prefix):
+                self.syntax_error('xml:namespace prefix not unique')
+            self.__namespaces[prefix] = attrdict['ns']
+        else:
+            if string.find(string.lower(name), 'xml') >= 0:
+                self.syntax_error('illegal processing instruction target name')
+            self.handle_proc(name, rawdata[k:j])
         return end.end(0)
 
     # Internal -- parse attributes between i and j
-    def parse_attributes(self, tag, i, j, attributes = None):
+    def parse_attributes(self, tag, i, j):
         rawdata = self.rawdata
-        # Now parse the data between i and j into a tag and attrs
         attrdict = {}
-        try:
-            # convert attributes list to dictionary
-            d = {}
-            for a in attributes:
-                d[a] = None
-            attributes = d
-        except TypeError:
-            pass
+        namespace = {}
         while i < j:
             res = attrfind.match(rawdata, i)
             if res is None:
                 break
             attrname, attrvalue = res.group('name', 'value')
+            i = res.end(0)
             if attrvalue is None:
                 self.syntax_error("no value specified for attribute `%s'" % attrname)
                 attrvalue = attrname
@@ -459,22 +484,19 @@ class XMLParser:
                 attrvalue = attrvalue[1:-1]
             else:
                 self.syntax_error("attribute `%s' value not quoted" % attrname)
+            res = xmlns.match(attrname)
+            if res is not None:
+                # namespace declaration
+                ncname = res.group('ncname')
+                namespace[ncname or ''] = attrvalue or None
+                continue
             if '<' in attrvalue:
                 self.syntax_error("`<' illegal in attribute value")
-            if attributes is not None and not attributes.has_key(attrname):
-                self.syntax_error("unknown attribute `%s' of element `%s'" %
-                                  (attrname, tag))
             if attrdict.has_key(attrname):
                 self.syntax_error("attribute `%s' specified twice" % attrname)
             attrvalue = string.translate(attrvalue, attrtrans)
             attrdict[attrname] = self.translate_references(attrvalue)
-            i = res.end(0)
-        if attributes is not None:
-            # fill in with default attributes
-            for key, val in attributes.items():
-                if val is not None and not attrdict.has_key(key):
-                    attrdict[key] = val
-        return attrdict, i
+        return attrdict, namespace, i
 
     # Internal -- handle starttag, return length or -1 if not terminated
     def parse_starttag(self, i):
@@ -487,19 +509,63 @@ class XMLParser:
         if tag is None or tag.end(0) != end.end(0):
             self.syntax_error('garbage in starttag')
             return end.end(0)
-        tagname = tag.group('tagname')
+        nstag = tagname = tag.group('tagname')
         if not self.__seen_starttag and self.__seen_doctype and \
            tagname != self.__seen_doctype:
             self.syntax_error('starttag does not match DOCTYPE')
         if self.__seen_starttag and not self.stack:
             self.syntax_error('multiple elements on top level')
-        if hasattr(self, tagname + '_attributes'):
-            attributes = getattr(self, tagname + '_attributes')
-        else:
-            attributes = None
         k, j = tag.span('attrs')
-        attrdict, k = self.parse_attributes(tagname, k, j, attributes)
-        self.finish_starttag(tagname, attrdict)
+        attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
+        self.stack.append((tagname, nsdict, nstag))
+        res = qname.match(tagname)
+        if res is not None:
+            prefix, nstag = res.group('prefix', 'local')
+            if prefix is None:
+                prefix = ''
+            ns = None
+            for t, d, nst in self.stack:
+                if d.has_key(prefix):
+                    ns = d[prefix]
+            if ns is None and prefix != '':
+                ns = self.__namespaces.get(prefix)
+            if ns is not None:
+                nstag = ns + ' ' + nstag
+            elif prefix != '':
+                nstag = prefix + ':' + nstag # undo split
+            self.stack[-1] = tagname, nsdict, nstag
+        # translate namespace of attributes
+        nattrdict = {}
+        for key, val in attrdict.items():
+            res = qname.match(key)
+            if res is not None:
+                aprefix, key = res.group('prefix', 'local')
+                if aprefix is None:
+                    aprefix = ''
+                ans = None
+                for t, d, nst in self.stack:
+                    if d.has_key(aprefix):
+                        ans = d[aprefix]
+                if ans is None and aprefix != '':
+                    ans = self.__namespaces.get(aprefix)
+                if ans is not None:
+                    key = ans + ' ' + key
+                elif aprefix != '':
+                    key = aprefix + ':' + key
+                elif ns is not None:
+                    key = ns + ' ' + key
+            nattrdict[key] = val
+        attrdict = nattrdict
+        attributes = self.attributes.get(nstag)
+        if attributes is not None:
+            for key in attrdict.keys():
+                if not attributes.has_key(key):
+                    self.syntax_error("unknown attribute `%s' in tag `%s'" % (key, tagname))
+            for key, val in attributes.items():
+                if val is not None and not attrdict.has_key(key):
+                    attrdict[key] = val
+        method = self.elements.get(nstag, (None, None))[0]
+        self.finish_starttag(nstag, attrdict, method)
         if tag.group('slash') == '/':
             self.finish_endtag(tagname)
         return tag.end(0)
@@ -521,7 +587,7 @@ class XMLParser:
         else:
             tag = res.group(0)
             if self.literal:
-                if not self.stack or tag != self.stack[-1]:
+                if not self.stack or tag != self.stack[-1][0]:
                     self.handle_data(rawdata[i])
                     return i+1
                 self.literal = 0
@@ -532,21 +598,14 @@ class XMLParser:
         return end.end(0)
 
     # Internal -- finish processing of start tag
-    # Return -1 for unknown tag, 1 for balanced tag
-    def finish_starttag(self, tag, attrs):
-        self.stack.append(tag)
-        methodname = 'start_' + tag
-        if hasattr(self, methodname):
-            method = getattr(self, methodname)
-            self.handle_starttag(tag, method, attrs)
-            return 1
+    def finish_starttag(self, tagname, attrdict, method):
+        if method is not None:
+            self.handle_starttag(tagname, method, attrdict)
         else:
-            self.unknown_starttag(tag, attrs)
-            return -1
+            self.unknown_starttag(tagname, attrdict)
 
     # Internal -- finish processing of end tag
     def finish_endtag(self, tag):
-        methodname = 'end_' + tag
         if not tag:
             self.syntax_error('name-less end tag')
             found = len(self.stack) - 1
@@ -554,27 +613,27 @@ class XMLParser:
                 self.unknown_endtag(tag)
                 return
         else:
-            if tag not in self.stack:
+            found = -1
+            for i in range(len(self.stack)):
+                if tag == self.stack[i][0]:
+                    found = i
+            if found == -1:
                 self.syntax_error('unopened end tag')
-                if hasattr(self, methodname):
-                    method = getattr(self, methodname)
+                method = self.elements.get(tag, (None, None))[1]
+                if method is not None:
                     self.handle_endtag(tag, method)
                 else:
                     self.unknown_endtag(tag)
                 return
-            found = len(self.stack)
-            for i in range(found):
-                if self.stack[i] == tag:
-                    found = i
         while len(self.stack) > found:
             if found < len(self.stack) - 1:
-                self.syntax_error('missing close tag for %s' % self.stack[-1])
-            tag = self.stack[-1]
-            if hasattr(self, methodname):
-                method = getattr(self, methodname)
-                self.handle_endtag(tag, method)
+                self.syntax_error('missing close tag for %s' % self.stack[-1][2])
+            nstag = self.stack[-1][2]
+            method = self.elements.get(nstag, (None, None))[1]
+            if method is not None:
+                self.handle_endtag(nstag, method)
             else:
-                self.unknown_endtag(tag)
+                self.unknown_endtag(nstag)
             del self.stack[-1]
 
     # Overridable -- handle xml processing instruction
@@ -654,9 +713,9 @@ class XMLParser:
 
 class TestXMLParser(XMLParser):
 
-    def __init__(self, verbose=0):
+    def __init__(self):
         self.testdata = ""
-        XMLParser.__init__(self, verbose)
+        XMLParser.__init__(self)
 
     def handle_xml(self, encoding, standalone):
         self.flush()
author	Guido van Rossum <guido@python.org>	1998-12-18 20:17:13 (GMT)
committer	Guido van Rossum <guido@python.org>	1998-12-18 20:17:13 (GMT)
commit	b083a9fb540b002fe8e386dd4168deacfb0bf574 (patch)
tree	b7ba2f60473f70bd645df514e2463cdb08d90c52 /Lib/xmllib.py
parent	6de7d0c3388f93288449449cc2a711358ffc4529 (diff)
download	cpython-b083a9fb540b002fe8e386dd4168deacfb0bf574.zip cpython-b083a9fb540b002fe8e386dd4168deacfb0bf574.tar.gz cpython-b083a9fb540b002fe8e386dd4168deacfb0bf574.tar.bz2