diff options
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/xmllib.py | 183 |
1 files changed, 121 insertions, 62 deletions
diff --git a/Lib/xmllib.py b/Lib/xmllib.py index bea210b..c551deb 100644 --- a/Lib/xmllib.py +++ b/Lib/xmllib.py @@ -5,7 +5,7 @@ import re import string -version = '0.1' +version = '0.2' # Regular expressions used for parsing @@ -64,6 +64,13 @@ commentclose = re.compile('-->') doubledash = re.compile('--') attrtrans = string.maketrans(' \r\n\t', ' ') +# definitions for XML namespaces +_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":" +ncname = re.compile(_NCName + '$') +qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix + '(?P<local>' + _NCName + ')$') + +xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$') # XML parser base class -- find tags and call handler functions. # Usage: p = XMLParser(); p.feed(data); ...; p.close(). @@ -76,10 +83,11 @@ attrtrans = string.maketrans(' \r\n\t', ' ') # as argument. class XMLParser: + attributes = {} # default, to be overridden + elements = {} # default, to be overridden # Interface -- initialize and reset this instance - def __init__(self, verbose=0): - self.verbose = verbose + def __init__(self): self.reset() # Interface -- reset this instance. Loses all unprocessed data @@ -92,6 +100,7 @@ class XMLParser: self.__at_start = 1 self.__seen_doctype = None self.__seen_starttag = 0 + self.__namespaces = {'xml':None} # xml is implicitly declared # For derived classes only -- enter literal mode (CDATA) till EOF def setnomoretags(self): @@ -333,7 +342,7 @@ class XMLParser: if self.stack: self.syntax_error('missing end tags') while self.stack: - self.finish_endtag(self.stack[-1]) + self.finish_endtag(self.stack[-1][0]) # Internal -- parse comment, return length or -1 if not terminated def parse_comment(self, i): @@ -413,7 +422,7 @@ class XMLParser: self.handle_cdata(rawdata[i+9:res.start(0)]) return res.end(0) - __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None} + __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None} # Internal -- handle a processing instruction tag def parse_proc(self, i): rawdata = self.rawdata @@ -428,29 +437,45 @@ class XMLParser: raise RuntimeError, 'unexpected call to parse_proc' k = res.end(0) name = res.group(0) - if string.find(string.lower(name), 'xml') >= 0: - self.syntax_error('illegal processing instruction target name') - self.handle_proc(name, rawdata[k:j]) + if name == 'xml:namespace': + self.syntax_error('old-fashioned namespace declaration') + # namespace declaration + # this must come after the <?xml?> declaration (if any) + # and before the <!DOCTYPE> (if any). + if self.__seen_doctype or self.__seen_starttag: + self.syntax_error('xml:namespace declaration too late in document') + attrdict, namespace, k = self.parse_attributes(name, k, j) + if namespace: + self.syntax_error('namespace declaration inside namespace declaration') + for attrname in attrdict.keys(): + if not self.__xml_namespace_attributes.has_key(attrname): + self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname) + if not attrdict.has_key('ns') or not attrdict.has_key('prefix'): + self.syntax_error('xml:namespace without required attributes') + prefix = attrdict.get('prefix') + if ncname.match(prefix) is None: + self.syntax_error('xml:namespace illegal prefix value') + return end.end(0) + if self.__namespaces.has_key(prefix): + self.syntax_error('xml:namespace prefix not unique') + self.__namespaces[prefix] = attrdict['ns'] + else: + if string.find(string.lower(name), 'xml') >= 0: + self.syntax_error('illegal processing instruction target name') + self.handle_proc(name, rawdata[k:j]) return end.end(0) # Internal -- parse attributes between i and j - def parse_attributes(self, tag, i, j, attributes = None): + def parse_attributes(self, tag, i, j): rawdata = self.rawdata - # Now parse the data between i and j into a tag and attrs attrdict = {} - try: - # convert attributes list to dictionary - d = {} - for a in attributes: - d[a] = None - attributes = d - except TypeError: - pass + namespace = {} while i < j: res = attrfind.match(rawdata, i) if res is None: break attrname, attrvalue = res.group('name', 'value') + i = res.end(0) if attrvalue is None: self.syntax_error("no value specified for attribute `%s'" % attrname) attrvalue = attrname @@ -459,22 +484,19 @@ class XMLParser: attrvalue = attrvalue[1:-1] else: self.syntax_error("attribute `%s' value not quoted" % attrname) + res = xmlns.match(attrname) + if res is not None: + # namespace declaration + ncname = res.group('ncname') + namespace[ncname or ''] = attrvalue or None + continue if '<' in attrvalue: self.syntax_error("`<' illegal in attribute value") - if attributes is not None and not attributes.has_key(attrname): - self.syntax_error("unknown attribute `%s' of element `%s'" % - (attrname, tag)) if attrdict.has_key(attrname): self.syntax_error("attribute `%s' specified twice" % attrname) attrvalue = string.translate(attrvalue, attrtrans) attrdict[attrname] = self.translate_references(attrvalue) - i = res.end(0) - if attributes is not None: - # fill in with default attributes - for key, val in attributes.items(): - if val is not None and not attrdict.has_key(key): - attrdict[key] = val - return attrdict, i + return attrdict, namespace, i # Internal -- handle starttag, return length or -1 if not terminated def parse_starttag(self, i): @@ -487,19 +509,63 @@ class XMLParser: if tag is None or tag.end(0) != end.end(0): self.syntax_error('garbage in starttag') return end.end(0) - tagname = tag.group('tagname') + nstag = tagname = tag.group('tagname') if not self.__seen_starttag and self.__seen_doctype and \ tagname != self.__seen_doctype: self.syntax_error('starttag does not match DOCTYPE') if self.__seen_starttag and not self.stack: self.syntax_error('multiple elements on top level') - if hasattr(self, tagname + '_attributes'): - attributes = getattr(self, tagname + '_attributes') - else: - attributes = None k, j = tag.span('attrs') - attrdict, k = self.parse_attributes(tagname, k, j, attributes) - self.finish_starttag(tagname, attrdict) + attrdict, nsdict, k = self.parse_attributes(tagname, k, j) + self.stack.append((tagname, nsdict, nstag)) + res = qname.match(tagname) + if res is not None: + prefix, nstag = res.group('prefix', 'local') + if prefix is None: + prefix = '' + ns = None + for t, d, nst in self.stack: + if d.has_key(prefix): + ns = d[prefix] + if ns is None and prefix != '': + ns = self.__namespaces.get(prefix) + if ns is not None: + nstag = ns + ' ' + nstag + elif prefix != '': + nstag = prefix + ':' + nstag # undo split + self.stack[-1] = tagname, nsdict, nstag + # translate namespace of attributes + nattrdict = {} + for key, val in attrdict.items(): + res = qname.match(key) + if res is not None: + aprefix, key = res.group('prefix', 'local') + if aprefix is None: + aprefix = '' + ans = None + for t, d, nst in self.stack: + if d.has_key(aprefix): + ans = d[aprefix] + if ans is None and aprefix != '': + ans = self.__namespaces.get(aprefix) + if ans is not None: + key = ans + ' ' + key + elif aprefix != '': + key = aprefix + ':' + key + elif ns is not None: + key = ns + ' ' + key + nattrdict[key] = val + attrdict = nattrdict + attributes = self.attributes.get(nstag) + if attributes is not None: + for key in attrdict.keys(): + if not attributes.has_key(key): + self.syntax_error("unknown attribute `%s' in tag `%s'" % (key, tagname)) + for key, val in attributes.items(): + if val is not None and not attrdict.has_key(key): + attrdict[key] = val + method = self.elements.get(nstag, (None, None))[0] + self.finish_starttag(nstag, attrdict, method) if tag.group('slash') == '/': self.finish_endtag(tagname) return tag.end(0) @@ -521,7 +587,7 @@ class XMLParser: else: tag = res.group(0) if self.literal: - if not self.stack or tag != self.stack[-1]: + if not self.stack or tag != self.stack[-1][0]: self.handle_data(rawdata[i]) return i+1 self.literal = 0 @@ -532,21 +598,14 @@ class XMLParser: return end.end(0) # Internal -- finish processing of start tag - # Return -1 for unknown tag, 1 for balanced tag - def finish_starttag(self, tag, attrs): - self.stack.append(tag) - methodname = 'start_' + tag - if hasattr(self, methodname): - method = getattr(self, methodname) - self.handle_starttag(tag, method, attrs) - return 1 + def finish_starttag(self, tagname, attrdict, method): + if method is not None: + self.handle_starttag(tagname, method, attrdict) else: - self.unknown_starttag(tag, attrs) - return -1 + self.unknown_starttag(tagname, attrdict) # Internal -- finish processing of end tag def finish_endtag(self, tag): - methodname = 'end_' + tag if not tag: self.syntax_error('name-less end tag') found = len(self.stack) - 1 @@ -554,27 +613,27 @@ class XMLParser: self.unknown_endtag(tag) return else: - if tag not in self.stack: + found = -1 + for i in range(len(self.stack)): + if tag == self.stack[i][0]: + found = i + if found == -1: self.syntax_error('unopened end tag') - if hasattr(self, methodname): - method = getattr(self, methodname) + method = self.elements.get(tag, (None, None))[1] + if method is not None: self.handle_endtag(tag, method) else: self.unknown_endtag(tag) return - found = len(self.stack) - for i in range(found): - if self.stack[i] == tag: - found = i while len(self.stack) > found: if found < len(self.stack) - 1: - self.syntax_error('missing close tag for %s' % self.stack[-1]) - tag = self.stack[-1] - if hasattr(self, methodname): - method = getattr(self, methodname) - self.handle_endtag(tag, method) + self.syntax_error('missing close tag for %s' % self.stack[-1][2]) + nstag = self.stack[-1][2] + method = self.elements.get(nstag, (None, None))[1] + if method is not None: + self.handle_endtag(nstag, method) else: - self.unknown_endtag(tag) + self.unknown_endtag(nstag) del self.stack[-1] # Overridable -- handle xml processing instruction @@ -654,9 +713,9 @@ class XMLParser: class TestXMLParser(XMLParser): - def __init__(self, verbose=0): + def __init__(self): self.testdata = "" - XMLParser.__init__(self, verbose) + XMLParser.__init__(self) def handle_xml(self, encoding, standalone): self.flush() |