import pulldom import string from StringIO import StringIO import types """ minidom.py -- a lightweight DOM implementation based on SAX. parse( "foo.xml" ) parseString( "<foo><bar/></foo>" ) Todo: ===== * convenience methods for getting elements and text. * more testing * bring some of the writer and linearizer code into conformance with this interface * SAX 2 namespaces """ class Node: ELEMENT_NODE = 1 ATTRIBUTE_NODE = 2 TEXT_NODE = 3 CDATA_SECTION_NODE = 4 ENTITY_REFERENCE_NODE = 5 ENTITY_NODE = 6 PROCESSING_INSTRUCTION_NODE = 7 COMMENT_NODE = 8 DOCUMENT_NODE = 9 DOCUMENT_TYPE_NODE = 10 DOCUMENT_FRAGMENT_NODE = 11 NOTATION_NODE = 12 allnodes={} _debug=0 _makeParentNodes=1 debug=None def __init__( self ): self.childNodes=[] if Node._debug: index=repr( id( self ))+repr( self.__class__ ) Node.allnodes[index]=repr( self.__dict__ ) if Node.debug==None: Node.debug=StringIO() #open( "debug4.out", "w" ) Node.debug.write( "create %s\n"%index ) def __getattr__( self, key ): if key[0:2]=="__": raise AttributeError # getattr should never call getattr! if self.__dict__.has_key("inGetAttr"): del self.inGetAttr raise AttributeError, key prefix,attrname=key[:5],key[5:] if prefix=="_get_": self.inGetAttr=1 if hasattr( self, attrname ): del self.inGetAttr return (lambda self=self, attrname=attrname: getattr( self, attrname )) else: del self.inGetAttr raise AttributeError, key else: self.inGetAttr=1 try: func = getattr( self, "_get_"+key ) except AttributeError: raise AttributeError, key del self.inGetAttr return func() def __nonzero__(self): return 1 def toxml( self ): writer=StringIO() self.writexml( writer ) return writer.getvalue() def hasChildNodes( self ): if self.childNodes: return 1 else: return 0 def _get_firstChild( self ): return self.childNodes[0] def _get_lastChild( self ): return self.childNodes[-1] def insertBefore( self, newChild, refChild): index=self.childNodes.index( refChild ) self.childNodes.insert( index, newChild ) if self._makeParentNodes: newChild.parentNode=self def appendChild( self, node ): self.childNodes.append( node ) return node def replaceChild( self, newChild, oldChild ): index=self.childNodes.index( oldChild ) self.childNodes[index]=oldChild def removeChild( self, oldChild ): index=self.childNodes.index( oldChild ) del self.childNodes[index] def cloneNode( self, deep ): import new clone=new.instance( self.__class__, self.__dict__ ) clone.attributes=self.attributes.copy() if not deep: clone.childNodes=[] else: clone.childNodes=map( lambda x: x.cloneNode, self.childNodes ) return clone def unlink( self ): self.parentNode=None while self.childNodes: self.childNodes[-1].unlink() del self.childNodes[-1] # probably not most efficient! self.childNodes=None if self.attributes: for attr in self._attrs.values(): self.removeAttributeNode( attr ) assert not len( self._attrs ) assert not len( self._attrsNS ) if Node._debug: index=repr( id( self ))+repr( self.__class__ ) self.debug.write( "Deleting: %s\n" % index ) del Node.allnodes[index] def _write_data( writer, data): "Writes datachars to writer." data=string.replace(data,"&","&") data=string.replace(data,"<","<") data=string.replace(data,"\"",""") data=string.replace(data,">",">") writer.write(data) def _getElementsByTagNameHelper( parent, name, rc ): for node in parent.childNodes: if node.nodeType==Node.ELEMENT_NODE and\ (name=="*" or node.tagName==name): rc.append( node ) _getElementsByTagNameHelper( node, name, rc ) return rc def _getElementsByTagNameNSHelper( parent, nsURI, localName, rc ): for node in parent.childNodes: if (node.nodeType==Node.ELEMENT_NODE ): if ((localName=="*" or node.tagName==localName) and (nsURI=="*" or node.namespaceURI==nsURI)): rc.append( node ) _getElementsByTagNameNSHelper( node, name, rc ) class Attr(Node): nodeType=Node.ATTRIBUTE_NODE def __init__( self, qName, namespaceURI="", localName=None, prefix=None ): # skip setattr for performance self.__dict__["localName"]=localName or qName self.__dict__["nodeName"] = self.__dict__["name"] = qName self.__dict__["namespaceURI"]=namespaceURI self.__dict__["prefix"]=prefix self.attributes=None Node.__init__( self ) # nodeValue and value are set elsewhere def __setattr__( self, name, value ): if name in ("value", "nodeValue" ): self.__dict__["value"]=self.__dict__["nodeValue"]=value else: self.__dict__[name]=value class AttributeList: """the attribute list is a transient interface to the underlying dictionaries. mutations here will change the underlying element's dictionary""" def __init__( self, attrs, attrsNS ): self._attrs=attrs self._attrsNS=attrsNS self.length=len( self._attrs.keys() ) def item( self, index ): try: return self[self.keys()[index]] except IndexError: return None def items( self ): return map( lambda node: (node.tagName, node.value), self._attrs.values() ) def itemsNS( self ): return map( lambda node: ((node.URI, node.localName), node.value), self._attrs.values() ) def keys( self ): return self._attrs.keys() def keysNS( self ): return self._attrsNS.keys() def values( self ): return self._attrs.values() def __len__( self ): return self.length def __cmp__( self, other ): if self._attrs is getattr( other, "_attrs", None ): return 0 else: return cmp( id( self ), id( other ) ) #FIXME: is it appropriate to return .value? def __getitem__( self, attname_or_tuple ): if type( attname_or_tuple ) == types.TupleType: return self._attrsNS[attname_or_tuple] else: return self._attrs[attname_or_tuple] # same as set def __setitem__( self, attname, value ): if type( value ) == types.StringType: node=Attr( attname ) node.value=value else: assert isinstance( value, Attr ) or type( value )==types.StringType node=value old=self._attrs.get( attname, None) if old: old.unlink() self._attrs[node.name]=node self._attrsNS[(node.namespaceURI,node.localName)]=node def __delitem__( self, attname_or_tuple ): node=self[attname_or_tuple] node.unlink() del self._attrs[node.name] del self._attrsNS[(node.namespaceURI, node.localName)] class Element( Node ): nodeType=Node.ELEMENT_NODE def __init__( self, tagName, namespaceURI="", prefix="", localName=None ): Node.__init__( self ) self.tagName = self.nodeName = tagName self.localName=localName or tagName self.prefix=prefix self.namespaceURI=namespaceURI self.nodeValue=None self._attrs={} # attributes are double-indexed: self._attrsNS={}# tagName -> Attribute # URI,localName -> Attribute # in the future: consider lazy generation of attribute objects # this is too tricky for now because of headaches # with namespaces. def getAttribute( self, attname ): return self._attrs[attname].value def getAttributeNS( self, namespaceURI, localName ): return self._attrsNS[(namespaceURI, localName)].value def setAttribute( self, attname, value ): attr=Attr( attname ) # for performance attr.__dict__["value"]=attr.__dict__["nodeValue"]=value self.setAttributeNode( attr ) def setAttributeNS( self, namespaceURI, qualifiedName, value ): prefix,localname=_nssplit( qualifiedName ) # for performance attr = Attr( qualifiedName, namespaceURI, localname, prefix ) attr.__dict__["value"]=attr.__dict__["nodeValue"]=value self.setAttributeNode( attr ) def getAttributeNode( self, attrname ): return self._attrs.get( attrname ) def getAttributeNodeNS( self, namespaceURI, localName ): return self._attrsNS[(namespaceURI, localName)] def setAttributeNode( self, attr ): old=self._attrs.get( attr.name, None) if old: old.unlink() self._attrs[attr.name]=attr self._attrsNS[(attr.namespaceURI,attr.localName)]=attr def removeAttribute( self, name ): attr = self._attrs[name] self.removeAttributeNode( attr ) def removeAttributeNS( self, namespaceURI, localName ): attr = self._attrsNS[(namespaceURI, localName)] self.removeAttributeNode( attr ) def removeAttributeNode( self, node ): node.unlink() del self._attrs[node.name] del self._attrsNS[(node.namespaceURI, node.localName)] def getElementsByTagName( self, name ): return _getElementsByTagNameHelper( self, name, [] ) def getElementsByTagNameNS(self,namespaceURI,localName): _getElementsByTagNameNSHelper( self, namespaceURI, localName, [] ) def __repr__( self ): return "<DOM Element:"+self.tagName+" at "+`id( self )` +" >" def writexml(self, writer): writer.write("<"+self.tagName) a_names=self._get_attributes().keys() a_names.sort() for a_name in a_names: writer.write(" "+a_name+"=\"") _write_data(writer, self._get_attributes()[a_name]) writer.write("\"") if self.childNodes: writer.write(">") for node in self.childNodes: node.writexml( writer ) writer.write("</"+self.tagName+">") else: writer.write("/>") def _get_attributes( self ): return AttributeList( self._attrs, self._attrsNS ) class Comment( Node ): nodeType=Node.COMMENT_NODE def __init__(self, data ): Node.__init__( self ) self.data=self.nodeValue=data self.nodeName="#comment" self.attributes=None def writexml( self, writer ): writer.write( "<!--" + self.data + "-->" ) class ProcessingInstruction( Node ): nodeType=Node.PROCESSING_INSTRUCTION_NODE def __init__(self, target, data ): Node.__init__( self ) self.target = self.nodeName = target self.data = self.nodeValue = data self.attributes=None def writexml( self, writer ): writer.write( "<?" + self.target +" " + self.data+ "?>" ) class Text( Node ): nodeType=Node.TEXT_NODE nodeName="#text" def __init__(self, data ): Node.__init__( self ) self.data = self.nodeValue = data self.attributes=None def __repr__(self): if len( self.data )> 10: dotdotdot="..." else: dotdotdot="" return "<DOM Text node \"" + self.data[0:10] + dotdotdot+"\">" def writexml( self, writer ): _write_data( writer, self.data ) def _nssplit( qualifiedName ): fields = string.split(qualifiedName, ':') if len(fields) == 2: return fields elif len(fields) == 1: return( '', fields[0] ) class Document( Node ): nodeType=Node.DOCUMENT_NODE documentElement=None def __init__( self ): Node.__init__( self ) self.attributes=None self.nodeName="#document" self.nodeValue=None def appendChild( self, node ): if node.nodeType==Node.ELEMENT_NODE: if self.documentElement: raise TypeError, "Two document elements disallowed" else: self.documentElement=node Node.appendChild( self, node ) return node createElement=Element createTextNode=Text createComment=Comment createProcessingInstruction=ProcessingInstruction createAttribute=Attr def createElementNS(self, namespaceURI, qualifiedName): prefix,localName=_nssplit( qualifiedName ) return Element(qualifiedName, namespaceURI, prefix, localName) def createAttributeNS(self, namespaceURI, qualifiedName): prefix,localName=_nssplit( qualifiedName ) return Attr(namespaceURI, qualifiedName, localName, prefix) def getElementsByTagNameNS(self,namespaceURI,localName): _getElementsByTagNameNSHelper( self, namespaceURI, localName ) def unlink( self ): self.documentElement=None Node.unlink( self ) def getElementsByTagName( self, name ): rc=[] _getElementsByTagNameHelper( self, name, rc ) return rc def writexml( self, writer ): for node in self.childNodes: node.writexml( writer ) def _doparse( func, args, kwargs ): events=apply( func, args, kwargs ) (toktype, rootNode)=events.getEvent() events.expandNode( rootNode ) return rootNode def parse( *args, **kwargs ): "Parse a file into a DOM by filename or file object" return _doparse( pulldom.parse, args, kwargs ) def parseString( *args, **kwargs ): "Parse a file into a DOM from a string" return _doparse( pulldom.parseString, args, kwargs )