diff options
Diffstat (limited to 'Lib/xml/sax')
-rw-r--r-- | Lib/xml/sax/__init__.py | 10 | ||||
-rw-r--r-- | Lib/xml/sax/expatreader.py | 22 | ||||
-rw-r--r-- | Lib/xml/sax/saxutils.py | 22 | ||||
-rw-r--r-- | Lib/xml/sax/xmlreader.py | 115 |
4 files changed, 138 insertions, 31 deletions
diff --git a/Lib/xml/sax/__init__.py b/Lib/xml/sax/__init__.py index 1077d5d..24b7d24 100644 --- a/Lib/xml/sax/__init__.py +++ b/Lib/xml/sax/__init__.py @@ -21,16 +21,17 @@ expatreader -- Driver that allows use of the Expat parser with the """ +from xmlreader import InputSource from handler import ContentHandler, ErrorHandler from _exceptions import SAXException, SAXNotRecognizedException, \ SAXParseException, SAXNotSupportedException -def parse(filename_or_stream, handler, errorHandler=ErrorHandler()): +def parse(source, handler, errorHandler=ErrorHandler()): parser = ExpatParser() parser.setContentHandler(handler) parser.setErrorHandler(errorHandler) - parser.parse(filename_or_stream) + parser.parse(source) def parseString(string, handler, errorHandler=ErrorHandler()): try: @@ -43,7 +44,10 @@ def parseString(string, handler, errorHandler=ErrorHandler()): parser = ExpatParser() parser.setContentHandler(handler) parser.setErrorHandler(errorHandler) - parser.parse(StringIO(string)) + + inpsrc = InputSource() + inpsrc.setByteStream(StringIO(string)) + parser.parse(inpsrc) # this is the parser list used by the make_parser function if no # alternatives are given as parameters to the function diff --git a/Lib/xml/sax/expatreader.py b/Lib/xml/sax/expatreader.py index 2f1ff1c..341efd3 100644 --- a/Lib/xml/sax/expatreader.py +++ b/Lib/xml/sax/expatreader.py @@ -18,7 +18,7 @@ version = "0.20" from xml.sax._exceptions import * from xml.parsers import expat -from xml.sax import xmlreader +from xml.sax import xmlreader, saxutils AttributesImpl = xmlreader.AttributesImpl AttributesNSImpl = xmlreader.AttributesNSImpl @@ -37,28 +37,24 @@ class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): # XMLReader methods - def parse(self, stream_or_string): + def parse(self, source): "Parse an XML document from a URL." - if type(stream_or_string) is type(""): - stream = open(stream_or_string) - else: - stream = stream_or_string - + source = saxutils.prepare_input_source(source) + + self._source = source self.reset() self._cont_handler.setDocumentLocator(self) try: - xmlreader.IncrementalParser.parse(self, stream) + xmlreader.IncrementalParser.parse(self, source) except expat.error: error_code = self._parser.ErrorCode raise SAXParseException(expat.ErrorString(error_code), None, self) self._cont_handler.endDocument() - def prepareParser(self, filename=None): - self._source = filename - - if self._source != None: - self._parser.SetBase(self._source) + def prepareParser(self, source): + if source.getSystemId() != None: + self._parser.SetBase(source.getSystemId()) def getFeature(self, name): if name == feature_namespaces: diff --git a/Lib/xml/sax/saxutils.py b/Lib/xml/sax/saxutils.py index fe13bde..8f8f42e 100644 --- a/Lib/xml/sax/saxutils.py +++ b/Lib/xml/sax/saxutils.py @@ -3,6 +3,7 @@ A library of useful helper classes to the SAX classes, for the convenience of application and driver writers. """ +import os, urlparse, urllib import handler import xmlreader @@ -181,3 +182,24 @@ class XMLFilterBase(xmlreader.XMLReader): def setProperty(self, name, value): self._parent.setProperty(name, value) + +# --- Utility functions + +def prepare_input_source(source, base = ""): + """This function takes an InputSource and an optional base URL and + returns a fully resolved InputSource object ready for reading.""" + + if type(source) == type(""): + source = xmlreader.InputSource(source) + + if source.getByteStream() == None: + sysid = source.getSystemId() + if urlparse.urlparse(sysid)[0] == '': + basehead = os.path.split(os.path.normpath(base))[0] + source.setSystemId(os.path.join(basehead, sysid)) + else: + source.setSystemId(urlparse.urljoin(base, sysid)) + + source.setByteStream(urllib.urlopen(source.getSystemId())) + + return source diff --git a/Lib/xml/sax/xmlreader.py b/Lib/xml/sax/xmlreader.py index 6dae0b5..e5133f6 100644 --- a/Lib/xml/sax/xmlreader.py +++ b/Lib/xml/sax/xmlreader.py @@ -6,6 +6,7 @@ import handler # ===== XMLREADER ===== class XMLReader: + def __init__(self): self._cont_handler = handler.ContentHandler() #self._dtd_handler = handler.DTDHandler() @@ -73,7 +74,8 @@ class XMLReader: "Sets the value of a SAX2 property." raise SAXNotRecognizedException("Property '%s' not recognized" % name) - +import saxutils + class IncrementalParser(XMLReader): """This interface adds three extra methods to the XMLReader interface that allow XML parsers to support incremental @@ -98,24 +100,18 @@ class IncrementalParser(XMLReader): self._bufsize = bufsize XMLReader.__init__(self) - def _parseOpenFile(self, source): - buffer = source.read(self._bufsize) + def parse(self, source): + source = saxutils.prepare_input_source(source) + + self.prepareParser(source) + file = source.getByteStream() + buffer = file.read(self._bufsize) while buffer != "": self.feed(buffer) - buffer = source.read(self._bufsize) - self.close() + buffer = file.read(self._bufsize) + self.reset() - def parse(self, source): - if hasattr(source, "read"): - self._parseOpenFile(source) - else: - #FIXME: how to recognize if it is a URL instead of filename? - self.prepareParser(source) - file = open(source) - self._parseOpenFile(file) - file.close() - def feed(self, data): """This method gives the raw XML data in the data parameter to the parser and makes it parse the data, emitting the @@ -174,6 +170,95 @@ class Locator: "Return the system identifier for the current event." return None +# ===== INPUTSOURCE ===== + +class InputSource: + """Encapsulation of the information needed by the XMLReader to + read entities. + + This class may include information about the public identifier, + system identifier, byte stream (possibly with character encoding + information) and/or the character stream of an entity. + + Applications will create objects of this class for use in the + XMLReader.parse method and for returning from + EntityResolver.resolveEntity. + + An InputSource belongs to the application, the XMLReader is not + allowed to modify InputSource objects passed to it from the + application, although it may make copies and modify those.""" + + def __init__(self, system_id = None): + self.__system_id = system_id + self.__public_id = None + self.__encoding = None + self.__bytefile = None + self.__charfile = None + + def setPublicId(self, public_id): + "Sets the public identifier of this InputSource." + self.__public_id = public_id + + def getPublicId(self): + "Returns the public identifier of this InputSource." + return self.__public_id + + def setSystemId(self, system_id): + "Sets the system identifier of this InputSource." + self.__system_id = system_id + + def getSystemId(self): + "Returns the system identifier of this InputSource." + return self.__system_id + + def setEncoding(self, encoding): + """Sets the character encoding of this InputSource. + + The encoding must be a string acceptable for an XML encoding + declaration (see section 4.3.3 of the XML recommendation). + + The encoding attribute of the InputSource is ignored if the + InputSource also contains a character stream.""" + self.__encoding = encoding + + def getEncoding(self): + "Get the character encoding of this InputSource." + return self.__encoding + + def setByteStream(self, bytefile): + """Set the byte stream (a Python file-like object which does + not perform byte-to-character conversion) for this input + source. + + The SAX parser will ignore this if there is also a character + stream specified, but it will use a byte stream in preference + to opening a URI connection itself. + + If the application knows the character encoding of the byte + stream, it should set it with the setEncoding method.""" + self.__bytefile = bytefile + + def getByteStream(self): + """Get the byte stream for this input source. + + The getEncoding method will return the character encoding for + this byte stream, or None if unknown.""" + return self.__bytefile + + def setCharacterStream(self, charfile): + """Set the character stream for this input source. (The stream + must be a Python 1.6 Unicode-wrapped file-like that performs + conversion to Unicode strings.) + + If there is a character stream specified, the SAX parser will + ignore any byte stream and will not attempt to open a URI + connection to the system identifier.""" + self.__charfile = charfile + + def getCharacterStream(self): + "Get the character stream for this input source." + return self.__charfile + # ===== ATTRIBUTESIMPL ===== class AttributesImpl: |