summaryrefslogtreecommitdiffstats
path: root/Lib/xml/sax
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/xml/sax')
-rw-r--r--Lib/xml/sax/__init__.py10
-rw-r--r--Lib/xml/sax/expatreader.py22
-rw-r--r--Lib/xml/sax/saxutils.py22
-rw-r--r--Lib/xml/sax/xmlreader.py115
4 files changed, 138 insertions, 31 deletions
diff --git a/Lib/xml/sax/__init__.py b/Lib/xml/sax/__init__.py
index 1077d5d..24b7d24 100644
--- a/Lib/xml/sax/__init__.py
+++ b/Lib/xml/sax/__init__.py
@@ -21,16 +21,17 @@ expatreader -- Driver that allows use of the Expat parser with the
"""
+from xmlreader import InputSource
from handler import ContentHandler, ErrorHandler
from _exceptions import SAXException, SAXNotRecognizedException, \
SAXParseException, SAXNotSupportedException
-def parse(filename_or_stream, handler, errorHandler=ErrorHandler()):
+def parse(source, handler, errorHandler=ErrorHandler()):
parser = ExpatParser()
parser.setContentHandler(handler)
parser.setErrorHandler(errorHandler)
- parser.parse(filename_or_stream)
+ parser.parse(source)
def parseString(string, handler, errorHandler=ErrorHandler()):
try:
@@ -43,7 +44,10 @@ def parseString(string, handler, errorHandler=ErrorHandler()):
parser = ExpatParser()
parser.setContentHandler(handler)
parser.setErrorHandler(errorHandler)
- parser.parse(StringIO(string))
+
+ inpsrc = InputSource()
+ inpsrc.setByteStream(StringIO(string))
+ parser.parse(inpsrc)
# this is the parser list used by the make_parser function if no
# alternatives are given as parameters to the function
diff --git a/Lib/xml/sax/expatreader.py b/Lib/xml/sax/expatreader.py
index 2f1ff1c..341efd3 100644
--- a/Lib/xml/sax/expatreader.py
+++ b/Lib/xml/sax/expatreader.py
@@ -18,7 +18,7 @@ version = "0.20"
from xml.sax._exceptions import *
from xml.parsers import expat
-from xml.sax import xmlreader
+from xml.sax import xmlreader, saxutils
AttributesImpl = xmlreader.AttributesImpl
AttributesNSImpl = xmlreader.AttributesNSImpl
@@ -37,28 +37,24 @@ class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
# XMLReader methods
- def parse(self, stream_or_string):
+ def parse(self, source):
"Parse an XML document from a URL."
- if type(stream_or_string) is type(""):
- stream = open(stream_or_string)
- else:
- stream = stream_or_string
-
+ source = saxutils.prepare_input_source(source)
+
+ self._source = source
self.reset()
self._cont_handler.setDocumentLocator(self)
try:
- xmlreader.IncrementalParser.parse(self, stream)
+ xmlreader.IncrementalParser.parse(self, source)
except expat.error:
error_code = self._parser.ErrorCode
raise SAXParseException(expat.ErrorString(error_code), None, self)
self._cont_handler.endDocument()
- def prepareParser(self, filename=None):
- self._source = filename
-
- if self._source != None:
- self._parser.SetBase(self._source)
+ def prepareParser(self, source):
+ if source.getSystemId() != None:
+ self._parser.SetBase(source.getSystemId())
def getFeature(self, name):
if name == feature_namespaces:
diff --git a/Lib/xml/sax/saxutils.py b/Lib/xml/sax/saxutils.py
index fe13bde..8f8f42e 100644
--- a/Lib/xml/sax/saxutils.py
+++ b/Lib/xml/sax/saxutils.py
@@ -3,6 +3,7 @@ A library of useful helper classes to the SAX classes, for the
convenience of application and driver writers.
"""
+import os, urlparse, urllib
import handler
import xmlreader
@@ -181,3 +182,24 @@ class XMLFilterBase(xmlreader.XMLReader):
def setProperty(self, name, value):
self._parent.setProperty(name, value)
+
+# --- Utility functions
+
+def prepare_input_source(source, base = ""):
+ """This function takes an InputSource and an optional base URL and
+ returns a fully resolved InputSource object ready for reading."""
+
+ if type(source) == type(""):
+ source = xmlreader.InputSource(source)
+
+ if source.getByteStream() == None:
+ sysid = source.getSystemId()
+ if urlparse.urlparse(sysid)[0] == '':
+ basehead = os.path.split(os.path.normpath(base))[0]
+ source.setSystemId(os.path.join(basehead, sysid))
+ else:
+ source.setSystemId(urlparse.urljoin(base, sysid))
+
+ source.setByteStream(urllib.urlopen(source.getSystemId()))
+
+ return source
diff --git a/Lib/xml/sax/xmlreader.py b/Lib/xml/sax/xmlreader.py
index 6dae0b5..e5133f6 100644
--- a/Lib/xml/sax/xmlreader.py
+++ b/Lib/xml/sax/xmlreader.py
@@ -6,6 +6,7 @@ import handler
# ===== XMLREADER =====
class XMLReader:
+
def __init__(self):
self._cont_handler = handler.ContentHandler()
#self._dtd_handler = handler.DTDHandler()
@@ -73,7 +74,8 @@ class XMLReader:
"Sets the value of a SAX2 property."
raise SAXNotRecognizedException("Property '%s' not recognized" % name)
-
+import saxutils
+
class IncrementalParser(XMLReader):
"""This interface adds three extra methods to the XMLReader
interface that allow XML parsers to support incremental
@@ -98,24 +100,18 @@ class IncrementalParser(XMLReader):
self._bufsize = bufsize
XMLReader.__init__(self)
- def _parseOpenFile(self, source):
- buffer = source.read(self._bufsize)
+ def parse(self, source):
+ source = saxutils.prepare_input_source(source)
+
+ self.prepareParser(source)
+ file = source.getByteStream()
+ buffer = file.read(self._bufsize)
while buffer != "":
self.feed(buffer)
- buffer = source.read(self._bufsize)
- self.close()
+ buffer = file.read(self._bufsize)
+
self.reset()
- def parse(self, source):
- if hasattr(source, "read"):
- self._parseOpenFile(source)
- else:
- #FIXME: how to recognize if it is a URL instead of filename?
- self.prepareParser(source)
- file = open(source)
- self._parseOpenFile(file)
- file.close()
-
def feed(self, data):
"""This method gives the raw XML data in the data parameter to
the parser and makes it parse the data, emitting the
@@ -174,6 +170,95 @@ class Locator:
"Return the system identifier for the current event."
return None
+# ===== INPUTSOURCE =====
+
+class InputSource:
+ """Encapsulation of the information needed by the XMLReader to
+ read entities.
+
+ This class may include information about the public identifier,
+ system identifier, byte stream (possibly with character encoding
+ information) and/or the character stream of an entity.
+
+ Applications will create objects of this class for use in the
+ XMLReader.parse method and for returning from
+ EntityResolver.resolveEntity.
+
+ An InputSource belongs to the application, the XMLReader is not
+ allowed to modify InputSource objects passed to it from the
+ application, although it may make copies and modify those."""
+
+ def __init__(self, system_id = None):
+ self.__system_id = system_id
+ self.__public_id = None
+ self.__encoding = None
+ self.__bytefile = None
+ self.__charfile = None
+
+ def setPublicId(self, public_id):
+ "Sets the public identifier of this InputSource."
+ self.__public_id = public_id
+
+ def getPublicId(self):
+ "Returns the public identifier of this InputSource."
+ return self.__public_id
+
+ def setSystemId(self, system_id):
+ "Sets the system identifier of this InputSource."
+ self.__system_id = system_id
+
+ def getSystemId(self):
+ "Returns the system identifier of this InputSource."
+ return self.__system_id
+
+ def setEncoding(self, encoding):
+ """Sets the character encoding of this InputSource.
+
+ The encoding must be a string acceptable for an XML encoding
+ declaration (see section 4.3.3 of the XML recommendation).
+
+ The encoding attribute of the InputSource is ignored if the
+ InputSource also contains a character stream."""
+ self.__encoding = encoding
+
+ def getEncoding(self):
+ "Get the character encoding of this InputSource."
+ return self.__encoding
+
+ def setByteStream(self, bytefile):
+ """Set the byte stream (a Python file-like object which does
+ not perform byte-to-character conversion) for this input
+ source.
+
+ The SAX parser will ignore this if there is also a character
+ stream specified, but it will use a byte stream in preference
+ to opening a URI connection itself.
+
+ If the application knows the character encoding of the byte
+ stream, it should set it with the setEncoding method."""
+ self.__bytefile = bytefile
+
+ def getByteStream(self):
+ """Get the byte stream for this input source.
+
+ The getEncoding method will return the character encoding for
+ this byte stream, or None if unknown."""
+ return self.__bytefile
+
+ def setCharacterStream(self, charfile):
+ """Set the character stream for this input source. (The stream
+ must be a Python 1.6 Unicode-wrapped file-like that performs
+ conversion to Unicode strings.)
+
+ If there is a character stream specified, the SAX parser will
+ ignore any byte stream and will not attempt to open a URI
+ connection to the system identifier."""
+ self.__charfile = charfile
+
+ def getCharacterStream(self):
+ "Get the character stream for this input source."
+ return self.__charfile
+
# ===== ATTRIBUTESIMPL =====
class AttributesImpl: