From 61de087f0f838f5b69592827d3d592c06aa9b655 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 2 Apr 2015 21:00:13 +0300 Subject: Issue #2175: SAX parsers now support a character stream of InputSource object. --- Doc/library/xml.sax.reader.rst | 12 ++++++------ Doc/whatsnew/3.5.rst | 7 +++++++ Lib/test/test_sax.py | 33 +++++++++++++++++++++++++++++++++ Lib/xml/sax/expatreader.py | 11 ++++++++--- Lib/xml/sax/saxutils.py | 7 +++++-- Lib/xml/sax/xmlreader.py | 4 +++- Misc/NEWS | 2 ++ 7 files changed, 64 insertions(+), 12 deletions(-) diff --git a/Doc/library/xml.sax.reader.rst b/Doc/library/xml.sax.reader.rst index 3ab6063..31ca260 100644 --- a/Doc/library/xml.sax.reader.rst +++ b/Doc/library/xml.sax.reader.rst @@ -100,8 +100,10 @@ The :class:`XMLReader` interface supports the following methods: system identifier (a string identifying the input source -- typically a file name or an URL), a file-like object, or an :class:`InputSource` object. When :meth:`parse` returns, the input is completely processed, and the parser object - can be discarded or reset. As a limitation, the current implementation only - accepts byte streams; processing of character streams is for further study. + can be discarded or reset. + + .. versionchanged:: 3.5 + Added support of character streams. .. method:: XMLReader.getContentHandler() @@ -288,8 +290,7 @@ InputSource Objects .. method:: InputSource.setByteStream(bytefile) - Set the byte stream (a Python file-like object which does not perform - byte-to-character conversion) for this input source. + Set the byte stream (a :term:`binary file`) for this input source. The SAX parser will ignore this if there is also a character stream specified, but it will use a byte stream in preference to opening a URI connection itself. @@ -308,8 +309,7 @@ InputSource Objects .. method:: InputSource.setCharacterStream(charfile) - Set the character stream for this input source. (The stream must be a Python 1.6 - Unicode-wrapped file-like that performs conversion to strings.) + Set the character stream (a :term:`text file`) for this input source. If there is a character stream specified, the SAX parser will ignore any byte stream and will not attempt to open a URI connection to the system identifier. diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst index 4fea8af..e3381ae 100644 --- a/Doc/whatsnew/3.5.rst +++ b/Doc/whatsnew/3.5.rst @@ -499,6 +499,13 @@ xmlrpc * :class:`xmlrpc.client.ServerProxy` is now a :term:`context manager`. (Contributed by Claudiu Popa in :issue:`20627`.) +xml.sax +------- + +* SAX parsers now support a character stream of + :class:`~xml.sax.xmlreader.InputSource` object. + (Contributed by Serhiy Storchaka in :issue:`2175`.) + faulthandler ------------ diff --git a/Lib/test/test_sax.py b/Lib/test/test_sax.py index c8d5b21..813dc2e 100644 --- a/Lib/test/test_sax.py +++ b/Lib/test/test_sax.py @@ -185,12 +185,24 @@ class PrepareInputSourceTest(unittest.TestCase): def make_byte_stream(self): return BytesIO(b"This is a byte stream.") + def make_character_stream(self): + return StringIO("This is a character stream.") + def checkContent(self, stream, content): self.assertIsNotNone(stream) self.assertEqual(stream.read(), content) stream.close() + def test_character_stream(self): + # If the source is an InputSource with a character stream, use it. + src = InputSource(self.file) + src.setCharacterStream(self.make_character_stream()) + prep = prepare_input_source(src) + self.assertIsNone(prep.getByteStream()) + self.checkContent(prep.getCharacterStream(), + "This is a character stream.") + def test_byte_stream(self): # If the source is an InputSource that does not have a character # stream but does have a byte stream, use the byte stream. @@ -225,6 +237,14 @@ class PrepareInputSourceTest(unittest.TestCase): self.checkContent(prep.getByteStream(), b"This is a byte stream.") + def test_text_file(self): + # If the source is a text file-like object, use it as a character + # stream. + prep = prepare_input_source(self.make_character_stream()) + self.assertIsNone(prep.getByteStream()) + self.checkContent(prep.getCharacterStream(), + "This is a character stream.") + # ===== XMLGenerator @@ -904,6 +924,19 @@ class ExpatReaderTest(XmlTestBase): self.assertEqual(result.getvalue(), xml_test_out) + def test_expat_inpsource_character_stream(self): + parser = create_parser() + result = BytesIO() + xmlgen = XMLGenerator(result) + + parser.setContentHandler(xmlgen) + inpsrc = InputSource() + with open(TEST_XMLFILE, 'rt', encoding='iso-8859-1') as f: + inpsrc.setCharacterStream(f) + parser.parse(inpsrc) + + self.assertEqual(result.getvalue(), xml_test_out) + # ===== IncrementalParser support def test_expat_incremental(self): diff --git a/Lib/xml/sax/expatreader.py b/Lib/xml/sax/expatreader.py index a227cda..65ac7e3 100644 --- a/Lib/xml/sax/expatreader.py +++ b/Lib/xml/sax/expatreader.py @@ -219,9 +219,14 @@ class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): self._parsing = 0 # break cycle created by expat handlers pointing to our methods self._parser = None - bs = self._source.getByteStream() - if bs is not None: - bs.close() + try: + file = self._source.getCharacterStream() + if file is not None: + file.close() + finally: + file = self._source.getByteStream() + if file is not None: + file.close() def _reset_cont_handler(self): self._parser.ProcessingInstructionHandler = \ diff --git a/Lib/xml/sax/saxutils.py b/Lib/xml/sax/saxutils.py index 1d3d0ec..a69c7f7 100644 --- a/Lib/xml/sax/saxutils.py +++ b/Lib/xml/sax/saxutils.py @@ -345,11 +345,14 @@ def prepare_input_source(source, base=""): elif hasattr(source, "read"): f = source source = xmlreader.InputSource() - source.setByteStream(f) + if isinstance(f.read(0), str): + source.setCharacterStream(f) + else: + source.setByteStream(f) if hasattr(f, "name") and isinstance(f.name, str): source.setSystemId(f.name) - if source.getByteStream() is None: + if source.getCharacterStream() is None and source.getByteStream() is None: sysid = source.getSystemId() basehead = os.path.dirname(os.path.normpath(base)) sysidfilename = os.path.join(basehead, sysid) diff --git a/Lib/xml/sax/xmlreader.py b/Lib/xml/sax/xmlreader.py index 7ef497f..716f228 100644 --- a/Lib/xml/sax/xmlreader.py +++ b/Lib/xml/sax/xmlreader.py @@ -117,7 +117,9 @@ class IncrementalParser(XMLReader): source = saxutils.prepare_input_source(source) self.prepareParser(source) - file = source.getByteStream() + file = source.getCharacterStream() + if file is None: + file = source.getByteStream() buffer = file.read(self._bufsize) while buffer: self.feed(buffer) diff --git a/Misc/NEWS b/Misc/NEWS index 9563a5b..b230674 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -16,6 +16,8 @@ Core and Builtins Library ------- +- Issue #2175: SAX parsers now support a character stream of InputSource object. + - Issue #16840: Tkinter now supports 64-bit integers added in Tcl 8.4 and arbitrary precision integers added in Tcl 8.5. -- cgit v0.12