bpo-35018: Sax parser should provide user access to lexical handlers (GH-20958)

Co-Authored-By: Jonathan Gossage <jgossage@gmail.com>
author: Zackery Spytz <zspytz@gmail.com> 2020-08-09 10:50:53 (GMT)
committer: GitHub <noreply@github.com> 2020-08-09 10:50:53 (GMT)
commit: e28b8c93878072dc02b116108ef5443084290d47 (patch)
tree: 5a5a398cdc0dbb9f8c78fe9d37ed15b7e3ce89ce /Lib
parent: 67acf74c4eaf64a860cc1bcda6efe6e9cb01f89b (diff)
download: cpython-e28b8c93878072dc02b116108ef5443084290d47.zip
cpython-e28b8c93878072dc02b116108ef5443084290d47.tar.gz
cpython-e28b8c93878072dc02b116108ef5443084290d47.tar.bz2
2 files changed, 200 insertions, 2 deletions
diff --git a/Lib/test/test_sax.py b/Lib/test/test_sax.py
index cfc674b..801143f 100644
--- a/Lib/test/test_sax.py
+++ b/Lib/test/test_sax.py
@@ -13,7 +13,8 @@ except SAXReaderNotAvailable:
 from xml.sax.saxutils import XMLGenerator, escape, unescape, quoteattr, \
                              XMLFilterBase, prepare_input_source
 from xml.sax.expatreader import create_parser
-from xml.sax.handler import feature_namespaces, feature_external_ges
+from xml.sax.handler import (feature_namespaces, feature_external_ges,
+                             LexicalHandler)
 from xml.sax.xmlreader import InputSource, AttributesImpl, AttributesNSImpl
 from io import BytesIO, StringIO
 import codecs
@@ -1356,6 +1357,155 @@ class XmlReaderTest(XmlTestBase):
         self.assertEqual(attrs.getQNameByName((ns_uri, "attr")), "ns:attr")
 
 
+class LexicalHandlerTest(unittest.TestCase):
+    def setUp(self):
+        self.parser = None
+
+        self.specified_version = '1.0'
+        self.specified_encoding = 'UTF-8'
+        self.specified_doctype = 'wish'
+        self.specified_entity_names = ('nbsp', 'source', 'target')
+        self.specified_comment = ('Comment in a DTD',
+                                  'Really! You think so?')
+        self.test_data = StringIO()
+        self.test_data.write('<?xml version="{}" encoding="{}"?>\n'.
+                             format(self.specified_version,
+                                    self.specified_encoding))
+        self.test_data.write('<!DOCTYPE {} [\n'.
+                             format(self.specified_doctype))
+        self.test_data.write('<!-- {} -->\n'.
+                             format(self.specified_comment[0]))
+        self.test_data.write('<!ELEMENT {} (to,from,heading,body,footer)>\n'.
+                             format(self.specified_doctype))
+        self.test_data.write('<!ELEMENT to (#PCDATA)>\n')
+        self.test_data.write('<!ELEMENT from (#PCDATA)>\n')
+        self.test_data.write('<!ELEMENT heading (#PCDATA)>\n')
+        self.test_data.write('<!ELEMENT body (#PCDATA)>\n')
+        self.test_data.write('<!ELEMENT footer (#PCDATA)>\n')
+        self.test_data.write('<!ENTITY {} "&#xA0;">\n'.
+                             format(self.specified_entity_names[0]))
+        self.test_data.write('<!ENTITY {} "Written by: Alexander.">\n'.
+                             format(self.specified_entity_names[1]))
+        self.test_data.write('<!ENTITY {} "Hope it gets to: Aristotle.">\n'.
+                             format(self.specified_entity_names[2]))
+        self.test_data.write(']>\n')
+        self.test_data.write('<{}>'.format(self.specified_doctype))
+        self.test_data.write('<to>Aristotle</to>\n')
+        self.test_data.write('<from>Alexander</from>\n')
+        self.test_data.write('<heading>Supplication</heading>\n')
+        self.test_data.write('<body>Teach me patience!</body>\n')
+        self.test_data.write('<footer>&{};&{};&{};</footer>\n'.
+                             format(self.specified_entity_names[1],
+                                    self.specified_entity_names[0],
+                                    self.specified_entity_names[2]))
+        self.test_data.write('<!-- {} -->\n'.format(self.specified_comment[1]))
+        self.test_data.write('</{}>\n'.format(self.specified_doctype))
+        self.test_data.seek(0)
+
+        # Data received from handlers - to be validated
+        self.version = None
+        self.encoding = None
+        self.standalone = None
+        self.doctype = None
+        self.publicID = None
+        self.systemID = None
+        self.end_of_dtd = False
+        self.comments = []
+
+    def test_handlers(self):
+        class TestLexicalHandler(LexicalHandler):
+            def __init__(self, test_harness, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.test_harness = test_harness
+
+            def startDTD(self, doctype, publicID, systemID):
+                self.test_harness.doctype = doctype
+                self.test_harness.publicID = publicID
+                self.test_harness.systemID = systemID
+
+            def endDTD(self):
+                self.test_harness.end_of_dtd = True
+
+            def comment(self, text):
+                self.test_harness.comments.append(text)
+
+        self.parser = create_parser()
+        self.parser.setContentHandler(ContentHandler())
+        self.parser.setProperty(
+            'http://xml.org/sax/properties/lexical-handler',
+            TestLexicalHandler(self))
+        source = InputSource()
+        source.setCharacterStream(self.test_data)
+        self.parser.parse(source)
+        self.assertEqual(self.doctype, self.specified_doctype)
+        self.assertIsNone(self.publicID)
+        self.assertIsNone(self.systemID)
+        self.assertTrue(self.end_of_dtd)
+        self.assertEqual(len(self.comments),
+                         len(self.specified_comment))
+        self.assertEqual(f' {self.specified_comment[0]} ', self.comments[0])
+
+
+class CDATAHandlerTest(unittest.TestCase):
+    def setUp(self):
+        self.parser = None
+        self.specified_chars = []
+        self.specified_chars.append(('Parseable character data', False))
+        self.specified_chars.append(('<> &% - assorted other XML junk.', True))
+        self.char_index = 0  # Used to index specified results within handlers
+        self.test_data = StringIO()
+        self.test_data.write('<root_doc>\n')
+        self.test_data.write('<some_pcdata>\n')
+        self.test_data.write(f'{self.specified_chars[0][0]}\n')
+        self.test_data.write('</some_pcdata>\n')
+        self.test_data.write('<some_cdata>\n')
+        self.test_data.write(f'<![CDATA[{self.specified_chars[1][0]}]]>\n')
+        self.test_data.write('</some_cdata>\n')
+        self.test_data.write('</root_doc>\n')
+        self.test_data.seek(0)
+
+        # Data received from handlers - to be validated
+        self.chardata = []
+        self.in_cdata = False
+
+    def test_handlers(self):
+        class TestLexicalHandler(LexicalHandler):
+            def __init__(self, test_harness, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.test_harness = test_harness
+
+            def startCDATA(self):
+                self.test_harness.in_cdata = True
+
+            def endCDATA(self):
+                self.test_harness.in_cdata = False
+
+        class TestCharHandler(ContentHandler):
+            def __init__(self, test_harness, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.test_harness = test_harness
+
+            def characters(self, content):
+                if content != '\n':
+                    h = self.test_harness
+                    t = h.specified_chars[h.char_index]
+                    h.assertEqual(t[0], content)
+                    h.assertEqual(t[1], h.in_cdata)
+                    h.char_index += 1
+
+        self.parser = create_parser()
+        self.parser.setContentHandler(TestCharHandler(self))
+        self.parser.setProperty(
+            'http://xml.org/sax/properties/lexical-handler',
+            TestLexicalHandler(self))
+        source = InputSource()
+        source.setCharacterStream(self.test_data)
+        self.parser.parse(source)
+
+        self.assertFalse(self.in_cdata)
+        self.assertEqual(self.char_index, 2)
+
+
 def test_main():
     run_unittest(MakeParserTest,
                  ParseTest,
@@ -1368,7 +1518,10 @@ def test_main():
                  StreamReaderWriterXmlgenTest,
                  ExpatReaderTest,
                  ErrorReportingTest,
-                 XmlReaderTest)
+                 XmlReaderTest,
+                 LexicalHandlerTest,
+                 CDATAHandlerTest)
+
 
 if __name__ == "__main__":
     test_main()
diff --git a/Lib/xml/sax/handler.py b/Lib/xml/sax/handler.py
index 481733d..e8d417e 100644
--- a/Lib/xml/sax/handler.py
+++ b/Lib/xml/sax/handler.py
@@ -340,3 +340,48 @@ all_properties = [property_lexical_handler,
                   property_xml_string,
                   property_encoding,
                   property_interning_dict]
+
+
+class LexicalHandler:
+    """Optional SAX2 handler for lexical events.
+
+    This handler is used to obtain lexical information about an XML
+    document, that is, information about how the document was encoded
+    (as opposed to what it contains, which is reported to the
+    ContentHandler), such as comments and CDATA marked section
+    boundaries.
+
+    To set the LexicalHandler of an XMLReader, use the setProperty
+    method with the property identifier
+    'http://xml.org/sax/properties/lexical-handler'."""
+
+    def comment(self, content):
+        """Reports a comment anywhere in the document (including the
+        DTD and outside the document element).
+
+        content is a string that holds the contents of the comment."""
+
+    def startDTD(self, name, public_id, system_id):
+        """Report the start of the DTD declarations, if the document
+        has an associated DTD.
+
+        A startEntity event will be reported before declaration events
+        from the external DTD subset are reported, and this can be
+        used to infer from which subset DTD declarations derive.
+
+        name is the name of the document element type, public_id the
+        public identifier of the DTD (or None if none were supplied)
+        and system_id the system identfier of the external subset (or
+        None if none were supplied)."""
+
+    def endDTD(self):
+        """Signals the end of DTD declarations."""
+
+    def startCDATA(self):
+        """Reports the beginning of a CDATA marked section.
+
+        The contents of the CDATA marked section will be reported
+        through the characters event."""
+
+    def endCDATA(self):
+        """Reports the end of a CDATA marked section."""
author	Zackery Spytz <zspytz@gmail.com>	2020-08-09 10:50:53 (GMT)
committer	GitHub <noreply@github.com>	2020-08-09 10:50:53 (GMT)
commit	e28b8c93878072dc02b116108ef5443084290d47 (patch)
tree	5a5a398cdc0dbb9f8c78fe9d37ed15b7e3ce89ce /Lib
parent	67acf74c4eaf64a860cc1bcda6efe6e9cb01f89b (diff)
download	cpython-e28b8c93878072dc02b116108ef5443084290d47.zip cpython-e28b8c93878072dc02b116108ef5443084290d47.tar.gz cpython-e28b8c93878072dc02b116108ef5443084290d47.tar.bz2