diff options
Diffstat (limited to 'Lib/test/test_htmlparser.py')
| -rw-r--r-- | Lib/test/test_htmlparser.py | 107 | 
1 files changed, 88 insertions, 19 deletions
| diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 11d9c9c..144f820 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -70,6 +70,15 @@ class EventCollectorExtra(EventCollector):          self.append(("starttag_text", self.get_starttag_text())) +class EventCollectorCharrefs(EventCollector): + +    def handle_charref(self, data): +        self.fail('This should never be called with convert_charrefs=True') + +    def handle_entityref(self, data): +        self.fail('This should never be called with convert_charrefs=True') + +  class TestCaseBase(unittest.TestCase):      def get_collector(self): @@ -84,26 +93,30 @@ class TestCaseBase(unittest.TestCase):          parser.close()          events = parser.get_events()          if events != expected_events: -            self.fail("received events did not match expected events\n" -                      "Expected:\n" + pprint.pformat(expected_events) + +            self.fail("received events did not match expected events" + +                      "\nSource:\n" + repr(source) + +                      "\nExpected:\n" + pprint.pformat(expected_events) +                        "\nReceived:\n" + pprint.pformat(events))      def _run_check_extra(self, source, events): -        self._run_check(source, events, EventCollectorExtra()) +        self._run_check(source, events, +                        EventCollectorExtra(convert_charrefs=False))      def _parse_error(self, source):          def parse(source=source):              parser = self.get_collector()              parser.feed(source)              parser.close() -        self.assertRaises(html.parser.HTMLParseError, parse) +        with self.assertRaises(html.parser.HTMLParseError): +            with self.assertWarns(DeprecationWarning): +                parse()  class HTMLParserStrictTestCase(TestCaseBase):      def get_collector(self):          with support.check_warnings(("", DeprecationWarning), quite=False): -            return EventCollector(strict=True) +            return EventCollector(strict=True, convert_charrefs=False)      def test_processing_instruction_only(self):          self._run_check("<?processing instruction>", [ @@ -339,7 +352,7 @@ text              self._run_check(s, [("starttag", element_lower, []),                                  ("data", content),                                  ("endtag", element_lower)], -                            collector=Collector()) +                            collector=Collector(convert_charrefs=False))      def test_comments(self):          html = ("<!-- I'm a valid comment -->" @@ -367,11 +380,60 @@ text                      ('comment', '[if lte IE 7]>pretty?<![endif]')]          self._run_check(html, expected) +    def test_convert_charrefs(self): +        collector = lambda: EventCollectorCharrefs(convert_charrefs=True) +        self.assertTrue(collector().convert_charrefs) +        charrefs = ['"', '"', '"', '"', '"', '"'] +        # check charrefs in the middle of the text/attributes +        expected = [('starttag', 'a', [('href', 'foo"zar')]), +                    ('data', 'a"z'), ('endtag', 'a')] +        for charref in charrefs: +            self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref), +                            expected, collector=collector()) +        # check charrefs at the beginning/end of the text/attributes +        expected = [('data', '"'), +                    ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]), +                    ('data', '"'), ('endtag', 'a'), ('data', '"')] +        for charref in charrefs: +            self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">' +                            '{0}</a>{0}'.format(charref), +                            expected, collector=collector()) +        # check charrefs in <script>/<style> elements +        for charref in charrefs: +            text = 'X'.join([charref]*3) +            expected = [('data', '"'), +                        ('starttag', 'script', []), ('data', text), +                        ('endtag', 'script'), ('data', '"'), +                        ('starttag', 'style', []), ('data', text), +                        ('endtag', 'style'), ('data', '"')] +            self._run_check('{1}<script>{0}</script>{1}' +                            '<style>{0}</style>{1}'.format(text, charref), +                            expected, collector=collector()) +        # check truncated charrefs at the end of the file +        html = '&quo &# &#x' +        for x in range(1, len(html)): +            self._run_check(html[:x], [('data', html[:x])], +                            collector=collector()) +        # check a string with no charrefs +        self._run_check('no charrefs here', [('data', 'no charrefs here')], +                        collector=collector()) +  class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):      def get_collector(self): -        return EventCollector(strict=False) +        return EventCollector(convert_charrefs=False) + +    def test_deprecation_warnings(self): +        with self.assertWarns(DeprecationWarning): +            EventCollector()  # convert_charrefs not passed explicitly +        with self.assertWarns(DeprecationWarning): +            EventCollector(strict=True) +        with self.assertWarns(DeprecationWarning): +            EventCollector(strict=False) +        with self.assertRaises(html.parser.HTMLParseError): +            with self.assertWarns(DeprecationWarning): +                EventCollector().error('test')      def test_tolerant_parsing(self):          self._run_check('<html <html>te>>xt&a<<bc</a></html>\n' @@ -564,17 +626,12 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):          for html, expected in data:              self._run_check(html, expected) -    def test_unescape_function(self): +    def test_unescape_method(self): +        from html import unescape          p = self.get_collector() -        self.assertEqual(p.unescape('&#bad;'),'&#bad;') -        self.assertEqual(p.unescape('&'),'&') -        # see #12888 -        self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050) -        # see #15156 -        self.assertEqual(p.unescape('ÉricÉric' -                                    '&alphacentauriαcentauri'), -                                    'ÉricÉric&alphacentauriαcentauri') -        self.assertEqual(p.unescape('&co;'), '&co;') +        with self.assertWarns(DeprecationWarning): +            s = '""""""&#bad;' +            self.assertEqual(p.unescape(s), unescape(s))      def test_broken_comments(self):          html = ('<! not really a comment >' @@ -625,12 +682,24 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):          ]          self._run_check(html, expected) +    def test_convert_charrefs_dropped_text(self): +        # #23144: make sure that all the events are triggered when +        # convert_charrefs is True, even if we don't call .close() +        parser = EventCollector(convert_charrefs=True) +        # before the fix, bar & baz was missing +        parser.feed("foo <a>link</a> bar & baz") +        self.assertEqual( +            parser.get_events(), +            [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'), +             ('endtag', 'a'), ('data', ' bar & baz')] +        ) +  class AttributesStrictTestCase(TestCaseBase):      def get_collector(self):          with support.check_warnings(("", DeprecationWarning), quite=False): -            return EventCollector(strict=True) +            return EventCollector(strict=True, convert_charrefs=False)      def test_attr_syntax(self):          output = [ @@ -691,7 +760,7 @@ class AttributesStrictTestCase(TestCaseBase):  class AttributesTolerantTestCase(AttributesStrictTestCase):      def get_collector(self): -        return EventCollector(strict=False) +        return EventCollector(convert_charrefs=False)      def test_attr_funky_names2(self):          self._run_check( | 
