diff options
Diffstat (limited to 'Lib/test/test_htmlparser.py')
-rw-r--r-- | Lib/test/test_htmlparser.py | 98 |
1 files changed, 79 insertions, 19 deletions
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index c977a9d..1a480c8 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -70,6 +70,18 @@ class EventCollectorExtra(EventCollector): self.append(("starttag_text", self.get_starttag_text())) +class EventCollectorCharrefs(EventCollector): + + def get_events(self): + return self.events + + def handle_charref(self, data): + self.fail('This should never be called with convert_charrefs=True') + + def handle_entityref(self, data): + self.fail('This should never be called with convert_charrefs=True') + + class TestCaseBase(unittest.TestCase): def get_collector(self): @@ -84,26 +96,30 @@ class TestCaseBase(unittest.TestCase): parser.close() events = parser.get_events() if events != expected_events: - self.fail("received events did not match expected events\n" - "Expected:\n" + pprint.pformat(expected_events) + + self.fail("received events did not match expected events" + + "\nSource:\n" + repr(source) + + "\nExpected:\n" + pprint.pformat(expected_events) + "\nReceived:\n" + pprint.pformat(events)) def _run_check_extra(self, source, events): - self._run_check(source, events, EventCollectorExtra()) + self._run_check(source, events, + EventCollectorExtra(convert_charrefs=False)) def _parse_error(self, source): def parse(source=source): parser = self.get_collector() parser.feed(source) parser.close() - self.assertRaises(html.parser.HTMLParseError, parse) + with self.assertRaises(html.parser.HTMLParseError): + with self.assertWarns(DeprecationWarning): + parse() class HTMLParserStrictTestCase(TestCaseBase): def get_collector(self): with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True) + return EventCollector(strict=True, convert_charrefs=False) def test_processing_instruction_only(self): self._run_check("<?processing instruction>", [ @@ -333,7 +349,7 @@ text self._run_check(s, [("starttag", element_lower, []), ("data", content), ("endtag", element_lower)], - collector=Collector()) + collector=Collector(convert_charrefs=False)) def test_comments(self): html = ("<!-- I'm a valid comment -->" @@ -361,11 +377,60 @@ text ('comment', '[if lte IE 7]>pretty?<![endif]')] self._run_check(html, expected) + def test_convert_charrefs(self): + collector = lambda: EventCollectorCharrefs(convert_charrefs=True) + self.assertTrue(collector().convert_charrefs) + charrefs = ['"', '"', '"', '"', '"', '"'] + # check charrefs in the middle of the text/attributes + expected = [('starttag', 'a', [('href', 'foo"zar')]), + ('data', 'a"z'), ('endtag', 'a')] + for charref in charrefs: + self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref), + expected, collector=collector()) + # check charrefs at the beginning/end of the text/attributes + expected = [('data', '"'), + ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]), + ('data', '"'), ('endtag', 'a'), ('data', '"')] + for charref in charrefs: + self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">' + '{0}</a>{0}'.format(charref), + expected, collector=collector()) + # check charrefs in <script>/<style> elements + for charref in charrefs: + text = 'X'.join([charref]*3) + expected = [('data', '"'), + ('starttag', 'script', []), ('data', text), + ('endtag', 'script'), ('data', '"'), + ('starttag', 'style', []), ('data', text), + ('endtag', 'style'), ('data', '"')] + self._run_check('{1}<script>{0}</script>{1}' + '<style>{0}</style>{1}'.format(text, charref), + expected, collector=collector()) + # check truncated charrefs at the end of the file + html = '&quo &# &#x' + for x in range(1, len(html)): + self._run_check(html[:x], [('data', html[:x])], + collector=collector()) + # check a string with no charrefs + self._run_check('no charrefs here', [('data', 'no charrefs here')], + collector=collector()) + class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): def get_collector(self): - return EventCollector(strict=False) + return EventCollector(convert_charrefs=False) + + def test_deprecation_warnings(self): + with self.assertWarns(DeprecationWarning): + EventCollector() # convert_charrefs not passed explicitly + with self.assertWarns(DeprecationWarning): + EventCollector(strict=True) + with self.assertWarns(DeprecationWarning): + EventCollector(strict=False) + with self.assertRaises(html.parser.HTMLParseError): + with self.assertWarns(DeprecationWarning): + EventCollector().error('test') def test_tolerant_parsing(self): self._run_check('<html <html>te>>xt&a<<bc</a></html>\n' @@ -558,17 +623,12 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): for html, expected in data: self._run_check(html, expected) - def test_unescape_function(self): + def test_unescape_method(self): + from html import unescape p = self.get_collector() - self.assertEqual(p.unescape('&#bad;'),'&#bad;') - self.assertEqual(p.unescape('&'),'&') - # see #12888 - self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050) - # see #15156 - self.assertEqual(p.unescape('ÉricÉric' - '&alphacentauriαcentauri'), - 'ÉricÉric&alphacentauriαcentauri') - self.assertEqual(p.unescape('&co;'), '&co;') + with self.assertWarns(DeprecationWarning): + s = '""""""&#bad;' + self.assertEqual(p.unescape(s), unescape(s)) def test_broken_comments(self): html = ('<! not really a comment >' @@ -624,7 +684,7 @@ class AttributesStrictTestCase(TestCaseBase): def get_collector(self): with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True) + return EventCollector(strict=True, convert_charrefs=False) def test_attr_syntax(self): output = [ @@ -685,7 +745,7 @@ class AttributesStrictTestCase(TestCaseBase): class AttributesTolerantTestCase(AttributesStrictTestCase): def get_collector(self): - return EventCollector(strict=False) + return EventCollector(convert_charrefs=False) def test_attr_funky_names2(self): self._run_check( |