"""Tests for HTMLParser.py."""
import html.parser
import pprint
import unittest
from test import support
class EventCollector(html.parser.HTMLParser):
def __init__(self, *args, **kw):
self.events = []
self.append = self.events.append
html.parser.HTMLParser.__init__(self, *args, **kw)
def get_events(self):
# Normalize the list of events so that buffer artefacts don't
# separate runs of contiguous characters.
L = []
prevtype = None
for event in self.events:
type = event[0]
if type == prevtype == "data":
L[-1] = ("data", L[-1][1] + event[1])
else:
L.append(event)
prevtype = type
self.events = L
return L
# structure markup
def handle_starttag(self, tag, attrs):
self.append(("starttag", tag, attrs))
def handle_startendtag(self, tag, attrs):
self.append(("startendtag", tag, attrs))
def handle_endtag(self, tag):
self.append(("endtag", tag))
# all other markup
def handle_comment(self, data):
self.append(("comment", data))
def handle_charref(self, data):
self.append(("charref", data))
def handle_data(self, data):
self.append(("data", data))
def handle_decl(self, data):
self.append(("decl", data))
def handle_entityref(self, data):
self.append(("entityref", data))
def handle_pi(self, data):
self.append(("pi", data))
def unknown_decl(self, decl):
self.append(("unknown decl", decl))
class EventCollectorExtra(EventCollector):
def handle_starttag(self, tag, attrs):
EventCollector.handle_starttag(self, tag, attrs)
self.append(("starttag_text", self.get_starttag_text()))
class TestCaseBase(unittest.TestCase):
def get_collector(self):
raise NotImplementedError
def _run_check(self, source, expected_events, collector=None):
if collector is None:
collector = self.get_collector()
parser = collector
for s in source:
parser.feed(s)
parser.close()
events = parser.get_events()
if events != expected_events:
self.fail("received events did not match expected events\n"
"Expected:\n" + pprint.pformat(expected_events) +
"\nReceived:\n" + pprint.pformat(events))
def _run_check_extra(self, source, events):
self._run_check(source, events, EventCollectorExtra())
def _parse_error(self, source):
def parse(source=source):
parser = html.parser.HTMLParser()
parser.feed(source)
parser.close()
self.assertRaises(html.parser.HTMLParseError, parse)
class HTMLParserStrictTestCase(TestCaseBase):
def get_collector(self):
return EventCollector(strict=True)
def test_processing_instruction_only(self):
self._run_check("", [
("pi", "processing instruction"),
])
self._run_check("", [
("pi", "processing instruction ?"),
])
def test_simple_html(self):
self._run_check("""
&entity;
sample
text
“
""", [
("data", "\n"),
("decl", "DOCTYPE html PUBLIC 'foo'"),
("data", "\n"),
("starttag", "html", []),
("entityref", "entity"),
("charref", "32"),
("data", "\n"),
("comment", "comment1a\n->
',
'foo = "";',
'foo = "";',
'foo = <\n/script> ',
'',
('\n//<\\/s\'+\'cript>\');\n//]]>'),
'\n\n',
'foo = "";',
'',
# these two should be invalid according to the HTML 5 spec,
# section 8.1.2.2
#'foo = \nscript>',
#'foo = script>',
]
elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
for content in contents:
for element in elements:
element_lower = element.lower()
s = '<{element}>{content}{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", element_lower, []),
("data", content),
("endtag", element_lower)])
def test_entityrefs_in_attributes(self):
self._run_check("",
[("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
def get_collector(self):
return EventCollector(strict=False)
def test_tolerant_parsing(self):
self._run_check('te>>xt&a</img>"
"
")
expected = [
('starttag', 'html', []),
('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
('starttag', 'table',
[('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
('starttag', 'tr', []),
('starttag', 'td', [('align', 'left')]),
('starttag', 'font', [('size', '-1')]),
('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
('endtag', 'span'), ('endtag', 'a'),
('data', '- '), ('starttag', 'a', [('href', '/1/')]),
('starttag', 'span', [('class', 'en')]), ('data', ' library'),
('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
]
self._run_check(html, expected)
def test_comma_between_attributes(self):
self._run_check('"
" "
"- software-and-i"
"- library