summaryrefslogtreecommitdiffstats
path: root/Lib/test/test_htmlparser.py
diff options
context:
space:
mode:
authorEzio Melotti <ezio.melotti@gmail.com>2013-11-23 17:52:05 (GMT)
committerEzio Melotti <ezio.melotti@gmail.com>2013-11-23 17:52:05 (GMT)
commit95401c5f6b9f07b094924559177c9b30a1c38998 (patch)
tree3029ea3bbffc0c53c64275a2e587bbf696a740cb /Lib/test/test_htmlparser.py
parente7f87e12626d6ae3b9ed8cae8904a6afad580ffc (diff)
downloadcpython-95401c5f6b9f07b094924559177c9b30a1c38998.zip
cpython-95401c5f6b9f07b094924559177c9b30a1c38998.tar.gz
cpython-95401c5f6b9f07b094924559177c9b30a1c38998.tar.bz2
#13633: Added a new convert_charrefs keyword arg to HTMLParser that, when True, automatically converts all character references.
Diffstat (limited to 'Lib/test/test_htmlparser.py')
-rw-r--r--Lib/test/test_htmlparser.py70
1 files changed, 62 insertions, 8 deletions
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 509b3cd..1a480c8 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -70,6 +70,18 @@ class EventCollectorExtra(EventCollector):
self.append(("starttag_text", self.get_starttag_text()))
+class EventCollectorCharrefs(EventCollector):
+
+ def get_events(self):
+ return self.events
+
+ def handle_charref(self, data):
+ self.fail('This should never be called with convert_charrefs=True')
+
+ def handle_entityref(self, data):
+ self.fail('This should never be called with convert_charrefs=True')
+
+
class TestCaseBase(unittest.TestCase):
def get_collector(self):
@@ -84,12 +96,14 @@ class TestCaseBase(unittest.TestCase):
parser.close()
events = parser.get_events()
if events != expected_events:
- self.fail("received events did not match expected events\n"
- "Expected:\n" + pprint.pformat(expected_events) +
+ self.fail("received events did not match expected events" +
+ "\nSource:\n" + repr(source) +
+ "\nExpected:\n" + pprint.pformat(expected_events) +
"\nReceived:\n" + pprint.pformat(events))
def _run_check_extra(self, source, events):
- self._run_check(source, events, EventCollectorExtra())
+ self._run_check(source, events,
+ EventCollectorExtra(convert_charrefs=False))
def _parse_error(self, source):
def parse(source=source):
@@ -105,7 +119,7 @@ class HTMLParserStrictTestCase(TestCaseBase):
def get_collector(self):
with support.check_warnings(("", DeprecationWarning), quite=False):
- return EventCollector(strict=True)
+ return EventCollector(strict=True, convert_charrefs=False)
def test_processing_instruction_only(self):
self._run_check("<?processing instruction>", [
@@ -335,7 +349,7 @@ text
self._run_check(s, [("starttag", element_lower, []),
("data", content),
("endtag", element_lower)],
- collector=Collector())
+ collector=Collector(convert_charrefs=False))
def test_comments(self):
html = ("<!-- I'm a valid comment -->"
@@ -363,14 +377,54 @@ text
('comment', '[if lte IE 7]>pretty?<![endif]')]
self._run_check(html, expected)
+ def test_convert_charrefs(self):
+ collector = lambda: EventCollectorCharrefs(convert_charrefs=True)
+ self.assertTrue(collector().convert_charrefs)
+ charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
+ # check charrefs in the middle of the text/attributes
+ expected = [('starttag', 'a', [('href', 'foo"zar')]),
+ ('data', 'a"z'), ('endtag', 'a')]
+ for charref in charrefs:
+ self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
+ expected, collector=collector())
+ # check charrefs at the beginning/end of the text/attributes
+ expected = [('data', '"'),
+ ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
+ ('data', '"'), ('endtag', 'a'), ('data', '"')]
+ for charref in charrefs:
+ self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
+ '{0}</a>{0}'.format(charref),
+ expected, collector=collector())
+ # check charrefs in <script>/<style> elements
+ for charref in charrefs:
+ text = 'X'.join([charref]*3)
+ expected = [('data', '"'),
+ ('starttag', 'script', []), ('data', text),
+ ('endtag', 'script'), ('data', '"'),
+ ('starttag', 'style', []), ('data', text),
+ ('endtag', 'style'), ('data', '"')]
+ self._run_check('{1}<script>{0}</script>{1}'
+ '<style>{0}</style>{1}'.format(text, charref),
+ expected, collector=collector())
+ # check truncated charrefs at the end of the file
+ html = '&quo &# &#x'
+ for x in range(1, len(html)):
+ self._run_check(html[:x], [('data', html[:x])],
+ collector=collector())
+ # check a string with no charrefs
+ self._run_check('no charrefs here', [('data', 'no charrefs here')],
+ collector=collector())
+
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
def get_collector(self):
- return EventCollector()
+ return EventCollector(convert_charrefs=False)
def test_deprecation_warnings(self):
with self.assertWarns(DeprecationWarning):
+ EventCollector() # convert_charrefs not passed explicitly
+ with self.assertWarns(DeprecationWarning):
EventCollector(strict=True)
with self.assertWarns(DeprecationWarning):
EventCollector(strict=False)
@@ -630,7 +684,7 @@ class AttributesStrictTestCase(TestCaseBase):
def get_collector(self):
with support.check_warnings(("", DeprecationWarning), quite=False):
- return EventCollector(strict=True)
+ return EventCollector(strict=True, convert_charrefs=False)
def test_attr_syntax(self):
output = [
@@ -691,7 +745,7 @@ class AttributesStrictTestCase(TestCaseBase):
class AttributesTolerantTestCase(AttributesStrictTestCase):
def get_collector(self):
- return EventCollector()
+ return EventCollector(convert_charrefs=False)
def test_attr_funky_names2(self):
self._run_check(