diff options
Diffstat (limited to 'Lib/test/test_sgmllib.py')
-rw-r--r-- | Lib/test/test_sgmllib.py | 111 |
1 files changed, 96 insertions, 15 deletions
diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index 8e8b02f..28a21a4 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -1,4 +1,6 @@ +import htmlentitydefs import pprint +import re import sgmllib import unittest from test import test_support @@ -64,6 +66,37 @@ class CDATAEventCollector(EventCollector): self.setliteral() +class HTMLEntityCollector(EventCollector): + + entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)' + '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)') + + def convert_charref(self, name): + self.append(("charref", "convert", name)) + if name[0] != "x": + return EventCollector.convert_charref(self, name) + + def convert_codepoint(self, codepoint): + self.append(("codepoint", "convert", codepoint)) + EventCollector.convert_codepoint(self, codepoint) + + def convert_entityref(self, name): + self.append(("entityref", "convert", name)) + return EventCollector.convert_entityref(self, name) + + # These to record that they were called, then pass the call along + # to the default implementation so that it's actions can be + # recorded. + + def handle_charref(self, data): + self.append(("charref", data)) + sgmllib.SGMLParser.handle_charref(self, data) + + def handle_entityref(self, data): + self.append(("entityref", data)) + sgmllib.SGMLParser.handle_entityref(self, data) + + class SGMLParserTestCase(unittest.TestCase): collector = EventCollector @@ -218,7 +251,9 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' """Substitution of entities and charrefs in attribute values""" # SF bug #1452246 self.check_events("""<a b=< c=<> d=<-> e='< ' - f="&xxx;" g=' !' h='Ǵ' i='x?a=b&c=d;'>""", + f="&xxx;" g=' !' h='Ǵ' + i='x?a=b&c=d;' + j='&#42;' k='&#42;'>""", [("starttag", "a", [("b", "<"), ("c", "<>"), ("d", "<->"), @@ -226,13 +261,59 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ("f", "&xxx;"), ("g", " !"), ("h", "Ǵ"), - ("i", "x?a=b&c=d;"), ])]) + ("i", "x?a=b&c=d;"), + ("j", "*"), + ("k", "*"), + ])]) + + def test_convert_overrides(self): + # This checks that the character and entity reference + # conversion helpers are called at the documented times. No + # attempt is made to really change what the parser accepts. + # + self.collector = HTMLEntityCollector + self.check_events(('<a title="“test”">foo</a>' + '&foobar;*'), [ + ('entityref', 'convert', 'ldquo'), + ('charref', 'convert', 'x201d'), + ('starttag', 'a', [('title', '“test”')]), + ('data', 'foo'), + ('endtag', 'a'), + ('entityref', 'foobar'), + ('entityref', 'convert', 'foobar'), + ('charref', '42'), + ('charref', 'convert', '42'), + ('codepoint', 'convert', 42), + ]) + + def test_attr_values_quoted_markup(self): + """Multi-line and markup in attribute values""" + self.check_events("""<a title='foo\n<br>bar'>text</a>""", + [("starttag", "a", [("title", "foo\n<br>bar")]), + ("data", "text"), + ("endtag", "a")]) + self.check_events("""<a title='less < than'>text</a>""", + [("starttag", "a", [("title", "less < than")]), + ("data", "text"), + ("endtag", "a")]) + self.check_events("""<a title='greater > than'>text</a>""", + [("starttag", "a", [("title", "greater > than")]), + ("data", "text"), + ("endtag", "a")]) def test_attr_funky_names(self): self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), ]) + def test_attr_value_ip6_url(self): + # http://www.python.org/sf/853506 + self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>" + "<a href=http://[1080::8:800:200C:417A]/>"), [ + ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]), + ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]), + ]) + def test_illegal_declarations(self): s = 'abc<!spacer type="block" height="25">def' self.check_events(s, [ @@ -301,8 +382,8 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' # that needs to be carefully considered before changing it. def _test_starttag_end_boundary(self): - self.check_events("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) - self.check_events("""<a b='>'>""", [("starttag", "a", [("b", ">")])]) + self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])]) + self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])]) def _test_buffer_artefacts(self): output = [("starttag", "a", [("b", "<")])] @@ -322,17 +403,17 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' self.check_events(["<a b='>'", ">"], output) output = [("comment", "abc")] - self._run_check(["", "<!--abc-->"], output) - self._run_check(["<", "!--abc-->"], output) - self._run_check(["<!", "--abc-->"], output) - self._run_check(["<!-", "-abc-->"], output) - self._run_check(["<!--", "abc-->"], output) - self._run_check(["<!--a", "bc-->"], output) - self._run_check(["<!--ab", "c-->"], output) - self._run_check(["<!--abc", "-->"], output) - self._run_check(["<!--abc-", "->"], output) - self._run_check(["<!--abc--", ">"], output) - self._run_check(["<!--abc-->", ""], output) + self.check_events(["", "<!--abc-->"], output) + self.check_events(["<", "!--abc-->"], output) + self.check_events(["<!", "--abc-->"], output) + self.check_events(["<!-", "-abc-->"], output) + self.check_events(["<!--", "abc-->"], output) + self.check_events(["<!--a", "bc-->"], output) + self.check_events(["<!--ab", "c-->"], output) + self.check_events(["<!--abc", "-->"], output) + self.check_events(["<!--abc-", "->"], output) + self.check_events(["<!--abc--", ">"], output) + self.check_events(["<!--abc-->", ""], output) def _test_starttag_junk_chars(self): self.check_parse_error("<") |