From 4b92cc3f7924e455b7e41cf1a66034a44ede0cc0 Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Mon, 13 Feb 2012 16:10:44 +0200 Subject: #13960: HTMLParser is now able to handle broken comments. --- Lib/HTMLParser.py | 36 +++++++++++++++++++++++++++- Lib/test/test_htmlparser.py | 58 +++++++++++++++++++++++++++++---------------- Misc/NEWS | 2 ++ 3 files changed, 74 insertions(+), 22 deletions(-) diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 1c6989e..516bc70 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -160,7 +160,7 @@ class HTMLParser(markupbase.ParserBase): elif startswith(" + gtpos = rawdata.find('>', 9) + if gtpos == -1: + return -1 + self.handle_decl(rawdata[i+2:gtpos]) + return gtpos+1 + else: + return self.parse_bogus_comment(i) + + # Internal -- parse bogus comment, return length or -1 if not terminated + # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state + def parse_bogus_comment(self, i, report=1): + rawdata = self.rawdata + if rawdata[i:i+2] != '', i+2) + if pos == -1: + return -1 + if report: + self.handle_comment(rawdata[i+2:pos]) + return pos + 1 + # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 14ed80c..29a721c 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -114,7 +114,7 @@ comment1b--> sample text “ - + """, [ ("data", "\n"), @@ -142,24 +142,6 @@ text ("data", " foo"), ]) - def test_doctype_decl(self): - inside = """\ -DOCTYPE html [ - - - - - - - %paramEntity; - -]""" - self._run_check("" % inside, [ - ("decl", inside), - ]) - def test_bad_nesting(self): # Strangely, this *is* supposed to test that overlapping # elements are allowed. HTMLParser is more geared toward @@ -182,7 +164,8 @@ DOCTYPE html [ ]) def test_illegal_declarations(self): - self._parse_error('') + self._run_check('', + [('comment', 'spacer type="block" height="25"')]) def test_starttag_end_boundary(self): self._run_check("""""", [("starttag", "a", [("b", "<")])]) @@ -233,7 +216,7 @@ DOCTYPE html [ self._parse_error("", [ @@ -449,6 +432,39 @@ class AttributesTestCase(TestCaseBase): [("href", "http://www.example.org/\">;")]), ("data", "spam"), ("endtag", "a")]) + def test_comments(self): + html = ("" + '' + '' + '' + '' + '' + '') + expected = [('comment', " I'm a valid comment "), + ('comment', 'me too!'), + ('comment', '--'), + ('comment', ''), + ('comment', '--I have many hyphens--'), + ('comment', ' I have a > in the middle '), + ('comment', ' and I have -- in the middle! ')] + self._run_check(html, expected) + + def test_broken_comments(self): + html = ('' + '' + '' + '' + '') + expected = [ + ('comment', ' not really a comment '), + ('comment', ' not a comment either --'), + ('comment', ' -- close enough --'), + ('comment', ''), + ('comment', '<-- this was an empty comment'), + ('comment', '!! another bogus comment !!!'), + ] + self._run_check(html, expected) + def test_condcoms(self): html = ('' '' diff --git a/Misc/NEWS b/Misc/NEWS index 3318913..55b19b1 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -90,6 +90,8 @@ Core and Builtins Library ------- +- Issue #13960: HTMLParser is now able to handle broken comments. + - Issue #9750: Fix sqlite3.Connection.iterdump on tables and fields with a name that is a keyword or contains quotes. Patch by Marko Kohtala. -- cgit v0.12