diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2012-02-10 08:45:44 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2012-02-10 08:45:44 (GMT) |
commit | fa3702dc28fa8aef291785c560832c9af60305a8 (patch) | |
tree | 3f83918e3a492d96c097ea85df7fda26559ab43d /Lib/html | |
parent | 5b14d732d8790a6a19cc8aa410740575ff94c85a (diff) | |
download | cpython-fa3702dc28fa8aef291785c560832c9af60305a8.zip cpython-fa3702dc28fa8aef291785c560832c9af60305a8.tar.gz cpython-fa3702dc28fa8aef291785c560832c9af60305a8.tar.bz2 |
#13960: HTMLParser is now able to handle broken comments when strict=False.
Diffstat (limited to 'Lib/html')
-rw-r--r-- | Lib/html/parser.py | 25 |
1 files changed, 24 insertions, 1 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index dd9c2e1..5c4a7ef 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -184,7 +184,17 @@ class HTMLParser(_markupbase.ParserBase): elif startswith("<?", i): k = self.parse_pi(i) elif startswith("<!", i): - k = self.parse_declaration(i) + # this might fail with things like <! not a comment > or + # <! -- space before '--' -->. When strict is True an + # error is raised, when it's False they will be considered + # as bogus comments and parsed (see parse_bogus_comment). + if self.strict: + k = self.parse_declaration(i) + else: + try: + k = self.parse_declaration(i) + except HTMLParseError: + k = self.parse_bogus_comment(i) elif (i + 1) < n: self.handle_data("<") k = i + 1 @@ -256,6 +266,19 @@ class HTMLParser(_markupbase.ParserBase): i = self.updatepos(i, n) self.rawdata = rawdata[i:] + # Internal -- parse bogus comment, return length or -1 if not terminated + # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state + def parse_bogus_comment(self, i, report=1): + rawdata = self.rawdata + if rawdata[i:i+2] != '<!': + self.error('unexpected call to parse_comment()') + pos = rawdata.find('>', i+2) + if pos == -1: + return -1 + if report: + self.handle_comment(rawdata[i+2:pos]) + return pos + 1 + # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata |