diff options
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/html/parser.py | 99 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 48 |
2 files changed, 125 insertions, 22 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index c2c7f6b..8d275ab 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -24,10 +24,14 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') +# Note, the strict one of this pair isn't really strict, but we can't +# make it correctly strict without breaking backward compatibility. attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') - +attrfind_tolerant = re.compile( + r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' + r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name @@ -42,6 +46,21 @@ locatestarttagend = re.compile(r""" )* \s* # trailing whitespace """, re.VERBOSE) +locatestarttagend_tolerant = re.compile(r""" + <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name + (?:\s* # optional whitespace before attribute name + (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:\s*=\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |\"[^\"]*\" # LIT-enclosed value + |[^'\">\s]+ # bare value + ) + (?:\s*,)* # possibly followed by a comma + )? + ) + )* + \s* # trailing whitespace +""", re.VERBOSE) endendtag = re.compile('>') endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') @@ -86,9 +105,15 @@ class HTMLParser(_markupbase.ParserBase): CDATA_CONTENT_ELEMENTS = ("script", "style") + def __init__(self, strict=True): + """Initialize and reset this instance. - def __init__(self): - """Initialize and reset this instance.""" + If strict is set to True (the default), errors are raised when invalid + HTML is encountered. If set to False, an attempt is instead made to + continue parsing, making "best guesses" about the intended meaning, in + a fashion similar to what browsers typically do. + """ + self.strict = strict self.reset() def reset(self): @@ -160,9 +185,18 @@ class HTMLParser(_markupbase.ParserBase): else: break if k < 0: - if end: + if not end: + break + if self.strict: self.error("EOF in middle of construct") - break + k = rawdata.find('>', i + 1) + if k < 0: + k = rawdata.find('<', i + 1) + if k < 0: + k = i + 1 + else: + k += 1 + self.handle_data(rawdata[i:k]) i = self.updatepos(i, k) elif startswith("&#", i): match = charref.match(rawdata, i) @@ -193,7 +227,12 @@ class HTMLParser(_markupbase.ParserBase): if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: - self.error("EOF in middle of entity or char ref") + if self.strict: + self.error("EOF in middle of entity or char ref") + else: + if k <= i: + k = n + i = self.updatepos(i, i + 1) # incomplete break elif (i + 1) < n: @@ -240,7 +279,10 @@ class HTMLParser(_markupbase.ParserBase): self.lasttag = tag = rawdata[i+1:k].lower() while k < endpos: - m = attrfind.match(rawdata, k) + if self.strict: + m = attrfind.match(rawdata, k) + else: + m = attrfind_tolerant.search(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) @@ -262,8 +304,11 @@ class HTMLParser(_markupbase.ParserBase): - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) + if self.strict: + self.error("junk characters in start tag: %r" + % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos if end.endswith('/>'): # XHTML-style empty tag: <span attr="value" /> self.handle_startendtag(tag, attrs) @@ -277,7 +322,10 @@ class HTMLParser(_markupbase.ParserBase): # or -1 if incomplete. def check_for_whole_start_tag(self, i): rawdata = self.rawdata - m = locatestarttagend.match(rawdata, i) + if self.strict: + m = locatestarttagend.match(rawdata, i) + else: + m = locatestarttagend_tolerant.match(rawdata, i) if m: j = m.end() next = rawdata[j:j+1] @@ -290,8 +338,13 @@ class HTMLParser(_markupbase.ParserBase): # buffer boundary return -1 # else bogus input - self.updatepos(i, j + 1) - self.error("malformed empty start tag") + if self.strict: + self.updatepos(i, j + 1) + self.error("malformed empty start tag") + if j > i: + return j + else: + return i + 1 if next == "": # end of input return -1 @@ -300,8 +353,13 @@ class HTMLParser(_markupbase.ParserBase): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 - self.updatepos(i, j) - self.error("malformed start tag") + if self.strict: + self.updatepos(i, j) + self.error("malformed start tag") + if j > i: + return j + else: + return i + 1 raise AssertionError("we should not get here!") # Internal -- parse endtag, return end or -1 if incomplete @@ -314,7 +372,15 @@ class HTMLParser(_markupbase.ParserBase): j = match.end() match = endtagfind.match(rawdata, i) # </ + tag + > if not match: - self.error("bad end tag: %r" % (rawdata[i:j],)) + if self.strict: + self.error("bad end tag: %r" % (rawdata[i:j],)) + k = rawdata.find('<', i + 1, j) + if k > i: + j = k + if j <= i: + j = i + 1 + self.handle_data(rawdata[i:j]) + return j tag = match.group(1) self.handle_endtag(tag.lower()) self.clear_cdata_mode() @@ -358,7 +424,8 @@ class HTMLParser(_markupbase.ParserBase): pass def unknown_decl(self, data): - self.error("unknown declaration: %r" % (data,)) + if self.strict: + self.error("unknown declaration: %r" % (data,)) # Internal -- helper to remove special character quoting entitydefs = None diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index e982218..beaf6b6 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -8,10 +8,10 @@ from test import support class EventCollector(html.parser.HTMLParser): - def __init__(self): + def __init__(self, *args, **kw): self.events = [] self.append = self.events.append - html.parser.HTMLParser.__init__(self) + html.parser.HTMLParser.__init__(self, *args, **kw) def get_events(self): # Normalize the list of events so that buffer artefacts don't @@ -72,8 +72,10 @@ class EventCollectorExtra(EventCollector): class TestCaseBase(unittest.TestCase): - def _run_check(self, source, expected_events, collector=EventCollector): - parser = collector() + def _run_check(self, source, expected_events, collector=None): + if collector is None: + collector = EventCollector() + parser = collector for s in source: parser.feed(s) parser.close() @@ -84,7 +86,7 @@ class TestCaseBase(unittest.TestCase): "\nReceived:\n" + pprint.pformat(events)) def _run_check_extra(self, source, events): - self._run_check(source, events, EventCollectorExtra) + self._run_check(source, events, EventCollectorExtra()) def _parse_error(self, source): def parse(source=source): @@ -321,8 +323,42 @@ DOCTYPE html [ ]) +class HTMLParserTolerantTestCase(TestCaseBase): + + def setUp(self): + self.collector = EventCollector(strict=False) + + def test_tolerant_parsing(self): + self._run_check('<html <html>te>>xt&a<<bc</a></html>\n' + '<img src="URL><//img></html</html>', [ + ('data', '<html '), + ('starttag', 'html', []), + ('data', 'te>>xt'), + ('entityref', 'a'), + ('data', '<<bc'), + ('endtag', 'a'), + ('endtag', 'html'), + ('data', '\n<img src="URL><//img></html'), + ('endtag', 'html')], + collector = self.collector) + + def test_comma_between_attributes(self): + self._run_check('<form action="/xxx.php?a=1&b=2&", ' + 'method="post">', [ + ('starttag', 'form', + [('action', '/xxx.php?a=1&b=2&'), + ('method', 'post')])], + collector = self.collector) + + def test_weird_chars_in_unquoted_attribute_values(self): + self._run_check('<form action=bogus|&#()value>', [ + ('starttag', 'form', + [('action', 'bogus|&#()value')])], + collector = self.collector) + + def test_main(): - support.run_unittest(HTMLParserTestCase) + support.run_unittest(HTMLParserTestCase, HTMLParserTolerantTestCase) if __name__ == "__main__": |