diff options
-rw-r--r-- | Doc/library/html.parser.rst | 42 | ||||
-rw-r--r-- | Lib/html/parser.py | 106 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 69 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
4 files changed, 23 insertions, 197 deletions
diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index 44b7d6e..67ae139 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -16,9 +16,9 @@ This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(strict=False, *, convert_charrefs=False) +.. class:: HTMLParser(*, convert_charrefs=False) - Create a parser instance. + Create a parser instance able to parse invalid markup. If *convert_charrefs* is ``True`` (default: ``False``), all character references (except the ones in ``script``/``style`` elements) are @@ -26,12 +26,6 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. The use of ``convert_charrefs=True`` is encouraged and will become the default in Python 3.5. - If *strict* is ``False`` (the default), the parser will accept and parse - invalid markup. If *strict* is ``True`` the parser will raise an - :exc:`~html.parser.HTMLParseError` exception instead [#]_ when it's not - able to parse the markup. The use of ``strict=True`` is discouraged and - the *strict* argument is deprecated. - An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are encountered. The user should subclass :class:`.HTMLParser` and override its @@ -40,32 +34,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. This parser does not check that end tags match start tags or call the end-tag handler for elements which are closed implicitly by closing an outer element. - .. versionchanged:: 3.2 - *strict* argument added. - - .. deprecated-removed:: 3.3 3.5 - The *strict* argument and the strict mode have been deprecated. - The parser is now able to accept and parse invalid markup too. - .. versionchanged:: 3.4 *convert_charrefs* keyword argument added. -An exception is defined as well: - - -.. exception:: HTMLParseError - - Exception raised by the :class:`HTMLParser` class when it encounters an error - while parsing and *strict* is ``True``. This exception provides three - attributes: :attr:`msg` is a brief message explaining the error, - :attr:`lineno` is the number of the line on which the broken construct was - detected, and :attr:`offset` is the number of characters into the line at - which the construct starts. - - .. deprecated-removed:: 3.3 3.5 - This exception has been deprecated because it's never raised by the parser - (when the default non-strict mode is used). - Example HTML Parser Application ------------------------------- @@ -246,8 +217,7 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): The *data* parameter will be the entire contents of the declaration inside the ``<![...]>`` markup. It is sometimes useful to be overridden by a - derived class. The base class implementation raises an :exc:`HTMLParseError` - when *strict* is ``True``. + derived class. The base class implementation does nothing. .. _htmlparser-examples: @@ -358,9 +328,3 @@ Parsing invalid HTML (e.g. unquoted attributes) also works:: Data : tag soup End tag : p End tag : a - -.. rubric:: Footnotes - -.. [#] For backward compatibility reasons *strict* mode does not raise - exceptions for all non-compliant HTML. That is, some invalid HTML - is tolerated even in *strict* mode. diff --git a/Lib/html/parser.py b/Lib/html/parser.py index a650d5e..5a4f9e1 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -29,35 +29,15 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') # Note: -# 1) the strict attrfind isn't really strict, but we can't make it -# correctly strict without breaking backward compatibility; -# 2) if you change tagfind/attrfind remember to update locatestarttagend too; -# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will +# 1) if you change tagfind/attrfind remember to update locatestarttagend too; +# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will # explode, so don't do it. -tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') -attrfind = re.compile( - r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') -locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name @@ -79,24 +59,6 @@ endendtag = re.compile('>') endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') -class HTMLParseError(Exception): - """Exception raised for all parse errors.""" - - def __init__(self, msg, position=(None, None)): - assert msg - self.msg = msg - self.lineno = position[0] - self.offset = position[1] - - def __str__(self): - result = self.msg - if self.lineno is not None: - result = result + ", at line %d" % self.lineno - if self.offset is not None: - result = result + ", column %d" % (self.offset + 1) - return result - - _default_sentinel = object() class HTMLParser(_markupbase.ParserBase): @@ -123,22 +85,12 @@ class HTMLParser(_markupbase.ParserBase): CDATA_CONTENT_ELEMENTS = ("script", "style") - def __init__(self, strict=_default_sentinel, *, - convert_charrefs=_default_sentinel): + def __init__(self, *, convert_charrefs=_default_sentinel): """Initialize and reset this instance. If convert_charrefs is True (default: False), all character references are automatically converted to the corresponding Unicode characters. - If strict is set to False (the default) the parser will parse invalid - markup, otherwise it will raise an error. Note that the strict mode - and argument are deprecated. """ - if strict is not _default_sentinel: - warnings.warn("The strict argument and mode are deprecated.", - DeprecationWarning, stacklevel=2) - else: - strict = False # default - self.strict = strict if convert_charrefs is _default_sentinel: convert_charrefs = False # default warnings.warn("The value of convert_charrefs will become True in " @@ -168,11 +120,6 @@ class HTMLParser(_markupbase.ParserBase): """Handle any buffered data.""" self.goahead(1) - def error(self, message): - warnings.warn("The 'error' method is deprecated.", - DeprecationWarning, stacklevel=2) - raise HTMLParseError(message, self.getpos()) - __starttag_text = None def get_starttag_text(self): @@ -227,10 +174,7 @@ class HTMLParser(_markupbase.ParserBase): elif startswith("<?", i): k = self.parse_pi(i) elif startswith("<!", i): - if self.strict: - k = self.parse_declaration(i) - else: - k = self.parse_html_declaration(i) + k = self.parse_html_declaration(i) elif (i + 1) < n: self.handle_data("<") k = i + 1 @@ -239,8 +183,6 @@ class HTMLParser(_markupbase.ParserBase): if k < 0: if not end: break - if self.strict: - self.error("EOF in middle of construct") k = rawdata.find('>', i + 1) if k < 0: k = rawdata.find('<', i + 1) @@ -282,13 +224,10 @@ class HTMLParser(_markupbase.ParserBase): if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: - if self.strict: - self.error("EOF in middle of entity or char ref") - else: - k = match.end() - if k <= i: - k = n - i = self.updatepos(i, i + 1) + k = match.end() + if k <= i: + k = n + i = self.updatepos(i, i + 1) # incomplete break elif (i + 1) < n: @@ -367,18 +306,12 @@ class HTMLParser(_markupbase.ParserBase): # Now parse the data between i+1 and j into a tag and attrs attrs = [] - if self.strict: - match = tagfind.match(rawdata, i+1) - else: - match = tagfind_tolerant.match(rawdata, i+1) + match = tagfind_tolerant.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = match.group(1).lower() while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) + m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) @@ -401,9 +334,6 @@ class HTMLParser(_markupbase.ParserBase): - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): @@ -419,10 +349,7 @@ class HTMLParser(_markupbase.ParserBase): # or -1 if incomplete. def check_for_whole_start_tag(self, i): rawdata = self.rawdata - if self.strict: - m = locatestarttagend.match(rawdata, i) - else: - m = locatestarttagend_tolerant.match(rawdata, i) + m = locatestarttagend_tolerant.match(rawdata, i) if m: j = m.end() next = rawdata[j:j+1] @@ -435,9 +362,6 @@ class HTMLParser(_markupbase.ParserBase): # buffer boundary return -1 # else bogus input - if self.strict: - self.updatepos(i, j + 1) - self.error("malformed empty start tag") if j > i: return j else: @@ -450,9 +374,6 @@ class HTMLParser(_markupbase.ParserBase): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 - if self.strict: - self.updatepos(i, j) - self.error("malformed start tag") if j > i: return j else: @@ -472,8 +393,6 @@ class HTMLParser(_markupbase.ParserBase): if self.cdata_elem is not None: self.handle_data(rawdata[i:gtpos]) return gtpos - if self.strict: - self.error("bad end tag: %r" % (rawdata[i:gtpos],)) # find the name: w3.org/TR/html5/tokenization.html#tag-name-state namematch = tagfind_tolerant.match(rawdata, i+2) if not namematch: @@ -539,8 +458,7 @@ class HTMLParser(_markupbase.ParserBase): pass def unknown_decl(self, data): - if self.strict: - self.error("unknown declaration: %r" % (data,)) + pass # Internal -- helper to remove special character quoting def unescape(self, s): diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 2d771a2..1aa150803 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -85,7 +85,7 @@ class EventCollectorCharrefs(EventCollector): class TestCaseBase(unittest.TestCase): def get_collector(self): - raise NotImplementedError + return EventCollector(convert_charrefs=False) def _run_check(self, source, expected_events, collector=None): if collector is None: @@ -105,21 +105,8 @@ class TestCaseBase(unittest.TestCase): self._run_check(source, events, EventCollectorExtra(convert_charrefs=False)) - def _parse_error(self, source): - def parse(source=source): - parser = self.get_collector() - parser.feed(source) - parser.close() - with self.assertRaises(html.parser.HTMLParseError): - with self.assertWarns(DeprecationWarning): - parse() - - -class HTMLParserStrictTestCase(TestCaseBase): - def get_collector(self): - with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True, convert_charrefs=False) +class HTMLParserTestCase(TestCaseBase): def test_processing_instruction_only(self): self._run_check("<?processing instruction>", [ @@ -201,9 +188,6 @@ text ("data", "this < text > contains < bare>pointy< brackets"), ]) - def test_illegal_declarations(self): - self._parse_error('<!spacer type="block" height="25">') - def test_starttag_end_boundary(self): self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])]) @@ -238,25 +222,6 @@ text self._run_check(["<!--abc--", ">"], output) self._run_check(["<!--abc-->", ""], output) - def test_starttag_junk_chars(self): - self._parse_error("</>") - self._parse_error("</$>") - self._parse_error("</") - self._parse_error("</a") - self._parse_error("<a<a>") - self._parse_error("</a<a>") - self._parse_error("<!") - self._parse_error("<a") - self._parse_error("<a foo='bar'") - self._parse_error("<a foo='bar") - self._parse_error("<a foo='>'") - self._parse_error("<a foo='>") - self._parse_error("<a$>") - self._parse_error("<a$b>") - self._parse_error("<a$b/>") - self._parse_error("<a$b >") - self._parse_error("<a$b />") - def test_valid_doctypes(self): # from http://www.w3.org/QA/2002/04/valid-dtd-list.html dtds = ['HTML', # HTML5 doctype @@ -281,9 +246,6 @@ text self._run_check("<!DOCTYPE %s>" % dtd, [('decl', 'DOCTYPE ' + dtd)]) - def test_declaration_junk_chars(self): - self._parse_error("<!DOCTYPE foo $ >") - def test_startendtag(self): self._run_check("<p/>", [ ("startendtag", "p", []), @@ -421,23 +383,12 @@ text self._run_check('no charrefs here', [('data', 'no charrefs here')], collector=collector()) - -class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): - - def get_collector(self): - return EventCollector(convert_charrefs=False) - def test_deprecation_warnings(self): with self.assertWarns(DeprecationWarning): EventCollector() # convert_charrefs not passed explicitly - with self.assertWarns(DeprecationWarning): - EventCollector(strict=True) - with self.assertWarns(DeprecationWarning): - EventCollector(strict=False) - with self.assertRaises(html.parser.HTMLParseError): - with self.assertWarns(DeprecationWarning): - EventCollector().error('test') + # the remaining tests were for the "tolerant" parser (which is now + # the default), and check various kind of broken markup def test_tolerant_parsing(self): self._run_check('<html <html>te>>xt&a<<bc</a></html>\n' '<img src="URL><//img></html</html>', [ @@ -686,11 +637,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): self._run_check(html, expected) -class AttributesStrictTestCase(TestCaseBase): - - def get_collector(self): - with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True, convert_charrefs=False) +class AttributesTestCase(TestCaseBase): def test_attr_syntax(self): output = [ @@ -747,12 +694,6 @@ class AttributesStrictTestCase(TestCaseBase): [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])]) - -class AttributesTolerantTestCase(AttributesStrictTestCase): - - def get_collector(self): - return EventCollector(convert_charrefs=False) - def test_attr_funky_names2(self): self._run_check( "<a $><b $=%><c \=/>", @@ -121,6 +121,9 @@ Core and Builtins Library ------- +- Issue #15114: the strict mode and argument of HTMLParser, HTMLParser.error, + and the HTMLParserError exception have been removed. + - Issue #22085: Dropped support of Tk 8.3 in Tkinter. - Issue #21580: Now Tkinter correctly handles bytes arguments passed to Tk. |