From 3861d8b27127a261391ee49ff8634a4ef3ba1dd3 Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Sat, 23 Jun 2012 15:27:51 +0200 Subject: #15114: the strict mode of HTMLParser and the HTMLParseError exception are deprecated now that the parser is able to parse invalid markup. --- Doc/library/html.parser.rst | 21 +++++++++++++++------ Lib/html/parser.py | 21 ++++++++++++--------- Lib/test/test_htmlparser.py | 6 ++++-- Misc/NEWS | 5 ++++- 4 files changed, 35 insertions(+), 18 deletions(-) diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index f3c36ec..4715185 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -16,13 +16,14 @@ This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(strict=True) +.. class:: HTMLParser(strict=False) - Create a parser instance. If *strict* is ``True`` (the default), invalid - HTML results in :exc:`~html.parser.HTMLParseError` exceptions [#]_. If - *strict* is ``False``, the parser uses heuristics to make a best guess at - the intention of any invalid HTML it encounters, similar to the way most - browsers do. Using ``strict=False`` is advised. + Create a parser instance. If *strict* is ``False`` (the default), the parser + will accept and parse invalid markup. If *strict* is ``True`` the parser + will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when + it's not able to parse the markup. + The use of ``strict=True`` is discouraged and the *strict* argument is + deprecated. An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are @@ -34,6 +35,10 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. .. versionchanged:: 3.2 *strict* keyword added + .. deprecated-removed:: 3.3 3.5 + The *strict* argument and the strict mode have been deprecated. + The parser is now able to accept and parse invalid markup too. + An exception is defined as well: @@ -46,6 +51,10 @@ An exception is defined as well: detected, and :attr:`offset` is the number of characters into the line at which the construct starts. + .. deprecated-removed:: 3.3 3.5 + This exception has been deprecated because it's never raised by the parser + (when the default non-strict mode is used). + Example HTML Parser Application ------------------------------- diff --git a/Lib/html/parser.py b/Lib/html/parser.py index de504ab..494cf24 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -10,6 +10,7 @@ import _markupbase import re +import warnings # Regular expressions used for parsing @@ -113,14 +114,16 @@ class HTMLParser(_markupbase.ParserBase): CDATA_CONTENT_ELEMENTS = ("script", "style") - def __init__(self, strict=True): + def __init__(self, strict=False): """Initialize and reset this instance. - If strict is set to True (the default), errors are raised when invalid - HTML is encountered. If set to False, an attempt is instead made to - continue parsing, making "best guesses" about the intended meaning, in - a fashion similar to what browsers typically do. + If strict is set to False (the default) the parser will parse invalid + markup, otherwise it will raise an error. Note that the strict mode + is deprecated. """ + if strict: + warnings.warn("The strict mode is deprecated.", + DeprecationWarning, stacklevel=2) self.strict = strict self.reset() @@ -271,8 +274,8 @@ class HTMLParser(_markupbase.ParserBase): # See also parse_declaration in _markupbase def parse_html_declaration(self, i): rawdata = self.rawdata - if rawdata[i:i+2] != '', i+2) if pos == -1: return -1 diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index c4f80cc..64a4f5d 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -102,7 +102,8 @@ class TestCaseBase(unittest.TestCase): class HTMLParserStrictTestCase(TestCaseBase): def get_collector(self): - return EventCollector(strict=True) + with support.check_warnings(("", DeprecationWarning), quite=False): + return EventCollector(strict=True) def test_processing_instruction_only(self): self._run_check("", [ @@ -594,7 +595,8 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): class AttributesStrictTestCase(TestCaseBase): def get_collector(self): - return EventCollector(strict=True) + with support.check_warnings(("", DeprecationWarning), quite=False): + return EventCollector(strict=True) def test_attr_syntax(self): output = [ diff --git a/Misc/NEWS b/Misc/NEWS index 0325058..d76aeeb 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -43,6 +43,9 @@ Core and Builtins Library ------- +- Issue #15114: the strict mode of HTMLParser and the HTMLParseError exception + are deprecated now that the parser is able to parse invalid markup. + - Issue #3665: \u and \U escapes are now supported in unicode regular expressions. Patch by Serhiy Storchaka. @@ -78,7 +81,7 @@ Library - Issue #9527: datetime.astimezone() method will now supply a class timezone instance corresponding to the system local timezone when called with no arguments. - + - Issue #14653: email.utils.mktime_tz() no longer relies on system mktime() when timezone offest is supplied. -- cgit v0.12