summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/html.parser.rst21
-rw-r--r--Lib/html/parser.py21
-rw-r--r--Lib/test/test_htmlparser.py6
-rw-r--r--Misc/NEWS5
4 files changed, 35 insertions, 18 deletions
diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index f3c36ec..4715185 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -16,13 +16,14 @@
This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(strict=True)
+.. class:: HTMLParser(strict=False)
- Create a parser instance. If *strict* is ``True`` (the default), invalid
- HTML results in :exc:`~html.parser.HTMLParseError` exceptions [#]_. If
- *strict* is ``False``, the parser uses heuristics to make a best guess at
- the intention of any invalid HTML it encounters, similar to the way most
- browsers do. Using ``strict=False`` is advised.
+ Create a parser instance. If *strict* is ``False`` (the default), the parser
+ will accept and parse invalid markup. If *strict* is ``True`` the parser
+ will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when
+ it's not able to parse the markup.
+ The use of ``strict=True`` is discouraged and the *strict* argument is
+ deprecated.
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are
@@ -34,6 +35,10 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. versionchanged:: 3.2 *strict* keyword added
+ .. deprecated-removed:: 3.3 3.5
+ The *strict* argument and the strict mode have been deprecated.
+ The parser is now able to accept and parse invalid markup too.
+
An exception is defined as well:
@@ -46,6 +51,10 @@ An exception is defined as well:
detected, and :attr:`offset` is the number of characters into the line at
which the construct starts.
+ .. deprecated-removed:: 3.3 3.5
+ This exception has been deprecated because it's never raised by the parser
+ (when the default non-strict mode is used).
+
Example HTML Parser Application
-------------------------------
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index de504ab..494cf24 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -10,6 +10,7 @@
import _markupbase
import re
+import warnings
# Regular expressions used for parsing
@@ -113,14 +114,16 @@ class HTMLParser(_markupbase.ParserBase):
CDATA_CONTENT_ELEMENTS = ("script", "style")
- def __init__(self, strict=True):
+ def __init__(self, strict=False):
"""Initialize and reset this instance.
- If strict is set to True (the default), errors are raised when invalid
- HTML is encountered. If set to False, an attempt is instead made to
- continue parsing, making "best guesses" about the intended meaning, in
- a fashion similar to what browsers typically do.
+ If strict is set to False (the default) the parser will parse invalid
+ markup, otherwise it will raise an error. Note that the strict mode
+ is deprecated.
"""
+ if strict:
+ warnings.warn("The strict mode is deprecated.",
+ DeprecationWarning, stacklevel=2)
self.strict = strict
self.reset()
@@ -271,8 +274,8 @@ class HTMLParser(_markupbase.ParserBase):
# See also parse_declaration in _markupbase
def parse_html_declaration(self, i):
rawdata = self.rawdata
- if rawdata[i:i+2] != '<!':
- self.error('unexpected call to parse_html_declaration()')
+ assert rawdata[i:i+2] == '<!', ('unexpected call to '
+ 'parse_html_declaration()')
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
@@ -292,8 +295,8 @@ class HTMLParser(_markupbase.ParserBase):
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
- if rawdata[i:i+2] not in ('<!', '</'):
- self.error('unexpected call to parse_comment()')
+ assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
+ 'parse_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index c4f80cc..64a4f5d 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -102,7 +102,8 @@ class TestCaseBase(unittest.TestCase):
class HTMLParserStrictTestCase(TestCaseBase):
def get_collector(self):
- return EventCollector(strict=True)
+ with support.check_warnings(("", DeprecationWarning), quite=False):
+ return EventCollector(strict=True)
def test_processing_instruction_only(self):
self._run_check("<?processing instruction>", [
@@ -594,7 +595,8 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
class AttributesStrictTestCase(TestCaseBase):
def get_collector(self):
- return EventCollector(strict=True)
+ with support.check_warnings(("", DeprecationWarning), quite=False):
+ return EventCollector(strict=True)
def test_attr_syntax(self):
output = [
diff --git a/Misc/NEWS b/Misc/NEWS
index 0325058..d76aeeb 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -43,6 +43,9 @@ Core and Builtins
Library
-------
+- Issue #15114: the strict mode of HTMLParser and the HTMLParseError exception
+ are deprecated now that the parser is able to parse invalid markup.
+
- Issue #3665: \u and \U escapes are now supported in unicode regular
expressions. Patch by Serhiy Storchaka.
@@ -78,7 +81,7 @@ Library
- Issue #9527: datetime.astimezone() method will now supply a class
timezone instance corresponding to the system local timezone when
called with no arguments.
-
+
- Issue #14653: email.utils.mktime_tz() no longer relies on system
mktime() when timezone offest is supplied.