diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2014-08-02 11:10:30 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2014-08-02 11:10:30 (GMT) |
commit | 73a4359eb0eb624c588c5d52083ea4944f9787ea (patch) | |
tree | 348fcf0642ccec54e7bd2e466cce6b7d063f7d7c /Lib | |
parent | ffff1440d118cae189a6c2baf595dda52cdc7c3a (diff) | |
download | cpython-73a4359eb0eb624c588c5d52083ea4944f9787ea.zip cpython-73a4359eb0eb624c588c5d52083ea4944f9787ea.tar.gz cpython-73a4359eb0eb624c588c5d52083ea4944f9787ea.tar.bz2 |
#15114: the strict mode and argument of HTMLParser, HTMLParser.error, and the HTMLParserError exception have been removed.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/html/parser.py | 106 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 69 |
2 files changed, 17 insertions, 158 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index a650d5e..5a4f9e1 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -29,35 +29,15 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') # Note: -# 1) the strict attrfind isn't really strict, but we can't make it -# correctly strict without breaking backward compatibility; -# 2) if you change tagfind/attrfind remember to update locatestarttagend too; -# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will +# 1) if you change tagfind/attrfind remember to update locatestarttagend too; +# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will # explode, so don't do it. -tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') -attrfind = re.compile( - r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') -locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name @@ -79,24 +59,6 @@ endendtag = re.compile('>') endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') -class HTMLParseError(Exception): - """Exception raised for all parse errors.""" - - def __init__(self, msg, position=(None, None)): - assert msg - self.msg = msg - self.lineno = position[0] - self.offset = position[1] - - def __str__(self): - result = self.msg - if self.lineno is not None: - result = result + ", at line %d" % self.lineno - if self.offset is not None: - result = result + ", column %d" % (self.offset + 1) - return result - - _default_sentinel = object() class HTMLParser(_markupbase.ParserBase): @@ -123,22 +85,12 @@ class HTMLParser(_markupbase.ParserBase): CDATA_CONTENT_ELEMENTS = ("script", "style") - def __init__(self, strict=_default_sentinel, *, - convert_charrefs=_default_sentinel): + def __init__(self, *, convert_charrefs=_default_sentinel): """Initialize and reset this instance. If convert_charrefs is True (default: False), all character references are automatically converted to the corresponding Unicode characters. - If strict is set to False (the default) the parser will parse invalid - markup, otherwise it will raise an error. Note that the strict mode - and argument are deprecated. """ - if strict is not _default_sentinel: - warnings.warn("The strict argument and mode are deprecated.", - DeprecationWarning, stacklevel=2) - else: - strict = False # default - self.strict = strict if convert_charrefs is _default_sentinel: convert_charrefs = False # default warnings.warn("The value of convert_charrefs will become True in " @@ -168,11 +120,6 @@ class HTMLParser(_markupbase.ParserBase): """Handle any buffered data.""" self.goahead(1) - def error(self, message): - warnings.warn("The 'error' method is deprecated.", - DeprecationWarning, stacklevel=2) - raise HTMLParseError(message, self.getpos()) - __starttag_text = None def get_starttag_text(self): @@ -227,10 +174,7 @@ class HTMLParser(_markupbase.ParserBase): elif startswith("<?", i): k = self.parse_pi(i) elif startswith("<!", i): - if self.strict: - k = self.parse_declaration(i) - else: - k = self.parse_html_declaration(i) + k = self.parse_html_declaration(i) elif (i + 1) < n: self.handle_data("<") k = i + 1 @@ -239,8 +183,6 @@ class HTMLParser(_markupbase.ParserBase): if k < 0: if not end: break - if self.strict: - self.error("EOF in middle of construct") k = rawdata.find('>', i + 1) if k < 0: k = rawdata.find('<', i + 1) @@ -282,13 +224,10 @@ class HTMLParser(_markupbase.ParserBase): if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: - if self.strict: - self.error("EOF in middle of entity or char ref") - else: - k = match.end() - if k <= i: - k = n - i = self.updatepos(i, i + 1) + k = match.end() + if k <= i: + k = n + i = self.updatepos(i, i + 1) # incomplete break elif (i + 1) < n: @@ -367,18 +306,12 @@ class HTMLParser(_markupbase.ParserBase): # Now parse the data between i+1 and j into a tag and attrs attrs = [] - if self.strict: - match = tagfind.match(rawdata, i+1) - else: - match = tagfind_tolerant.match(rawdata, i+1) + match = tagfind_tolerant.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = match.group(1).lower() while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) + m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) @@ -401,9 +334,6 @@ class HTMLParser(_markupbase.ParserBase): - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): @@ -419,10 +349,7 @@ class HTMLParser(_markupbase.ParserBase): # or -1 if incomplete. def check_for_whole_start_tag(self, i): rawdata = self.rawdata - if self.strict: - m = locatestarttagend.match(rawdata, i) - else: - m = locatestarttagend_tolerant.match(rawdata, i) + m = locatestarttagend_tolerant.match(rawdata, i) if m: j = m.end() next = rawdata[j:j+1] @@ -435,9 +362,6 @@ class HTMLParser(_markupbase.ParserBase): # buffer boundary return -1 # else bogus input - if self.strict: - self.updatepos(i, j + 1) - self.error("malformed empty start tag") if j > i: return j else: @@ -450,9 +374,6 @@ class HTMLParser(_markupbase.ParserBase): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 - if self.strict: - self.updatepos(i, j) - self.error("malformed start tag") if j > i: return j else: @@ -472,8 +393,6 @@ class HTMLParser(_markupbase.ParserBase): if self.cdata_elem is not None: self.handle_data(rawdata[i:gtpos]) return gtpos - if self.strict: - self.error("bad end tag: %r" % (rawdata[i:gtpos],)) # find the name: w3.org/TR/html5/tokenization.html#tag-name-state namematch = tagfind_tolerant.match(rawdata, i+2) if not namematch: @@ -539,8 +458,7 @@ class HTMLParser(_markupbase.ParserBase): pass def unknown_decl(self, data): - if self.strict: - self.error("unknown declaration: %r" % (data,)) + pass # Internal -- helper to remove special character quoting def unescape(self, s): diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 2d771a2..1aa150803 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -85,7 +85,7 @@ class EventCollectorCharrefs(EventCollector): class TestCaseBase(unittest.TestCase): def get_collector(self): - raise NotImplementedError + return EventCollector(convert_charrefs=False) def _run_check(self, source, expected_events, collector=None): if collector is None: @@ -105,21 +105,8 @@ class TestCaseBase(unittest.TestCase): self._run_check(source, events, EventCollectorExtra(convert_charrefs=False)) - def _parse_error(self, source): - def parse(source=source): - parser = self.get_collector() - parser.feed(source) - parser.close() - with self.assertRaises(html.parser.HTMLParseError): - with self.assertWarns(DeprecationWarning): - parse() - - -class HTMLParserStrictTestCase(TestCaseBase): - def get_collector(self): - with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True, convert_charrefs=False) +class HTMLParserTestCase(TestCaseBase): def test_processing_instruction_only(self): self._run_check("<?processing instruction>", [ @@ -201,9 +188,6 @@ text ("data", "this < text > contains < bare>pointy< brackets"), ]) - def test_illegal_declarations(self): - self._parse_error('<!spacer type="block" height="25">') - def test_starttag_end_boundary(self): self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])]) @@ -238,25 +222,6 @@ text self._run_check(["<!--abc--", ">"], output) self._run_check(["<!--abc-->", ""], output) - def test_starttag_junk_chars(self): - self._parse_error("</>") - self._parse_error("</$>") - self._parse_error("</") - self._parse_error("</a") - self._parse_error("<a<a>") - self._parse_error("</a<a>") - self._parse_error("<!") - self._parse_error("<a") - self._parse_error("<a foo='bar'") - self._parse_error("<a foo='bar") - self._parse_error("<a foo='>'") - self._parse_error("<a foo='>") - self._parse_error("<a$>") - self._parse_error("<a$b>") - self._parse_error("<a$b/>") - self._parse_error("<a$b >") - self._parse_error("<a$b />") - def test_valid_doctypes(self): # from http://www.w3.org/QA/2002/04/valid-dtd-list.html dtds = ['HTML', # HTML5 doctype @@ -281,9 +246,6 @@ text self._run_check("<!DOCTYPE %s>" % dtd, [('decl', 'DOCTYPE ' + dtd)]) - def test_declaration_junk_chars(self): - self._parse_error("<!DOCTYPE foo $ >") - def test_startendtag(self): self._run_check("<p/>", [ ("startendtag", "p", []), @@ -421,23 +383,12 @@ text self._run_check('no charrefs here', [('data', 'no charrefs here')], collector=collector()) - -class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): - - def get_collector(self): - return EventCollector(convert_charrefs=False) - def test_deprecation_warnings(self): with self.assertWarns(DeprecationWarning): EventCollector() # convert_charrefs not passed explicitly - with self.assertWarns(DeprecationWarning): - EventCollector(strict=True) - with self.assertWarns(DeprecationWarning): - EventCollector(strict=False) - with self.assertRaises(html.parser.HTMLParseError): - with self.assertWarns(DeprecationWarning): - EventCollector().error('test') + # the remaining tests were for the "tolerant" parser (which is now + # the default), and check various kind of broken markup def test_tolerant_parsing(self): self._run_check('<html <html>te>>xt&a<<bc</a></html>\n' '<img src="URL><//img></html</html>', [ @@ -686,11 +637,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): self._run_check(html, expected) -class AttributesStrictTestCase(TestCaseBase): - - def get_collector(self): - with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True, convert_charrefs=False) +class AttributesTestCase(TestCaseBase): def test_attr_syntax(self): output = [ @@ -747,12 +694,6 @@ class AttributesStrictTestCase(TestCaseBase): [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])]) - -class AttributesTolerantTestCase(AttributesStrictTestCase): - - def get_collector(self): - return EventCollector(convert_charrefs=False) - def test_attr_funky_names2(self): self._run_check( "<a $><b $=%><c \=/>", |