diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2011-11-14 16:56:11 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2011-11-14 16:56:11 (GMT) |
commit | d5d4406c8ebbbdf8a8961fc119be22b15a1c40ad (patch) | |
tree | 585f97fa7990268f7ef46a60d78407838e334bc3 | |
parent | 84b48a6c46ce7720a23d92f4d64961812d00ce1b (diff) | |
parent | c2fe57762b6cfa8849908e1a0475036cd0b058ba (diff) | |
download | cpython-d5d4406c8ebbbdf8a8961fc119be22b15a1c40ad.zip cpython-d5d4406c8ebbbdf8a8961fc119be22b15a1c40ad.tar.gz cpython-d5d4406c8ebbbdf8a8961fc119be22b15a1c40ad.tar.bz2 |
#1745761, #755670, #13357, #12629, #1200313: merge with 3.2.
-rw-r--r-- | Lib/html/parser.py | 19 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 226 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
3 files changed, 167 insertions, 81 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index afdb305..662e855 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -30,8 +30,8 @@ attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( - r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?') + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name @@ -49,16 +49,16 @@ locatestarttagend = re.compile(r""" locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s* # optional whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator + (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name + (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma - )? - ) - )* + )?\s* + )* + )? \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') @@ -295,6 +295,7 @@ class HTMLParser(_markupbase.ParserBase): elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] + if attrvalue: attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index b587ab8..1ce4594 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -196,60 +196,6 @@ DOCTYPE html [ ("data", "this < text > contains < bare>pointy< brackets"), ]) - def test_attr_syntax(self): - output = [ - ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) - ] - self._run_check("""<a b='v' c="v" d=v e>""", output) - self._run_check("""<a b = 'v' c = "v" d = v e>""", output) - self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output) - self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output) - - def test_attr_values(self): - self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""", - [("starttag", "a", [("b", "xxx\n\txxx"), - ("c", "yyy\t\nyyy"), - ("d", "\txyz\n")]) - ]) - self._run_check("""<a b='' c="">""", [ - ("starttag", "a", [("b", ""), ("c", "")]), - ]) - # Regression test for SF patch #669683. - self._run_check("<e a=rgb(1,2,3)>", [ - ("starttag", "e", [("a", "rgb(1,2,3)")]), - ]) - # Regression test for SF bug #921657. - self._run_check("<a href=mailto:xyz@example.com>", [ - ("starttag", "a", [("href", "mailto:xyz@example.com")]), - ]) - - def test_attr_nonascii(self): - # see issue 7311 - self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [ - ("starttag", "img", [("src", "/foo/bar.png"), - ("alt", "\u4e2d\u6587")]), - ]) - self._run_check("<a title='\u30c6\u30b9\u30c8' " - "href='\u30c6\u30b9\u30c8.html'>", [ - ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), - ("href", "\u30c6\u30b9\u30c8.html")]), - ]) - self._run_check('<a title="\u30c6\u30b9\u30c8" ' - 'href="\u30c6\u30b9\u30c8.html">', [ - ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), - ("href", "\u30c6\u30b9\u30c8.html")]), - ]) - - def test_attr_entity_replacement(self): - self._run_check("""<a b='&><"''>""", [ - ("starttag", "a", [("b", "&><\"'")]), - ]) - - def test_attr_funky_names(self): - self._run_check("""<a a.b='v' c:d=v e-f=v>""", [ - ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), - ]) - def test_illegal_declarations(self): self._parse_error('<!spacer type="block" height="25">') @@ -295,13 +241,11 @@ DOCTYPE html [ self._parse_error("<a<a>") self._parse_error("</a<a>") self._parse_error("<!") - self._parse_error("<a $>") self._parse_error("<a") self._parse_error("<a foo='bar'") self._parse_error("<a foo='bar") self._parse_error("<a foo='>'") self._parse_error("<a foo='>") - self._parse_error("<a foo=>") def test_declaration_junk_chars(self): self._parse_error("<!DOCTYPE foo $ >") @@ -358,10 +302,6 @@ DOCTYPE html [ ("endtag", element_lower)]) - def test_entityrefs_in_attributes(self): - self._run_check("<html foo='€&aa&unsupported;'>", - [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])]) - class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): @@ -371,15 +311,14 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): def test_tolerant_parsing(self): self._run_check('<html <html>te>>xt&a<<bc</a></html>\n' '<img src="URL><//img></html</html>', [ - ('data', '<html '), - ('starttag', 'html', []), - ('data', 'te>>xt'), - ('entityref', 'a'), - ('data', '<<bc'), - ('endtag', 'a'), - ('endtag', 'html'), - ('data', '\n<img src="URL><//img></html'), - ('endtag', 'html')]) + ('starttag', 'html', [('<html', None)]), + ('data', 'te>>xt'), + ('entityref', 'a'), + ('data', '<<bc'), + ('endtag', 'a'), + ('endtag', 'html'), + ('data', '\n<img src="URL><//img></html'), + ('endtag', 'html')]) def test_with_unquoted_attributes(self): # see #12008 @@ -410,7 +349,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): 'method="post">', [ ('starttag', 'form', [('action', '/xxx.php?a=1&b=2&'), - ('method', 'post')])]) + (',', None), ('method', 'post')])]) def test_weird_chars_in_unquoted_attribute_values(self): self._run_check('<form action=bogus|&#()value>', [ @@ -441,7 +380,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>' expected = [ - ('starttag', 'div', [('style', ''), ('foo', 'bar')]), + ('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]), ('starttag', 'b', []), ('data', 'The '), ('starttag', 'a', [('href', 'some_url')]), @@ -458,8 +397,151 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050) +class AttributesStrictTestCase(TestCaseBase): + + def get_collector(self): + return EventCollector(strict=True) + + def test_attr_syntax(self): + output = [ + ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) + ] + self._run_check("""<a b='v' c="v" d=v e>""", output) + self._run_check("""<a b = 'v' c = "v" d = v e>""", output) + self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output) + self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output) + + def test_attr_values(self): + self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""", + [("starttag", "a", [("b", "xxx\n\txxx"), + ("c", "yyy\t\nyyy"), + ("d", "\txyz\n")])]) + self._run_check("""<a b='' c="">""", + [("starttag", "a", [("b", ""), ("c", "")])]) + # Regression test for SF patch #669683. + self._run_check("<e a=rgb(1,2,3)>", + [("starttag", "e", [("a", "rgb(1,2,3)")])]) + # Regression test for SF bug #921657. + self._run_check( + "<a href=mailto:xyz@example.com>", + [("starttag", "a", [("href", "mailto:xyz@example.com")])]) + + def test_attr_nonascii(self): + # see issue 7311 + self._run_check( + "<img src=/foo/bar.png alt=\u4e2d\u6587>", + [("starttag", "img", [("src", "/foo/bar.png"), + ("alt", "\u4e2d\u6587")])]) + self._run_check( + "<a title='\u30c6\u30b9\u30c8' href='\u30c6\u30b9\u30c8.html'>", + [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), + ("href", "\u30c6\u30b9\u30c8.html")])]) + self._run_check( + '<a title="\u30c6\u30b9\u30c8" href="\u30c6\u30b9\u30c8.html">', + [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), + ("href", "\u30c6\u30b9\u30c8.html")])]) + + def test_attr_entity_replacement(self): + self._run_check( + "<a b='&><"''>", + [("starttag", "a", [("b", "&><\"'")])]) + + def test_attr_funky_names(self): + self._run_check( + "<a a.b='v' c:d=v e-f=v>", + [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])]) + + def test_entityrefs_in_attributes(self): + self._run_check( + "<html foo='€&aa&unsupported;'>", + [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])]) + + + +class AttributesTolerantTestCase(AttributesStrictTestCase): + + def get_collector(self): + return EventCollector(strict=False) + + def test_attr_funky_names2(self): + self._run_check( + "<a $><b $=%><c \=/>", + [("starttag", "a", [("$", None)]), + ("starttag", "b", [("$", "%")]), + ("starttag", "c", [("\\", "/")])]) + + def test_entities_in_attribute_value(self): + # see #1200313 + for entity in ['&', '&', '&', '&']: + self._run_check('<a href="%s">' % entity, + [("starttag", "a", [("href", "&")])]) + self._run_check("<a href='%s'>" % entity, + [("starttag", "a", [("href", "&")])]) + self._run_check("<a href=%s>" % entity, + [("starttag", "a", [("href", "&")])]) + + def test_malformed_attributes(self): + # see #13357 + html = ( + "<a href=test'style='color:red;bad1'>test - bad1</a>" + "<a href=test'+style='color:red;ba2'>test - bad2</a>" + "<a href=test' style='color:red;bad3'>test - bad3</a>" + "<a href = test' style='color:red;bad4' >test - bad4</a>" + ) + expected = [ + ('starttag', 'a', [('href', "test'style='color:red;bad1'")]), + ('data', 'test - bad1'), ('endtag', 'a'), + ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]), + ('data', 'test - bad2'), ('endtag', 'a'), + ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]), + ('data', 'test - bad3'), ('endtag', 'a'), + ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]), + ('data', 'test - bad4'), ('endtag', 'a') + ] + self._run_check(html, expected) + + def test_malformed_adjacent_attributes(self): + # see #12629 + self._run_check('<x><y z=""o"" /></x>', + [('starttag', 'x', []), + ('startendtag', 'y', [('z', ''), ('o""', None)]), + ('endtag', 'x')]) + self._run_check('<x><y z="""" /></x>', + [('starttag', 'x', []), + ('startendtag', 'y', [('z', ''), ('""', None)]), + ('endtag', 'x')]) + + # see #755670 for the following 3 tests + def test_adjacent_attributes(self): + self._run_check('<a width="100%"cellspacing=0>', + [("starttag", "a", + [("width", "100%"), ("cellspacing","0")])]) + + self._run_check('<a id="foo"class="bar">', + [("starttag", "a", + [("id", "foo"), ("class","bar")])]) + + def test_missing_attribute_value(self): + self._run_check('<a v=>', + [("starttag", "a", [("v", "")])]) + + def test_javascript_attribute_value(self): + self._run_check("<a href=javascript:popup('/popup/help.html')>", + [("starttag", "a", + [("href", "javascript:popup('/popup/help.html')")])]) + + def test_end_tag_in_attribute_value(self): + # see #1745761 + self._run_check("<a href='http://www.example.org/\">;'>spam</a>", + [("starttag", "a", + [("href", "http://www.example.org/\">;")]), + ("data", "spam"), ("endtag", "a")]) + + + def test_main(): - support.run_unittest(HTMLParserStrictTestCase, HTMLParserTolerantTestCase) + support.run_unittest(HTMLParserStrictTestCase, HTMLParserTolerantTestCase, + AttributesStrictTestCase, AttributesTolerantTestCase) if __name__ == "__main__": @@ -365,6 +365,9 @@ Core and Builtins Library ------- +- Issues #1745761, #755670, #13357, #12629, #1200313: HTMLParser now correctly + handles non-valid attributes, including adjacent and unquoted attributes. + - Issue #13193: Fix distutils.filelist.FileList and packaging.manifest.Manifest under Windows. The "recursive-include" directive now recognizes both legal path separators. |