diff options
author | Ezio Melotti <none@none> | 2011-04-05 17:40:52 (GMT) |
---|---|---|
committer | Ezio Melotti <none@none> | 2011-04-05 17:40:52 (GMT) |
commit | 9f1ffb2ae932f5eef1bcf1317a0e3d8f4bad0e0c (patch) | |
tree | 934c7dffb54ca6e5d3e30b611cba8ddae4a56fef /Lib | |
parent | 104c3f1020213fc2d0a5da6b23d72dd042d6c413 (diff) | |
download | cpython-9f1ffb2ae932f5eef1bcf1317a0e3d8f4bad0e0c.zip cpython-9f1ffb2ae932f5eef1bcf1317a0e3d8f4bad0e0c.tar.gz cpython-9f1ffb2ae932f5eef1bcf1317a0e3d8f4bad0e0c.tar.bz2 |
#7311: fix HTMLParser to accept non-ASCII attribute values.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/HTMLParser.py | 2 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 17 |
2 files changed, 18 insertions, 1 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 4fdc09a..e018901 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -26,7 +26,7 @@ commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') + r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 717585c..0620d0b 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -208,6 +208,23 @@ DOCTYPE html [ ("starttag", "a", [("href", "mailto:xyz@example.com")]), ]) + def test_attr_nonascii(self): + # see issue 7311 + self._run_check(u"<img src=/foo/bar.png alt=\u4e2d\u6587>", [ + ("starttag", "img", [("src", "/foo/bar.png"), + ("alt", u"\u4e2d\u6587")]), + ]) + self._run_check(u"<a title='\u30c6\u30b9\u30c8' " + u"href='\u30c6\u30b9\u30c8.html'>", [ + ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"), + ("href", u"\u30c6\u30b9\u30c8.html")]), + ]) + self._run_check(u'<a title="\u30c6\u30b9\u30c8" ' + u'href="\u30c6\u30b9\u30c8.html">', [ + ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"), + ("href", u"\u30c6\u30b9\u30c8.html")]), + ]) + def test_attr_entity_replacement(self): self._run_check("""<a b='&><"''>""", [ ("starttag", "a", [("b", "&><\"'")]), |