summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorEzio Melotti <none@none>2011-04-05 17:40:52 (GMT)
committerEzio Melotti <none@none>2011-04-05 17:40:52 (GMT)
commit9f1ffb2ae932f5eef1bcf1317a0e3d8f4bad0e0c (patch)
tree934c7dffb54ca6e5d3e30b611cba8ddae4a56fef /Lib
parent104c3f1020213fc2d0a5da6b23d72dd042d6c413 (diff)
downloadcpython-9f1ffb2ae932f5eef1bcf1317a0e3d8f4bad0e0c.zip
cpython-9f1ffb2ae932f5eef1bcf1317a0e3d8f4bad0e0c.tar.gz
cpython-9f1ffb2ae932f5eef1bcf1317a0e3d8f4bad0e0c.tar.bz2
#7311: fix HTMLParser to accept non-ASCII attribute values.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/HTMLParser.py2
-rw-r--r--Lib/test/test_htmlparser.py17
2 files changed, 18 insertions, 1 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 4fdc09a..e018901 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -26,7 +26,7 @@ commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
+ r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 717585c..0620d0b 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -208,6 +208,23 @@ DOCTYPE html [
("starttag", "a", [("href", "mailto:xyz@example.com")]),
])
+ def test_attr_nonascii(self):
+ # see issue 7311
+ self._run_check(u"<img src=/foo/bar.png alt=\u4e2d\u6587>", [
+ ("starttag", "img", [("src", "/foo/bar.png"),
+ ("alt", u"\u4e2d\u6587")]),
+ ])
+ self._run_check(u"<a title='\u30c6\u30b9\u30c8' "
+ u"href='\u30c6\u30b9\u30c8.html'>", [
+ ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
+ ("href", u"\u30c6\u30b9\u30c8.html")]),
+ ])
+ self._run_check(u'<a title="\u30c6\u30b9\u30c8" '
+ u'href="\u30c6\u30b9\u30c8.html">', [
+ ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
+ ("href", u"\u30c6\u30b9\u30c8.html")]),
+ ])
+
def test_attr_entity_replacement(self):
self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
("starttag", "a", [("b", "&><\"'")]),