summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEzio Melotti <none@none>2011-04-07 19:03:31 (GMT)
committerEzio Melotti <none@none>2011-04-07 19:03:31 (GMT)
commit2e3607c1e758865519b28066b8925f37203e2197 (patch)
treeec09ce29a9dcc1bfa91e7e0f65428666332eda41
parent9b5ac3efa64d72b54d4f1ab32a95c260b39ab98d (diff)
downloadcpython-2e3607c1e758865519b28066b8925f37203e2197.zip
cpython-2e3607c1e758865519b28066b8925f37203e2197.tar.gz
cpython-2e3607c1e758865519b28066b8925f37203e2197.tar.bz2
#7311: fix html.parser to accept non-ASCII attribute values.
-rw-r--r--Lib/html/parser.py2
-rw-r--r--Lib/test/test_htmlparser.py17
-rw-r--r--Misc/NEWS2
3 files changed, 20 insertions, 1 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 21ebbc3..a3586eb 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -28,7 +28,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
# make it correctly strict without breaking backward compatibility.
attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
+ r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 5ecd016..637ab01 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -217,6 +217,23 @@ DOCTYPE html [
("starttag", "a", [("href", "mailto:xyz@example.com")]),
])
+ def test_attr_nonascii(self):
+ # see issue 7311
+ self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [
+ ("starttag", "img", [("src", "/foo/bar.png"),
+ ("alt", "\u4e2d\u6587")]),
+ ])
+ self._run_check("<a title='\u30c6\u30b9\u30c8' "
+ "href='\u30c6\u30b9\u30c8.html'>", [
+ ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+ ("href", "\u30c6\u30b9\u30c8.html")]),
+ ])
+ self._run_check('<a title="\u30c6\u30b9\u30c8" '
+ 'href="\u30c6\u30b9\u30c8.html">', [
+ ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+ ("href", "\u30c6\u30b9\u30c8.html")]),
+ ])
+
def test_attr_entity_replacement(self):
self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
("starttag", "a", [("b", "&><\"'")]),
diff --git a/Misc/NEWS b/Misc/NEWS
index 2ae7eaf..80d46ed 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -49,6 +49,8 @@ Core and Builtins
Library
-------
+- Issue #7311: fix html.parser to accept non-ASCII attribute values.
+
- Issue #11605: email.parser.BytesFeedParser was incorrectly converting multipart
subpararts with an 8bit CTE into unicode instead of preserving the bytes.