#7311: merge with 3.2.

author: Ezio Melotti <ezio.melotti@gmail.com> 2011-04-07 19:27:44 (GMT)
committer: Ezio Melotti <ezio.melotti@gmail.com> 2011-04-07 19:27:44 (GMT)
commit: 6537be7fb20ca6e2c8617e8369e45c7f484b3724 (patch)
tree: 4140ed931562119e2af720c7ded74be811f36ab7
parent: cece8cfe8a54bb8977ba74599c3861147367a377 (diff)
parent: 2e3607c1e758865519b28066b8925f37203e2197 (diff)
download: cpython-6537be7fb20ca6e2c8617e8369e45c7f484b3724.zip
cpython-6537be7fb20ca6e2c8617e8369e45c7f484b3724.tar.gz
cpython-6537be7fb20ca6e2c8617e8369e45c7f484b3724.tar.bz2
3 files changed, 20 insertions, 1 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 21ebbc3..a3586eb 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -28,7 +28,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
 # make it correctly strict without breaking backward compatibility.
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
-    r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
+    r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
 attrfind_tolerant = re.compile(
     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
     r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 5ecd016..637ab01 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -217,6 +217,23 @@ DOCTYPE html [
             ("starttag", "a", [("href", "mailto:xyz@example.com")]),
             ])
 
+    def test_attr_nonascii(self):
+        # see issue 7311
+        self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [
+            ("starttag", "img", [("src", "/foo/bar.png"),
+                                 ("alt", "\u4e2d\u6587")]),
+            ])
+        self._run_check("<a title='\u30c6\u30b9\u30c8' "
+                        "href='\u30c6\u30b9\u30c8.html'>", [
+            ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+                               ("href", "\u30c6\u30b9\u30c8.html")]),
+            ])
+        self._run_check('<a title="\u30c6\u30b9\u30c8" '
+                        'href="\u30c6\u30b9\u30c8.html">', [
+            ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+                               ("href", "\u30c6\u30b9\u30c8.html")]),
+            ])
+
     def test_attr_entity_replacement(self):
         self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
             ("starttag", "a", [("b", "&><\"'")]),
diff --git a/Misc/NEWS b/Misc/NEWS
index 37eb250..fa03fc1 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -94,6 +94,8 @@ Core and Builtins
 Library
 -------
 
+- Issue #7311: fix html.parser to accept non-ASCII attribute values.
+
 - Issue #11605: email.parser.BytesFeedParser was incorrectly converting multipart
   subpararts with an 8bit CTE into unicode instead of preserving the bytes.
author	Ezio Melotti <ezio.melotti@gmail.com>	2011-04-07 19:27:44 (GMT)
committer	Ezio Melotti <ezio.melotti@gmail.com>	2011-04-07 19:27:44 (GMT)
commit	6537be7fb20ca6e2c8617e8369e45c7f484b3724 (patch)
tree	4140ed931562119e2af720c7ded74be811f36ab7
parent	cece8cfe8a54bb8977ba74599c3861147367a377 (diff)
parent	2e3607c1e758865519b28066b8925f37203e2197 (diff)
download	cpython-6537be7fb20ca6e2c8617e8369e45c7f484b3724.zip cpython-6537be7fb20ca6e2c8617e8369e45c7f484b3724.tar.gz cpython-6537be7fb20ca6e2c8617e8369e45c7f484b3724.tar.bz2