diff options
| author | Ezio Melotti <ezio.melotti@gmail.com> | 2011-04-07 19:27:44 (GMT) |
|---|---|---|
| committer | Ezio Melotti <ezio.melotti@gmail.com> | 2011-04-07 19:27:44 (GMT) |
| commit | 6537be7fb20ca6e2c8617e8369e45c7f484b3724 (patch) | |
| tree | 4140ed931562119e2af720c7ded74be811f36ab7 | |
| parent | cece8cfe8a54bb8977ba74599c3861147367a377 (diff) | |
| parent | 2e3607c1e758865519b28066b8925f37203e2197 (diff) | |
| download | cpython-6537be7fb20ca6e2c8617e8369e45c7f484b3724.zip cpython-6537be7fb20ca6e2c8617e8369e45c7f484b3724.tar.gz cpython-6537be7fb20ca6e2c8617e8369e45c7f484b3724.tar.bz2 | |
#7311: merge with 3.2.
| -rw-r--r-- | Lib/html/parser.py | 2 | ||||
| -rw-r--r-- | Lib/test/test_htmlparser.py | 17 | ||||
| -rw-r--r-- | Misc/NEWS | 2 |
3 files changed, 20 insertions, 1 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 21ebbc3..a3586eb 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -28,7 +28,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') # make it correctly strict without breaking backward compatibility. attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') + r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?') diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 5ecd016..637ab01 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -217,6 +217,23 @@ DOCTYPE html [ ("starttag", "a", [("href", "mailto:xyz@example.com")]), ]) + def test_attr_nonascii(self): + # see issue 7311 + self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [ + ("starttag", "img", [("src", "/foo/bar.png"), + ("alt", "\u4e2d\u6587")]), + ]) + self._run_check("<a title='\u30c6\u30b9\u30c8' " + "href='\u30c6\u30b9\u30c8.html'>", [ + ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), + ("href", "\u30c6\u30b9\u30c8.html")]), + ]) + self._run_check('<a title="\u30c6\u30b9\u30c8" ' + 'href="\u30c6\u30b9\u30c8.html">', [ + ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), + ("href", "\u30c6\u30b9\u30c8.html")]), + ]) + def test_attr_entity_replacement(self): self._run_check("""<a b='&><"''>""", [ ("starttag", "a", [("b", "&><\"'")]), @@ -94,6 +94,8 @@ Core and Builtins Library ------- +- Issue #7311: fix html.parser to accept non-ASCII attribute values. + - Issue #11605: email.parser.BytesFeedParser was incorrectly converting multipart subpararts with an 8bit CTE into unicode instead of preserving the bytes. |
