summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEzio Melotti <ezio.melotti@gmail.com>2012-02-21 07:22:16 (GMT)
committerEzio Melotti <ezio.melotti@gmail.com>2012-02-21 07:22:16 (GMT)
commit36b7361fe76733b3a4944ef92b49bcea4584b740 (patch)
tree37ae2ab2af68f8334f6de81980ebbdef535b2662
parent9be6c3ddf09ce4bc5768f708867e6b30a5bc78cb (diff)
downloadcpython-36b7361fe76733b3a4944ef92b49bcea4584b740.zip
cpython-36b7361fe76733b3a4944ef92b49bcea4584b740.tar.gz
cpython-36b7361fe76733b3a4944ef92b49bcea4584b740.tar.bz2
HTMLParser is now able to handle slashes in the start tag.
-rw-r--r--Lib/HTMLParser.py10
-rw-r--r--Lib/test/test_htmlparser.py21
-rw-r--r--Misc/NEWS2
3 files changed, 28 insertions, 5 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 5081a62..d4e14d4 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -28,19 +28,19 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
attrfind = re.compile(
- r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+ r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
+ r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
- (?:\s+ # whitespace before attribute name
- (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
+ (?:[\s/]* # optional whitespace before attribute name
+ (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
(?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value
)
- )?\s*
+ )?(?:\s|/(?!>))*
)*
)?
\s* # trailing whitespace
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 8136bca..41f4340 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -240,6 +240,27 @@ text
self._run_check("<!DOCTYPE %s>" % dtd,
[('decl', 'DOCTYPE ' + dtd)])
+ def test_slashes_in_starttag(self):
+ self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
+ html = ('<img width=902 height=250px '
+ 'src="/sites/default/files/images/homepage/foo.jpg" '
+ '/*what am I doing here*/ />')
+ expected = [(
+ 'startendtag', 'img',
+ [('width', '902'), ('height', '250px'),
+ ('src', '/sites/default/files/images/homepage/foo.jpg'),
+ ('*what', None), ('am', None), ('i', None),
+ ('doing', None), ('here*', None)]
+ )]
+ self._run_check(html, expected)
+ html = ('<a / /foo/ / /=/ / /bar/ / />'
+ '<a / /foo/ / /=/ / /bar/ / >')
+ expected = [
+ ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
+ ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
+ ]
+ self._run_check(html, expected)
+
def test_declaration_junk_chars(self):
self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
diff --git a/Misc/NEWS b/Misc/NEWS
index 1c04987..45f22b1 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -98,6 +98,8 @@ Core and Builtins
Library
-------
+- HTMLParser is now able to handle slashes in the start tag.
+
- Issue #14001: CVE-2012-0845: xmlrpc: Fix an endless loop in
SimpleXMLRPCServer upon malformed POST request.