From 29877e8e04755c919b42ee012495f2e9671f3251 Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Tue, 21 Feb 2012 09:25:00 +0200 Subject: HTMLParser is now able to handle slashes in the start tag. --- Lib/html/parser.py | 18 +++++++++++------- Lib/test/test_htmlparser.py | 21 +++++++++++++++++++++ Misc/NEWS | 2 ++ 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index aa31fbc..2bfd187 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -26,14 +26,18 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') -# Note, the strict one of this pair isn't really strict, but we can't -# make it correctly strict without breaking backward compatibility. +# Note: +# 1) the strict attrfind isn't really strict, but we can't make it +# correctly strict without breaking backward compatibility; +# 2) if you change attrfind remember to update locatestarttagend too; +# 3) if you change attrfind and/or locatestarttagend the parser will +# explode, so don't do it. attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name @@ -50,15 +54,15 @@ locatestarttagend = re.compile(r""" """, re.VERBOSE) locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s* # optional whitespace before attribute name - (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name + (?:[\s/]* # optional whitespace before attribute name + (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma - )?\s* + )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index e2b09a9..3e2a590 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -389,6 +389,27 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): self._run_check("', [('startendtag', 'a', [('foo', 'var')])]) + html = ('') + expected = [( + 'startendtag', 'img', + [('width', '902'), ('height', '250px'), + ('src', '/sites/default/files/images/homepage/foo.jpg'), + ('*what', None), ('am', None), ('i', None), + ('doing', None), ('here*', None)] + )] + self._run_check(html, expected) + html = ('' + '') + expected = [ + ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]), + ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)]) + ] + self._run_check(html, expected) + def test_declaration_junk_chars(self): self._run_check("", [('decl', 'DOCTYPE foo $ ')]) diff --git a/Misc/NEWS b/Misc/NEWS index 0e3595a..4574455 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -121,6 +121,8 @@ Core and Builtins Library ------- +- HTMLParser is now able to handle slashes in the start tag. + - Issue #14001: CVE-2012-0845: xmlrpc: Fix an endless loop in SimpleXMLRPCServer upon malformed POST request. -- cgit v0.12