diff options
-rw-r--r-- | Lib/html/parser.py | 18 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 21 | ||||
-rw-r--r-- | Misc/NEWS | 2 |
3 files changed, 34 insertions, 7 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index aa31fbc..2bfd187 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -26,14 +26,18 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') -# Note, the strict one of this pair isn't really strict, but we can't -# make it correctly strict without breaking backward compatibility. +# Note: +# 1) the strict attrfind isn't really strict, but we can't make it +# correctly strict without breaking backward compatibility; +# 2) if you change attrfind remember to update locatestarttagend too; +# 3) if you change attrfind and/or locatestarttagend the parser will +# explode, so don't do it. attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name @@ -50,15 +54,15 @@ locatestarttagend = re.compile(r""" """, re.VERBOSE) locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s* # optional whitespace before attribute name - (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name + (?:[\s/]* # optional whitespace before attribute name + (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma - )?\s* + )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index e2b09a9..3e2a590 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -389,6 +389,27 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): self._run_check("<a foo='>'", [('data', "<a foo='>'")]) self._run_check("<a foo='>", [('data', "<a foo='>")]) + def test_slashes_in_starttag(self): + self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])]) + html = ('<img width=902 height=250px ' + 'src="/sites/default/files/images/homepage/foo.jpg" ' + '/*what am I doing here*/ />') + expected = [( + 'startendtag', 'img', + [('width', '902'), ('height', '250px'), + ('src', '/sites/default/files/images/homepage/foo.jpg'), + ('*what', None), ('am', None), ('i', None), + ('doing', None), ('here*', None)] + )] + self._run_check(html, expected) + html = ('<a / /foo/ / /=/ / /bar/ / />' + '<a / /foo/ / /=/ / /bar/ / >') + expected = [ + ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]), + ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)]) + ] + self._run_check(html, expected) + def test_declaration_junk_chars(self): self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')]) @@ -479,6 +479,8 @@ Core and Builtins Library ------- +- HTMLParser is now able to handle slashes in the start tag. + - Issue #13641: Decoding functions in the base64 module now accept ASCII-only unicode strings. Patch by Catalin Iacob. |