diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2012-02-21 07:25:00 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2012-02-21 07:25:00 (GMT) |
commit | 29877e8e04755c919b42ee012495f2e9671f3251 (patch) | |
tree | 05d086dff8880990b213a0f4af769c53895c998e /Lib | |
parent | 178e5ea305848015d514f4038118777374e44c87 (diff) | |
download | cpython-29877e8e04755c919b42ee012495f2e9671f3251.zip cpython-29877e8e04755c919b42ee012495f2e9671f3251.tar.gz cpython-29877e8e04755c919b42ee012495f2e9671f3251.tar.bz2 |
HTMLParser is now able to handle slashes in the start tag.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/html/parser.py | 18 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 21 |
2 files changed, 32 insertions, 7 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index aa31fbc..2bfd187 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -26,14 +26,18 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') -# Note, the strict one of this pair isn't really strict, but we can't -# make it correctly strict without breaking backward compatibility. +# Note: +# 1) the strict attrfind isn't really strict, but we can't make it +# correctly strict without breaking backward compatibility; +# 2) if you change attrfind remember to update locatestarttagend too; +# 3) if you change attrfind and/or locatestarttagend the parser will +# explode, so don't do it. attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name @@ -50,15 +54,15 @@ locatestarttagend = re.compile(r""" """, re.VERBOSE) locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s* # optional whitespace before attribute name - (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name + (?:[\s/]* # optional whitespace before attribute name + (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma - )?\s* + )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index e2b09a9..3e2a590 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -389,6 +389,27 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): self._run_check("<a foo='>'", [('data', "<a foo='>'")]) self._run_check("<a foo='>", [('data', "<a foo='>")]) + def test_slashes_in_starttag(self): + self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])]) + html = ('<img width=902 height=250px ' + 'src="/sites/default/files/images/homepage/foo.jpg" ' + '/*what am I doing here*/ />') + expected = [( + 'startendtag', 'img', + [('width', '902'), ('height', '250px'), + ('src', '/sites/default/files/images/homepage/foo.jpg'), + ('*what', None), ('am', None), ('i', None), + ('doing', None), ('here*', None)] + )] + self._run_check(html, expected) + html = ('<a / /foo/ / /=/ / /bar/ / />' + '<a / /foo/ / /=/ / /bar/ / >') + expected = [ + ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]), + ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)]) + ] + self._run_check(html, expected) + def test_declaration_junk_chars(self): self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')]) |