diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2011-11-14 16:53:33 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2011-11-14 16:53:33 (GMT) |
commit | c2fe57762b6cfa8849908e1a0475036cd0b058ba (patch) | |
tree | 0ff2ec6b5db1284216d740461af1a83c1c7d01f2 /Lib/html | |
parent | b245ed1cdf769354b2454cc3fbd34e9b9233cb0a (diff) | |
download | cpython-c2fe57762b6cfa8849908e1a0475036cd0b058ba.zip cpython-c2fe57762b6cfa8849908e1a0475036cd0b058ba.tar.gz cpython-c2fe57762b6cfa8849908e1a0475036cd0b058ba.tar.bz2 |
#1745761, #755670, #13357, #12629, #1200313: improve attribute handling in HTMLParser.
Diffstat (limited to 'Lib/html')
-rw-r--r-- | Lib/html/parser.py | 19 |
1 files changed, 10 insertions, 9 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index afdb305..662e855 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -30,8 +30,8 @@ attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( - r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?') + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name @@ -49,16 +49,16 @@ locatestarttagend = re.compile(r""" locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s* # optional whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator + (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name + (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma - )? - ) - )* + )?\s* + )* + )? \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') @@ -295,6 +295,7 @@ class HTMLParser(_markupbase.ParserBase): elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] + if attrvalue: attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() |