summaryrefslogtreecommitdiffstats
path: root/Lib/html/parser.py
diff options
context:
space:
mode:
authorEzio Melotti <ezio.melotti@gmail.com>2011-11-14 16:53:33 (GMT)
committerEzio Melotti <ezio.melotti@gmail.com>2011-11-14 16:53:33 (GMT)
commitc2fe57762b6cfa8849908e1a0475036cd0b058ba (patch)
tree0ff2ec6b5db1284216d740461af1a83c1c7d01f2 /Lib/html/parser.py
parentb245ed1cdf769354b2454cc3fbd34e9b9233cb0a (diff)
downloadcpython-c2fe57762b6cfa8849908e1a0475036cd0b058ba.zip
cpython-c2fe57762b6cfa8849908e1a0475036cd0b058ba.tar.gz
cpython-c2fe57762b6cfa8849908e1a0475036cd0b058ba.tar.bz2
#1745761, #755670, #13357, #12629, #1200313: improve attribute handling in HTMLParser.
Diffstat (limited to 'Lib/html/parser.py')
-rw-r--r--Lib/html/parser.py19
1 files changed, 10 insertions, 9 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index afdb305..662e855 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -30,8 +30,8 @@ attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile(
- r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
+ r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
+ r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
@@ -49,16 +49,16 @@ locatestarttagend = re.compile(r"""
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s* # optional whitespace before attribute name
- (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
- (?:\s*=\s* # value indicator
+ (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
+ (?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
- |\"[^\"]*\" # LIT-enclosed value
- |[^'\">\s]+ # bare value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^>\s]* # bare value
)
(?:\s*,)* # possibly followed by a comma
- )?
- )
- )*
+ )?\s*
+ )*
+ )?
\s* # trailing whitespace
""", re.VERBOSE)
endendtag = re.compile('>')
@@ -295,6 +295,7 @@ class HTMLParser(_markupbase.ParserBase):
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
+ if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()