Merge the HTMLParser fix with 3.2.

author: Ezio Melotti <ezio.melotti@gmail.com> 2012-02-21 07:29:10 (GMT)
committer: Ezio Melotti <ezio.melotti@gmail.com> 2012-02-21 07:29:10 (GMT)
commit: 307da2b07040d0eea179cf989e3de8d9e75a9b2f (patch)
tree: c3043ecf176f5d84e7c31bf8c354594f3b3998da /Lib/html/parser.py
parent: 79d38788ee5e00571db751d312612faf94f09eef (diff)
parent: 29877e8e04755c919b42ee012495f2e9671f3251 (diff)
download: cpython-307da2b07040d0eea179cf989e3de8d9e75a9b2f.zip
cpython-307da2b07040d0eea179cf989e3de8d9e75a9b2f.tar.gz
cpython-307da2b07040d0eea179cf989e3de8d9e75a9b2f.tar.bz2
1 files changed, 11 insertions, 7 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index aa31fbc..2bfd187 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -26,14 +26,18 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
-# Note, the strict one of this pair isn't really strict, but we can't
-# make it correctly strict without breaking backward compatibility.
+# Note:
+#  1) the strict attrfind isn't really strict, but we can't make it
+#     correctly strict without breaking backward compatibility;
+#  2) if you change attrfind remember to update locatestarttagend too;
+#  3) if you change attrfind and/or locatestarttagend the parser will
+#     explode, so don't do it.
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
     r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
 attrfind_tolerant = re.compile(
-    r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
-    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+    r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
+    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
 locatestarttagend = re.compile(r"""
   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
   (?:\s+                             # whitespace before attribute name
@@ -50,15 +54,15 @@ locatestarttagend = re.compile(r"""
 """, re.VERBOSE)
 locatestarttagend_tolerant = re.compile(r"""
   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
-  (?:\s*                             # optional whitespace before attribute name
-    (?:(?<=['"\s])[^\s/>][^\s/=>]*   # attribute name
+  (?:[\s/]*                          # optional whitespace before attribute name
+    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
       (?:\s*=+\s*                    # value indicator
         (?:'[^']*'                   # LITA-enclosed value
           |"[^"]*"                   # LIT-enclosed value
           |(?!['"])[^>\s]*           # bare value
          )
          (?:\s*,)*                   # possibly followed by a comma
-       )?\s*
+       )?(?:\s|/(?!>))*
      )*
    )?
   \s*                                # trailing whitespace
author	Ezio Melotti <ezio.melotti@gmail.com>	2012-02-21 07:29:10 (GMT)
committer	Ezio Melotti <ezio.melotti@gmail.com>	2012-02-21 07:29:10 (GMT)
commit	307da2b07040d0eea179cf989e3de8d9e75a9b2f (patch)
tree	c3043ecf176f5d84e7c31bf8c354594f3b3998da /Lib/html/parser.py
parent	79d38788ee5e00571db751d312612faf94f09eef (diff)
parent	29877e8e04755c919b42ee012495f2e9671f3251 (diff)
download	cpython-307da2b07040d0eea179cf989e3de8d9e75a9b2f.zip cpython-307da2b07040d0eea179cf989e3de8d9e75a9b2f.tar.gz cpython-307da2b07040d0eea179cf989e3de8d9e75a9b2f.tar.bz2