#15156: HTMLParser now uses the new "html.entities.html5" dictionary.

author: Ezio Melotti <ezio.melotti@gmail.com> 2012-06-24 20:02:56 (GMT)
committer: Ezio Melotti <ezio.melotti@gmail.com> 2012-06-24 20:02:56 (GMT)
commit: 46495182d0fc58b519d10315f1bf392f08f33a2e (patch)
tree: 0503e0a7032d33e98954331d3a2d5c6e19607392 /Lib
parent: a504a7a7d1fd6056e067027354d31595aa4b8958 (diff)
download: cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.zip
cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.tar.gz
cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.tar.bz2
2 files changed, 21 insertions, 18 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 494cf24..f8ac828 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -500,7 +500,6 @@ class HTMLParser(_markupbase.ParserBase):
             self.error("unknown declaration: %r" % (data,))
 
     # Internal -- helper to remove special character quoting
-    entitydefs = None
     def unescape(self, s):
         if '&' not in s:
             return s
@@ -510,24 +509,23 @@ class HTMLParser(_markupbase.ParserBase):
                 if s[0] == "#":
                     s = s[1:]
                     if s[0] in ['x','X']:
-                        c = int(s[1:], 16)
+                        c = int(s[1:].rstrip(';'), 16)
                     else:
-                        c = int(s)
+                        c = int(s.rstrip(';'))
                     return chr(c)
             except ValueError:
-                return '&#'+ s +';'
+                return '&#' + s
             else:
-                # Cannot use name2codepoint directly, because HTMLParser
-                # supports apos, which is not part of HTML 4
-                import html.entities
-                if HTMLParser.entitydefs is None:
-                    entitydefs = HTMLParser.entitydefs = {'apos':"'"}
-                    for k, v in html.entities.name2codepoint.items():
-                        entitydefs[k] = chr(v)
-                try:
-                    return self.entitydefs[s]
-                except KeyError:
-                    return '&'+s+';'
-
-        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
+                from html.entities import html5
+                if s in html5:
+                    return html5[s]
+                elif s.endswith(';'):
+                    return '&' + s
+                for x in range(2, len(s)):
+                    if s[:x] in html5:
+                        return html5[s[:x]] + s[x:]
+                else:
+                    return '&' + s
+
+        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
                       replaceEntities, s, flags=re.ASCII)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 64a4f5d..c5d878d 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -456,7 +456,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
         self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
                         'method="post">', [
                             ('starttag', 'form',
-                                [('action', '/xxx.php?a=1&b=2&amp'),
+                                [('action', '/xxx.php?a=1&b=2&'),
                                  (',', None), ('method', 'post')])])
 
     def test_weird_chars_in_unquoted_attribute_values(self):
@@ -541,6 +541,11 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
         self.assertEqual(p.unescape('&#0038;'),'&')
         # see #12888
         self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
+        # see #15156
+        self.assertEqual(p.unescape('&Eacuteric&Eacute;ric'
+                                    '&alphacentauri&alpha;centauri'),
+                                    'ÉricÉric&alphacentauriαcentauri')
+        self.assertEqual(p.unescape('&co;'), '&co;')
 
     def test_broken_comments(self):
         html = ('<! not really a comment >'
author	Ezio Melotti <ezio.melotti@gmail.com>	2012-06-24 20:02:56 (GMT)
committer	Ezio Melotti <ezio.melotti@gmail.com>	2012-06-24 20:02:56 (GMT)
commit	46495182d0fc58b519d10315f1bf392f08f33a2e (patch)
tree	0503e0a7032d33e98954331d3a2d5c6e19607392 /Lib
parent	a504a7a7d1fd6056e067027354d31595aa4b8958 (diff)
download	cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.zip cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.tar.gz cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.tar.bz2