diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2012-06-24 20:02:56 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2012-06-24 20:02:56 (GMT) |
commit | 46495182d0fc58b519d10315f1bf392f08f33a2e (patch) | |
tree | 0503e0a7032d33e98954331d3a2d5c6e19607392 /Lib | |
parent | a504a7a7d1fd6056e067027354d31595aa4b8958 (diff) | |
download | cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.zip cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.tar.gz cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.tar.bz2 |
#15156: HTMLParser now uses the new "html.entities.html5" dictionary.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/html/parser.py | 32 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 7 |
2 files changed, 21 insertions, 18 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 494cf24..f8ac828 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -500,7 +500,6 @@ class HTMLParser(_markupbase.ParserBase): self.error("unknown declaration: %r" % (data,)) # Internal -- helper to remove special character quoting - entitydefs = None def unescape(self, s): if '&' not in s: return s @@ -510,24 +509,23 @@ class HTMLParser(_markupbase.ParserBase): if s[0] == "#": s = s[1:] if s[0] in ['x','X']: - c = int(s[1:], 16) + c = int(s[1:].rstrip(';'), 16) else: - c = int(s) + c = int(s.rstrip(';')) return chr(c) except ValueError: - return '&#'+ s +';' + return '&#' + s else: - # Cannot use name2codepoint directly, because HTMLParser - # supports apos, which is not part of HTML 4 - import html.entities - if HTMLParser.entitydefs is None: - entitydefs = HTMLParser.entitydefs = {'apos':"'"} - for k, v in html.entities.name2codepoint.items(): - entitydefs[k] = chr(v) - try: - return self.entitydefs[s] - except KeyError: - return '&'+s+';' - - return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", + from html.entities import html5 + if s in html5: + return html5[s] + elif s.endswith(';'): + return '&' + s + for x in range(2, len(s)): + if s[:x] in html5: + return html5[s[:x]] + s[x:] + else: + return '&' + s + + return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))", replaceEntities, s, flags=re.ASCII) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 64a4f5d..c5d878d 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -456,7 +456,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): self._run_check('<form action="/xxx.php?a=1&b=2&", ' 'method="post">', [ ('starttag', 'form', - [('action', '/xxx.php?a=1&b=2&'), + [('action', '/xxx.php?a=1&b=2&'), (',', None), ('method', 'post')])]) def test_weird_chars_in_unquoted_attribute_values(self): @@ -541,6 +541,11 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): self.assertEqual(p.unescape('&'),'&') # see #12888 self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050) + # see #15156 + self.assertEqual(p.unescape('ÉricÉric' + '&alphacentauriαcentauri'), + 'ÉricÉric&alphacentauriαcentauri') + self.assertEqual(p.unescape('&co;'), '&co;') def test_broken_comments(self): html = ('<! not really a comment >' |