From 46495182d0fc58b519d10315f1bf392f08f33a2e Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Sun, 24 Jun 2012 22:02:56 +0200 Subject: #15156: HTMLParser now uses the new "html.entities.html5" dictionary. --- Doc/library/html.entities.rst | 4 ---- Lib/html/parser.py | 32 +++++++++++++++----------------- Lib/test/test_htmlparser.py | 7 ++++++- Misc/NEWS | 2 ++ 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/Doc/library/html.entities.rst b/Doc/library/html.entities.rst index f0dd7aa..65ce817 100644 --- a/Doc/library/html.entities.rst +++ b/Doc/library/html.entities.rst @@ -11,10 +11,6 @@ This module defines four dictionaries, :data:`html5`, :data:`name2codepoint`, :data:`codepoint2name`, and :data:`entitydefs`. -:data:`entitydefs` is used to provide the :attr:`entitydefs` -attribute of the :class:`html.parser.HTMLParser` class. The definition provided -here contains all the entities defined by XHTML 1.0 that can be handled using -simple textual substitution in the Latin-1 character set (ISO-8859-1). .. data:: html5 diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 494cf24..f8ac828 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -500,7 +500,6 @@ class HTMLParser(_markupbase.ParserBase): self.error("unknown declaration: %r" % (data,)) # Internal -- helper to remove special character quoting - entitydefs = None def unescape(self, s): if '&' not in s: return s @@ -510,24 +509,23 @@ class HTMLParser(_markupbase.ParserBase): if s[0] == "#": s = s[1:] if s[0] in ['x','X']: - c = int(s[1:], 16) + c = int(s[1:].rstrip(';'), 16) else: - c = int(s) + c = int(s.rstrip(';')) return chr(c) except ValueError: - return '&#'+ s +';' + return '&#' + s else: - # Cannot use name2codepoint directly, because HTMLParser - # supports apos, which is not part of HTML 4 - import html.entities - if HTMLParser.entitydefs is None: - entitydefs = HTMLParser.entitydefs = {'apos':"'"} - for k, v in html.entities.name2codepoint.items(): - entitydefs[k] = chr(v) - try: - return self.entitydefs[s] - except KeyError: - return '&'+s+';' - - return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", + from html.entities import html5 + if s in html5: + return html5[s] + elif s.endswith(';'): + return '&' + s + for x in range(2, len(s)): + if s[:x] in html5: + return html5[s[:x]] + s[x:] + else: + return '&' + s + + return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))", replaceEntities, s, flags=re.ASCII) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 64a4f5d..c5d878d 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -456,7 +456,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): self._run_check('
', [ ('starttag', 'form', - [('action', '/xxx.php?a=1&b=2&'), + [('action', '/xxx.php?a=1&b=2&'), (',', None), ('method', 'post')])]) def test_weird_chars_in_unquoted_attribute_values(self): @@ -541,6 +541,11 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): self.assertEqual(p.unescape('&'),'&') # see #12888 self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050) + # see #15156 + self.assertEqual(p.unescape('ÉricÉric' + '&alphacentauriαcentauri'), + 'ÉricÉric&alphacentauriαcentauri') + self.assertEqual(p.unescape('&co;'), '&co;') def test_broken_comments(self): html = ('' diff --git a/Misc/NEWS b/Misc/NEWS index 0ccdce5..da574b0 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -76,6 +76,8 @@ Library It is used automatically on platforms supporting the necessary os.openat() and os.unlinkat() functions. Main code by Martin von Löwis. +- Issue #15156: HTMLParser now uses the new "html.entities.html5" dictionary. + - Issue #11113: add a new "html5" dictionary containing the named character references defined by the HTML5 standard and the equivalent Unicode character(s) to the html.entities module. -- cgit v0.12