diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2012-06-24 20:02:56 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2012-06-24 20:02:56 (GMT) |
commit | 46495182d0fc58b519d10315f1bf392f08f33a2e (patch) | |
tree | 0503e0a7032d33e98954331d3a2d5c6e19607392 /Lib/html | |
parent | a504a7a7d1fd6056e067027354d31595aa4b8958 (diff) | |
download | cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.zip cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.tar.gz cpython-46495182d0fc58b519d10315f1bf392f08f33a2e.tar.bz2 |
#15156: HTMLParser now uses the new "html.entities.html5" dictionary.
Diffstat (limited to 'Lib/html')
-rw-r--r-- | Lib/html/parser.py | 32 |
1 files changed, 15 insertions, 17 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 494cf24..f8ac828 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -500,7 +500,6 @@ class HTMLParser(_markupbase.ParserBase): self.error("unknown declaration: %r" % (data,)) # Internal -- helper to remove special character quoting - entitydefs = None def unescape(self, s): if '&' not in s: return s @@ -510,24 +509,23 @@ class HTMLParser(_markupbase.ParserBase): if s[0] == "#": s = s[1:] if s[0] in ['x','X']: - c = int(s[1:], 16) + c = int(s[1:].rstrip(';'), 16) else: - c = int(s) + c = int(s.rstrip(';')) return chr(c) except ValueError: - return '&#'+ s +';' + return '&#' + s else: - # Cannot use name2codepoint directly, because HTMLParser - # supports apos, which is not part of HTML 4 - import html.entities - if HTMLParser.entitydefs is None: - entitydefs = HTMLParser.entitydefs = {'apos':"'"} - for k, v in html.entities.name2codepoint.items(): - entitydefs[k] = chr(v) - try: - return self.entitydefs[s] - except KeyError: - return '&'+s+';' - - return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", + from html.entities import html5 + if s in html5: + return html5[s] + elif s.endswith(';'): + return '&' + s + for x in range(2, len(s)): + if s[:x] in html5: + return html5[s[:x]] + s[x:] + else: + return '&' + s + + return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))", replaceEntities, s, flags=re.ASCII) |