diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2013-11-19 18:28:45 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2013-11-19 18:28:45 (GMT) |
commit | 4a9ee26750aa8cb37b5072b2bb4dd328819febb4 (patch) | |
tree | bc714725cf478795c34bd9f8200a52424a47474b /Lib/html/parser.py | |
parent | 5160da1afc07ab759a95d2b863134a88b9318e65 (diff) | |
download | cpython-4a9ee26750aa8cb37b5072b2bb4dd328819febb4.zip cpython-4a9ee26750aa8cb37b5072b2bb4dd328819febb4.tar.gz cpython-4a9ee26750aa8cb37b5072b2bb4dd328819febb4.tar.bz2 |
#2927: Added the unescape() function to the html module.
Diffstat (limited to 'Lib/html/parser.py')
-rw-r--r-- | Lib/html/parser.py | 38 |
1 files changed, 5 insertions, 33 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 22498db..e793c37 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -8,9 +8,12 @@ # and CDATA (character data -- only end tags are special). -import _markupbase import re import warnings +import _markupbase + +from html import unescape + __all__ = ['HTMLParser'] @@ -357,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase): attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue: - attrvalue = self.unescape(attrvalue) + attrvalue = unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() @@ -510,34 +513,3 @@ class HTMLParser(_markupbase.ParserBase): def unknown_decl(self, data): if self.strict: self.error("unknown declaration: %r" % (data,)) - - # Internal -- helper to remove special character quoting - def unescape(self, s): - if '&' not in s: - return s - def replaceEntities(s): - s = s.groups()[0] - try: - if s[0] == "#": - s = s[1:] - if s[0] in ['x','X']: - c = int(s[1:].rstrip(';'), 16) - else: - c = int(s.rstrip(';')) - return chr(c) - except ValueError: - return '&#' + s - else: - from html.entities import html5 - if s in html5: - return html5[s] - elif s.endswith(';'): - return '&' + s - for x in range(2, len(s)): - if s[:x] in html5: - return html5[s[:x]] + s[x:] - else: - return '&' + s - - return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))", - replaceEntities, s, flags=re.ASCII) |