summaryrefslogtreecommitdiffstats
path: root/Lib/html/parser.py
diff options
context:
space:
mode:
authorEzio Melotti <ezio.melotti@gmail.com>2013-11-19 18:28:45 (GMT)
committerEzio Melotti <ezio.melotti@gmail.com>2013-11-19 18:28:45 (GMT)
commit4a9ee26750aa8cb37b5072b2bb4dd328819febb4 (patch)
treebc714725cf478795c34bd9f8200a52424a47474b /Lib/html/parser.py
parent5160da1afc07ab759a95d2b863134a88b9318e65 (diff)
downloadcpython-4a9ee26750aa8cb37b5072b2bb4dd328819febb4.zip
cpython-4a9ee26750aa8cb37b5072b2bb4dd328819febb4.tar.gz
cpython-4a9ee26750aa8cb37b5072b2bb4dd328819febb4.tar.bz2
#2927: Added the unescape() function to the html module.
Diffstat (limited to 'Lib/html/parser.py')
-rw-r--r--Lib/html/parser.py38
1 files changed, 5 insertions, 33 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 22498db..e793c37 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -8,9 +8,12 @@
# and CDATA (character data -- only end tags are special).
-import _markupbase
import re
import warnings
+import _markupbase
+
+from html import unescape
+
__all__ = ['HTMLParser']
@@ -357,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase):
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
- attrvalue = self.unescape(attrvalue)
+ attrvalue = unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
@@ -510,34 +513,3 @@ class HTMLParser(_markupbase.ParserBase):
def unknown_decl(self, data):
if self.strict:
self.error("unknown declaration: %r" % (data,))
-
- # Internal -- helper to remove special character quoting
- def unescape(self, s):
- if '&' not in s:
- return s
- def replaceEntities(s):
- s = s.groups()[0]
- try:
- if s[0] == "#":
- s = s[1:]
- if s[0] in ['x','X']:
- c = int(s[1:].rstrip(';'), 16)
- else:
- c = int(s.rstrip(';'))
- return chr(c)
- except ValueError:
- return '&#' + s
- else:
- from html.entities import html5
- if s in html5:
- return html5[s]
- elif s.endswith(';'):
- return '&' + s
- for x in range(2, len(s)):
- if s[:x] in html5:
- return html5[s[:x]] + s[x:]
- else:
- return '&' + s
-
- return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
- replaceEntities, s, flags=re.ASCII)