summaryrefslogtreecommitdiffstats
path: root/Lib/html
diff options
context:
space:
mode:
authorEzio Melotti <ezio.melotti@gmail.com>2011-11-18 16:01:49 (GMT)
committerEzio Melotti <ezio.melotti@gmail.com>2011-11-18 16:01:49 (GMT)
commit15cb48923449bdd1325a7736a5f9bb73c8529cec (patch)
treeea68bd3978d6449acc9a3b7c65972922f6706ced /Lib/html
parent8008f2aba0c063a882c33ebd4b39a5a560deb8c0 (diff)
downloadcpython-15cb48923449bdd1325a7736a5f9bb73c8529cec.zip
cpython-15cb48923449bdd1325a7736a5f9bb73c8529cec.tar.gz
cpython-15cb48923449bdd1325a7736a5f9bb73c8529cec.tar.bz2
#13358: HTMLParser now calls handle_data only once for each CDATA.
Diffstat (limited to 'Lib/html')
-rw-r--r--Lib/html/parser.py7
1 files changed, 4 insertions, 3 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 662e855..dd9c2e1 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -14,7 +14,6 @@ import re
# Regular expressions used for parsing
interesting_normal = re.compile('[&<]')
-interesting_cdata = re.compile(r'<(/|\Z)')
incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@@ -149,8 +148,8 @@ class HTMLParser(_markupbase.ParserBase):
return self.__starttag_text
def set_cdata_mode(self, elem):
- self.interesting = interesting_cdata
self.cdata_elem = elem.lower()
+ self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
def clear_cdata_mode(self):
self.interesting = interesting_normal
@@ -168,6 +167,8 @@ class HTMLParser(_markupbase.ParserBase):
if match:
j = match.start()
else:
+ if self.cdata_elem:
+ break
j = n
if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
@@ -250,7 +251,7 @@ class HTMLParser(_markupbase.ParserBase):
else:
assert 0, "interesting.search() lied"
# end while
- if end and i < n:
+ if end and i < n and not self.cdata_elem:
self.handle_data(rawdata[i:n])
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]