diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2011-11-18 16:01:49 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2011-11-18 16:01:49 (GMT) |
commit | 15cb48923449bdd1325a7736a5f9bb73c8529cec (patch) | |
tree | ea68bd3978d6449acc9a3b7c65972922f6706ced /Lib/html | |
parent | 8008f2aba0c063a882c33ebd4b39a5a560deb8c0 (diff) | |
download | cpython-15cb48923449bdd1325a7736a5f9bb73c8529cec.zip cpython-15cb48923449bdd1325a7736a5f9bb73c8529cec.tar.gz cpython-15cb48923449bdd1325a7736a5f9bb73c8529cec.tar.bz2 |
#13358: HTMLParser now calls handle_data only once for each CDATA.
Diffstat (limited to 'Lib/html')
-rw-r--r-- | Lib/html/parser.py | 7 |
1 files changed, 4 insertions, 3 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 662e855..dd9c2e1 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -14,7 +14,6 @@ import re # Regular expressions used for parsing interesting_normal = re.compile('[&<]') -interesting_cdata = re.compile(r'<(/|\Z)') incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') @@ -149,8 +148,8 @@ class HTMLParser(_markupbase.ParserBase): return self.__starttag_text def set_cdata_mode(self, elem): - self.interesting = interesting_cdata self.cdata_elem = elem.lower() + self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) def clear_cdata_mode(self): self.interesting = interesting_normal @@ -168,6 +167,8 @@ class HTMLParser(_markupbase.ParserBase): if match: j = match.start() else: + if self.cdata_elem: + break j = n if i < j: self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) @@ -250,7 +251,7 @@ class HTMLParser(_markupbase.ParserBase): else: assert 0, "interesting.search() lied" # end while - if end and i < n: + if end and i < n and not self.cdata_elem: self.handle_data(rawdata[i:n]) i = self.updatepos(i, n) self.rawdata = rawdata[i:] |