#13358: HTMLParser now calls handle_data only once for each CDATA.

author: Ezio Melotti <ezio.melotti@gmail.com> 2011-11-18 16:00:40 (GMT)
committer: Ezio Melotti <ezio.melotti@gmail.com> 2011-11-18 16:00:40 (GMT)
commit: 00dc60beee3bf4b68fd658716616f25503a3a9eb (patch)
tree: f229f62cf4c74e6692181a0cc91a1fc12fd06e63 /Lib
parent: 93bbb6a9a641d54c242651e97948c15be911c9bb (diff)
download: cpython-00dc60beee3bf4b68fd658716616f25503a3a9eb.zip
cpython-00dc60beee3bf4b68fd658716616f25503a3a9eb.tar.gz
cpython-00dc60beee3bf4b68fd658716616f25503a3a9eb.tar.bz2
2 files changed, 25 insertions, 3 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index cd353f8..1c6989e 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -14,7 +14,6 @@ import re
 # Regular expressions used for parsing
 
 interesting_normal = re.compile('[&<]')
-interesting_cdata = re.compile(r'<(/|\Z)')
 incomplete = re.compile('&[a-zA-Z#]')
 
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@@ -125,8 +124,8 @@ class HTMLParser(markupbase.ParserBase):
         return self.__starttag_text
 
     def set_cdata_mode(self, elem):
-        self.interesting = interesting_cdata
         self.cdata_elem = elem.lower()
+        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
 
     def clear_cdata_mode(self):
         self.interesting = interesting_normal
@@ -144,6 +143,8 @@ class HTMLParser(markupbase.ParserBase):
             if match:
                 j = match.start()
             else:
+                if self.cdata_elem:
+                    break
                 j = n
             if i < j: self.handle_data(rawdata[i:j])
             i = self.updatepos(i, j)
@@ -212,7 +213,7 @@ class HTMLParser(markupbase.ParserBase):
             else:
                 assert 0, "interesting.search() lied"
         # end while
-        if end and i < n:
+        if end and i < n and not self.cdata_elem:
             self.handle_data(rawdata[i:n])
             i = self.updatepos(i, n)
         self.rawdata = rawdata[i:]
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index b84e7dc..5dfe466 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -286,6 +286,27 @@ DOCTYPE html [
                                     ("data", content),
                                     ("endtag", element_lower)])
 
+    def test_cdata_with_closing_tags(self):
+        # see issue #13358
+        # make sure that HTMLParser calls handle_data only once for each CDATA.
+        # The normal event collector normalizes the events in get_events,
+        # so we override it to return the original list of events.
+        class Collector(EventCollector):
+            def get_events(self):
+                return self.events
+
+        content = """<!-- not a comment --> &not-an-entity-ref;
+                  <a href="" /> </p><p> &amp; <span></span></style>
+                  '</script' + '>' </html> </head> </scripter>!"""
+        for element in [' script', 'script ', ' script ',
+                        '\nscript', 'script\n', '\nscript\n']:
+            s = u'<script>{content}</{element}>'.format(element=element,
+                                                        content=content)
+            self._run_check(s, [("starttag", "script", []),
+                                ("data", content),
+                                ("endtag", "script")],
+                            collector=Collector)
+
     def test_malformatted_charref(self):
         self._run_check("<p>&#bad;</p>", [
             ("starttag", "p", []),
author	Ezio Melotti <ezio.melotti@gmail.com>	2011-11-18 16:00:40 (GMT)
committer	Ezio Melotti <ezio.melotti@gmail.com>	2011-11-18 16:00:40 (GMT)
commit	00dc60beee3bf4b68fd658716616f25503a3a9eb (patch)
tree	f229f62cf4c74e6692181a0cc91a1fc12fd06e63 /Lib
parent	93bbb6a9a641d54c242651e97948c15be911c9bb (diff)
download	cpython-00dc60beee3bf4b68fd658716616f25503a3a9eb.zip cpython-00dc60beee3bf4b68fd658716616f25503a3a9eb.tar.gz cpython-00dc60beee3bf4b68fd658716616f25503a3a9eb.tar.bz2