#23144: merge with 3.5.

author: Ezio Melotti <ezio.melotti@gmail.com> 2015-09-06 18:49:48 (GMT)
committer: Ezio Melotti <ezio.melotti@gmail.com> 2015-09-06 18:49:48 (GMT)
commit: 564cf7b62ced76a3cc87b16a8a278f0612673690 (patch)
tree: 4595fab1818ef207ad3abd6b0f0323ee6e9b5a5a
parent: 56f6e76c680f47ad2b11bed9406305a000a1889a (diff)
parent: 20a2c6482e28a2ca8d257ba646f2b8ead4837387 (diff)
download: cpython-564cf7b62ced76a3cc87b16a8a278f0612673690.zip
cpython-564cf7b62ced76a3cc87b16a8a278f0612673690.tar.gz
cpython-564cf7b62ced76a3cc87b16a8a278f0612673690.tar.bz2
3 files changed, 26 insertions, 5 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 390d4cc..43e6411 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -139,7 +139,15 @@ class HTMLParser(_markupbase.ParserBase):
             if self.convert_charrefs and not self.cdata_elem:
                 j = rawdata.find('<', i)
                 if j < 0:
-                    if not end:
+                    # if we can't find the next <, either we are at the end
+                    # or there's more text incoming.  If the latter is True,
+                    # we can't pass the text to handle_data in case we have
+                    # a charref cut in half at end.  Try to determine if
+                    # this is the case before proceding by looking for an
+                    # & near the end and see if it's followed by a space or ;.
+                    amppos = rawdata.rfind('&', max(i, n-34))
+                    if (amppos >= 0 and
+                        not re.compile(r'[\s;]').search(rawdata, amppos)):
                         break  # wait till we get all the text
                     j = n
             else:
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index de8f3e8..11420b2c 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -72,9 +72,6 @@ class EventCollectorExtra(EventCollector):
 
 class EventCollectorCharrefs(EventCollector):
 
-    def get_events(self):
-        return self.events
-
     def handle_charref(self, data):
         self.fail('This should never be called with convert_charrefs=True')
 
@@ -633,6 +630,18 @@ text
         ]
         self._run_check(html, expected)
 
+    def test_convert_charrefs_dropped_text(self):
+        # #23144: make sure that all the events are triggered when
+        # convert_charrefs is True, even if we don't call .close()
+        parser = EventCollector(convert_charrefs=True)
+        # before the fix, bar & baz was missing
+        parser.feed("foo <a>link</a> bar &amp; baz")
+        self.assertEqual(
+            parser.get_events(),
+            [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'),
+             ('endtag', 'a'), ('data', ' bar & baz')]
+        )
+
 
 class AttributesTestCase(TestCaseBase):
 
diff --git a/Misc/NEWS b/Misc/NEWS
index 096ab67..155f061 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -1,4 +1,4 @@
-+++++++++++
++++++++++++
 Python News
 +++++++++++
 
@@ -181,9 +181,13 @@ Core and Builtins
 Library
 -------
 
+- Issue #23144: Make sure that HTMLParser.feed() returns all the data, even
+  when convert_charrefs is True.
+
 - Issue #24635: Fixed a bug in typing.py where isinstance([], typing.Iterable)
   would return True once, then False on subsequent calls.
 
+
 - Issue #24989: Fixed buffer overread in BytesIO.readline() if a position is
   set beyond size.  Based on patch by John Leitch.
author	Ezio Melotti <ezio.melotti@gmail.com>	2015-09-06 18:49:48 (GMT)
committer	Ezio Melotti <ezio.melotti@gmail.com>	2015-09-06 18:49:48 (GMT)
commit	564cf7b62ced76a3cc87b16a8a278f0612673690 (patch)
tree	4595fab1818ef207ad3abd6b0f0323ee6e9b5a5a
parent	56f6e76c680f47ad2b11bed9406305a000a1889a (diff)
parent	20a2c6482e28a2ca8d257ba646f2b8ead4837387 (diff)
download	cpython-564cf7b62ced76a3cc87b16a8a278f0612673690.zip cpython-564cf7b62ced76a3cc87b16a8a278f0612673690.tar.gz cpython-564cf7b62ced76a3cc87b16a8a278f0612673690.tar.bz2