diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2015-09-06 18:38:06 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2015-09-06 18:38:06 (GMT) |
commit | 6f2bb98966853edcf1855b9dd310529d071241a2 (patch) | |
tree | 088d7d3cdd103b5b01a6d9c081ec3b40fcd110bd | |
parent | 527ef0792f79fda93f568830bba396b22fbcec6a (diff) | |
download | cpython-6f2bb98966853edcf1855b9dd310529d071241a2.zip cpython-6f2bb98966853edcf1855b9dd310529d071241a2.tar.gz cpython-6f2bb98966853edcf1855b9dd310529d071241a2.tar.bz2 |
#23144: Make sure that HTMLParser.feed() returns all the data, even when convert_charrefs is True.
-rw-r--r-- | Lib/html/parser.py | 10 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 15 | ||||
-rw-r--r-- | Misc/NEWS | 5 |
3 files changed, 25 insertions, 5 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index a650d5e..9ae31b9 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -198,7 +198,15 @@ class HTMLParser(_markupbase.ParserBase): if self.convert_charrefs and not self.cdata_elem: j = rawdata.find('<', i) if j < 0: - if not end: + # if we can't find the next <, either we are at the end + # or there's more text incoming. If the latter is True, + # we can't pass the text to handle_data in case we have + # a charref cut in half at end. Try to determine if + # this is the case before proceding by looking for an + # & near the end and see if it's followed by a space or ;. + amppos = rawdata.rfind('&', max(i, n-34)) + if (amppos >= 0 and + not re.compile(r'[\s;]').search(rawdata, amppos)): break # wait till we get all the text j = n else: diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 2d771a2..144f820 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -72,9 +72,6 @@ class EventCollectorExtra(EventCollector): class EventCollectorCharrefs(EventCollector): - def get_events(self): - return self.events - def handle_charref(self, data): self.fail('This should never be called with convert_charrefs=True') @@ -685,6 +682,18 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): ] self._run_check(html, expected) + def test_convert_charrefs_dropped_text(self): + # #23144: make sure that all the events are triggered when + # convert_charrefs is True, even if we don't call .close() + parser = EventCollector(convert_charrefs=True) + # before the fix, bar & baz was missing + parser.feed("foo <a>link</a> bar & baz") + self.assertEqual( + parser.get_events(), + [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'), + ('endtag', 'a'), ('data', ' bar & baz')] + ) + class AttributesStrictTestCase(TestCaseBase): @@ -1,4 +1,4 @@ -+++++++++++ ++++++++++++ Python News +++++++++++ @@ -81,6 +81,9 @@ Core and Builtins Library ------- +- Issue #23144: Make sure that HTMLParser.feed() returns all the data, even + when convert_charrefs is True. + - Issue #16180: Exit pdb if file has syntax error, instead of trapping user in an infinite loop. Patch by Xavier de Gaye. |