Issue #21448: Fixed FeedParser feed() to avoid O(N**2) behavior when parsing long line.

Original patch by Raymond Hettinger.
author: Serhiy Storchaka <storchaka@gmail.com> 2014-08-12 10:59:11 (GMT)
committer: Serhiy Storchaka <storchaka@gmail.com> 2014-08-12 10:59:11 (GMT)
commit: 320a1c0ff715b6c04034722393efe510973430ee (patch)
tree: 6f9d2e40433afd332a4a9e4c26f2136394ffc8ac /Lib
parent: 6f2017076293f0e1ea807260434053a58be6667b (diff)
download: cpython-320a1c0ff715b6c04034722393efe510973430ee.zip
cpython-320a1c0ff715b6c04034722393efe510973430ee.tar.gz
cpython-320a1c0ff715b6c04034722393efe510973430ee.tar.bz2
2 files changed, 77 insertions, 12 deletions
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index 6cf9b91..0c3b572 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -50,8 +50,8 @@ class BufferedSubFile(object):
     simple abstraction -- it parses until EOF closes the current message.
     """
     def __init__(self):
-        # The last partial line pushed into this object.
-        self._partial = ''
+        # Chunks of the last partial line pushed into this object.
+        self._partial = []
         # The list of full, pushed lines, in reverse order
         self._lines = []
         # The stack of false-EOF checking predicates.
@@ -67,8 +67,8 @@ class BufferedSubFile(object):
 
     def close(self):
         # Don't forget any trailing partial line.
-        self._lines.append(self._partial)
-        self._partial = ''
+        self.pushlines(''.join(self._partial).splitlines(True))
+        self._partial = []
         self._closed = True
 
     def readline(self):
@@ -96,16 +96,26 @@ class BufferedSubFile(object):
 
     def push(self, data):
         """Push some new data into this object."""
-        # Handle any previous leftovers
-        data, self._partial = self._partial + data, ''
         # Crack into lines, but preserve the linesep characters on the end of each
         parts = data.splitlines(True)
+
+        if not parts or not parts[0].endswith(('\n', '\r')):
+            # No new complete lines, so just accumulate partials
+            self._partial += parts
+            return
+
+        if self._partial:
+            # If there are previous leftovers, complete them now
+            self._partial.append(parts[0])
+            parts[0:1] = ''.join(self._partial).splitlines(True)
+            del self._partial[:]
+
         # If the last element of the list does not end in a newline, then treat
         # it as a partial line.  We only check for '\n' here because a line
         # ending with '\r' might be a line that was split in the middle of a
         # '\r\n' sequence (see bugs 1555570 and 1721862).
-        if parts and not parts[-1].endswith('\n'):
-            self._partial = parts.pop()
+        if not parts[-1].endswith('\n'):
+            self._partial = [parts.pop()]
         self.pushlines(parts)
 
     def pushlines(self, lines):
diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py
index af8bc7b..48730d8 100644
--- a/Lib/test/test_email/test_email.py
+++ b/Lib/test/test_email/test_email.py
@@ -10,6 +10,7 @@ import textwrap
 
 from io import StringIO, BytesIO
 from itertools import chain
+from random import choice
 
 import email
 import email.policy
@@ -3353,16 +3354,70 @@ Do you like this message?
             bsf.push(il)
             nt += n
             n1 = 0
-            while True:
-                ol = bsf.readline()
-                if ol == NeedMoreData:
-                    break
+            for ol in iter(bsf.readline, NeedMoreData):
                 om.append(ol)
                 n1 += 1
             self.assertEqual(n, n1)
         self.assertEqual(len(om), nt)
         self.assertEqual(''.join([il for il, n in imt]), ''.join(om))
 
+    def test_push_random(self):
+        from email.feedparser import BufferedSubFile, NeedMoreData
+
+        n = 10000
+        chunksize = 5
+        chars = 'abcd \t\r\n'
+
+        s = ''.join(choice(chars) for i in range(n)) + '\n'
+        target = s.splitlines(True)
+
+        bsf = BufferedSubFile()
+        lines = []
+        for i in range(0, len(s), chunksize):
+            chunk = s[i:i+chunksize]
+            bsf.push(chunk)
+            lines.extend(iter(bsf.readline, NeedMoreData))
+        self.assertEqual(lines, target)
+
+
+class TestFeedParsers(TestEmailBase):
+
+    def parse(self, chunks):
+        from email.feedparser import FeedParser
+        feedparser = FeedParser()
+        for chunk in chunks:
+            feedparser.feed(chunk)
+        return feedparser.close()
+
+    def test_newlines(self):
+        m = self.parse(['a:\nb:\rc:\r\nd:\n'])
+        self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
+        m = self.parse(['a:\nb:\rc:\r\nd:'])
+        self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
+        m = self.parse(['a:\rb', 'c:\n'])
+        self.assertEqual(m.keys(), ['a', 'bc'])
+        m = self.parse(['a:\r', 'b:\n'])
+        self.assertEqual(m.keys(), ['a', 'b'])
+        m = self.parse(['a:\r', '\nb:\n'])
+        self.assertEqual(m.keys(), ['a', 'b'])
+        m = self.parse(['a:\x85b:\u2028c:\n'])
+        self.assertEqual(m.items(), [('a', '\x85'), ('b', '\u2028'), ('c', '')])
+        m = self.parse(['a:\r', 'b:\x85', 'c:\n'])
+        self.assertEqual(m.items(), [('a', ''), ('b', '\x85'), ('c', '')])
+
+    def test_long_lines(self):
+        M, N = 1000, 100000
+        m = self.parse(['a:b\n\n'] + ['x'*M] * N)
+        self.assertEqual(m.items(), [('a', 'b')])
+        self.assertEqual(m.get_payload(), 'x'*M*N)
+        m = self.parse(['a:b\r\r'] + ['x'*M] * N)
+        self.assertEqual(m.items(), [('a', 'b')])
+        self.assertEqual(m.get_payload(), 'x'*M*N)
+        m = self.parse(['a:b\r\r'] + ['x'*M+'\x85'] * N)
+        self.assertEqual(m.items(), [('a', 'b')])
+        self.assertEqual(m.get_payload(), ('x'*M+'\x85')*N)
+        m = self.parse(['a:\r', 'b: '] + ['x'*M] * N)
+        self.assertEqual(m.items(), [('a', ''), ('b', 'x'*M*N)])
 
 
 class TestParsers(TestEmailBase):
author	Serhiy Storchaka <storchaka@gmail.com>	2014-08-12 10:59:11 (GMT)
committer	Serhiy Storchaka <storchaka@gmail.com>	2014-08-12 10:59:11 (GMT)
commit	320a1c0ff715b6c04034722393efe510973430ee (patch)
tree	6f9d2e40433afd332a4a9e4c26f2136394ffc8ac /Lib
parent	6f2017076293f0e1ea807260434053a58be6667b (diff)
download	cpython-320a1c0ff715b6c04034722393efe510973430ee.zip cpython-320a1c0ff715b6c04034722393efe510973430ee.tar.gz cpython-320a1c0ff715b6c04034722393efe510973430ee.tar.bz2