summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2014-08-12 10:58:23 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2014-08-12 10:58:23 (GMT)
commitbc6f8de73badfbc1a0662cdf0b3c7539ebdb7bf6 (patch)
treed8b61cf5d9b0f9fc68ac4f3fb0851715cc4f9347
parent3fdffc9fb69b8c2893c833c494132b00c26f5e8f (diff)
downloadcpython-bc6f8de73badfbc1a0662cdf0b3c7539ebdb7bf6.zip
cpython-bc6f8de73badfbc1a0662cdf0b3c7539ebdb7bf6.tar.gz
cpython-bc6f8de73badfbc1a0662cdf0b3c7539ebdb7bf6.tar.bz2
Issue #21448: Fixed FeedParser feed() to avoid O(N**2) behavior when parsing long line.
Original patch by Raymond Hettinger.
-rw-r--r--Lib/email/feedparser.py33
-rw-r--r--Lib/email/test/test_email.py57
-rw-r--r--Misc/NEWS3
3 files changed, 82 insertions, 11 deletions
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index 15db26d..8031ca6 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -49,8 +49,8 @@ class BufferedSubFile(object):
simple abstraction -- it parses until EOF closes the current message.
"""
def __init__(self):
- # The last partial line pushed into this object.
- self._partial = ''
+ # Chunks of the last partial line pushed into this object.
+ self._partial = []
# The list of full, pushed lines, in reverse order
self._lines = []
# The stack of false-EOF checking predicates.
@@ -66,8 +66,8 @@ class BufferedSubFile(object):
def close(self):
# Don't forget any trailing partial line.
- self._lines.append(self._partial)
- self._partial = ''
+ self.pushlines(''.join(self._partial).splitlines(True))
+ self._partial = []
self._closed = True
def readline(self):
@@ -95,8 +95,29 @@ class BufferedSubFile(object):
def push(self, data):
"""Push some new data into this object."""
- # Handle any previous leftovers
- data, self._partial = self._partial + data, ''
+ # Crack into lines, but preserve the linesep characters on the end of each
+ parts = data.splitlines(True)
+
+ if not parts or not parts[0].endswith(('\n', '\r')):
+ # No new complete lines, so just accumulate partials
+ self._partial += parts
+ return
+
+ if self._partial:
+ # If there are previous leftovers, complete them now
+ self._partial.append(parts[0])
+ parts[0:1] = ''.join(self._partial).splitlines(True)
+ del self._partial[:]
+
+ # If the last element of the list does not end in a newline, then treat
+ # it as a partial line. We only check for '\n' here because a line
+ # ending with '\r' might be a line that was split in the middle of a
+ # '\r\n' sequence (see bugs 1555570 and 1721862).
+ if not parts[-1].endswith('\n'):
+ self._partial = [parts.pop()]
+ self.pushlines(parts)
+
+ def pushlines(self, lines):
# Crack into lines, but preserve the newlines on the end of each
parts = NLCRE_crack.split(data)
# The *ahem* interesting behaviour of re.split when supplied grouping
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py
index c4a90d8..9ed1a0a9 100644
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -11,6 +11,7 @@ import unittest
import warnings
import textwrap
from cStringIO import StringIO
+from random import choice
import email
@@ -2578,16 +2579,63 @@ Do you like this message?
bsf.push(il)
nt += n
n1 = 0
- while True:
- ol = bsf.readline()
- if ol == NeedMoreData:
- break
+ for ol in iter(bsf.readline, NeedMoreData):
om.append(ol)
n1 += 1
self.assertEqual(n, n1)
self.assertEqual(len(om), nt)
self.assertEqual(''.join([il for il, n in imt]), ''.join(om))
+ def test_push_random(self):
+ from email.feedparser import BufferedSubFile, NeedMoreData
+
+ n = 10000
+ chunksize = 5
+ chars = 'abcd \t\r\n'
+
+ s = ''.join(choice(chars) for i in range(n)) + '\n'
+ target = s.splitlines(True)
+
+ bsf = BufferedSubFile()
+ lines = []
+ for i in range(0, len(s), chunksize):
+ chunk = s[i:i+chunksize]
+ bsf.push(chunk)
+ lines.extend(iter(bsf.readline, NeedMoreData))
+ self.assertEqual(lines, target)
+
+
+class TestFeedParsers(TestEmailBase):
+
+ def parse(self, chunks):
+ from email.feedparser import FeedParser
+ feedparser = FeedParser()
+ for chunk in chunks:
+ feedparser.feed(chunk)
+ return feedparser.close()
+
+ def test_newlines(self):
+ m = self.parse(['a:\nb:\rc:\r\nd:\n'])
+ self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
+ m = self.parse(['a:\nb:\rc:\r\nd:'])
+ self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
+ m = self.parse(['a:\rb', 'c:\n'])
+ self.assertEqual(m.keys(), ['a', 'bc'])
+ m = self.parse(['a:\r', 'b:\n'])
+ self.assertEqual(m.keys(), ['a', 'b'])
+ m = self.parse(['a:\r', '\nb:\n'])
+ self.assertEqual(m.keys(), ['a', 'b'])
+
+ def test_long_lines(self):
+ M, N = 1000, 100000
+ m = self.parse(['a:b\n\n'] + ['x'*M] * N)
+ self.assertEqual(m.items(), [('a', 'b')])
+ self.assertEqual(m.get_payload(), 'x'*M*N)
+ m = self.parse(['a:b\r\r'] + ['x'*M] * N)
+ self.assertEqual(m.items(), [('a', 'b')])
+ self.assertEqual(m.get_payload(), 'x'*M*N)
+ m = self.parse(['a:\r', 'b: '] + ['x'*M] * N)
+ self.assertEqual(m.items(), [('a', ''), ('b', 'x'*M*N)])
class TestParsers(TestEmailBase):
@@ -3180,7 +3228,6 @@ A very long line that must get split to something other than at the
self.assertEqual(res, '=?iso-8859-2?q?abc?=')
self.assertIsInstance(res, str)
-
# Test RFC 2231 header parameters (en/de)coding
class TestRFC2231(TestEmailBase):
def test_get_param(self):
diff --git a/Misc/NEWS b/Misc/NEWS
index fc88269..86deb58 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -19,6 +19,9 @@ Core and Builtins
Library
-------
+- Issue #21448: Changed FeedParser feed() to avoid O(N**2) behavior when
+ parsing long line. Original patch by Raymond Hettinger.
+
- Issue #17923: glob() patterns ending with a slash no longer match non-dirs on
AIX. Based on patch by Delhallt.