#15220: simplify and speed up feedparser's line splitting.

Original patch submitted by QNX, modified for clarity by me (mostly comments). QNX reports a 30% speed up in average email parsing time.
author: R David Murray <rdmurray@bitdance.com> 2013-02-14 02:17:13 (GMT)
committer: R David Murray <rdmurray@bitdance.com> 2013-02-14 02:17:13 (GMT)
commit: 2940e71add4a7145129429fa46a401abb9459674 (patch)
tree: d378e919600358df804c2fe78d749773eca36be7 /Lib/email
parent: f0bf84c84bff4e635501319e6d6bfa141c59b41e (diff)
download: cpython-2940e71add4a7145129429fa46a401abb9459674.zip
cpython-2940e71add4a7145129429fa46a401abb9459674.tar.gz
cpython-2940e71add4a7145129429fa46a401abb9459674.tar.bz2
1 files changed, 9 insertions, 18 deletions
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index ea41e95..eb75fe3 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -98,24 +98,15 @@ class BufferedSubFile(object):
         """Push some new data into this object."""
         # Handle any previous leftovers
         data, self._partial = self._partial + data, ''
-        # Crack into lines, but preserve the newlines on the end of each
-        parts = NLCRE_crack.split(data)
-        # The *ahem* interesting behaviour of re.split when supplied grouping
-        # parentheses is that the last element of the resulting list is the
-        # data after the final RE.  In the case of a NL/CR terminated string,
-        # this is the empty string.
-        self._partial = parts.pop()
-        #GAN 29Mar09  bugs 1555570, 1721862  Confusion at 8K boundary ending with \r:
-        # is there a \n to follow later?
-        if not self._partial and parts and parts[-1].endswith('\r'):
-            self._partial = parts.pop(-2)+parts.pop()
-        # parts is a list of strings, alternating between the line contents
-        # and the eol character(s).  Gather up a list of lines after
-        # re-attaching the newlines.
-        lines = []
-        for i in range(len(parts) // 2):
-            lines.append(parts[i*2] + parts[i*2+1])
-        self.pushlines(lines)
+        # Crack into lines, but preserve the linesep characters on the end of each
+        parts = data.splitlines(True)
+        # If the last element of the list does not end in a newline, then treat
+        # it as a partial line.  We only check for '\n' here because a line
+        # ending with '\r' might be a line that was split in the middle of a
+        # '\r\n' sequence (see bugs 1555570 and 1721862).
+        if parts and not parts[-1].endswith('\n'):
+            self._partial = parts.pop()
+        self.pushlines(parts)
 
     def pushlines(self, lines):
         # Reverse and insert at the front of the lines.
author	R David Murray <rdmurray@bitdance.com>	2013-02-14 02:17:13 (GMT)
committer	R David Murray <rdmurray@bitdance.com>	2013-02-14 02:17:13 (GMT)
commit	2940e71add4a7145129429fa46a401abb9459674 (patch)
tree	d378e919600358df804c2fe78d749773eca36be7 /Lib/email
parent	f0bf84c84bff4e635501319e6d6bfa141c59b41e (diff)
download	cpython-2940e71add4a7145129429fa46a401abb9459674.zip cpython-2940e71add4a7145129429fa46a401abb9459674.tar.gz cpython-2940e71add4a7145129429fa46a401abb9459674.tar.bz2