summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorR David Murray <rdmurray@bitdance.com>2013-02-14 02:17:13 (GMT)
committerR David Murray <rdmurray@bitdance.com>2013-02-14 02:17:13 (GMT)
commit2940e71add4a7145129429fa46a401abb9459674 (patch)
treed378e919600358df804c2fe78d749773eca36be7
parentf0bf84c84bff4e635501319e6d6bfa141c59b41e (diff)
downloadcpython-2940e71add4a7145129429fa46a401abb9459674.zip
cpython-2940e71add4a7145129429fa46a401abb9459674.tar.gz
cpython-2940e71add4a7145129429fa46a401abb9459674.tar.bz2
#15220: simplify and speed up feedparser's line splitting.
Original patch submitted by QNX, modified for clarity by me (mostly comments). QNX reports a 30% speed up in average email parsing time.
-rw-r--r--Lib/email/feedparser.py27
-rw-r--r--Misc/NEWS3
2 files changed, 12 insertions, 18 deletions
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index ea41e95..eb75fe3 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -98,24 +98,15 @@ class BufferedSubFile(object):
"""Push some new data into this object."""
# Handle any previous leftovers
data, self._partial = self._partial + data, ''
- # Crack into lines, but preserve the newlines on the end of each
- parts = NLCRE_crack.split(data)
- # The *ahem* interesting behaviour of re.split when supplied grouping
- # parentheses is that the last element of the resulting list is the
- # data after the final RE. In the case of a NL/CR terminated string,
- # this is the empty string.
- self._partial = parts.pop()
- #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
- # is there a \n to follow later?
- if not self._partial and parts and parts[-1].endswith('\r'):
- self._partial = parts.pop(-2)+parts.pop()
- # parts is a list of strings, alternating between the line contents
- # and the eol character(s). Gather up a list of lines after
- # re-attaching the newlines.
- lines = []
- for i in range(len(parts) // 2):
- lines.append(parts[i*2] + parts[i*2+1])
- self.pushlines(lines)
+ # Crack into lines, but preserve the linesep characters on the end of each
+ parts = data.splitlines(True)
+ # If the last element of the list does not end in a newline, then treat
+ # it as a partial line. We only check for '\n' here because a line
+ # ending with '\r' might be a line that was split in the middle of a
+ # '\r\n' sequence (see bugs 1555570 and 1721862).
+ if parts and not parts[-1].endswith('\n'):
+ self._partial = parts.pop()
+ self.pushlines(parts)
def pushlines(self, lines):
# Reverse and insert at the front of the lines.
diff --git a/Misc/NEWS b/Misc/NEWS
index 199fd8f..909375a 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -253,6 +253,9 @@ Core and Builtins
Library
-------
+- Issue #15220: email.feedparser's line splitting algorithm is now simpler and
+ faster.
+
- Issue #16743: Fix mmap overflow check on 32 bit Windows.
- Issue #16996: webbrowser module now uses shutil.which() to find a