diff options
author | R David Murray <rdmurray@bitdance.com> | 2013-02-14 02:17:13 (GMT) |
---|---|---|
committer | R David Murray <rdmurray@bitdance.com> | 2013-02-14 02:17:13 (GMT) |
commit | 2940e71add4a7145129429fa46a401abb9459674 (patch) | |
tree | d378e919600358df804c2fe78d749773eca36be7 /Lib/email | |
parent | f0bf84c84bff4e635501319e6d6bfa141c59b41e (diff) | |
download | cpython-2940e71add4a7145129429fa46a401abb9459674.zip cpython-2940e71add4a7145129429fa46a401abb9459674.tar.gz cpython-2940e71add4a7145129429fa46a401abb9459674.tar.bz2 |
#15220: simplify and speed up feedparser's line splitting.
Original patch submitted by QNX, modified for clarity by me (mostly comments).
QNX reports a 30% speed up in average email parsing time.
Diffstat (limited to 'Lib/email')
-rw-r--r-- | Lib/email/feedparser.py | 27 |
1 files changed, 9 insertions, 18 deletions
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index ea41e95..eb75fe3 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -98,24 +98,15 @@ class BufferedSubFile(object): """Push some new data into this object.""" # Handle any previous leftovers data, self._partial = self._partial + data, '' - # Crack into lines, but preserve the newlines on the end of each - parts = NLCRE_crack.split(data) - # The *ahem* interesting behaviour of re.split when supplied grouping - # parentheses is that the last element of the resulting list is the - # data after the final RE. In the case of a NL/CR terminated string, - # this is the empty string. - self._partial = parts.pop() - #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r: - # is there a \n to follow later? - if not self._partial and parts and parts[-1].endswith('\r'): - self._partial = parts.pop(-2)+parts.pop() - # parts is a list of strings, alternating between the line contents - # and the eol character(s). Gather up a list of lines after - # re-attaching the newlines. - lines = [] - for i in range(len(parts) // 2): - lines.append(parts[i*2] + parts[i*2+1]) - self.pushlines(lines) + # Crack into lines, but preserve the linesep characters on the end of each + parts = data.splitlines(True) + # If the last element of the list does not end in a newline, then treat + # it as a partial line. We only check for '\n' here because a line + # ending with '\r' might be a line that was split in the middle of a + # '\r\n' sequence (see bugs 1555570 and 1721862). + if parts and not parts[-1].endswith('\n'): + self._partial = parts.pop() + self.pushlines(parts) def pushlines(self, lines): # Reverse and insert at the front of the lines. |