diff options
Diffstat (limited to 'Lib/email/feedparser.py')
| -rw-r--r-- | Lib/email/feedparser.py | 85 | 
1 files changed, 52 insertions, 33 deletions
| diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index ea41e95..c95b27f 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -33,7 +33,7 @@ NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')  NLCRE_crack = re.compile('(\r\n|\r|\n)')  # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character  # except controls, SP, and ":". -headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])') +headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')  EMPTYSTRING = ''  NL = '\n' @@ -50,8 +50,8 @@ class BufferedSubFile(object):      simple abstraction -- it parses until EOF closes the current message.      """      def __init__(self): -        # The last partial line pushed into this object. -        self._partial = '' +        # Chunks of the last partial line pushed into this object. +        self._partial = []          # The list of full, pushed lines, in reverse order          self._lines = []          # The stack of false-EOF checking predicates. @@ -67,8 +67,8 @@ class BufferedSubFile(object):      def close(self):          # Don't forget any trailing partial line. -        self._lines.append(self._partial) -        self._partial = '' +        self.pushlines(''.join(self._partial).splitlines(True)) +        self._partial = []          self._closed = True      def readline(self): @@ -96,26 +96,27 @@ class BufferedSubFile(object):      def push(self, data):          """Push some new data into this object.""" -        # Handle any previous leftovers -        data, self._partial = self._partial + data, '' -        # Crack into lines, but preserve the newlines on the end of each -        parts = NLCRE_crack.split(data) -        # The *ahem* interesting behaviour of re.split when supplied grouping -        # parentheses is that the last element of the resulting list is the -        # data after the final RE.  In the case of a NL/CR terminated string, -        # this is the empty string. -        self._partial = parts.pop() -        #GAN 29Mar09  bugs 1555570, 1721862  Confusion at 8K boundary ending with \r: -        # is there a \n to follow later? -        if not self._partial and parts and parts[-1].endswith('\r'): -            self._partial = parts.pop(-2)+parts.pop() -        # parts is a list of strings, alternating between the line contents -        # and the eol character(s).  Gather up a list of lines after -        # re-attaching the newlines. -        lines = [] -        for i in range(len(parts) // 2): -            lines.append(parts[i*2] + parts[i*2+1]) -        self.pushlines(lines) +        # Crack into lines, but preserve the linesep characters on the end of each +        parts = data.splitlines(True) + +        if not parts or not parts[0].endswith(('\n', '\r')): +            # No new complete lines, so just accumulate partials +            self._partial += parts +            return + +        if self._partial: +            # If there are previous leftovers, complete them now +            self._partial.append(parts[0]) +            parts[0:1] = ''.join(self._partial).splitlines(True) +            del self._partial[:] + +        # If the last element of the list does not end in a newline, then treat +        # it as a partial line.  We only check for '\n' here because a line +        # ending with '\r' might be a line that was split in the middle of a +        # '\r\n' sequence (see bugs 1555570 and 1721862). +        if not parts[-1].endswith('\n'): +            self._partial = [parts.pop()] +        self.pushlines(parts)      def pushlines(self, lines):          # Reverse and insert at the front of the lines. @@ -135,7 +136,7 @@ class BufferedSubFile(object):  class FeedParser:      """A feed-style parser of email.""" -    def __init__(self, _factory=message.Message, *, policy=compat32): +    def __init__(self, _factory=None, *, policy=compat32):          """_factory is called with no arguments to create a new message obj          The policy keyword specifies a policy object that controls a number of @@ -143,14 +144,23 @@ class FeedParser:          backward compatibility.          """ -        self._factory = _factory          self.policy = policy -        try: -            _factory(policy=self.policy) -            self._factory_kwds = lambda: {'policy': self.policy} -        except TypeError: -            # Assume this is an old-style factory -            self._factory_kwds = lambda: {} +        self._factory_kwds = lambda: {'policy': self.policy} +        if _factory is None: +            # What this should be: +            #self._factory = policy.default_message_factory +            # but, because we are post 3.4 feature freeze, fix with temp hack: +            if self.policy is compat32: +                self._factory = message.Message +            else: +                self._factory = message.EmailMessage +        else: +            self._factory = _factory +            try: +                _factory(policy=self.policy) +            except TypeError: +                # Assume this is an old-style factory +                self._factory_kwds = lambda: {}          self._input = BufferedSubFile()          self._msgstack = []          self._parse = self._parsegen().__next__ @@ -501,6 +511,15 @@ class FeedParser:              # There will always be a colon, because if there wasn't the part of              # the parser that calls us would have started parsing the body.              i = line.find(':') + +            # If the colon is on the start of the line the header is clearly +            # malformed, but we might be able to salvage the rest of the +            # message. Track the error but keep going. +            if i == 0: +                defect = errors.InvalidHeaderDefect("Missing header name.") +                self._cur.defects.append(defect) +                continue +              assert i>0, "_parse_headers fed line with no : and no leading WS"              lastheader = line[:i]              lastvalue = [line] | 
