diff options
Diffstat (limited to 'Lib/email/feedparser.py')
-rw-r--r-- | Lib/email/feedparser.py | 73 |
1 files changed, 49 insertions, 24 deletions
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index aa8a2ff..56f50df 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -25,6 +25,7 @@ import re from email import errors from email import message +from email._policybase import compat32 NLCRE = re.compile('\r\n|\r|\n') NLCRE_bol = re.compile('(\r\n|\r|\n)') @@ -120,9 +121,6 @@ class BufferedSubFile(object): # Reverse and insert at the front of the lines. self._lines[:0] = lines[::-1] - def is_closed(self): - return self._closed - def __iter__(self): return self @@ -137,9 +135,22 @@ class BufferedSubFile(object): class FeedParser: """A feed-style parser of email.""" - def __init__(self, _factory=message.Message): - """_factory is called with no arguments to create a new message obj""" + def __init__(self, _factory=message.Message, *, policy=compat32): + """_factory is called with no arguments to create a new message obj + + The policy keyword specifies a policy object that controls a number of + aspects of the parser's operation. The default policy maintains + backward compatibility. + + """ self._factory = _factory + self.policy = policy + try: + _factory(policy=self.policy) + self._factory_kwds = lambda: {'policy': self.policy} + except TypeError: + # Assume this is an old-style factory + self._factory_kwds = lambda: {} self._input = BufferedSubFile() self._msgstack = [] self._parse = self._parsegen().__next__ @@ -171,11 +182,12 @@ class FeedParser: # Look for final set of defects if root.get_content_maintype() == 'multipart' \ and not root.is_multipart(): - root.defects.append(errors.MultipartInvariantViolationDefect()) + defect = errors.MultipartInvariantViolationDefect() + self.policy.handle_defect(root, defect) return root def _new_message(self): - msg = self._factory() + msg = self._factory(**self._factory_kwds()) if self._cur and self._cur.get_content_type() == 'multipart/digest': msg.set_default_type('message/rfc822') if self._msgstack: @@ -207,6 +219,8 @@ class FeedParser: # (i.e. newline), just throw it away. Otherwise the line is # part of the body so push it back. if not NLCRE.match(line): + defect = errors.MissingHeaderBodySeparatorDefect() + self.policy.handle_defect(self._cur, defect) self._input.unreadline(line) break headers.append(line) @@ -284,7 +298,8 @@ class FeedParser: # defined a boundary. That's a problem which we'll handle by # reading everything until the EOF and marking the message as # defective. - self._cur.defects.append(errors.NoBoundaryInMultipartDefect()) + defect = errors.NoBoundaryInMultipartDefect() + self.policy.handle_defect(self._cur, defect) lines = [] for line in self._input: if line is NeedMoreData: @@ -293,6 +308,11 @@ class FeedParser: lines.append(line) self._cur.set_payload(EMPTYSTRING.join(lines)) return + # Make sure a valid content type was specified per RFC 2045:6.4. + if (self._cur.get('content-transfer-encoding', '8bit').lower() + not in ('7bit', '8bit', 'binary')): + defect = errors.InvalidMultipartContentTransferEncodingDefect() + self.policy.handle_defect(self._cur, defect) # Create a line match predicate which matches the inter-part # boundary as well as the end-of-multipart boundary. Don't push # this onto the input stream until we've scanned past the @@ -304,6 +324,7 @@ class FeedParser: capturing_preamble = True preamble = [] linesep = False + close_boundary_seen = False while True: line = self._input.readline() if line is NeedMoreData: @@ -318,6 +339,7 @@ class FeedParser: # the closing boundary, then we need to initialize the # epilogue with the empty string (see below). if mo.group('end'): + close_boundary_seen = True linesep = mo.group('linesep') break # We saw an inter-part boundary. Were we in the preamble? @@ -386,9 +408,9 @@ class FeedParser: # We've seen either the EOF or the end boundary. If we're still # capturing the preamble, we never saw the start boundary. Note # that as a defect and store the captured text as the payload. - # Everything from here to the EOF is epilogue. if capturing_preamble: - self._cur.defects.append(errors.StartBoundaryNotFoundDefect()) + defect = errors.StartBoundaryNotFoundDefect() + self.policy.handle_defect(self._cur, defect) self._cur.set_payload(EMPTYSTRING.join(preamble)) epilogue = [] for line in self._input: @@ -397,8 +419,15 @@ class FeedParser: continue self._cur.epilogue = EMPTYSTRING.join(epilogue) return - # If the end boundary ended in a newline, we'll need to make sure - # the epilogue isn't None + # If we're not processing the preamble, then we might have seen + # EOF without seeing that end boundary...that is also a defect. + if not close_boundary_seen: + defect = errors.CloseBoundaryNotFoundDefect() + self.policy.handle_defect(self._cur, defect) + return + # Everything from here to the EOF is epilogue. If the end boundary + # ended in a newline, we'll need to make sure the epilogue isn't + # None if linesep: epilogue = [''] else: @@ -440,14 +469,12 @@ class FeedParser: # is illegal, so let's note the defect, store the illegal # line, and ignore it for purposes of headers. defect = errors.FirstHeaderLineIsContinuationDefect(line) - self._cur.defects.append(defect) + self.policy.handle_defect(self._cur, defect) continue lastvalue.append(line) continue if lastheader: - # XXX reconsider the joining of folded lines - lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n') - self._cur[lastheader] = lhdr + self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) lastheader, lastvalue = '', [] # Check for envelope header, i.e. unix-from if line.startswith('From '): @@ -471,19 +498,17 @@ class FeedParser: self._cur.defects.append(defect) continue # Split the line on the colon separating field name from value. + # There will always be a colon, because if there wasn't the part of + # the parser that calls us would have started parsing the body. i = line.find(':') - if i < 0: - defect = errors.MalformedHeaderDefect(line) - self._cur.defects.append(defect) - continue + assert i>0, "_parse_headers fed line with no : and no leading WS" lastheader = line[:i] - lastvalue = [line[i+1:].lstrip()] + lastvalue = [line] # Done with all the lines, so handle the last header. if lastheader: - # XXX reconsider the joining of folded lines - self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n') + self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) + - class BytesFeedParser(FeedParser): """Like FeedParser, but feed accepts bytes.""" |