diff options
Diffstat (limited to 'Lib/email/FeedParser.py')
-rw-r--r-- | Lib/email/FeedParser.py | 362 |
1 files changed, 362 insertions, 0 deletions
diff --git a/Lib/email/FeedParser.py b/Lib/email/FeedParser.py new file mode 100644 index 0000000..a82d305 --- /dev/null +++ b/Lib/email/FeedParser.py @@ -0,0 +1,362 @@ +# A new Feed-style Parser + +from email import Errors, Message +import re + +NLCRE = re.compile('\r\n|\r|\n') + +EMPTYSTRING = '' +NL = '\n' + +NeedMoreData = object() + +class FeedableLumpOfText: + "A file-like object that can have new data loaded into it" + + def __init__(self): + self._partial = '' + self._done = False + # _pending is a list of lines, in reverse order + self._pending = [] + + def readline(self): + """ Return a line of data. + + If data has been pushed back with unreadline(), the most recently + returned unreadline()d data will be returned. + """ + if not self._pending: + if self._done: + return '' + return NeedMoreData + return self._pending.pop() + + def unreadline(self, line): + """ Push a line back into the object. + """ + self._pending.append(line) + + def peekline(self): + """ Non-destructively look at the next line """ + if not self._pending: + if self._done: + return '' + return NeedMoreData + return self._pending[-1] + + + # for r in self._input.readuntil(regexp): + # if r is NeedMoreData: + # yield NeedMoreData + # preamble, matchobj = r + def readuntil(self, matchre, afterblank=False, includematch=False): + """ Read a line at a time until we get the specified RE. + + Returns the text up to (and including, if includematch is true) the + matched text, and the RE match object. If afterblank is true, + there must be a blank line before the matched text. Moves current + filepointer to the line following the matched line. If we reach + end-of-file, return what we've got so far, and return None as the + RE match object. + """ + prematch = [] + blankseen = 0 + while 1: + if not self._pending: + if self._done: + # end of file + yield EMPTYSTRING.join(prematch), None + else: + yield NeedMoreData + continue + line = self._pending.pop() + if afterblank: + if NLCRE.match(line): + blankseen = 1 + continue + else: + blankseen = 0 + m = matchre.match(line) + if (m and not afterblank) or (m and afterblank and blankseen): + if includematch: + prematch.append(line) + yield EMPTYSTRING.join(prematch), m + prematch.append(line) + + + NLatend = re.compile('(\r\n|\r|\n)$').match + NLCRE_crack = re.compile('(\r\n|\r|\n)') + + def push(self, data): + """ Push some new data into this object """ + # Handle any previous leftovers + data, self._partial = self._partial+data, '' + # Crack into lines, but leave the newlines on the end of each + lines = self.NLCRE_crack.split(data) + # The *ahem* interesting behaviour of re.split when supplied + # groups means that the last element is the data after the + # final RE. In the case of a NL/CR terminated string, this is + # the empty string. + self._partial = lines.pop() + o = [] + for i in range(len(lines) / 2): + o.append(EMPTYSTRING.join([lines[i*2], lines[i*2+1]])) + self.pushlines(o) + + def pushlines(self, lines): + """ Push a list of new lines into the object """ + # Reverse and insert at the front of _pending + self._pending[:0] = lines[::-1] + + def end(self): + """ There is no more data """ + self._done = True + + def is_done(self): + return self._done + + def __iter__(self): + return self + + def next(self): + l = self.readline() + if l == '': + raise StopIteration + return l + +class FeedParser: + "A feed-style parser of email. copy docstring here" + + def __init__(self, _class=Message.Message): + "fnord fnord fnord" + self._class = _class + self._input = FeedableLumpOfText() + self._root = None + self._objectstack = [] + self._parse = self._parsegen().next + + def end(self): + self._input.end() + self._call_parse() + return self._root + + def feed(self, data): + self._input.push(data) + self._call_parse() + + def _call_parse(self): + try: + self._parse() + except StopIteration: + pass + + headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])') + + def _parse_headers(self,headerlist): + # Passed a list of strings that are the headers for the + # current object + lastheader = '' + lastvalue = [] + + + for lineno, line in enumerate(headerlist): + # Check for continuation + if line[0] in ' \t': + if not lastheader: + raise Errors.HeaderParseError('First line must not be a continuation') + lastvalue.append(line) + continue + + if lastheader: + # XXX reconsider the joining of folded lines + self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip() + lastheader, lastvalue = '', [] + + # Check for Unix-From + if line.startswith('From '): + if lineno == 0: + self._cur.set_unixfrom(line) + continue + elif lineno == len(headerlist) - 1: + # Something looking like a unix-from at the end - it's + # probably the first line of the body + self._input.unreadline(line) + return + else: + # Weirdly placed unix-from line. Ignore it. + continue + + i = line.find(':') + if i < 0: + # The older parser had various special-cases here. We've + # already handled them + raise Errors.HeaderParseError( + "Not a header, not a continuation: ``%s''" % line) + lastheader = line[:i] + lastvalue = [line[i+1:].lstrip()] + + if lastheader: + # XXX reconsider the joining of folded lines + self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip() + + + def _parsegen(self): + # Parse any currently available text + self._new_sub_object() + self._root = self._cur + completing = False + last = None + + for line in self._input: + if line is NeedMoreData: + yield None # Need More Data + continue + self._input.unreadline(line) + if not completing: + headers = [] + # Now collect all headers. + for line in self._input: + if line is NeedMoreData: + yield None # Need More Data + continue + if not self.headerRE.match(line): + self._parse_headers(headers) + # A message/rfc822 has no body and no internal + # boundary. + if self._cur.get_content_maintype() == "message": + self._new_sub_object() + completing = False + headers = [] + continue + if line.strip(): + # No blank line between headers and body. + # Push this line back, it's the first line of + # the body. + self._input.unreadline(line) + break + else: + headers.append(line) + else: + # We're done with the data and are still inside the headers + self._parse_headers(headers) + + # Now we're dealing with the body + boundary = self._cur.get_boundary() + isdigest = (self._cur.get_content_type() == 'multipart/digest') + if boundary and not self._cur._finishing: + separator = '--' + boundary + self._cur._boundaryRE = re.compile( + r'(?P<sep>' + re.escape(separator) + + r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$') + for r in self._input.readuntil(self._cur._boundaryRE): + if r is NeedMoreData: + yield NeedMoreData + else: + preamble, matchobj = r + break + if not matchobj: + # Broken - we hit the end of file. Just set the body + # to the text. + if completing: + self._attach_trailer(last, preamble) + else: + self._attach_preamble(self._cur, preamble) + # XXX move back to the parent container. + self._pop_container() + completing = True + continue + if preamble: + if completing: + preamble = preamble[:-len(matchobj.group('linesep'))] + self._attach_trailer(last, preamble) + else: + self._attach_preamble(self._cur, preamble) + elif not completing: + # The module docs specify an empty preamble is None, not '' + self._cur.preamble = None + # If we _are_ completing, the last object gets no payload + + if matchobj.group('end'): + # That was the end boundary tag. Bounce back to the + # parent container + last = self._pop_container() + self._input.unreadline(matchobj.group('linesep')) + completing = True + continue + + # A number of MTAs produced by a nameless large company + # we shall call "SicroMoft" produce repeated boundary + # lines. + while True: + line = self._input.peekline() + if line is NeedMoreData: + yield None + continue + if self._cur._boundaryRE.match(line): + self._input.readline() + else: + break + + self._new_sub_object() + + completing = False + if isdigest: + self._cur.set_default_type('message/rfc822') + continue + else: + # non-multipart or after end-boundary + if last is not self._root: + last = self._pop_container() + if self._cur.get_content_maintype() == "message": + # We double-pop to leave the RFC822 object + self._pop_container() + completing = True + elif self._cur._boundaryRE and last <> self._root: + completing = True + else: + # Non-multipart top level, or in the trailer of the + # top level multipart + while not self._input.is_done(): + yield None + data = list(self._input) + body = EMPTYSTRING.join(data) + self._attach_trailer(last, body) + + + def _attach_trailer(self, obj, trailer): + #import pdb ; pdb.set_trace() + if obj.get_content_maintype() in ( "multipart", "message" ): + obj.epilogue = trailer + else: + obj.set_payload(trailer) + + def _attach_preamble(self, obj, trailer): + if obj.get_content_maintype() in ( "multipart", "message" ): + obj.preamble = trailer + else: + obj.set_payload(trailer) + + + def _new_sub_object(self): + new = self._class() + #print "pushing", self._objectstack, repr(new) + if self._objectstack: + self._objectstack[-1].attach(new) + self._objectstack.append(new) + new._boundaryRE = None + new._finishing = False + self._cur = new + + def _pop_container(self): + # Move the pointer to the container of the current object. + # Returns the (old) current object + #import pdb ; pdb.set_trace() + #print "popping", self._objectstack + last = self._objectstack.pop() + if self._objectstack: + self._cur = self._objectstack[-1] + else: + self._cur._finishing = True + return last + + |