summaryrefslogtreecommitdiffstats
path: root/Lib/email/FeedParser.py
diff options
context:
space:
mode:
authorAnthony Baxter <anthonybaxter@gmail.com>2004-03-22 00:33:28 (GMT)
committerAnthony Baxter <anthonybaxter@gmail.com>2004-03-22 00:33:28 (GMT)
commit39a0f044210b82f3352b9824c1f1625c7bdb9f29 (patch)
tree68ed866e0a75c9c8f8a1aba1fa86849a6ba25f05 /Lib/email/FeedParser.py
parente62c5c88f179e5f6b445e40603ef7b7b2e706be9 (diff)
downloadcpython-39a0f044210b82f3352b9824c1f1625c7bdb9f29.zip
cpython-39a0f044210b82f3352b9824c1f1625c7bdb9f29.tar.gz
cpython-39a0f044210b82f3352b9824c1f1625c7bdb9f29.tar.bz2
New parser. Next up, making the current parser use this parser
Diffstat (limited to 'Lib/email/FeedParser.py')
-rw-r--r--Lib/email/FeedParser.py362
1 files changed, 362 insertions, 0 deletions
diff --git a/Lib/email/FeedParser.py b/Lib/email/FeedParser.py
new file mode 100644
index 0000000..a82d305
--- /dev/null
+++ b/Lib/email/FeedParser.py
@@ -0,0 +1,362 @@
+# A new Feed-style Parser
+
+from email import Errors, Message
+import re
+
+NLCRE = re.compile('\r\n|\r|\n')
+
+EMPTYSTRING = ''
+NL = '\n'
+
+NeedMoreData = object()
+
+class FeedableLumpOfText:
+ "A file-like object that can have new data loaded into it"
+
+ def __init__(self):
+ self._partial = ''
+ self._done = False
+ # _pending is a list of lines, in reverse order
+ self._pending = []
+
+ def readline(self):
+ """ Return a line of data.
+
+ If data has been pushed back with unreadline(), the most recently
+ returned unreadline()d data will be returned.
+ """
+ if not self._pending:
+ if self._done:
+ return ''
+ return NeedMoreData
+ return self._pending.pop()
+
+ def unreadline(self, line):
+ """ Push a line back into the object.
+ """
+ self._pending.append(line)
+
+ def peekline(self):
+ """ Non-destructively look at the next line """
+ if not self._pending:
+ if self._done:
+ return ''
+ return NeedMoreData
+ return self._pending[-1]
+
+
+ # for r in self._input.readuntil(regexp):
+ # if r is NeedMoreData:
+ # yield NeedMoreData
+ # preamble, matchobj = r
+ def readuntil(self, matchre, afterblank=False, includematch=False):
+ """ Read a line at a time until we get the specified RE.
+
+ Returns the text up to (and including, if includematch is true) the
+ matched text, and the RE match object. If afterblank is true,
+ there must be a blank line before the matched text. Moves current
+ filepointer to the line following the matched line. If we reach
+ end-of-file, return what we've got so far, and return None as the
+ RE match object.
+ """
+ prematch = []
+ blankseen = 0
+ while 1:
+ if not self._pending:
+ if self._done:
+ # end of file
+ yield EMPTYSTRING.join(prematch), None
+ else:
+ yield NeedMoreData
+ continue
+ line = self._pending.pop()
+ if afterblank:
+ if NLCRE.match(line):
+ blankseen = 1
+ continue
+ else:
+ blankseen = 0
+ m = matchre.match(line)
+ if (m and not afterblank) or (m and afterblank and blankseen):
+ if includematch:
+ prematch.append(line)
+ yield EMPTYSTRING.join(prematch), m
+ prematch.append(line)
+
+
+ NLatend = re.compile('(\r\n|\r|\n)$').match
+ NLCRE_crack = re.compile('(\r\n|\r|\n)')
+
+ def push(self, data):
+ """ Push some new data into this object """
+ # Handle any previous leftovers
+ data, self._partial = self._partial+data, ''
+ # Crack into lines, but leave the newlines on the end of each
+ lines = self.NLCRE_crack.split(data)
+ # The *ahem* interesting behaviour of re.split when supplied
+ # groups means that the last element is the data after the
+ # final RE. In the case of a NL/CR terminated string, this is
+ # the empty string.
+ self._partial = lines.pop()
+ o = []
+ for i in range(len(lines) / 2):
+ o.append(EMPTYSTRING.join([lines[i*2], lines[i*2+1]]))
+ self.pushlines(o)
+
+ def pushlines(self, lines):
+ """ Push a list of new lines into the object """
+ # Reverse and insert at the front of _pending
+ self._pending[:0] = lines[::-1]
+
+ def end(self):
+ """ There is no more data """
+ self._done = True
+
+ def is_done(self):
+ return self._done
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ l = self.readline()
+ if l == '':
+ raise StopIteration
+ return l
+
+class FeedParser:
+ "A feed-style parser of email. copy docstring here"
+
+ def __init__(self, _class=Message.Message):
+ "fnord fnord fnord"
+ self._class = _class
+ self._input = FeedableLumpOfText()
+ self._root = None
+ self._objectstack = []
+ self._parse = self._parsegen().next
+
+ def end(self):
+ self._input.end()
+ self._call_parse()
+ return self._root
+
+ def feed(self, data):
+ self._input.push(data)
+ self._call_parse()
+
+ def _call_parse(self):
+ try:
+ self._parse()
+ except StopIteration:
+ pass
+
+ headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
+
+ def _parse_headers(self,headerlist):
+ # Passed a list of strings that are the headers for the
+ # current object
+ lastheader = ''
+ lastvalue = []
+
+
+ for lineno, line in enumerate(headerlist):
+ # Check for continuation
+ if line[0] in ' \t':
+ if not lastheader:
+ raise Errors.HeaderParseError('First line must not be a continuation')
+ lastvalue.append(line)
+ continue
+
+ if lastheader:
+ # XXX reconsider the joining of folded lines
+ self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
+ lastheader, lastvalue = '', []
+
+ # Check for Unix-From
+ if line.startswith('From '):
+ if lineno == 0:
+ self._cur.set_unixfrom(line)
+ continue
+ elif lineno == len(headerlist) - 1:
+ # Something looking like a unix-from at the end - it's
+ # probably the first line of the body
+ self._input.unreadline(line)
+ return
+ else:
+ # Weirdly placed unix-from line. Ignore it.
+ continue
+
+ i = line.find(':')
+ if i < 0:
+ # The older parser had various special-cases here. We've
+ # already handled them
+ raise Errors.HeaderParseError(
+ "Not a header, not a continuation: ``%s''" % line)
+ lastheader = line[:i]
+ lastvalue = [line[i+1:].lstrip()]
+
+ if lastheader:
+ # XXX reconsider the joining of folded lines
+ self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
+
+
+ def _parsegen(self):
+ # Parse any currently available text
+ self._new_sub_object()
+ self._root = self._cur
+ completing = False
+ last = None
+
+ for line in self._input:
+ if line is NeedMoreData:
+ yield None # Need More Data
+ continue
+ self._input.unreadline(line)
+ if not completing:
+ headers = []
+ # Now collect all headers.
+ for line in self._input:
+ if line is NeedMoreData:
+ yield None # Need More Data
+ continue
+ if not self.headerRE.match(line):
+ self._parse_headers(headers)
+ # A message/rfc822 has no body and no internal
+ # boundary.
+ if self._cur.get_content_maintype() == "message":
+ self._new_sub_object()
+ completing = False
+ headers = []
+ continue
+ if line.strip():
+ # No blank line between headers and body.
+ # Push this line back, it's the first line of
+ # the body.
+ self._input.unreadline(line)
+ break
+ else:
+ headers.append(line)
+ else:
+ # We're done with the data and are still inside the headers
+ self._parse_headers(headers)
+
+ # Now we're dealing with the body
+ boundary = self._cur.get_boundary()
+ isdigest = (self._cur.get_content_type() == 'multipart/digest')
+ if boundary and not self._cur._finishing:
+ separator = '--' + boundary
+ self._cur._boundaryRE = re.compile(
+ r'(?P<sep>' + re.escape(separator) +
+ r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
+ for r in self._input.readuntil(self._cur._boundaryRE):
+ if r is NeedMoreData:
+ yield NeedMoreData
+ else:
+ preamble, matchobj = r
+ break
+ if not matchobj:
+ # Broken - we hit the end of file. Just set the body
+ # to the text.
+ if completing:
+ self._attach_trailer(last, preamble)
+ else:
+ self._attach_preamble(self._cur, preamble)
+ # XXX move back to the parent container.
+ self._pop_container()
+ completing = True
+ continue
+ if preamble:
+ if completing:
+ preamble = preamble[:-len(matchobj.group('linesep'))]
+ self._attach_trailer(last, preamble)
+ else:
+ self._attach_preamble(self._cur, preamble)
+ elif not completing:
+ # The module docs specify an empty preamble is None, not ''
+ self._cur.preamble = None
+ # If we _are_ completing, the last object gets no payload
+
+ if matchobj.group('end'):
+ # That was the end boundary tag. Bounce back to the
+ # parent container
+ last = self._pop_container()
+ self._input.unreadline(matchobj.group('linesep'))
+ completing = True
+ continue
+
+ # A number of MTAs produced by a nameless large company
+ # we shall call "SicroMoft" produce repeated boundary
+ # lines.
+ while True:
+ line = self._input.peekline()
+ if line is NeedMoreData:
+ yield None
+ continue
+ if self._cur._boundaryRE.match(line):
+ self._input.readline()
+ else:
+ break
+
+ self._new_sub_object()
+
+ completing = False
+ if isdigest:
+ self._cur.set_default_type('message/rfc822')
+ continue
+ else:
+ # non-multipart or after end-boundary
+ if last is not self._root:
+ last = self._pop_container()
+ if self._cur.get_content_maintype() == "message":
+ # We double-pop to leave the RFC822 object
+ self._pop_container()
+ completing = True
+ elif self._cur._boundaryRE and last <> self._root:
+ completing = True
+ else:
+ # Non-multipart top level, or in the trailer of the
+ # top level multipart
+ while not self._input.is_done():
+ yield None
+ data = list(self._input)
+ body = EMPTYSTRING.join(data)
+ self._attach_trailer(last, body)
+
+
+ def _attach_trailer(self, obj, trailer):
+ #import pdb ; pdb.set_trace()
+ if obj.get_content_maintype() in ( "multipart", "message" ):
+ obj.epilogue = trailer
+ else:
+ obj.set_payload(trailer)
+
+ def _attach_preamble(self, obj, trailer):
+ if obj.get_content_maintype() in ( "multipart", "message" ):
+ obj.preamble = trailer
+ else:
+ obj.set_payload(trailer)
+
+
+ def _new_sub_object(self):
+ new = self._class()
+ #print "pushing", self._objectstack, repr(new)
+ if self._objectstack:
+ self._objectstack[-1].attach(new)
+ self._objectstack.append(new)
+ new._boundaryRE = None
+ new._finishing = False
+ self._cur = new
+
+ def _pop_container(self):
+ # Move the pointer to the container of the current object.
+ # Returns the (old) current object
+ #import pdb ; pdb.set_trace()
+ #print "popping", self._objectstack
+ last = self._objectstack.pop()
+ if self._objectstack:
+ self._cur = self._objectstack[-1]
+ else:
+ self._cur._finishing = True
+ return last
+
+