summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/rfc822.py228
1 files changed, 123 insertions, 105 deletions
diff --git a/Lib/rfc822.py b/Lib/rfc822.py
index 3a02224..71a445b 100644
--- a/Lib/rfc822.py
+++ b/Lib/rfc822.py
@@ -1,52 +1,66 @@
-"""RFC-822 message manipulation class.
+"""RFC 2822 message manipulation.
-XXX This is only a very rough sketch of a full RFC-822 parser;
-in particular the tokenizing of addresses does not adhere to all the
-quoting rules.
+Note: This is only a very rough sketch of a full RFC-822 parser; in particular
+the tokenizing of addresses does not adhere to all the quoting rules.
+
+Note: RFC 2822 is a long awaited update to RFC 822. This module should
+conform to RFC 2822, and is thus mis-named (it's not worth renaming it). Some
+effort at RFC 2822 updates have been made, but a thorough audit has not been
+performed. Consider any RFC 2822 non-conformance to be a bug.
+
+ RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
+ RFC 822: http://www.faqs.org/rfcs/rfc822.html (obsolete)
Directions for use:
To create a Message object: first open a file, e.g.:
+
fp = open(file, 'r')
+
You can use any other legal way of getting an open file object, e.g. use
-sys.stdin or call os.popen().
-Then pass the open file object to the Message() constructor:
+sys.stdin or call os.popen(). Then pass the open file object to the Message()
+constructor:
+
m = Message(fp)
-This class can work with any input object that supports a readline
-method. If the input object has seek and tell capability, the
-rewindbody method will work; also illegal lines will be pushed back
-onto the input stream. If the input object lacks seek but has an
-`unread' method that can push back a line of input, Message will use
-that to push back illegal lines. Thus this class can be used to parse
-messages coming from a buffered stream.
-
-The optional `seekable' argument is provided as a workaround for
-certain stdio libraries in which tell() discards buffered data before
-discovering that the lseek() system call doesn't work. For maximum
-portability, you should set the seekable argument to zero to prevent
-that initial \code{tell} when passing in an unseekable object such as
-a a file object created from a socket object. If it is 1 on entry --
-which it is by default -- the tell() method of the open file object is
-called once; if this raises an exception, seekable is reset to 0. For
-other nonzero values of seekable, this test is not made.
+This class can work with any input object that supports a readline method. If
+the input object has seek and tell capability, the rewindbody method will
+work; also illegal lines will be pushed back onto the input stream. If the
+input object lacks seek but has an `unread' method that can push back a line
+of input, Message will use that to push back illegal lines. Thus this class
+can be used to parse messages coming from a buffered stream.
+
+The optional `seekable' argument is provided as a workaround for certain stdio
+libraries in which tell() discards buffered data before discovering that the
+lseek() system call doesn't work. For maximum portability, you should set the
+seekable argument to zero to prevent that initial \code{tell} when passing in
+an unseekable object such as a a file object created from a socket object. If
+it is 1 on entry -- which it is by default -- the tell() method of the open
+file object is called once; if this raises an exception, seekable is reset to
+0. For other nonzero values of seekable, this test is not made.
To get the text of a particular header there are several methods:
+
str = m.getheader(name)
str = m.getrawheader(name)
-where name is the name of the header, e.g. 'Subject'.
-The difference is that getheader() strips the leading and trailing
-whitespace, while getrawheader() doesn't. Both functions retain
-embedded whitespace (including newlines) exactly as they are
-specified in the header, and leave the case of the text unchanged.
+
+where name is the name of the header, e.g. 'Subject'. The difference is that
+getheader() strips the leading and trailing whitespace, while getrawheader()
+doesn't. Both functions retain embedded whitespace (including newlines)
+exactly as they are specified in the header, and leave the case of the text
+unchanged.
For addresses and address lists there are functions
- realname, mailaddress = m.getaddr(name) and
+
+ realname, mailaddress = m.getaddr(name)
list = m.getaddrlist(name)
+
where the latter returns a list of (realname, mailaddr) tuples.
There is also a method
+
time = m.getdate(name)
+
which parses a Date-like field and returns a time-compatible tuple,
i.e. a tuple such as returned by time.localtime() or accepted by
time.mktime().
@@ -65,7 +79,7 @@ _blanklines = ('\r\n', '\n') # Optimization for islast()
class Message:
- """Represents a single RFC-822-compliant message."""
+ """Represents a single RFC 2822-compliant message."""
def __init__(self, fp, seekable = 1):
"""Initialize the class instance and read the headers."""
@@ -106,18 +120,17 @@ class Message:
def readheaders(self):
"""Read header lines.
- Read header lines up to the entirely blank line that
- terminates them. The (normally blank) line that ends the
- headers is skipped, but not included in the returned list.
- If a non-header line ends the headers, (which is an error),
- an attempt is made to backspace over it; it is never
- included in the returned list.
-
- The variable self.status is set to the empty string if all
- went well, otherwise it is an error message.
- The variable self.headers is a completely uninterpreted list
- of lines contained in the header (so printing them will
- reproduce the header exactly as it appears in the file).
+ Read header lines up to the entirely blank line that terminates them.
+ The (normally blank) line that ends the headers is skipped, but not
+ included in the returned list. If a non-header line ends the headers,
+ (which is an error), an attempt is made to backspace over it; it is
+ never included in the returned list.
+
+ The variable self.status is set to the empty string if all went well,
+ otherwise it is an error message. The variable self.headers is a
+ completely uninterpreted list of lines contained in the header (so
+ printing them will reproduce the header exactly as it appears in the
+ file).
"""
self.dict = {}
self.unixfrom = ''
@@ -183,8 +196,8 @@ class Message:
"""Determine whether a given line is a legal header.
This method should return the header name, suitably canonicalized.
- You may override this method in order to use Message parsing
- on tagged data in RFC822-like formats with special header formats.
+ You may override this method in order to use Message parsing on tagged
+ data in RFC 2822-like formats with special header formats.
"""
i = line.find(':')
if i > 0:
@@ -193,35 +206,32 @@ class Message:
return None
def islast(self, line):
- """Determine whether a line is a legal end of RFC-822 headers.
+ """Determine whether a line is a legal end of RFC 2822 headers.
- You may override this method if your application wants
- to bend the rules, e.g. to strip trailing whitespace,
- or to recognize MH template separators ('--------').
- For convenience (e.g. for code reading from sockets) a
- line consisting of \r\n also matches.
+ You may override this method if your application wants to bend the
+ rules, e.g. to strip trailing whitespace, or to recognize MH template
+ separators ('--------'). For convenience (e.g. for code reading from
+ sockets) a line consisting of \r\n also matches.
"""
return line in _blanklines
def iscomment(self, line):
"""Determine whether a line should be skipped entirely.
- You may override this method in order to use Message parsing
- on tagged data in RFC822-like formats that support embedded
- comments or free-text data.
+ You may override this method in order to use Message parsing on tagged
+ data in RFC 2822-like formats that support embedded comments or
+ free-text data.
"""
return None
def getallmatchingheaders(self, name):
"""Find all header lines matching a given header name.
- Look through the list of headers and find all lines
- matching a given header name (and their continuation
- lines). A list of the lines is returned, without
- interpretation. If the header does not occur, an
- empty list is returned. If the header occurs multiple
- times, all occurrences are returned. Case is not
- important in the header name.
+ Look through the list of headers and find all lines matching a given
+ header name (and their continuation lines). A list of the lines is
+ returned, without interpretation. If the header does not occur, an
+ empty list is returned. If the header occurs multiple times, all
+ occurrences are returned. Case is not important in the header name.
"""
name = name.lower() + ':'
n = len(name)
@@ -239,9 +249,8 @@ class Message:
def getfirstmatchingheader(self, name):
"""Get the first header line matching name.
- This is similar to getallmatchingheaders, but it returns
- only the first matching header (and its continuation
- lines).
+ This is similar to getallmatchingheaders, but it returns only the
+ first matching header (and its continuation lines).
"""
name = name.lower() + ':'
n = len(name)
@@ -260,11 +269,10 @@ class Message:
def getrawheader(self, name):
"""A higher-level interface to getfirstmatchingheader().
- Return a string containing the literal text of the
- header but with the keyword stripped. All leading,
- trailing and embedded whitespace is kept in the
- string, however.
- Return None if the header does not occur.
+ Return a string containing the literal text of the header but with the
+ keyword stripped. All leading, trailing and embedded whitespace is
+ kept in the string, however. Return None if the header does not
+ occur.
"""
list = self.getfirstmatchingheader(name)
@@ -276,10 +284,9 @@ class Message:
def getheader(self, name, default=None):
"""Get the header value for a name.
- This is the normal interface: it returns a stripped
- version of the header value for a given header name,
- or None if it doesn't exist. This uses the dictionary
- version which finds the *last* such header.
+ This is the normal interface: it returns a stripped version of the
+ header value for a given header name, or None if it doesn't exist.
+ This uses the dictionary version which finds the *last* such header.
"""
try:
return self.dict[name.lower()]
@@ -290,10 +297,9 @@ class Message:
def getheaders(self, name):
"""Get all values for a header.
- This returns a list of values for headers given more than once;
- each value in the result list is stripped in the same way as the
- result of getheader(). If the header is not given, return an
- empty list.
+ This returns a list of values for headers given more than once; each
+ value in the result list is stripped in the same way as the result of
+ getheader(). If the header is not given, return an empty list.
"""
result = []
current = ''
@@ -332,7 +338,6 @@ class Message:
Retrieves a list of addresses from a header, where each address is a
tuple as returned by getaddr(). Scans all named headers, so it works
properly with multiple To: or Cc: headers for example.
-
"""
raw = []
for h in self.getallmatchingheaders(name):
@@ -352,8 +357,8 @@ class Message:
def getdate(self, name):
"""Retrieve a date field from a header.
- Retrieves a date field from the named header, returning
- a tuple compatible with time.mktime().
+ Retrieves a date field from the named header, returning a tuple
+ compatible with time.mktime().
"""
try:
data = self[name]
@@ -364,9 +369,8 @@ class Message:
def getdate_tz(self, name):
"""Retrieve a date field from a header as a 10-tuple.
- The first 9 elements make up a tuple compatible with
- time.mktime(), and the 10th is the offset of the poster's
- time zone from GMT/UTC.
+ The first 9 elements make up a tuple compatible with time.mktime(),
+ and the 10th is the offset of the poster's time zone from GMT/UTC.
"""
try:
data = self[name]
@@ -388,9 +392,9 @@ class Message:
def __setitem__(self, name, value):
"""Set the value of a header.
- Note: This is not a perfect inversion of __getitem__, because
- any changed headers get stuck at the end of the raw-headers list
- rather than where the altered header was.
+ Note: This is not a perfect inversion of __getitem__, because any
+ changed headers get stuck at the end of the raw-headers list rather
+ than where the altered header was.
"""
del self[name] # Won't fail if it doesn't exist
self.dict[name.lower()] = value
@@ -502,7 +506,9 @@ class AddrlistClass:
"""Address parser class by Ben Escoto.
To understand what this class does, it helps to have a copy of
- RFC-822 in front of you.
+ RFC 2822 in front of you.
+
+ http://www.faqs.org/rfcs/rfc2822.html
Note: this class interface is deprecated and may be removed in the future.
Use rfc822.AddressList instead.
@@ -511,14 +517,18 @@ class AddrlistClass:
def __init__(self, field):
"""Initialize a new instance.
- `field' is an unparsed address header field, containing
- one or more addresses.
+ `field' is an unparsed address header field, containing one or more
+ addresses.
"""
self.specials = '()<>@,:;.\"[]'
self.pos = 0
self.LWS = ' \t'
self.CR = '\r\n'
self.atomends = self.specials + self.LWS + self.CR
+ # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
+ # is obsolete syntax. RFC 2822 requires that we recognize obsolete
+ # syntax, so allow dots in phrases.
+ self.phraseends = self.atomends.replace('.', '')
self.field = field
self.commentlist = []
@@ -633,7 +643,7 @@ class AddrlistClass:
return adlist
def getaddrspec(self):
- """Parse an RFC-822 addr-spec."""
+ """Parse an RFC 2822 addr-spec."""
aslist = []
self.gotonext()
@@ -677,15 +687,15 @@ class AddrlistClass:
def getdelimited(self, beginchar, endchars, allowcomments = 1):
"""Parse a header fragment delimited by special characters.
- `beginchar' is the start character for the fragment.
- If self is not looking at an instance of `beginchar' then
- getdelimited returns the empty string.
+ `beginchar' is the start character for the fragment. If self is not
+ looking at an instance of `beginchar' then getdelimited returns the
+ empty string.
`endchars' is a sequence of allowable end-delimiting characters.
Parsing stops when one of these is encountered.
- If `allowcomments' is non-zero, embedded RFC-822 comments
- are allowed within the parsed fragment.
+ If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
+ within the parsed fragment.
"""
if self.field[self.pos] != beginchar:
return ''
@@ -719,15 +729,22 @@ class AddrlistClass:
return self.getdelimited('(', ')\r', 1)
def getdomainliteral(self):
- """Parse an RFC-822 domain-literal."""
+ """Parse an RFC 2822 domain-literal."""
return '[%s]' % self.getdelimited('[', ']\r', 0)
- def getatom(self):
- """Parse an RFC-822 atom."""
+ def getatom(self, atomends=None):
+ """Parse an RFC 2822 atom.
+
+ Optional atomends specifies a different set of end token delimiters
+ (the default is to use self.atomends). This is used e.g. in
+ getphraselist() since phrase endings must not include the `.' (which
+ is legal in phrases)."""
atomlist = ['']
+ if atomends is None:
+ atomends = self.atomends
while self.pos < len(self.field):
- if self.field[self.pos] in self.atomends:
+ if self.field[self.pos] in atomends:
break
else: atomlist.append(self.field[self.pos])
self.pos = self.pos + 1
@@ -735,11 +752,11 @@ class AddrlistClass:
return ''.join(atomlist)
def getphraselist(self):
- """Parse a sequence of RFC-822 phrases.
+ """Parse a sequence of RFC 2822 phrases.
- A phrase is a sequence of words, which are in turn either
- RFC-822 atoms or quoted-strings. Phrases are canonicalized
- by squeezing all runs of continuous whitespace into one space.
+ A phrase is a sequence of words, which are in turn either RFC 2822
+ atoms or quoted-strings. Phrases are canonicalized by squeezing all
+ runs of continuous whitespace into one space.
"""
plist = []
@@ -750,14 +767,15 @@ class AddrlistClass:
plist.append(self.getquote())
elif self.field[self.pos] == '(':
self.commentlist.append(self.getcomment())
- elif self.field[self.pos] in self.atomends:
+ elif self.field[self.pos] in self.phraseends:
break
- else: plist.append(self.getatom())
+ else:
+ plist.append(self.getatom(self.phraseends))
return plist
class AddressList(AddrlistClass):
- """An AddressList encapsulates a list of parsed RFC822 addresses."""
+ """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
def __init__(self, field):
AddrlistClass.__init__(self, field)
if field: