diff options
-rw-r--r-- | Lib/rfc822.py | 228 |
1 files changed, 123 insertions, 105 deletions
diff --git a/Lib/rfc822.py b/Lib/rfc822.py index 3a02224..71a445b 100644 --- a/Lib/rfc822.py +++ b/Lib/rfc822.py @@ -1,52 +1,66 @@ -"""RFC-822 message manipulation class. +"""RFC 2822 message manipulation. -XXX This is only a very rough sketch of a full RFC-822 parser; -in particular the tokenizing of addresses does not adhere to all the -quoting rules. +Note: This is only a very rough sketch of a full RFC-822 parser; in particular +the tokenizing of addresses does not adhere to all the quoting rules. + +Note: RFC 2822 is a long awaited update to RFC 822. This module should +conform to RFC 2822, and is thus mis-named (it's not worth renaming it). Some +effort at RFC 2822 updates have been made, but a thorough audit has not been +performed. Consider any RFC 2822 non-conformance to be a bug. + + RFC 2822: http://www.faqs.org/rfcs/rfc2822.html + RFC 822: http://www.faqs.org/rfcs/rfc822.html (obsolete) Directions for use: To create a Message object: first open a file, e.g.: + fp = open(file, 'r') + You can use any other legal way of getting an open file object, e.g. use -sys.stdin or call os.popen(). -Then pass the open file object to the Message() constructor: +sys.stdin or call os.popen(). Then pass the open file object to the Message() +constructor: + m = Message(fp) -This class can work with any input object that supports a readline -method. If the input object has seek and tell capability, the -rewindbody method will work; also illegal lines will be pushed back -onto the input stream. If the input object lacks seek but has an -`unread' method that can push back a line of input, Message will use -that to push back illegal lines. Thus this class can be used to parse -messages coming from a buffered stream. - -The optional `seekable' argument is provided as a workaround for -certain stdio libraries in which tell() discards buffered data before -discovering that the lseek() system call doesn't work. For maximum -portability, you should set the seekable argument to zero to prevent -that initial \code{tell} when passing in an unseekable object such as -a a file object created from a socket object. If it is 1 on entry -- -which it is by default -- the tell() method of the open file object is -called once; if this raises an exception, seekable is reset to 0. For -other nonzero values of seekable, this test is not made. +This class can work with any input object that supports a readline method. If +the input object has seek and tell capability, the rewindbody method will +work; also illegal lines will be pushed back onto the input stream. If the +input object lacks seek but has an `unread' method that can push back a line +of input, Message will use that to push back illegal lines. Thus this class +can be used to parse messages coming from a buffered stream. + +The optional `seekable' argument is provided as a workaround for certain stdio +libraries in which tell() discards buffered data before discovering that the +lseek() system call doesn't work. For maximum portability, you should set the +seekable argument to zero to prevent that initial \code{tell} when passing in +an unseekable object such as a a file object created from a socket object. If +it is 1 on entry -- which it is by default -- the tell() method of the open +file object is called once; if this raises an exception, seekable is reset to +0. For other nonzero values of seekable, this test is not made. To get the text of a particular header there are several methods: + str = m.getheader(name) str = m.getrawheader(name) -where name is the name of the header, e.g. 'Subject'. -The difference is that getheader() strips the leading and trailing -whitespace, while getrawheader() doesn't. Both functions retain -embedded whitespace (including newlines) exactly as they are -specified in the header, and leave the case of the text unchanged. + +where name is the name of the header, e.g. 'Subject'. The difference is that +getheader() strips the leading and trailing whitespace, while getrawheader() +doesn't. Both functions retain embedded whitespace (including newlines) +exactly as they are specified in the header, and leave the case of the text +unchanged. For addresses and address lists there are functions - realname, mailaddress = m.getaddr(name) and + + realname, mailaddress = m.getaddr(name) list = m.getaddrlist(name) + where the latter returns a list of (realname, mailaddr) tuples. There is also a method + time = m.getdate(name) + which parses a Date-like field and returns a time-compatible tuple, i.e. a tuple such as returned by time.localtime() or accepted by time.mktime(). @@ -65,7 +79,7 @@ _blanklines = ('\r\n', '\n') # Optimization for islast() class Message: - """Represents a single RFC-822-compliant message.""" + """Represents a single RFC 2822-compliant message.""" def __init__(self, fp, seekable = 1): """Initialize the class instance and read the headers.""" @@ -106,18 +120,17 @@ class Message: def readheaders(self): """Read header lines. - Read header lines up to the entirely blank line that - terminates them. The (normally blank) line that ends the - headers is skipped, but not included in the returned list. - If a non-header line ends the headers, (which is an error), - an attempt is made to backspace over it; it is never - included in the returned list. - - The variable self.status is set to the empty string if all - went well, otherwise it is an error message. - The variable self.headers is a completely uninterpreted list - of lines contained in the header (so printing them will - reproduce the header exactly as it appears in the file). + Read header lines up to the entirely blank line that terminates them. + The (normally blank) line that ends the headers is skipped, but not + included in the returned list. If a non-header line ends the headers, + (which is an error), an attempt is made to backspace over it; it is + never included in the returned list. + + The variable self.status is set to the empty string if all went well, + otherwise it is an error message. The variable self.headers is a + completely uninterpreted list of lines contained in the header (so + printing them will reproduce the header exactly as it appears in the + file). """ self.dict = {} self.unixfrom = '' @@ -183,8 +196,8 @@ class Message: """Determine whether a given line is a legal header. This method should return the header name, suitably canonicalized. - You may override this method in order to use Message parsing - on tagged data in RFC822-like formats with special header formats. + You may override this method in order to use Message parsing on tagged + data in RFC 2822-like formats with special header formats. """ i = line.find(':') if i > 0: @@ -193,35 +206,32 @@ class Message: return None def islast(self, line): - """Determine whether a line is a legal end of RFC-822 headers. + """Determine whether a line is a legal end of RFC 2822 headers. - You may override this method if your application wants - to bend the rules, e.g. to strip trailing whitespace, - or to recognize MH template separators ('--------'). - For convenience (e.g. for code reading from sockets) a - line consisting of \r\n also matches. + You may override this method if your application wants to bend the + rules, e.g. to strip trailing whitespace, or to recognize MH template + separators ('--------'). For convenience (e.g. for code reading from + sockets) a line consisting of \r\n also matches. """ return line in _blanklines def iscomment(self, line): """Determine whether a line should be skipped entirely. - You may override this method in order to use Message parsing - on tagged data in RFC822-like formats that support embedded - comments or free-text data. + You may override this method in order to use Message parsing on tagged + data in RFC 2822-like formats that support embedded comments or + free-text data. """ return None def getallmatchingheaders(self, name): """Find all header lines matching a given header name. - Look through the list of headers and find all lines - matching a given header name (and their continuation - lines). A list of the lines is returned, without - interpretation. If the header does not occur, an - empty list is returned. If the header occurs multiple - times, all occurrences are returned. Case is not - important in the header name. + Look through the list of headers and find all lines matching a given + header name (and their continuation lines). A list of the lines is + returned, without interpretation. If the header does not occur, an + empty list is returned. If the header occurs multiple times, all + occurrences are returned. Case is not important in the header name. """ name = name.lower() + ':' n = len(name) @@ -239,9 +249,8 @@ class Message: def getfirstmatchingheader(self, name): """Get the first header line matching name. - This is similar to getallmatchingheaders, but it returns - only the first matching header (and its continuation - lines). + This is similar to getallmatchingheaders, but it returns only the + first matching header (and its continuation lines). """ name = name.lower() + ':' n = len(name) @@ -260,11 +269,10 @@ class Message: def getrawheader(self, name): """A higher-level interface to getfirstmatchingheader(). - Return a string containing the literal text of the - header but with the keyword stripped. All leading, - trailing and embedded whitespace is kept in the - string, however. - Return None if the header does not occur. + Return a string containing the literal text of the header but with the + keyword stripped. All leading, trailing and embedded whitespace is + kept in the string, however. Return None if the header does not + occur. """ list = self.getfirstmatchingheader(name) @@ -276,10 +284,9 @@ class Message: def getheader(self, name, default=None): """Get the header value for a name. - This is the normal interface: it returns a stripped - version of the header value for a given header name, - or None if it doesn't exist. This uses the dictionary - version which finds the *last* such header. + This is the normal interface: it returns a stripped version of the + header value for a given header name, or None if it doesn't exist. + This uses the dictionary version which finds the *last* such header. """ try: return self.dict[name.lower()] @@ -290,10 +297,9 @@ class Message: def getheaders(self, name): """Get all values for a header. - This returns a list of values for headers given more than once; - each value in the result list is stripped in the same way as the - result of getheader(). If the header is not given, return an - empty list. + This returns a list of values for headers given more than once; each + value in the result list is stripped in the same way as the result of + getheader(). If the header is not given, return an empty list. """ result = [] current = '' @@ -332,7 +338,6 @@ class Message: Retrieves a list of addresses from a header, where each address is a tuple as returned by getaddr(). Scans all named headers, so it works properly with multiple To: or Cc: headers for example. - """ raw = [] for h in self.getallmatchingheaders(name): @@ -352,8 +357,8 @@ class Message: def getdate(self, name): """Retrieve a date field from a header. - Retrieves a date field from the named header, returning - a tuple compatible with time.mktime(). + Retrieves a date field from the named header, returning a tuple + compatible with time.mktime(). """ try: data = self[name] @@ -364,9 +369,8 @@ class Message: def getdate_tz(self, name): """Retrieve a date field from a header as a 10-tuple. - The first 9 elements make up a tuple compatible with - time.mktime(), and the 10th is the offset of the poster's - time zone from GMT/UTC. + The first 9 elements make up a tuple compatible with time.mktime(), + and the 10th is the offset of the poster's time zone from GMT/UTC. """ try: data = self[name] @@ -388,9 +392,9 @@ class Message: def __setitem__(self, name, value): """Set the value of a header. - Note: This is not a perfect inversion of __getitem__, because - any changed headers get stuck at the end of the raw-headers list - rather than where the altered header was. + Note: This is not a perfect inversion of __getitem__, because any + changed headers get stuck at the end of the raw-headers list rather + than where the altered header was. """ del self[name] # Won't fail if it doesn't exist self.dict[name.lower()] = value @@ -502,7 +506,9 @@ class AddrlistClass: """Address parser class by Ben Escoto. To understand what this class does, it helps to have a copy of - RFC-822 in front of you. + RFC 2822 in front of you. + + http://www.faqs.org/rfcs/rfc2822.html Note: this class interface is deprecated and may be removed in the future. Use rfc822.AddressList instead. @@ -511,14 +517,18 @@ class AddrlistClass: def __init__(self, field): """Initialize a new instance. - `field' is an unparsed address header field, containing - one or more addresses. + `field' is an unparsed address header field, containing one or more + addresses. """ self.specials = '()<>@,:;.\"[]' self.pos = 0 self.LWS = ' \t' self.CR = '\r\n' self.atomends = self.specials + self.LWS + self.CR + # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it + # is obsolete syntax. RFC 2822 requires that we recognize obsolete + # syntax, so allow dots in phrases. + self.phraseends = self.atomends.replace('.', '') self.field = field self.commentlist = [] @@ -633,7 +643,7 @@ class AddrlistClass: return adlist def getaddrspec(self): - """Parse an RFC-822 addr-spec.""" + """Parse an RFC 2822 addr-spec.""" aslist = [] self.gotonext() @@ -677,15 +687,15 @@ class AddrlistClass: def getdelimited(self, beginchar, endchars, allowcomments = 1): """Parse a header fragment delimited by special characters. - `beginchar' is the start character for the fragment. - If self is not looking at an instance of `beginchar' then - getdelimited returns the empty string. + `beginchar' is the start character for the fragment. If self is not + looking at an instance of `beginchar' then getdelimited returns the + empty string. `endchars' is a sequence of allowable end-delimiting characters. Parsing stops when one of these is encountered. - If `allowcomments' is non-zero, embedded RFC-822 comments - are allowed within the parsed fragment. + If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed + within the parsed fragment. """ if self.field[self.pos] != beginchar: return '' @@ -719,15 +729,22 @@ class AddrlistClass: return self.getdelimited('(', ')\r', 1) def getdomainliteral(self): - """Parse an RFC-822 domain-literal.""" + """Parse an RFC 2822 domain-literal.""" return '[%s]' % self.getdelimited('[', ']\r', 0) - def getatom(self): - """Parse an RFC-822 atom.""" + def getatom(self, atomends=None): + """Parse an RFC 2822 atom. + + Optional atomends specifies a different set of end token delimiters + (the default is to use self.atomends). This is used e.g. in + getphraselist() since phrase endings must not include the `.' (which + is legal in phrases).""" atomlist = [''] + if atomends is None: + atomends = self.atomends while self.pos < len(self.field): - if self.field[self.pos] in self.atomends: + if self.field[self.pos] in atomends: break else: atomlist.append(self.field[self.pos]) self.pos = self.pos + 1 @@ -735,11 +752,11 @@ class AddrlistClass: return ''.join(atomlist) def getphraselist(self): - """Parse a sequence of RFC-822 phrases. + """Parse a sequence of RFC 2822 phrases. - A phrase is a sequence of words, which are in turn either - RFC-822 atoms or quoted-strings. Phrases are canonicalized - by squeezing all runs of continuous whitespace into one space. + A phrase is a sequence of words, which are in turn either RFC 2822 + atoms or quoted-strings. Phrases are canonicalized by squeezing all + runs of continuous whitespace into one space. """ plist = [] @@ -750,14 +767,15 @@ class AddrlistClass: plist.append(self.getquote()) elif self.field[self.pos] == '(': self.commentlist.append(self.getcomment()) - elif self.field[self.pos] in self.atomends: + elif self.field[self.pos] in self.phraseends: break - else: plist.append(self.getatom()) + else: + plist.append(self.getatom(self.phraseends)) return plist class AddressList(AddrlistClass): - """An AddressList encapsulates a list of parsed RFC822 addresses.""" + """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" def __init__(self, field): AddrlistClass.__init__(self, field) if field: |