diff options
author | Barry Warsaw <barry@python.org> | 2002-04-10 21:01:31 (GMT) |
---|---|---|
committer | Barry Warsaw <barry@python.org> | 2002-04-10 21:01:31 (GMT) |
commit | 409a4c08b545aa064cf8fe3b8de51404756a301e (patch) | |
tree | 06cf8fe44e1fe28fbc0147635ec41961f2df6515 /Lib/email/Header.py | |
parent | 68e69338ae19c37bd3e69cb76e107bfa76231e06 (diff) | |
download | cpython-409a4c08b545aa064cf8fe3b8de51404756a301e.zip cpython-409a4c08b545aa064cf8fe3b8de51404756a301e.tar.gz cpython-409a4c08b545aa064cf8fe3b8de51404756a301e.tar.bz2 |
Sync'ing with standalone email package 2.0.1. This adds support for
non-us-ascii character sets in headers and bodies. Some API changes
(with DeprecationWarnings for the old APIs). Better RFC-compliant
implementations of base64 and quoted-printable.
Updated test cases. Documentation updates to follow (after I finish
writing them ;).
Diffstat (limited to 'Lib/email/Header.py')
-rw-r--r-- | Lib/email/Header.py | 210 |
1 files changed, 210 insertions, 0 deletions
diff --git a/Lib/email/Header.py b/Lib/email/Header.py new file mode 100644 index 0000000..097b978 --- /dev/null +++ b/Lib/email/Header.py @@ -0,0 +1,210 @@ +# Copyright (C) 2002 Python Software Foundation +# Author: che@debian.org (Ben Gertzfield) + +"""Header encoding and decoding functionality.""" + +import re +import email.quopriMIME +import email.base64MIME +from email.Charset import Charset + +CRLFSPACE = '\r\n ' +CRLF = '\r\n' +NLSPACE = '\n ' + +MAXLINELEN = 76 + +ENCODE = 1 +DECODE = 2 + +# Match encoded-word strings in the form =?charset?q?Hello_World?= +ecre = re.compile(r''' + =\? # literal =? + (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset + \? # literal ? + (?P<encoding>[qb]) # either a "q" or a "b", case insensitive + \? # literal ? + (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string + \?= # literal ?= + ''', re.VERBOSE | re.IGNORECASE) + + + +# Helpers +_max_append = email.quopriMIME._max_append + + + +def decode_header(header): + """Decode a message header value without converting charset. + + Returns a list of (decoded_string, charset) pairs containing each of the + decoded parts of the header. Charset is None for non-encoded parts of the + header, otherwise a lower-case string containing the name of the character + set specified in the encoded string. + """ + # If no encoding, just return the header + header = str(header) + if not ecre.search(header): + return [(header, None)] + + decoded = [] + dec = '' + for line in header.splitlines(): + # This line might not have an encoding in it + if not ecre.search(line): + decoded.append((line, None)) + continue + + parts = ecre.split(line) + while parts: + unenc = parts.pop(0).strip() + if unenc: + # Should we continue a long line? + if decoded and decoded[-1][1] is None: + decoded[-1] = (decoded[-1][0] + dec, None) + else: + decoded.append((unenc, None)) + if parts: + charset, encoding = [s.lower() for s in parts[0:2]] + encoded = parts[2] + dec = '' + if encoding == 'q': + dec = email.quopriMIME.header_decode(encoded) + elif encoding == 'b': + dec = email.base64MIME.decode(encoded) + else: + dec = encoded + + if decoded and decoded[-1][1] == charset: + decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1]) + else: + decoded.append((dec, charset)) + del parts[0:3] + return decoded + + + +class Header: + def __init__(self, s, charset=None, maxlinelen=MAXLINELEN, + header_name=None): + """Create a MIME-compliant header that can contain many languages. + + Specify the initial header value in s. Specify its character set as a + Charset object in the charset argument. If none, a default Charset + instance will be used. + + You can later append to the header with append(s, charset) below; + charset does not have to be the same as the one initially specified + here. In fact, it's optional, and if not given, defaults to the + charset specified in the constructor. + + The maximum line length can either be specified by maxlinelen, or you + can pass in the name of the header field (e.g. "Subject") to let this + class guess the best line length to use to prevent wrapping. The + default maxlinelen is 76. + """ + if charset is None: + charset = Charset() + self._charset = charset + # BAW: I believe `chunks' and `maxlinelen' should be non-public. + self._chunks = [] + self.append(s, charset) + self._maxlinelen = maxlinelen + if header_name is not None: + self.guess_maxlinelen(header_name) + + def __str__(self): + """A synonym for self.encode().""" + return self.encode() + + def guess_maxlinelen(self, s=None): + """Guess the maximum length to make each header line. + + Given a header name (e.g. "Subject"), set this header's maximum line + length to an appropriate length to avoid line wrapping. If s is not + given, return the previous maximum line length and don't set it. + + Returns the new maximum line length. + """ + # BAW: is this semantic necessary? + if s is not None: + self._maxlinelen = MAXLINELEN - len(s) - 2 + return self._maxlinelen + + def append(self, s, charset=None): + """Append string s with Charset charset to the MIME header. + + charset defaults to the one given in the class constructor. + """ + if charset is None: + charset = self._charset + self._chunks.append((s, charset)) + + def _split(self, s, charset): + # Split up a header safely for use with encode_chunks. BAW: this + # appears to be a private convenience method. + splittable = charset.to_splittable(s) + encoded = charset.from_splittable(splittable) + + if charset.encoded_header_len(encoded) < self._maxlinelen: + return [(encoded, charset)] + else: + # Divide and conquer. BAW: halfway depends on integer division. + # When porting to Python 2.2, use the // operator. + halfway = len(splittable) // 2 + first = charset.from_splittable(splittable[:halfway], 0) + last = charset.from_splittable(splittable[halfway:], 0) + return self._split(first, charset) + self._split(last, charset) + + def encode(self): + """Encode a message header, possibly converting charset and encoding. + + There are many issues involved in converting a given string for use in + an email header. Only certain character sets are readable in most + email clients, and as header strings can only contain a subset of + 7-bit ASCII, care must be taken to properly convert and encode (with + Base64 or quoted-printable) header strings. In addition, there is a + 75-character length limit on any given encoded header field, so + line-wrapping must be performed, even with double-byte character sets. + + This method will do its best to convert the string to the correct + character set used in email, and encode and line wrap it safely with + the appropriate scheme for that character set. + + If the given charset is not known or an error occurs during + conversion, this function will return the header untouched. + """ + newchunks = [] + for s, charset in self._chunks: + newchunks += self._split(s, charset) + self._chunks = newchunks + return self.encode_chunks() + + def encode_chunks(self): + """MIME-encode a header with many different charsets and/or encodings. + + Given a list of pairs (string, charset), return a MIME-encoded string + suitable for use in a header field. Each pair may have different + charsets and/or encodings, and the resulting header will accurately + reflect each setting. + + Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like + character sets like iso-8859-1), email.Utils.BASE64 (Base64, for + non-ASCII like character sets like KOI8-R and iso-2022-jp), or None + (no encoding). + + Each pair will be represented on a separate line; the resulting string + will be in the format: + + "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n + =?charset2?b?SvxyZ2VuIEL2aW5n?=" + """ + chunks = [] + for header, charset in self._chunks: + if charset is None: + _max_append(chunks, header, self._maxlinelen, ' ') + else: + _max_append(chunks, charset.header_encode(header, 0), + self._maxlinelen, ' ') + return NLSPACE.join(chunks) |