summaryrefslogtreecommitdiffstats
path: root/Lib/email/Header.py
diff options
context:
space:
mode:
authorBarry Warsaw <barry@python.org>2002-04-10 21:01:31 (GMT)
committerBarry Warsaw <barry@python.org>2002-04-10 21:01:31 (GMT)
commit409a4c08b545aa064cf8fe3b8de51404756a301e (patch)
tree06cf8fe44e1fe28fbc0147635ec41961f2df6515 /Lib/email/Header.py
parent68e69338ae19c37bd3e69cb76e107bfa76231e06 (diff)
downloadcpython-409a4c08b545aa064cf8fe3b8de51404756a301e.zip
cpython-409a4c08b545aa064cf8fe3b8de51404756a301e.tar.gz
cpython-409a4c08b545aa064cf8fe3b8de51404756a301e.tar.bz2
Sync'ing with standalone email package 2.0.1. This adds support for
non-us-ascii character sets in headers and bodies. Some API changes (with DeprecationWarnings for the old APIs). Better RFC-compliant implementations of base64 and quoted-printable. Updated test cases. Documentation updates to follow (after I finish writing them ;).
Diffstat (limited to 'Lib/email/Header.py')
-rw-r--r--Lib/email/Header.py210
1 files changed, 210 insertions, 0 deletions
diff --git a/Lib/email/Header.py b/Lib/email/Header.py
new file mode 100644
index 0000000..097b978
--- /dev/null
+++ b/Lib/email/Header.py
@@ -0,0 +1,210 @@
+# Copyright (C) 2002 Python Software Foundation
+# Author: che@debian.org (Ben Gertzfield)
+
+"""Header encoding and decoding functionality."""
+
+import re
+import email.quopriMIME
+import email.base64MIME
+from email.Charset import Charset
+
+CRLFSPACE = '\r\n '
+CRLF = '\r\n'
+NLSPACE = '\n '
+
+MAXLINELEN = 76
+
+ENCODE = 1
+DECODE = 2
+
+# Match encoded-word strings in the form =?charset?q?Hello_World?=
+ecre = re.compile(r'''
+ =\? # literal =?
+ (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
+ \? # literal ?
+ (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
+ \? # literal ?
+ (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
+ \?= # literal ?=
+ ''', re.VERBOSE | re.IGNORECASE)
+
+
+
+# Helpers
+_max_append = email.quopriMIME._max_append
+
+
+
+def decode_header(header):
+ """Decode a message header value without converting charset.
+
+ Returns a list of (decoded_string, charset) pairs containing each of the
+ decoded parts of the header. Charset is None for non-encoded parts of the
+ header, otherwise a lower-case string containing the name of the character
+ set specified in the encoded string.
+ """
+ # If no encoding, just return the header
+ header = str(header)
+ if not ecre.search(header):
+ return [(header, None)]
+
+ decoded = []
+ dec = ''
+ for line in header.splitlines():
+ # This line might not have an encoding in it
+ if not ecre.search(line):
+ decoded.append((line, None))
+ continue
+
+ parts = ecre.split(line)
+ while parts:
+ unenc = parts.pop(0).strip()
+ if unenc:
+ # Should we continue a long line?
+ if decoded and decoded[-1][1] is None:
+ decoded[-1] = (decoded[-1][0] + dec, None)
+ else:
+ decoded.append((unenc, None))
+ if parts:
+ charset, encoding = [s.lower() for s in parts[0:2]]
+ encoded = parts[2]
+ dec = ''
+ if encoding == 'q':
+ dec = email.quopriMIME.header_decode(encoded)
+ elif encoding == 'b':
+ dec = email.base64MIME.decode(encoded)
+ else:
+ dec = encoded
+
+ if decoded and decoded[-1][1] == charset:
+ decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
+ else:
+ decoded.append((dec, charset))
+ del parts[0:3]
+ return decoded
+
+
+
+class Header:
+ def __init__(self, s, charset=None, maxlinelen=MAXLINELEN,
+ header_name=None):
+ """Create a MIME-compliant header that can contain many languages.
+
+ Specify the initial header value in s. Specify its character set as a
+ Charset object in the charset argument. If none, a default Charset
+ instance will be used.
+
+ You can later append to the header with append(s, charset) below;
+ charset does not have to be the same as the one initially specified
+ here. In fact, it's optional, and if not given, defaults to the
+ charset specified in the constructor.
+
+ The maximum line length can either be specified by maxlinelen, or you
+ can pass in the name of the header field (e.g. "Subject") to let this
+ class guess the best line length to use to prevent wrapping. The
+ default maxlinelen is 76.
+ """
+ if charset is None:
+ charset = Charset()
+ self._charset = charset
+ # BAW: I believe `chunks' and `maxlinelen' should be non-public.
+ self._chunks = []
+ self.append(s, charset)
+ self._maxlinelen = maxlinelen
+ if header_name is not None:
+ self.guess_maxlinelen(header_name)
+
+ def __str__(self):
+ """A synonym for self.encode()."""
+ return self.encode()
+
+ def guess_maxlinelen(self, s=None):
+ """Guess the maximum length to make each header line.
+
+ Given a header name (e.g. "Subject"), set this header's maximum line
+ length to an appropriate length to avoid line wrapping. If s is not
+ given, return the previous maximum line length and don't set it.
+
+ Returns the new maximum line length.
+ """
+ # BAW: is this semantic necessary?
+ if s is not None:
+ self._maxlinelen = MAXLINELEN - len(s) - 2
+ return self._maxlinelen
+
+ def append(self, s, charset=None):
+ """Append string s with Charset charset to the MIME header.
+
+ charset defaults to the one given in the class constructor.
+ """
+ if charset is None:
+ charset = self._charset
+ self._chunks.append((s, charset))
+
+ def _split(self, s, charset):
+ # Split up a header safely for use with encode_chunks. BAW: this
+ # appears to be a private convenience method.
+ splittable = charset.to_splittable(s)
+ encoded = charset.from_splittable(splittable)
+
+ if charset.encoded_header_len(encoded) < self._maxlinelen:
+ return [(encoded, charset)]
+ else:
+ # Divide and conquer. BAW: halfway depends on integer division.
+ # When porting to Python 2.2, use the // operator.
+ halfway = len(splittable) // 2
+ first = charset.from_splittable(splittable[:halfway], 0)
+ last = charset.from_splittable(splittable[halfway:], 0)
+ return self._split(first, charset) + self._split(last, charset)
+
+ def encode(self):
+ """Encode a message header, possibly converting charset and encoding.
+
+ There are many issues involved in converting a given string for use in
+ an email header. Only certain character sets are readable in most
+ email clients, and as header strings can only contain a subset of
+ 7-bit ASCII, care must be taken to properly convert and encode (with
+ Base64 or quoted-printable) header strings. In addition, there is a
+ 75-character length limit on any given encoded header field, so
+ line-wrapping must be performed, even with double-byte character sets.
+
+ This method will do its best to convert the string to the correct
+ character set used in email, and encode and line wrap it safely with
+ the appropriate scheme for that character set.
+
+ If the given charset is not known or an error occurs during
+ conversion, this function will return the header untouched.
+ """
+ newchunks = []
+ for s, charset in self._chunks:
+ newchunks += self._split(s, charset)
+ self._chunks = newchunks
+ return self.encode_chunks()
+
+ def encode_chunks(self):
+ """MIME-encode a header with many different charsets and/or encodings.
+
+ Given a list of pairs (string, charset), return a MIME-encoded string
+ suitable for use in a header field. Each pair may have different
+ charsets and/or encodings, and the resulting header will accurately
+ reflect each setting.
+
+ Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
+ character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
+ non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
+ (no encoding).
+
+ Each pair will be represented on a separate line; the resulting string
+ will be in the format:
+
+ "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
+ =?charset2?b?SvxyZ2VuIEL2aW5n?="
+ """
+ chunks = []
+ for header, charset in self._chunks:
+ if charset is None:
+ _max_append(chunks, header, self._maxlinelen, ' ')
+ else:
+ _max_append(chunks, charset.header_encode(header, 0),
+ self._maxlinelen, ' ')
+ return NLSPACE.join(chunks)