summaryrefslogtreecommitdiffstats
path: root/Lib/email/generator.py
diff options
context:
space:
mode:
authorR. David Murray <rdmurray@bitdance.com>2010-10-08 15:55:28 (GMT)
committerR. David Murray <rdmurray@bitdance.com>2010-10-08 15:55:28 (GMT)
commit96fd54eaec700cc50e5960f45ee79bc25c2c48c5 (patch)
tree4e4fc3f48d8957b6b0fccc372410e8374ce4fb70 /Lib/email/generator.py
parent59fdd6736bbf1ba14083a4bb777abaefc364f876 (diff)
downloadcpython-96fd54eaec700cc50e5960f45ee79bc25c2c48c5.zip
cpython-96fd54eaec700cc50e5960f45ee79bc25c2c48c5.tar.gz
cpython-96fd54eaec700cc50e5960f45ee79bc25c2c48c5.tar.bz2
#4661: add bytes parsing and generation to email (email version bump to 5.1.0)
The work on this is not 100% complete, but everything is present to allow real-world testing of the code. The only remaining major todo item is to (hopefully!) enhance the handling of non-ASCII bytes in headers converted to unicode by RFC2047 encoding them rather than replacing them with '?'s.
Diffstat (limited to 'Lib/email/generator.py')
-rw-r--r--Lib/email/generator.py186
1 files changed, 143 insertions, 43 deletions
diff --git a/Lib/email/generator.py b/Lib/email/generator.py
index e05b67d..40b95c4 100644
--- a/Lib/email/generator.py
+++ b/Lib/email/generator.py
@@ -12,8 +12,9 @@ import time
import random
import warnings
-from io import StringIO
+from io import StringIO, BytesIO
from email.header import Header
+from email.message import _has_surrogates
UNDERSCORE = '_'
NL = '\n'
@@ -72,7 +73,7 @@ class Generator:
ufrom = msg.get_unixfrom()
if not ufrom:
ufrom = 'From nobody ' + time.ctime(time.time())
- print(ufrom, file=self._fp)
+ self.write(ufrom + NL)
self._write(msg)
def clone(self, fp):
@@ -83,6 +84,29 @@ class Generator:
# Protected interface - undocumented ;/
#
+ # Note that we use 'self.write' when what we are writing is coming from
+ # the source, and self._fp.write when what we are writing is coming from a
+ # buffer (because the Bytes subclass has already had a chance to transform
+ # the data in its write method in that case). This is an entirely
+ # pragmatic split determined by experiment; we could be more general by
+ # always using write and having the Bytes subclass write method detect when
+ # it has already transformed the input; but, since this whole thing is a
+ # hack anyway this seems good enough.
+
+ # We use these class constants when we need to manipulate data that has
+ # already been written to a buffer (ex: constructing a re to check the
+ # boundary), and the module level NL constant when adding new output to a
+ # buffer via self.write, because 'write' always takes strings.
+ # Having write always take strings makes the code simpler, but there are
+ # a few occasions when we need to write previously created data back
+ # to the buffer or to a new buffer; for those cases we use self._fp.write.
+ _NL = NL
+ _EMPTY = ''
+
+ def _new_buffer(self):
+ # BytesGenerator overrides this to return BytesIO.
+ return StringIO()
+
def _write(self, msg):
# We can't write the headers yet because of the following scenario:
# say a multipart message includes the boundary string somewhere in
@@ -91,13 +115,13 @@ class Generator:
# parameter.
#
# The way we do this, so as to make the _handle_*() methods simpler,
- # is to cache any subpart writes into a StringIO. The we write the
- # headers and the StringIO contents. That way, subpart handlers can
+ # is to cache any subpart writes into a buffer. The we write the
+ # headers and the buffer contents. That way, subpart handlers can
# Do The Right Thing, and can still modify the Content-Type: header if
# necessary.
oldfp = self._fp
try:
- self._fp = sfp = StringIO()
+ self._fp = sfp = self._new_buffer()
self._dispatch(msg)
finally:
self._fp = oldfp
@@ -132,16 +156,16 @@ class Generator:
def _write_headers(self, msg):
for h, v in msg.items():
- print('%s:' % h, end=' ', file=self._fp)
+ self.write('%s: ' % h)
if isinstance(v, Header):
- print(v.encode(maxlinelen=self._maxheaderlen), file=self._fp)
+ self.write(v.encode(maxlinelen=self._maxheaderlen)+NL)
else:
# Header's got lots of smarts, so use it.
header = Header(v, maxlinelen=self._maxheaderlen,
header_name=h)
- print(header.encode(), file=self._fp)
+ self.write(header.encode()+NL)
# A blank line always separates headers from body
- print(file=self._fp)
+ self.write(NL)
#
# Handlers for writing types and subtypes
@@ -153,9 +177,15 @@ class Generator:
return
if not isinstance(payload, str):
raise TypeError('string payload expected: %s' % type(payload))
+ if _has_surrogates(msg._payload):
+ charset = msg.get_param('charset')
+ if charset is not None:
+ del msg['content-transfer-encoding']
+ msg.set_payload(payload, charset)
+ payload = msg.get_payload()
if self._mangle_from_:
payload = fcre.sub('>From ', payload)
- self._fp.write(payload)
+ self.write(payload)
# Default body handler
_writeBody = _handle_text
@@ -170,21 +200,21 @@ class Generator:
subparts = []
elif isinstance(subparts, str):
# e.g. a non-strict parse of a message with no starting boundary.
- self._fp.write(subparts)
+ self.write(subparts)
return
elif not isinstance(subparts, list):
# Scalar payload
subparts = [subparts]
for part in subparts:
- s = StringIO()
+ s = self._new_buffer()
g = self.clone(s)
g.flatten(part, unixfrom=False)
msgtexts.append(s.getvalue())
# Now make sure the boundary we've selected doesn't appear in any of
# the message texts.
- alltext = NL.join(msgtexts)
+ alltext = self._NL.join(msgtexts)
# BAW: What about boundaries that are wrapped in double-quotes?
- boundary = msg.get_boundary(failobj=_make_boundary(alltext))
+ boundary = msg.get_boundary(failobj=self._make_boundary(alltext))
# If we had to calculate a new boundary because the body text
# contained that string, set the new boundary. We don't do it
# unconditionally because, while set_boundary() preserves order, it
@@ -195,9 +225,9 @@ class Generator:
msg.set_boundary(boundary)
# If there's a preamble, write it out, with a trailing CRLF
if msg.preamble is not None:
- print(msg.preamble, file=self._fp)
+ self.write(msg.preamble + NL)
# dash-boundary transport-padding CRLF
- print('--' + boundary, file=self._fp)
+ self.write('--' + boundary + NL)
# body-part
if msgtexts:
self._fp.write(msgtexts.pop(0))
@@ -206,14 +236,14 @@ class Generator:
# --> CRLF body-part
for body_part in msgtexts:
# delimiter transport-padding CRLF
- print('\n--' + boundary, file=self._fp)
+ self.write('\n--' + boundary + NL)
# body-part
self._fp.write(body_part)
# close-delimiter transport-padding
- self._fp.write('\n--' + boundary + '--')
+ self.write('\n--' + boundary + '--')
if msg.epilogue is not None:
- print(file=self._fp)
- self._fp.write(msg.epilogue)
+ self.write(NL)
+ self.write(msg.epilogue)
def _handle_multipart_signed(self, msg):
# The contents of signed parts has to stay unmodified in order to keep
@@ -232,23 +262,23 @@ class Generator:
# block and the boundary. Sigh.
blocks = []
for part in msg.get_payload():
- s = StringIO()
+ s = self._new_buffer()
g = self.clone(s)
g.flatten(part, unixfrom=False)
text = s.getvalue()
- lines = text.split('\n')
+ lines = text.split(self._NL)
# Strip off the unnecessary trailing empty line
- if lines and lines[-1] == '':
- blocks.append(NL.join(lines[:-1]))
+ if lines and lines[-1] == self._EMPTY:
+ blocks.append(self._NL.join(lines[:-1]))
else:
blocks.append(text)
# Now join all the blocks with an empty line. This has the lovely
# effect of separating each block with an empty line, but not adding
# an extra one after the last one.
- self._fp.write(NL.join(blocks))
+ self._fp.write(self._NL.join(blocks))
def _handle_message(self, msg):
- s = StringIO()
+ s = self._new_buffer()
g = self.clone(s)
# The payload of a message/rfc822 part should be a multipart sequence
# of length 1. The zeroth element of the list should be the Message
@@ -265,6 +295,90 @@ class Generator:
payload = s.getvalue()
self._fp.write(payload)
+ # This used to be a module level function; we use a classmethod for this
+ # and _compile_re so we can continue to provide the module level function
+ # for backward compatibility by doing
+ # _make_boudary = Generator._make_boundary
+ # at the end of the module. It *is* internal, so we could drop that...
+ @classmethod
+ def _make_boundary(cls, text=None):
+ # Craft a random boundary. If text is given, ensure that the chosen
+ # boundary doesn't appear in the text.
+ token = random.randrange(sys.maxsize)
+ boundary = ('=' * 15) + (_fmt % token) + '=='
+ if text is None:
+ return boundary
+ b = boundary
+ counter = 0
+ while True:
+ cre = cls._compile_re('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
+ if not cre.search(text):
+ break
+ b = boundary + '.' + str(counter)
+ counter += 1
+ return b
+
+ @classmethod
+ def _compile_re(cls, s, flags):
+ return re.compile(s, flags)
+
+
+class BytesGenerator(Generator):
+ """Generates a bytes version of a Message object tree.
+
+ Functionally identical to the base Generator except that the output is
+ bytes and not string. When surrogates were used in the input to encode
+ bytes, these are decoded back to bytes for output.
+
+ The outfp object must accept bytes in its write method.
+ """
+
+ # Bytes versions of these constants for use in manipulating data from
+ # the BytesIO buffer.
+ _NL = NL.encode('ascii')
+ _EMPTY = b''
+
+ def write(self, s):
+ self._fp.write(s.encode('ascii', 'surrogateescape'))
+
+ def _new_buffer(self):
+ return BytesIO()
+
+ def _write_headers(self, msg):
+ # This is almost the same as the string version, except for handling
+ # strings with 8bit bytes.
+ for h, v in msg._headers:
+ self.write('%s: ' % h)
+ if isinstance(v, Header):
+ self.write(v.encode(maxlinelen=self._maxheaderlen)+NL)
+ elif _has_surrogates(v):
+ # If we have raw 8bit data in a byte string, we have no idea
+ # what the encoding is. There is no safe way to split this
+ # string. If it's ascii-subset, then we could do a normal
+ # ascii split, but if it's multibyte then we could break the
+ # string. There's no way to know so the least harm seems to
+ # be to not split the string and risk it being too long.
+ self.write(v+NL)
+ else:
+ # Header's got lots of smarts and this string is safe...
+ header = Header(v, maxlinelen=self._maxheaderlen,
+ header_name=h)
+ self.write(header.encode()+NL)
+ # A blank line always separates headers from body
+ self.write(NL)
+
+ def _handle_text(self, msg):
+ # If the string has surrogates the original source was bytes, so
+ # just write it back out.
+ if _has_surrogates(msg._payload):
+ self.write(msg._payload)
+ else:
+ super(BytesGenerator,self)._handle_text(msg)
+
+ @classmethod
+ def _compile_re(cls, s, flags):
+ return re.compile(s.encode('ascii'), flags)
+
_FMT = '[Non-text (%(type)s) part of message omitted, filename %(filename)s]'
@@ -325,23 +439,9 @@ class DecodedGenerator(Generator):
-# Helper
+# Helper used by Generator._make_boundary
_width = len(repr(sys.maxsize-1))
_fmt = '%%0%dd' % _width
-def _make_boundary(text=None):
- # Craft a random boundary. If text is given, ensure that the chosen
- # boundary doesn't appear in the text.
- token = random.randrange(sys.maxsize)
- boundary = ('=' * 15) + (_fmt % token) + '=='
- if text is None:
- return boundary
- b = boundary
- counter = 0
- while True:
- cre = re.compile('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
- if not cre.search(text):
- break
- b = boundary + '.' + str(counter)
- counter += 1
- return b
+# Backward compatibility
+_make_boundary = Generator._make_boundary