summaryrefslogtreecommitdiffstats
path: root/Lib/email
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/email')
-rw-r--r--Lib/email/_header_value_parser.py37
-rw-r--r--Lib/email/feedparser.py27
-rw-r--r--Lib/email/iterators.py6
-rw-r--r--Lib/email/message.py44
-rw-r--r--Lib/email/parser.py3
-rw-r--r--Lib/email/utils.py14
6 files changed, 76 insertions, 55 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 291437c..0392379 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -368,8 +368,7 @@ class TokenList(list):
yield (indent + ' !! invalid element in token '
'list: {!r}'.format(token))
else:
- for line in token._pp(indent+' '):
- yield line
+ yield from token._pp(indent+' ')
if self.defects:
extra = ' Defects: {}'.format(self.defects)
else:
@@ -1315,24 +1314,22 @@ RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
# Parser
#
-"""Parse strings according to RFC822/2047/2822/5322 rules.
-
-This is a stateless parser. Each get_XXX function accepts a string and
-returns either a Terminal or a TokenList representing the RFC object named
-by the method and a string containing the remaining unparsed characters
-from the input. Thus a parser method consumes the next syntactic construct
-of a given type and returns a token representing the construct plus the
-unparsed remainder of the input string.
-
-For example, if the first element of a structured header is a 'phrase',
-then:
-
- phrase, value = get_phrase(value)
-
-returns the complete phrase from the start of the string value, plus any
-characters left in the string after the phrase is removed.
-
-"""
+# Parse strings according to RFC822/2047/2822/5322 rules.
+#
+# This is a stateless parser. Each get_XXX function accepts a string and
+# returns either a Terminal or a TokenList representing the RFC object named
+# by the method and a string containing the remaining unparsed characters
+# from the input. Thus a parser method consumes the next syntactic construct
+# of a given type and returns a token representing the construct plus the
+# unparsed remainder of the input string.
+#
+# For example, if the first element of a structured header is a 'phrase',
+# then:
+#
+# phrase, value = get_phrase(value)
+#
+# returns the complete phrase from the start of the string value, plus any
+# characters left in the string after the phrase is removed.
_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
_non_atom_end_matcher = re.compile(r"[^{}]+".format(
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index ea41e95..eb75fe3 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -98,24 +98,15 @@ class BufferedSubFile(object):
"""Push some new data into this object."""
# Handle any previous leftovers
data, self._partial = self._partial + data, ''
- # Crack into lines, but preserve the newlines on the end of each
- parts = NLCRE_crack.split(data)
- # The *ahem* interesting behaviour of re.split when supplied grouping
- # parentheses is that the last element of the resulting list is the
- # data after the final RE. In the case of a NL/CR terminated string,
- # this is the empty string.
- self._partial = parts.pop()
- #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
- # is there a \n to follow later?
- if not self._partial and parts and parts[-1].endswith('\r'):
- self._partial = parts.pop(-2)+parts.pop()
- # parts is a list of strings, alternating between the line contents
- # and the eol character(s). Gather up a list of lines after
- # re-attaching the newlines.
- lines = []
- for i in range(len(parts) // 2):
- lines.append(parts[i*2] + parts[i*2+1])
- self.pushlines(lines)
+ # Crack into lines, but preserve the linesep characters on the end of each
+ parts = data.splitlines(True)
+ # If the last element of the list does not end in a newline, then treat
+ # it as a partial line. We only check for '\n' here because a line
+ # ending with '\r' might be a line that was split in the middle of a
+ # '\r\n' sequence (see bugs 1555570 and 1721862).
+ if parts and not parts[-1].endswith('\n'):
+ self._partial = parts.pop()
+ self.pushlines(parts)
def pushlines(self, lines):
# Reverse and insert at the front of the lines.
diff --git a/Lib/email/iterators.py b/Lib/email/iterators.py
index 3adc4a0..b5502ee 100644
--- a/Lib/email/iterators.py
+++ b/Lib/email/iterators.py
@@ -26,8 +26,7 @@ def walk(self):
yield self
if self.is_multipart():
for subpart in self.get_payload():
- for subsubpart in subpart.walk():
- yield subsubpart
+ yield from subpart.walk()
@@ -40,8 +39,7 @@ def body_line_iterator(msg, decode=False):
for subpart in msg.walk():
payload = subpart.get_payload(decode=decode)
if isinstance(payload, str):
- for line in StringIO(payload):
- yield line
+ yield from StringIO(payload)
def typed_subpart_iterator(msg, maintype='text', subtype=None):
diff --git a/Lib/email/message.py b/Lib/email/message.py
index 5020a03..ebaf1c1 100644
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@@ -132,22 +132,50 @@ class Message:
def __str__(self):
"""Return the entire formatted message as a string.
- This includes the headers, body, and envelope header.
"""
return self.as_string()
- def as_string(self, unixfrom=False, maxheaderlen=0):
+ def as_string(self, unixfrom=False, maxheaderlen=0, policy=None):
"""Return the entire formatted message as a string.
- Optional `unixfrom' when True, means include the Unix From_ envelope
- header.
- This is a convenience method and may not generate the message exactly
- as you intend. For more flexibility, use the flatten() method of a
- Generator instance.
+ Optional 'unixfrom', when true, means include the Unix From_ envelope
+ header. For backward compatibility reasons, if maxheaderlen is
+ not specified it defaults to 0, so you must override it explicitly
+ if you want a different maxheaderlen. 'policy' is passed to the
+ Generator instance used to serialize the mesasge; if it is not
+ specified the policy associated with the message instance is used.
+
+ If the message object contains binary data that is not encoded
+ according to RFC standards, the non-compliant data will be replaced by
+ unicode "unknown character" code points.
"""
from email.generator import Generator
+ policy = self.policy if policy is None else policy
fp = StringIO()
- g = Generator(fp, mangle_from_=False, maxheaderlen=maxheaderlen)
+ g = Generator(fp,
+ mangle_from_=False,
+ maxheaderlen=maxheaderlen,
+ policy=policy)
+ g.flatten(self, unixfrom=unixfrom)
+ return fp.getvalue()
+
+ def __bytes__(self):
+ """Return the entire formatted message as a bytes object.
+ """
+ return self.as_bytes()
+
+ def as_bytes(self, unixfrom=False, policy=None):
+ """Return the entire formatted message as a bytes object.
+
+ Optional 'unixfrom', when true, means include the Unix From_ envelope
+ header. 'policy' is passed to the BytesGenerator instance used to
+ serialize the message; if not specified the policy associated with
+ the message instance is used.
+ """
+ from email.generator import BytesGenerator
+ policy = self.policy if policy is None else policy
+ fp = BytesIO()
+ g = BytesGenerator(fp, mangle_from_=False, policy=policy)
g.flatten(self, unixfrom=unixfrom)
return fp.getvalue()
diff --git a/Lib/email/parser.py b/Lib/email/parser.py
index 752bf35..f49d31d 100644
--- a/Lib/email/parser.py
+++ b/Lib/email/parser.py
@@ -4,7 +4,8 @@
"""A parser of RFC 2822 and MIME email messages."""
-__all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser']
+__all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser',
+ 'FeedParser', 'BytesFeedParser']
import warnings
from io import StringIO, TextIOWrapper
diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index 93a625c..b3b42bb 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -54,10 +54,16 @@ TICK = "'"
specialsre = re.compile(r'[][\\()<>@,:;".]')
escapesre = re.compile(r'[\\"]')
-# How to figure out if we are processing strings that come from a byte
-# source with undecodable characters.
-_has_surrogates = re.compile(
- '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
+def _has_surrogates(s):
+ """Return True if s contains surrogate-escaped binary data."""
+ # This check is based on the fact that unless there are surrogates, utf8
+ # (Python's default encoding) can encode any string. This is the fastest
+ # way to check for surrogates, see issue 11454 for timings.
+ try:
+ s.encode()
+ return False
+ except UnicodeEncodeError:
+ return True
# How to deal with a string containing bytes before handing it to the
# application through the 'normal' interface.