diff options
Diffstat (limited to 'Lib/email')
-rw-r--r-- | Lib/email/_header_value_parser.py | 37 | ||||
-rw-r--r-- | Lib/email/feedparser.py | 27 | ||||
-rw-r--r-- | Lib/email/iterators.py | 6 | ||||
-rw-r--r-- | Lib/email/message.py | 44 | ||||
-rw-r--r-- | Lib/email/parser.py | 3 | ||||
-rw-r--r-- | Lib/email/utils.py | 14 |
6 files changed, 76 insertions, 55 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 291437c..0392379 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -368,8 +368,7 @@ class TokenList(list): yield (indent + ' !! invalid element in token ' 'list: {!r}'.format(token)) else: - for line in token._pp(indent+' '): - yield line + yield from token._pp(indent+' ') if self.defects: extra = ' Defects: {}'.format(self.defects) else: @@ -1315,24 +1314,22 @@ RouteComponentMarker = ValueTerminal('@', 'route-component-marker') # Parser # -"""Parse strings according to RFC822/2047/2822/5322 rules. - -This is a stateless parser. Each get_XXX function accepts a string and -returns either a Terminal or a TokenList representing the RFC object named -by the method and a string containing the remaining unparsed characters -from the input. Thus a parser method consumes the next syntactic construct -of a given type and returns a token representing the construct plus the -unparsed remainder of the input string. - -For example, if the first element of a structured header is a 'phrase', -then: - - phrase, value = get_phrase(value) - -returns the complete phrase from the start of the string value, plus any -characters left in the string after the phrase is removed. - -""" +# Parse strings according to RFC822/2047/2822/5322 rules. +# +# This is a stateless parser. Each get_XXX function accepts a string and +# returns either a Terminal or a TokenList representing the RFC object named +# by the method and a string containing the remaining unparsed characters +# from the input. Thus a parser method consumes the next syntactic construct +# of a given type and returns a token representing the construct plus the +# unparsed remainder of the input string. +# +# For example, if the first element of a structured header is a 'phrase', +# then: +# +# phrase, value = get_phrase(value) +# +# returns the complete phrase from the start of the string value, plus any +# characters left in the string after the phrase is removed. _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split _non_atom_end_matcher = re.compile(r"[^{}]+".format( diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index ea41e95..eb75fe3 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -98,24 +98,15 @@ class BufferedSubFile(object): """Push some new data into this object.""" # Handle any previous leftovers data, self._partial = self._partial + data, '' - # Crack into lines, but preserve the newlines on the end of each - parts = NLCRE_crack.split(data) - # The *ahem* interesting behaviour of re.split when supplied grouping - # parentheses is that the last element of the resulting list is the - # data after the final RE. In the case of a NL/CR terminated string, - # this is the empty string. - self._partial = parts.pop() - #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r: - # is there a \n to follow later? - if not self._partial and parts and parts[-1].endswith('\r'): - self._partial = parts.pop(-2)+parts.pop() - # parts is a list of strings, alternating between the line contents - # and the eol character(s). Gather up a list of lines after - # re-attaching the newlines. - lines = [] - for i in range(len(parts) // 2): - lines.append(parts[i*2] + parts[i*2+1]) - self.pushlines(lines) + # Crack into lines, but preserve the linesep characters on the end of each + parts = data.splitlines(True) + # If the last element of the list does not end in a newline, then treat + # it as a partial line. We only check for '\n' here because a line + # ending with '\r' might be a line that was split in the middle of a + # '\r\n' sequence (see bugs 1555570 and 1721862). + if parts and not parts[-1].endswith('\n'): + self._partial = parts.pop() + self.pushlines(parts) def pushlines(self, lines): # Reverse and insert at the front of the lines. diff --git a/Lib/email/iterators.py b/Lib/email/iterators.py index 3adc4a0..b5502ee 100644 --- a/Lib/email/iterators.py +++ b/Lib/email/iterators.py @@ -26,8 +26,7 @@ def walk(self): yield self if self.is_multipart(): for subpart in self.get_payload(): - for subsubpart in subpart.walk(): - yield subsubpart + yield from subpart.walk() @@ -40,8 +39,7 @@ def body_line_iterator(msg, decode=False): for subpart in msg.walk(): payload = subpart.get_payload(decode=decode) if isinstance(payload, str): - for line in StringIO(payload): - yield line + yield from StringIO(payload) def typed_subpart_iterator(msg, maintype='text', subtype=None): diff --git a/Lib/email/message.py b/Lib/email/message.py index 5020a03..ebaf1c1 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -132,22 +132,50 @@ class Message: def __str__(self): """Return the entire formatted message as a string. - This includes the headers, body, and envelope header. """ return self.as_string() - def as_string(self, unixfrom=False, maxheaderlen=0): + def as_string(self, unixfrom=False, maxheaderlen=0, policy=None): """Return the entire formatted message as a string. - Optional `unixfrom' when True, means include the Unix From_ envelope - header. - This is a convenience method and may not generate the message exactly - as you intend. For more flexibility, use the flatten() method of a - Generator instance. + Optional 'unixfrom', when true, means include the Unix From_ envelope + header. For backward compatibility reasons, if maxheaderlen is + not specified it defaults to 0, so you must override it explicitly + if you want a different maxheaderlen. 'policy' is passed to the + Generator instance used to serialize the mesasge; if it is not + specified the policy associated with the message instance is used. + + If the message object contains binary data that is not encoded + according to RFC standards, the non-compliant data will be replaced by + unicode "unknown character" code points. """ from email.generator import Generator + policy = self.policy if policy is None else policy fp = StringIO() - g = Generator(fp, mangle_from_=False, maxheaderlen=maxheaderlen) + g = Generator(fp, + mangle_from_=False, + maxheaderlen=maxheaderlen, + policy=policy) + g.flatten(self, unixfrom=unixfrom) + return fp.getvalue() + + def __bytes__(self): + """Return the entire formatted message as a bytes object. + """ + return self.as_bytes() + + def as_bytes(self, unixfrom=False, policy=None): + """Return the entire formatted message as a bytes object. + + Optional 'unixfrom', when true, means include the Unix From_ envelope + header. 'policy' is passed to the BytesGenerator instance used to + serialize the message; if not specified the policy associated with + the message instance is used. + """ + from email.generator import BytesGenerator + policy = self.policy if policy is None else policy + fp = BytesIO() + g = BytesGenerator(fp, mangle_from_=False, policy=policy) g.flatten(self, unixfrom=unixfrom) return fp.getvalue() diff --git a/Lib/email/parser.py b/Lib/email/parser.py index 752bf35..f49d31d 100644 --- a/Lib/email/parser.py +++ b/Lib/email/parser.py @@ -4,7 +4,8 @@ """A parser of RFC 2822 and MIME email messages.""" -__all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser'] +__all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser', + 'FeedParser', 'BytesFeedParser'] import warnings from io import StringIO, TextIOWrapper diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 93a625c..b3b42bb 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -54,10 +54,16 @@ TICK = "'" specialsre = re.compile(r'[][\\()<>@,:;".]') escapesre = re.compile(r'[\\"]') -# How to figure out if we are processing strings that come from a byte -# source with undecodable characters. -_has_surrogates = re.compile( - '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search +def _has_surrogates(s): + """Return True if s contains surrogate-escaped binary data.""" + # This check is based on the fact that unless there are surrogates, utf8 + # (Python's default encoding) can encode any string. This is the fastest + # way to check for surrogates, see issue 11454 for timings. + try: + s.encode() + return False + except UnicodeEncodeError: + return True # How to deal with a string containing bytes before handing it to the # application through the 'normal' interface. |