summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/email/_header_value_parser.py729
-rw-r--r--Lib/email/headerregistry.py9
-rw-r--r--Lib/test/test_email/test__header_value_parser.py48
-rw-r--r--Lib/test/test_email/test_generator.py56
-rw-r--r--Lib/test/test_email/test_headerregistry.py229
-rw-r--r--Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst3
6 files changed, 511 insertions, 563 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index b4737c8..b34c58b 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -97,96 +97,14 @@ def quote_string(value):
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
#
-# Accumulator for header folding
-#
-
-class _Folded:
-
- def __init__(self, maxlen, policy):
- self.maxlen = maxlen
- self.policy = policy
- self.lastlen = 0
- self.stickyspace = None
- self.firstline = True
- self.done = []
- self.current = []
-
- def newline(self):
- self.done.extend(self.current)
- self.done.append(self.policy.linesep)
- self.current.clear()
- self.lastlen = 0
-
- def finalize(self):
- if self.current:
- self.newline()
-
- def __str__(self):
- return ''.join(self.done)
-
- def append(self, stoken):
- self.current.append(stoken)
-
- def append_if_fits(self, token, stoken=None):
- if stoken is None:
- stoken = str(token)
- l = len(stoken)
- if self.stickyspace is not None:
- stickyspace_len = len(self.stickyspace)
- if self.lastlen + stickyspace_len + l <= self.maxlen:
- self.current.append(self.stickyspace)
- self.lastlen += stickyspace_len
- self.current.append(stoken)
- self.lastlen += l
- self.stickyspace = None
- self.firstline = False
- return True
- if token.has_fws:
- ws = token.pop_leading_fws()
- if ws is not None:
- self.stickyspace += str(ws)
- stickyspace_len += len(ws)
- token._fold(self)
- return True
- if stickyspace_len and l + 1 <= self.maxlen:
- margin = self.maxlen - l
- if 0 < margin < stickyspace_len:
- trim = stickyspace_len - margin
- self.current.append(self.stickyspace[:trim])
- self.stickyspace = self.stickyspace[trim:]
- stickyspace_len = trim
- self.newline()
- self.current.append(self.stickyspace)
- self.current.append(stoken)
- self.lastlen = l + stickyspace_len
- self.stickyspace = None
- self.firstline = False
- return True
- if not self.firstline:
- self.newline()
- self.current.append(self.stickyspace)
- self.current.append(stoken)
- self.stickyspace = None
- self.firstline = False
- return True
- if self.lastlen + l <= self.maxlen:
- self.current.append(stoken)
- self.lastlen += l
- return True
- if l < self.maxlen:
- self.newline()
- self.current.append(stoken)
- self.lastlen = l
- return True
- return False
-
-#
# TokenList and its subclasses
#
class TokenList(list):
token_type = None
+ syntactic_break = True
+ ew_combine_allowed = True
def __init__(self, *args, **kw):
super().__init__(*args, **kw)
@@ -207,84 +125,13 @@ class TokenList(list):
def all_defects(self):
return sum((x.all_defects for x in self), self.defects)
- #
- # Folding API
- #
- # parts():
- #
- # return a list of objects that constitute the "higher level syntactic
- # objects" specified by the RFC as the best places to fold a header line.
- # The returned objects must include leading folding white space, even if
- # this means mutating the underlying parse tree of the object. Each object
- # is only responsible for returning *its* parts, and should not drill down
- # to any lower level except as required to meet the leading folding white
- # space constraint.
- #
- # _fold(folded):
- #
- # folded: the result accumulator. This is an instance of _Folded.
- # (XXX: I haven't finished factoring this out yet, the folding code
- # pretty much uses this as a state object.) When the folded.current
- # contains as much text as will fit, the _fold method should call
- # folded.newline.
- # folded.lastlen: the current length of the test stored in folded.current.
- # folded.maxlen: The maximum number of characters that may appear on a
- # folded line. Differs from the policy setting in that "no limit" is
- # represented by +inf, which means it can be used in the trivially
- # logical fashion in comparisons.
- #
- # Currently no subclasses implement parts, and I think this will remain
- # true. A subclass only needs to implement _fold when the generic version
- # isn't sufficient. _fold will need to be implemented primarily when it is
- # possible for encoded words to appear in the specialized token-list, since
- # there is no generic algorithm that can know where exactly the encoded
- # words are allowed. A _fold implementation is responsible for filling
- # lines in the same general way that the top level _fold does. It may, and
- # should, call the _fold method of sub-objects in a similar fashion to that
- # of the top level _fold.
- #
- # XXX: I'm hoping it will be possible to factor the existing code further
- # to reduce redundancy and make the logic clearer.
-
- @property
- def parts(self):
- klass = self.__class__
- this = []
- for token in self:
- if token.startswith_fws():
- if this:
- yield this[0] if len(this)==1 else klass(this)
- this.clear()
- end_ws = token.pop_trailing_ws()
- this.append(token)
- if end_ws:
- yield klass(this)
- this = [end_ws]
- if this:
- yield this[0] if len(this)==1 else klass(this)
-
def startswith_fws(self):
return self[0].startswith_fws()
- def pop_leading_fws(self):
- if self[0].token_type == 'fws':
- return self.pop(0)
- return self[0].pop_leading_fws()
-
- def pop_trailing_ws(self):
- if self[-1].token_type == 'cfws':
- return self.pop(-1)
- return self[-1].pop_trailing_ws()
-
@property
- def has_fws(self):
- for part in self:
- if part.has_fws:
- return True
- return False
-
- def has_leading_comment(self):
- return self[0].has_leading_comment()
+ def as_ew_allowed(self):
+ """True if all top level tokens of this part may be RFC2047 encoded."""
+ return all(part.as_ew_allowed for part in self)
@property
def comments(self):
@@ -294,69 +141,13 @@ class TokenList(list):
return comments
def fold(self, *, policy):
- # max_line_length 0/None means no limit, ie: infinitely long.
- maxlen = policy.max_line_length or float("+inf")
- folded = _Folded(maxlen, policy)
- self._fold(folded)
- folded.finalize()
- return str(folded)
-
- def as_encoded_word(self, charset):
- # This works only for things returned by 'parts', which include
- # the leading fws, if any, that should be used.
- res = []
- ws = self.pop_leading_fws()
- if ws:
- res.append(ws)
- trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
- res.append(_ew.encode(str(self), charset))
- res.append(trailer)
- return ''.join(res)
-
- def cte_encode(self, charset, policy):
- res = []
- for part in self:
- res.append(part.cte_encode(charset, policy))
- return ''.join(res)
-
- def _fold(self, folded):
- encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
- for part in self.parts:
- tstr = str(part)
- tlen = len(tstr)
- try:
- str(part).encode(encoding)
- except UnicodeEncodeError:
- if any(isinstance(x, errors.UndecodableBytesDefect)
- for x in part.all_defects):
- charset = 'unknown-8bit'
- else:
- # XXX: this should be a policy setting when utf8 is False.
- charset = 'utf-8'
- tstr = part.cte_encode(charset, folded.policy)
- tlen = len(tstr)
- if folded.append_if_fits(part, tstr):
- continue
- # Peel off the leading whitespace if any and make it sticky, to
- # avoid infinite recursion.
- ws = part.pop_leading_fws()
- if ws is not None:
- folded.stickyspace = str(ws)
- if folded.append_if_fits(part):
- continue
- if part.has_fws:
- part._fold(folded)
- continue
- # There are no fold points in this one; it is too long for a single
- # line and can't be split...we just have to put it on its own line.
- folded.append(tstr)
- folded.newline()
+ return _refold_parse_tree(self, policy=policy)
def pprint(self, indent=''):
- print('\n'.join(self._pp(indent='')))
+ print(self.ppstr(indent=indent))
def ppstr(self, indent=''):
- return '\n'.join(self._pp(indent=''))
+ return '\n'.join(self._pp(indent=indent))
def _pp(self, indent=''):
yield '{}{}/{}('.format(
@@ -391,173 +182,11 @@ class UnstructuredTokenList(TokenList):
token_type = 'unstructured'
- def _fold(self, folded):
- last_ew = None
- encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
- for part in self.parts:
- tstr = str(part)
- is_ew = False
- try:
- str(part).encode(encoding)
- except UnicodeEncodeError:
- if any(isinstance(x, errors.UndecodableBytesDefect)
- for x in part.all_defects):
- charset = 'unknown-8bit'
- else:
- charset = 'utf-8'
- if last_ew is not None:
- # We've already done an EW, combine this one with it
- # if there's room.
- chunk = get_unstructured(
- ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
- oldlastlen = sum(len(x) for x in folded.current[:last_ew])
- schunk = str(chunk)
- lchunk = len(schunk)
- if oldlastlen + lchunk <= folded.maxlen:
- del folded.current[last_ew:]
- folded.append(schunk)
- folded.lastlen = oldlastlen + lchunk
- continue
- tstr = part.as_encoded_word(charset)
- is_ew = True
- if folded.append_if_fits(part, tstr):
- if is_ew:
- last_ew = len(folded.current) - 1
- continue
- if is_ew or last_ew:
- # It's too big to fit on the line, but since we've
- # got encoded words we can use encoded word folding.
- part._fold_as_ew(folded)
- continue
- # Peel off the leading whitespace if any and make it sticky, to
- # avoid infinite recursion.
- ws = part.pop_leading_fws()
- if ws is not None:
- folded.stickyspace = str(ws)
- if folded.append_if_fits(part):
- continue
- if part.has_fws:
- part._fold(folded)
- continue
- # It can't be split...we just have to put it on its own line.
- folded.append(tstr)
- folded.newline()
- last_ew = None
-
- def cte_encode(self, charset, policy):
- res = []
- last_ew = None
- for part in self:
- spart = str(part)
- try:
- spart.encode('us-ascii')
- res.append(spart)
- except UnicodeEncodeError:
- if last_ew is None:
- res.append(part.cte_encode(charset, policy))
- last_ew = len(res)
- else:
- tl = get_unstructured(''.join(res[last_ew:] + [spart]))
- res.append(tl.as_encoded_word(charset))
- return ''.join(res)
-
class Phrase(TokenList):
token_type = 'phrase'
- def _fold(self, folded):
- # As with Unstructured, we can have pure ASCII with or without
- # surrogateescape encoded bytes, or we could have unicode. But this
- # case is more complicated, since we have to deal with the various
- # sub-token types and how they can be composed in the face of
- # unicode-that-needs-CTE-encoding, and the fact that if a token a
- # comment that becomes a barrier across which we can't compose encoded
- # words.
- last_ew = None
- encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
- for part in self.parts:
- tstr = str(part)
- tlen = len(tstr)
- has_ew = False
- try:
- str(part).encode(encoding)
- except UnicodeEncodeError:
- if any(isinstance(x, errors.UndecodableBytesDefect)
- for x in part.all_defects):
- charset = 'unknown-8bit'
- else:
- charset = 'utf-8'
- if last_ew is not None and not part.has_leading_comment():
- # We've already done an EW, let's see if we can combine
- # this one with it. The last_ew logic ensures that all we
- # have at this point is atoms, no comments or quoted
- # strings. So we can treat the text between the last
- # encoded word and the content of this token as
- # unstructured text, and things will work correctly. But
- # we have to strip off any trailing comment on this token
- # first, and if it is a quoted string we have to pull out
- # the content (we're encoding it, so it no longer needs to
- # be quoted).
- if part[-1].token_type == 'cfws' and part.comments:
- remainder = part.pop(-1)
- else:
- remainder = ''
- for i, token in enumerate(part):
- if token.token_type == 'bare-quoted-string':
- part[i] = UnstructuredTokenList(token[:])
- chunk = get_unstructured(
- ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
- schunk = str(chunk)
- lchunk = len(schunk)
- if last_ew + lchunk <= folded.maxlen:
- del folded.current[last_ew:]
- folded.append(schunk)
- folded.lastlen = sum(len(x) for x in folded.current)
- continue
- tstr = part.as_encoded_word(charset)
- tlen = len(tstr)
- has_ew = True
- if folded.append_if_fits(part, tstr):
- if has_ew and not part.comments:
- last_ew = len(folded.current) - 1
- elif part.comments or part.token_type == 'quoted-string':
- # If a comment is involved we can't combine EWs. And if a
- # quoted string is involved, it's not worth the effort to
- # try to combine them.
- last_ew = None
- continue
- part._fold(folded)
-
- def cte_encode(self, charset, policy):
- res = []
- last_ew = None
- is_ew = False
- for part in self:
- spart = str(part)
- try:
- spart.encode('us-ascii')
- res.append(spart)
- except UnicodeEncodeError:
- is_ew = True
- if last_ew is None:
- if not part.comments:
- last_ew = len(res)
- res.append(part.cte_encode(charset, policy))
- elif not part.has_leading_comment():
- if part[-1].token_type == 'cfws' and part.comments:
- remainder = part.pop(-1)
- else:
- remainder = ''
- for i, token in enumerate(part):
- if token.token_type == 'bare-quoted-string':
- part[i] = UnstructuredTokenList(token[:])
- tl = get_unstructured(''.join(res[last_ew:] + [spart]))
- res[last_ew:] = [tl.as_encoded_word(charset)]
- if part.comments or (not is_ew and part.token_type == 'quoted-string'):
- last_ew = None
- return ''.join(res)
-
class Word(TokenList):
token_type = 'word'
@@ -567,9 +196,6 @@ class CFWSList(WhiteSpaceTokenList):
token_type = 'cfws'
- def has_leading_comment(self):
- return bool(self.comments)
-
class Atom(TokenList):
@@ -579,6 +205,7 @@ class Atom(TokenList):
class Token(TokenList):
token_type = 'token'
+ encode_as_ew = False
class EncodedWord(TokenList):
@@ -588,13 +215,6 @@ class EncodedWord(TokenList):
charset = None
lang = None
- @property
- def encoded(self):
- if self.cte is not None:
- return self.cte
- _ew.encode(str(self), self.charset)
-
-
class QuotedString(TokenList):
@@ -865,6 +485,7 @@ class InvalidMailbox(TokenList):
class Domain(TokenList):
token_type = 'domain'
+ as_ew_allowed = False
@property
def domain(self):
@@ -879,11 +500,13 @@ class DotAtom(TokenList):
class DotAtomText(TokenList):
token_type = 'dot-atom-text'
+ as_ew_allowed = True
class AddrSpec(TokenList):
token_type = 'addr-spec'
+ as_ew_allowed = False
@property
def local_part(self):
@@ -916,11 +539,13 @@ class AddrSpec(TokenList):
class ObsLocalPart(TokenList):
token_type = 'obs-local-part'
+ as_ew_allowed = False
class DisplayName(Phrase):
token_type = 'display-name'
+ ew_combine_allowed = False
@property
def display_name(self):
@@ -960,6 +585,7 @@ class DisplayName(Phrase):
class LocalPart(TokenList):
token_type = 'local-part'
+ as_ew_allowed = False
@property
def value(self):
@@ -995,6 +621,7 @@ class LocalPart(TokenList):
class DomainLiteral(TokenList):
token_type = 'domain-literal'
+ as_ew_allowed = False
@property
def domain(self):
@@ -1081,6 +708,7 @@ class Value(TokenList):
class MimeParameters(TokenList):
token_type = 'mime-parameters'
+ syntactic_break = False
@property
def params(self):
@@ -1165,6 +793,10 @@ class MimeParameters(TokenList):
class ParameterizedHeaderValue(TokenList):
+ # Set this false so that the value doesn't wind up on a new line even
+ # if it and the parameters would fit there but not on the first line.
+ syntactic_break = False
+
@property
def params(self):
for token in reversed(self):
@@ -1172,18 +804,11 @@ class ParameterizedHeaderValue(TokenList):
return token.params
return {}
- @property
- def parts(self):
- if self and self[-1].token_type == 'mime-parameters':
- # We don't want to start a new line if all of the params don't fit
- # after the value, so unwrap the parameter list.
- return TokenList(self[:-1] + self[-1])
- return TokenList(self).parts
-
class ContentType(ParameterizedHeaderValue):
token_type = 'content-type'
+ as_ew_allowed = False
maintype = 'text'
subtype = 'plain'
@@ -1191,40 +816,27 @@ class ContentType(ParameterizedHeaderValue):
class ContentDisposition(ParameterizedHeaderValue):
token_type = 'content-disposition'
+ as_ew_allowed = False
content_disposition = None
class ContentTransferEncoding(TokenList):
token_type = 'content-transfer-encoding'
+ as_ew_allowed = False
cte = '7bit'
class HeaderLabel(TokenList):
token_type = 'header-label'
+ as_ew_allowed = False
class Header(TokenList):
token_type = 'header'
- def _fold(self, folded):
- folded.append(str(self.pop(0)))
- folded.lastlen = len(folded.current[0])
- # The first line of the header is different from all others: we don't
- # want to start a new object on a new line if it has any fold points in
- # it that would allow part of it to be on the first header line.
- # Further, if the first fold point would fit on the new line, we want
- # to do that, but if it doesn't we want to put it on the first line.
- # Folded supports this via the stickyspace attribute. If this
- # attribute is not None, it does the special handling.
- folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
- rest = self.pop(0)
- if self:
- raise ValueError("Malformed Header token list")
- rest._fold(folded)
-
#
# Terminal classes and instances
@@ -1232,6 +844,10 @@ class Header(TokenList):
class Terminal(str):
+ as_ew_allowed = True
+ ew_combine_allowed = True
+ syntactic_break = True
+
def __new__(cls, value, token_type):
self = super().__new__(cls, value)
self.token_type = token_type
@@ -1241,6 +857,9 @@ class Terminal(str):
def __repr__(self):
return "{}({})".format(self.__class__.__name__, super().__repr__())
+ def pprint(self):
+ print(self.__class__.__name__ + '/' + self.token_type)
+
@property
def all_defects(self):
return list(self.defects)
@@ -1254,29 +873,14 @@ class Terminal(str):
'' if not self.defects else ' {}'.format(self.defects),
)]
- def cte_encode(self, charset, policy):
- value = str(self)
- try:
- value.encode('us-ascii')
- return value
- except UnicodeEncodeError:
- return _ew.encode(value, charset)
-
def pop_trailing_ws(self):
# This terminates the recursion.
return None
- def pop_leading_fws(self):
- # This terminates the recursion.
- return None
-
@property
def comments(self):
return []
- def has_leading_comment(self):
- return False
-
def __getnewargs__(self):
return(str(self), self.token_type)
@@ -1290,8 +894,6 @@ class WhiteSpaceTerminal(Terminal):
def startswith_fws(self):
return True
- has_fws = True
-
class ValueTerminal(Terminal):
@@ -1302,11 +904,6 @@ class ValueTerminal(Terminal):
def startswith_fws(self):
return False
- has_fws = False
-
- def as_encoded_word(self, charset):
- return _ew.encode(str(self), charset)
-
class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
@@ -1314,15 +911,9 @@ class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
def value(self):
return ''
- @property
- def encoded(self):
- return self[:]
-
def __str__(self):
return ''
- has_fws = True
-
# XXX these need to become classes and used as instances so
# that a program can't change them in a parse tree and screw
@@ -2751,7 +2342,7 @@ def get_parameter(value):
if value[0] != "'":
raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
"delimiter, but found {!r}".format(value))
- appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
+ appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
value = value[1:]
if value and value[0] != "'":
token, value = get_attrtext(value)
@@ -2760,7 +2351,7 @@ def get_parameter(value):
if not value or value[0] != "'":
raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
"delimiter, but found {}".format(value))
- appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
+ appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
value = value[1:]
if remainder is not None:
# Treat the rest of value as bare quoted string content.
@@ -2965,3 +2556,255 @@ def parse_content_transfer_encoding_header(value):
token, value = get_phrase(value)
cte_header.append(token)
return cte_header
+
+
+#
+# Header folding
+#
+# Header folding is complex, with lots of rules and corner cases. The
+# following code does its best to obey the rules and handle the corner
+# cases, but you can be sure there are few bugs:)
+#
+# This folder generally canonicalizes as it goes, preferring the stringified
+# version of each token. The tokens contain information that supports the
+# folder, including which tokens can be encoded in which ways.
+#
+# Folded text is accumulated in a simple list of strings ('lines'), each
+# one of which should be less than policy.max_line_length ('maxlen').
+#
+
+def _steal_trailing_WSP_if_exists(lines):
+ wsp = ''
+ if lines and lines[-1] and lines[-1][-1] in WSP:
+ wsp = lines[-1][-1]
+ lines[-1] = lines[-1][:-1]
+ return wsp
+
+def _refold_parse_tree(parse_tree, *, policy):
+ """Return string of contents of parse_tree folded according to RFC rules.
+
+ """
+ # max_line_length 0/None means no limit, ie: infinitely long.
+ maxlen = policy.max_line_length or float("+inf")
+ encoding = 'utf-8' if policy.utf8 else 'us-ascii'
+ lines = ['']
+ last_ew = None
+ wrap_as_ew_blocked = 0
+ want_encoding = False
+ end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
+ parts = list(parse_tree)
+ while parts:
+ part = parts.pop(0)
+ if part is end_ew_not_allowed:
+ wrap_as_ew_blocked -= 1
+ continue
+ tstr = str(part)
+ try:
+ tstr.encode(encoding)
+ charset = encoding
+ except UnicodeEncodeError:
+ if any(isinstance(x, errors.UndecodableBytesDefect)
+ for x in part.all_defects):
+ charset = 'unknown-8bit'
+ else:
+ # If policy.utf8 is false this should really be taken from a
+ # 'charset' property on the policy.
+ charset = 'utf-8'
+ want_encoding = True
+ if part.token_type == 'mime-parameters':
+ # Mime parameter folding (using RFC2231) is extra special.
+ _fold_mime_parameters(part, lines, maxlen, encoding)
+ continue
+ if want_encoding and not wrap_as_ew_blocked:
+ if not part.as_ew_allowed:
+ want_encoding = False
+ last_ew = None
+ if part.syntactic_break:
+ encoded_part = part.fold(policy=policy)[:-1] # strip nl
+ if policy.linesep not in encoded_part:
+ # It fits on a single line
+ if len(encoded_part) > maxlen - len(lines[-1]):
+ # But not on this one, so start a new one.
+ newline = _steal_trailing_WSP_if_exists(lines)
+ # XXX what if encoded_part has no leading FWS?
+ lines.append(newline)
+ lines[-1] += encoded_part
+ continue
+ # Either this is not a major syntactic break, so we don't
+ # want it on a line by itself even if it fits, or it
+ # doesn't fit on a line by itself. Either way, fall through
+ # to unpacking the subparts and wrapping them.
+ if not hasattr(part, 'encode'):
+ # It's not a Terminal, do each piece individually.
+ parts = list(part) + parts
+ else:
+ # It's a terminal, wrap it as an encoded word, possibly
+ # combining it with previously encoded words if allowed.
+ last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
+ part.ew_combine_allowed, charset)
+ want_encoding = False
+ continue
+ if len(tstr) <= maxlen - len(lines[-1]):
+ lines[-1] += tstr
+ continue
+ # This part is too long to fit. The RFC wants us to break at
+ # "major syntactic breaks", so unless we don't consider this
+ # to be one, check if it will fit on the next line by itself.
+ if (part.syntactic_break and
+ len(tstr) + 1 <= maxlen):
+ newline = _steal_trailing_WSP_if_exists(lines)
+ if newline or part.startswith_fws():
+ lines.append(newline + tstr)
+ continue
+ if not hasattr(part, 'encode'):
+ # It's not a terminal, try folding the subparts.
+ newparts = list(part)
+ if not part.as_ew_allowed:
+ wrap_as_ew_blocked += 1
+ newparts.append(end_ew_not_allowed)
+ parts = newparts + parts
+ continue
+ if part.as_ew_allowed and not wrap_as_ew_blocked:
+ # It doesn't need CTE encoding, but encode it anyway so we can
+ # wrap it.
+ parts.insert(0, part)
+ want_encoding = True
+ continue
+ # We can't figure out how to wrap, it, so give up.
+ newline = _steal_trailing_WSP_if_exists(lines)
+ if newline or part.startswith_fws():
+ lines.append(newline + tstr)
+ else:
+ # We can't fold it onto the next line either...
+ lines[-1] += tstr
+ return policy.linesep.join(lines) + policy.linesep
+
+def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
+ """Fold string to_encode into lines as encoded word, combining if allowed.
+ Return the new value for last_ew, or None if ew_combine_allowed is False.
+
+ If there is already an encoded word in the last line of lines (indicated by
+ a non-None value for last_ew) and ew_combine_allowed is true, decode the
+ existing ew, combine it with to_encode, and re-encode. Otherwise, encode
+ to_encode. In either case, split to_encode as necessary so that the
+ encoded segments fit within maxlen.
+
+ """
+ if last_ew is not None and ew_combine_allowed:
+ to_encode = str(
+ get_unstructured(lines[-1][last_ew:] + to_encode))
+ lines[-1] = lines[-1][:last_ew]
+ if to_encode[0] in WSP:
+ # We're joining this to non-encoded text, so don't encode
+ # the leading blank.
+ leading_wsp = to_encode[0]
+ to_encode = to_encode[1:]
+ if (len(lines[-1]) == maxlen):
+ lines.append(_steal_trailing_WSP_if_exists(lines))
+ lines[-1] += leading_wsp
+ trailing_wsp = ''
+ if to_encode[-1] in WSP:
+ # Likewise for the trailing space.
+ trailing_wsp = to_encode[-1]
+ to_encode = to_encode[:-1]
+ new_last_ew = len(lines[-1]) if last_ew is None else last_ew
+ while to_encode:
+ remaining_space = maxlen - len(lines[-1])
+ # The RFC2047 chrome takes up 7 characters plus the length
+ # of the charset name.
+ encode_as = 'utf-8' if charset == 'us-ascii' else charset
+ text_space = remaining_space - len(encode_as) - 7
+ if text_space <= 0:
+ lines.append(' ')
+ # XXX We'll get an infinite loop here if maxlen is <= 7
+ continue
+ first_part = to_encode[:text_space]
+ ew = _ew.encode(first_part, charset=encode_as)
+ excess = len(ew) - remaining_space
+ if excess > 0:
+ # encode always chooses the shortest encoding, so this
+ # is guaranteed to fit at this point.
+ first_part = first_part[:-excess]
+ ew = _ew.encode(first_part)
+ lines[-1] += ew
+ to_encode = to_encode[len(first_part):]
+ if to_encode:
+ lines.append(' ')
+ new_last_ew = len(lines[-1])
+ lines[-1] += trailing_wsp
+ return new_last_ew if ew_combine_allowed else None
+
+def _fold_mime_parameters(part, lines, maxlen, encoding):
+ """Fold TokenList 'part' into the 'lines' list as mime parameters.
+
+ Using the decoded list of parameters and values, format them according to
+ the RFC rules, including using RFC2231 encoding if the value cannot be
+ expressed in 'encoding' and/or the paramter+value is too long to fit within
+ 'maxlen'.
+
+ """
+ # Special case for RFC2231 encoding: start from decoded values and use
+ # RFC2231 encoding iff needed.
+ #
+ # Note that the 1 and 2s being added to the length calculations are
+ # accounting for the possibly-needed spaces and semicolons we'll be adding.
+ #
+ for name, value in part.params:
+ # XXX What if this ';' puts us over maxlen the first time through the
+ # loop? We should split the header value onto a newline in that case,
+ # but to do that we need to recognize the need earlier or reparse the
+ # header, so I'm going to ignore that bug for now. It'll only put us
+ # one character over.
+ if not lines[-1].rstrip().endswith(';'):
+ lines[-1] += ';'
+ charset = encoding
+ error_handler = 'strict'
+ try:
+ value.encode(encoding)
+ encoding_required = False
+ except UnicodeEncodeError:
+ encoding_required = True
+ if utils._has_surrogates(value):
+ charset = 'unknown-8bit'
+ error_handler = 'surrogateescape'
+ else:
+ charset = 'utf-8'
+ if encoding_required:
+ encoded_value = urllib.parse.quote(
+ value, safe='', errors=error_handler)
+ tstr = "{}*={}''{}".format(name, charset, encoded_value)
+ else:
+ tstr = '{}={}'.format(name, quote_string(value))
+ if len(lines[-1]) + len(tstr) + 1 < maxlen:
+ lines[-1] = lines[-1] + ' ' + tstr
+ continue
+ elif len(tstr) + 2 <= maxlen:
+ lines.append(' ' + tstr)
+ continue
+ # We need multiple sections. We are allowed to mix encoded and
+ # non-encoded sections, but we aren't going to. We'll encode them all.
+ section = 0
+ extra_chrome = charset + "''"
+ while value:
+ chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome)
+ if maxlen <= chrome_len + 3:
+ # We need room for the leading blank, the trailing semicolon,
+ # and at least one character of the value. If we don't
+ # have that, we'd be stuck, so in that case fall back to
+ # the RFC standard width.
+ maxlen = 78
+ splitpoint = maxchars = maxlen - chrome_len - 2
+ while True:
+ partial = value[:splitpoint]
+ encoded_value = urllib.parse.quote(
+ partial, safe='', errors=error_handler)
+ if len(encoded_value) <= maxchars:
+ break
+ splitpoint -= 1
+ lines.append(" {}*{}*={}{}".format(
+ name, section, extra_chrome, encoded_value))
+ extra_chrome = ''
+ section += 1
+ value = value[splitpoint:]
+ if value:
+ lines[-1] += ';'
diff --git a/Lib/email/headerregistry.py b/Lib/email/headerregistry.py
index 81fee14..0065204 100644
--- a/Lib/email/headerregistry.py
+++ b/Lib/email/headerregistry.py
@@ -245,13 +245,16 @@ class BaseHeader(str):
the header name and the ': ' separator.
"""
- # At some point we need to only put fws here if it was in the source.
+ # At some point we need to put fws here iif it was in the source.
header = parser.Header([
parser.HeaderLabel([
parser.ValueTerminal(self.name, 'header-name'),
parser.ValueTerminal(':', 'header-sep')]),
- parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]),
- self._parse_tree])
+ ])
+ if self._parse_tree:
+ header.append(
+ parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]))
+ header.append(self._parse_tree)
return header.fold(policy=policy)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index e0ec87d..1667617 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -14,18 +14,7 @@ class TestTokens(TestEmailBase):
self.assertEqual(x, ' \t')
self.assertEqual(str(x), '')
self.assertEqual(x.value, '')
- self.assertEqual(x.encoded, ' \t')
-
- # UnstructuredTokenList
-
- def test_undecodable_bytes_error_preserved(self):
- badstr = b"le pouf c\xaflebre".decode('ascii', 'surrogateescape')
- unst = parser.get_unstructured(badstr)
- self.assertDefectsEqual(unst.all_defects, [errors.UndecodableBytesDefect])
- parts = list(unst.parts)
- self.assertDefectsEqual(parts[0].all_defects, [])
- self.assertDefectsEqual(parts[1].all_defects, [])
- self.assertDefectsEqual(parts[2].all_defects, [errors.UndecodableBytesDefect])
+ self.assertEqual(x.token_type, 'fws')
class TestParserMixin:
@@ -139,7 +128,6 @@ class TestParser(TestParserMixin, TestEmailBase):
'first second',
[],
'')
- self.assertEqual(ew.encoded, '=?us-ascii*jive?q?first_second?=')
self.assertEqual(ew.charset, 'us-ascii')
self.assertEqual(ew.lang, 'jive')
@@ -150,7 +138,6 @@ class TestParser(TestParserMixin, TestEmailBase):
'first second',
[],
'')
- self.assertEqual(ew.encoded, '=?us-ascii?q?first_second?=')
self.assertEqual(ew.charset, 'us-ascii')
self.assertEqual(ew.lang, '')
@@ -2700,28 +2687,37 @@ class TestFolding(TestEmailBase):
# and with unicode tokens in the comments. Spaces inside the quotes
# currently don't do the right thing.
- def test_initial_whitespace_splitting(self):
+ def test_split_at_whitespace_after_header_before_long_token(self):
body = parser.get_unstructured(' ' + 'x'*77)
header = parser.Header([
parser.HeaderLabel([parser.ValueTerminal('test:', 'atext')]),
parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), body])
self._test(header, 'test: \n ' + 'x'*77 + '\n')
- def test_whitespace_splitting(self):
+ def test_split_at_whitespace_before_long_token(self):
self._test(parser.get_unstructured('xxx ' + 'y'*77),
'xxx \n ' + 'y'*77 + '\n')
+ def test_overlong_encodeable_is_wrapped(self):
+ first_token_with_whitespace = 'xxx '
+ chrome_leader = '=?utf-8?q?'
+ len_chrome = len(chrome_leader) + 2
+ len_non_y = len_chrome + len(first_token_with_whitespace)
+ self._test(parser.get_unstructured(first_token_with_whitespace +
+ 'y'*80),
+ first_token_with_whitespace + chrome_leader +
+ 'y'*(78-len_non_y) + '?=\n' +
+ ' ' + chrome_leader + 'y'*(80-(78-len_non_y)) + '?=\n')
+
def test_long_filename_attachment(self):
- folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"')
- self.assertEqual(
- 'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"\n',
- folded
- )
- folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"')
- self.assertEqual(
- 'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"\n',
- folded
- )
+ self._test(parser.parse_content_disposition_header(
+ 'attachment; filename="TEST_TEST_TEST_TEST'
+ '_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"'),
+ "attachment;\n"
+ " filename*0*=us-ascii''TEST_TEST_TEST_TEST_TEST_TEST"
+ "_TEST_TEST_TEST_TEST_TEST;\n"
+ " filename*1*=_TEST_TES.txt\n",
+ )
if __name__ == '__main__':
unittest.main()
diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py
index c4f1829..c1aeaef 100644
--- a/Lib/test/test_email/test_generator.py
+++ b/Lib/test/test_email/test_generator.py
@@ -27,7 +27,6 @@ class TestGeneratorBase:
None
"""),
- # From is wrapped because wrapped it fits in 40.
40: textwrap.dedent("""\
To: whom_it_may_concern@example.com
From:
@@ -40,11 +39,11 @@ class TestGeneratorBase:
None
"""),
- # Neither to nor from fit even if put on a new line,
- # so we leave them sticking out on the first line.
20: textwrap.dedent("""\
- To: whom_it_may_concern@example.com
- From: nobody_you_want_to_know@example.com
+ To:
+ whom_it_may_concern@example.com
+ From:
+ nobody_you_want_to_know@example.com
Subject: We the
willing led by the
unknowing are doing
@@ -169,6 +168,53 @@ class TestGeneratorBase:
g.flatten(msg)
self.assertEqual(s.getvalue(), self.typ(self.refold_long_expected[0]))
+ def test_rfc2231_wrapping(self):
+ # This is pretty much just to make sure we don't have an infinite
+ # loop; I don't expect anyone to hit this in the field.
+ msg = self.msgmaker(self.typ(textwrap.dedent("""\
+ To: nobody
+ Content-Disposition: attachment;
+ filename="afilenamelongenoghtowraphere"
+
+ None
+ """)))
+ expected = textwrap.dedent("""\
+ To: nobody
+ Content-Disposition: attachment;
+ filename*0*=us-ascii''afilename;
+ filename*1*=longenoghtowraphere
+
+ None
+ """)
+ s = self.ioclass()
+ g = self.genclass(s, policy=self.policy.clone(max_line_length=33))
+ g.flatten(msg)
+ self.assertEqual(s.getvalue(), self.typ(expected))
+
+ def test_rfc2231_wrapping_switches_to_default_len_if_too_narrow(self):
+ # This is just to make sure we don't have an infinite loop; I don't
+ # expect anyone to hit this in the field, so I'm not bothering to make
+ # the result optimal (the encoding isn't needed).
+ msg = self.msgmaker(self.typ(textwrap.dedent("""\
+ To: nobody
+ Content-Disposition: attachment;
+ filename="afilenamelongenoghtowraphere"
+
+ None
+ """)))
+ expected = textwrap.dedent("""\
+ To: nobody
+ Content-Disposition:
+ attachment;
+ filename*0*=us-ascii''afilenamelongenoghtowraphere
+
+ None
+ """)
+ s = self.ioclass()
+ g = self.genclass(s, policy=self.policy.clone(max_line_length=20))
+ g.flatten(msg)
+ self.assertEqual(s.getvalue(), self.typ(expected))
+
class TestGenerator(TestGeneratorBase, TestEmailBase):
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py
index af836dc..30ce0ba 100644
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -229,14 +229,14 @@ class TestContentTypeHeader(TestHeaderBase):
defects = args[1] if l>1 else []
decoded = args[2] if l>2 and args[2] is not DITTO else source
header = 'Content-Type:' + ' ' if source else ''
- folded = args[3] if l>3 else header + source + '\n'
+ folded = args[3] if l>3 else header + decoded + '\n'
h = self.make_header('Content-Type', source)
self.assertEqual(h.content_type, content_type)
self.assertEqual(h.maintype, maintype)
self.assertEqual(h.subtype, subtype)
self.assertEqual(h.params, parmdict)
with self.assertRaises(TypeError):
- h.params['abc'] = 'xyz' # params is read-only.
+ h.params['abc'] = 'xyz' # make sure params is read-only.
self.assertDefectsEqual(h.defects, defects)
self.assertEqual(h, decoded)
self.assertEqual(h.fold(policy=policy.default), folded)
@@ -373,9 +373,10 @@ class TestContentTypeHeader(TestHeaderBase):
'text/plain; Charset="utf-8"'),
# Since this is pretty much the ur-mimeheader, we'll put all the tests
- # that exercise the parameter parsing and formatting here.
- #
- # XXX: question: is minimal quoting preferred?
+ # that exercise the parameter parsing and formatting here. Note that
+ # when we refold we may canonicalize, so things like whitespace,
+ # quoting, and rfc2231 encoding may change from what was in the input
+ # header.
'unquoted_param_value': (
'text/plain; title=foo',
@@ -384,7 +385,8 @@ class TestContentTypeHeader(TestHeaderBase):
'plain',
{'title': 'foo'},
[],
- 'text/plain; title="foo"'),
+ 'text/plain; title="foo"',
+ ),
'param_value_with_tspecials': (
'text/plain; title="(bar)foo blue"',
@@ -415,7 +417,8 @@ class TestContentTypeHeader(TestHeaderBase):
'mixed',
{'boundary': 'CPIMSSMTPC06p5f3tG'},
[],
- 'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"'),
+ 'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"',
+ ),
'spaces_around_semis': (
('image/jpeg; name="wibble.JPG" ; x-mac-type="4A504547" ; '
@@ -429,14 +432,31 @@ class TestContentTypeHeader(TestHeaderBase):
[],
('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; '
'x-mac-creator="474B4F4E"'),
- # XXX: it could be that we will eventually prefer to fold starting
- # from the decoded value, in which case these spaces and similar
- # spaces in other tests will be wrong.
- ('Content-Type: image/jpeg; name="wibble.JPG" ; '
- 'x-mac-type="4A504547" ;\n'
+ ('Content-Type: image/jpeg; name="wibble.JPG";'
+ ' x-mac-type="4A504547";\n'
' x-mac-creator="474B4F4E"\n'),
),
+ 'lots_of_mime_params': (
+ ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; '
+ 'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'),
+ 'image/jpeg',
+ 'image',
+ 'jpeg',
+ {'name': 'wibble.JPG',
+ 'x-mac-type': '4A504547',
+ 'x-mac-creator': '474B4F4E',
+ 'x-extrastuff': 'make it longer'},
+ [],
+ ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; '
+ 'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'),
+ # In this case the whole of the MimeParameters does *not* fit
+ # one one line, so we break at a lower syntactic level.
+ ('Content-Type: image/jpeg; name="wibble.JPG";'
+ ' x-mac-type="4A504547";\n'
+ ' x-mac-creator="474B4F4E"; x-extrastuff="make it longer"\n'),
+ ),
+
'semis_inside_quotes': (
'image/jpeg; name="Jim&amp;&amp;Jill"',
'image/jpeg',
@@ -460,19 +480,25 @@ class TestContentTypeHeader(TestHeaderBase):
[],
r'image/jpeg; name="Jim \"Bob\" Jill"'),
- # XXX: This test works except for the refolding of the header. I'll
- # deal with that bug when I deal with the other folding bugs.
- #'non_ascii_in_params': (
- # ('foo\xa7/bar; b\xa7r=two; '
- # 'baz=thr\xa7e'.encode('latin-1').decode('us-ascii',
- # 'surrogateescape')),
- # 'foo\uFFFD/bar',
- # 'foo\uFFFD',
- # 'bar',
- # {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'},
- # [errors.UndecodableBytesDefect]*3,
- # 'foo�/bar; b�r="two"; baz="thr�e"',
- # ),
+ 'non_ascii_in_params': (
+ ('foo\xa7/bar; b\xa7r=two; '
+ 'baz=thr\xa7e'.encode('latin-1').decode('us-ascii',
+ 'surrogateescape')),
+ 'foo\uFFFD/bar',
+ 'foo\uFFFD',
+ 'bar',
+ {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'},
+ [errors.UndecodableBytesDefect]*3,
+ 'foo�/bar; b�r="two"; baz="thr�e"',
+ # XXX Two bugs here: the mime type is not allowed to be an encoded
+ # word, and we shouldn't be emitting surrogates in the parameter
+ # names. But I don't know what the behavior should be here, so I'm
+ # punting for now. In practice this is unlikely to be encountered
+ # since headers with binary in them only come from a binary source
+ # and are almost certain to be re-emitted without refolding.
+ 'Content-Type: =?unknown-8bit?q?foo=A7?=/bar; b\udca7r="two";\n'
+ " baz*=unknown-8bit''thr%A7e\n",
+ ),
# RFC 2231 parameter tests.
@@ -494,19 +520,20 @@ class TestContentTypeHeader(TestHeaderBase):
[],
r'image/jpeg; bar="baz\"foobar\"baz"'),
- # XXX: This test works except for the refolding of the header. I'll
- # deal with that bug when I deal with the other folding bugs.
- #'non_ascii_rfc2231_value': (
- # ('text/plain; charset=us-ascii; '
- # "title*=us-ascii'en'This%20is%20"
- # 'not%20f\xa7n').encode('latin-1').decode('us-ascii',
- # 'surrogateescape'),
- # 'text/plain',
- # 'text',
- # 'plain',
- # {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'},
- # [errors.UndecodableBytesDefect],
- # 'text/plain; charset="us-ascii"; title="This is not f�n"'),
+ 'non_ascii_rfc2231_value': (
+ ('text/plain; charset=us-ascii; '
+ "title*=us-ascii'en'This%20is%20"
+ 'not%20f\xa7n').encode('latin-1').decode('us-ascii',
+ 'surrogateescape'),
+ 'text/plain',
+ 'text',
+ 'plain',
+ {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'},
+ [errors.UndecodableBytesDefect],
+ 'text/plain; charset="us-ascii"; title="This is not f�n"',
+ 'Content-Type: text/plain; charset="us-ascii";\n'
+ " title*=unknown-8bit''This%20is%20not%20f%A7n\n",
+ ),
'rfc2231_encoded_charset': (
'text/plain; charset*=ansi-x3.4-1968\'\'us-ascii',
@@ -529,8 +556,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'This is ***fun*** is it not.pdf'},
[],
'text/plain; name="This is ***fun*** is it not.pdf"',
- ('Content-Type: text/plain;\tname*0*=\'\'This%20is%20;\n'
- '\tname*1*=%2A%2A%2Afun%2A%2A%2A%20;\tname*2="is it not.pdf"\n'),
),
# Make sure we also handle it if there are spurious double quotes.
@@ -545,9 +570,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'This is even more ***fun*** is it not.pdf'},
[errors.InvalidHeaderDefect]*2,
'text/plain; name="This is even more ***fun*** is it not.pdf"',
- ('Content-Type: text/plain;\t'
- 'name*0*="us-ascii\'\'This%20is%20even%20more%20";\n'
- '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it not.pdf"\n'),
),
'rfc2231_single_quote_inside_double_quotes': (
@@ -562,9 +584,8 @@ class TestContentTypeHeader(TestHeaderBase):
[errors.InvalidHeaderDefect]*2,
('text/plain; charset="us-ascii"; '
'title="This is really ***fun*** isn\'t it!"'),
- ('Content-Type: text/plain; charset=us-ascii;\n'
- '\ttitle*0*="us-ascii\'en\'This%20is%20really%20";\n'
- '\ttitle*1*="%2A%2A%2Afun%2A%2A%2A%20";\ttitle*2="isn\'t it!"\n'),
+ ('Content-Type: text/plain; charset="us-ascii";\n'
+ ' title="This is really ***fun*** isn\'t it!"\n'),
),
'rfc2231_single_quote_in_value_with_charset_and_lang': (
@@ -576,9 +597,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': "Frank's Document"},
[errors.InvalidHeaderDefect]*2,
'application/x-foo; name="Frank\'s Document"',
- ('Content-Type: application/x-foo;\t'
- 'name*0*="us-ascii\'en-us\'Frank\'s";\n'
- ' name*1*=" Document"\n'),
),
'rfc2231_single_quote_in_non_encoded_value': (
@@ -590,9 +608,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': "us-ascii'en-us'Frank's Document"},
[],
'application/x-foo; name="us-ascii\'en-us\'Frank\'s Document"',
- ('Content-Type: application/x-foo;\t'
- 'name*0="us-ascii\'en-us\'Frank\'s";\n'
- ' name*1=" Document"\n'),
),
'rfc2231_no_language_or_charset': (
@@ -615,12 +630,8 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'This is even more ***fun*** is it.pdf'},
[errors.InvalidHeaderDefect]*2,
'text/plain; name="This is even more ***fun*** is it.pdf"',
- ('Content-Type: text/plain;\t'
- 'name*0*="\'\'This%20is%20even%20more%20";\n'
- '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'),
),
- # XXX: see below...the first name line here should be *0 not *0*.
'rfc2231_partly_encoded': (
("text/plain;"
'\tname*0*="\'\'This%20is%20even%20more%20";'
@@ -632,9 +643,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'This is even more ***fun*** is it.pdf'},
[errors.InvalidHeaderDefect]*2,
'text/plain; name="This is even more ***fun*** is it.pdf"',
- ('Content-Type: text/plain;\t'
- 'name*0*="\'\'This%20is%20even%20more%20";\n'
- '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'),
),
'rfc2231_partly_encoded_2': (
@@ -647,10 +655,11 @@ class TestContentTypeHeader(TestHeaderBase):
'plain',
{'name': 'This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf'},
[errors.InvalidHeaderDefect],
- 'text/plain; name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"',
- ('Content-Type: text/plain;\t'
- 'name*0*="\'\'This%20is%20even%20more%20";\n'
- '\tname*1="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'),
+ ('text/plain;'
+ ' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"'),
+ ('Content-Type: text/plain;\n'
+ ' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is'
+ ' it.pdf"\n'),
),
'rfc2231_unknown_charset_treated_as_ascii': (
@@ -669,9 +678,12 @@ class TestContentTypeHeader(TestHeaderBase):
'plain',
{'charset': 'utf-8\uFFFD\uFFFD\uFFFD'},
[errors.UndecodableBytesDefect],
- 'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"'),
+ 'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"',
+ "Content-Type: text/plain;"
+ " charset*=unknown-8bit''utf-8%F1%F2%F3\n",
+ ),
- 'rfc2231_utf_8_in_supposedly_ascii_charset_parameter_value': (
+ 'rfc2231_utf8_in_supposedly_ascii_charset_parameter_value': (
"text/plain; charset*=ascii''utf-8%E2%80%9D",
'text/plain',
'text',
@@ -679,9 +691,11 @@ class TestContentTypeHeader(TestHeaderBase):
{'charset': 'utf-8”'},
[errors.UndecodableBytesDefect],
'text/plain; charset="utf-8”"',
+ # XXX Should folding change the charset to utf8? Currently it just
+ # reproduces the original, which is arguably fine.
+ "Content-Type: text/plain;"
+ " charset*=unknown-8bit''utf-8%E2%80%9D\n",
),
- # XXX: if the above were *re*folded, it would get tagged as utf-8
- # instead of ascii in the param, since it now contains non-ASCII.
'rfc2231_encoded_then_unencoded_segments': (
('application/x-foo;'
@@ -694,9 +708,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'My Document For You'},
[errors.InvalidHeaderDefect],
'application/x-foo; name="My Document For You"',
- ('Content-Type: application/x-foo;\t'
- 'name*0*="us-ascii\'en-us\'My";\n'
- '\tname*1=" Document";\tname*2=" For You"\n'),
),
# My reading of the RFC is that this is an invalid header. The RFC
@@ -713,11 +724,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'My Document For You'},
[errors.InvalidHeaderDefect]*3,
'application/x-foo; name="My Document For You"',
- ("Content-Type: application/x-foo;\tname*0=us-ascii'en-us'My;\t"
- # XXX: the newline is in the wrong place, come back and fix
- # this when the rest of tests pass.
- 'name*1*=" Document"\n;'
- '\tname*2*=" For You"\n'),
),
# XXX: I would say this one should default to ascii/en for the
@@ -730,8 +736,7 @@ class TestContentTypeHeader(TestHeaderBase):
# charset'lang'value pattern exactly *and* there is at least one
# encoded segment. Implementing that algorithm will require some
# refactoring, so I haven't done it (yet).
-
- 'rfc2231_qouted_unencoded_then_encoded_segments': (
+ 'rfc2231_quoted_unencoded_then_encoded_segments': (
('application/x-foo;'
'\tname*0="us-ascii\'en-us\'My";'
'\tname*1*=" Document";'
@@ -742,9 +747,25 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': "us-ascii'en-us'My Document For You"},
[errors.InvalidHeaderDefect]*2,
'application/x-foo; name="us-ascii\'en-us\'My Document For You"',
- ('Content-Type: application/x-foo;\t'
- 'name*0="us-ascii\'en-us\'My";\n'
- '\tname*1*=" Document";\tname*2*=" For You"\n'),
+ ),
+
+ # Make sure our folding algorithm produces multiple sections correctly.
+ # We could mix encoded and non-encoded segments, but we don't, we just
+ # make them all encoded. It might be worth fixing that, since the
+ # sections can get used for wrapping ascii text.
+ 'rfc2231_folded_segments_correctly_formatted': (
+ ('application/x-foo;'
+ '\tname="' + "with spaces"*8 + '"'),
+ 'application/x-foo',
+ 'application',
+ 'x-foo',
+ {'name': "with spaces"*8},
+ [],
+ 'application/x-foo; name="' + "with spaces"*8 + '"',
+ "Content-Type: application/x-foo;\n"
+ " name*0*=us-ascii''with%20spaceswith%20spaceswith%20spaceswith"
+ "%20spaceswith;\n"
+ " name*1*=%20spaceswith%20spaceswith%20spaceswith%20spaces\n"
),
}
@@ -827,8 +848,8 @@ class TestContentDisposition(TestHeaderBase):
[],
('attachment; filename="genome.jpeg"; '
'modification-date="Wed, 12 Feb 1997 16:29:51 -0500"'),
- ('Content-Disposition: attachment; filename=genome.jpeg;\n'
- ' modification-date="Wed, 12 Feb 1997 16:29:51 -0500";\n'),
+ ('Content-Disposition: attachment; filename="genome.jpeg";\n'
+ ' modification-date="Wed, 12 Feb 1997 16:29:51 -0500"\n'),
),
'no_value': (
@@ -873,7 +894,7 @@ class TestMIMEVersionHeader(TestHeaderBase):
if source:
source = ' ' + source
self.assertEqual(h.fold(policy=policy.default),
- 'MIME-Version:' + source + '\n')
+ 'MIME-Version:' + source + '\n')
version_string_params = {
@@ -1546,15 +1567,39 @@ class TestFolding(TestHeaderBase):
'singlewordthatwontfit')
self.assertEqual(
h.fold(policy=policy.default.clone(max_line_length=20)),
- 'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n')
+ 'Subject: \n'
+ ' =?utf-8?q?thisisa?=\n'
+ ' =?utf-8?q?verylon?=\n'
+ ' =?utf-8?q?glineco?=\n'
+ ' =?utf-8?q?nsistin?=\n'
+ ' =?utf-8?q?gofasin?=\n'
+ ' =?utf-8?q?gleword?=\n'
+ ' =?utf-8?q?thatwon?=\n'
+ ' =?utf-8?q?tfit?=\n'
+ )
def test_fold_unstructured_with_two_overlong_words(self):
h = self.make_header('Subject', 'thisisaverylonglineconsistingofa'
'singlewordthatwontfit plusanotherverylongwordthatwontfit')
self.assertEqual(
h.fold(policy=policy.default.clone(max_line_length=20)),
- 'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n'
- ' plusanotherverylongwordthatwontfit\n')
+ 'Subject: \n'
+ ' =?utf-8?q?thisisa?=\n'
+ ' =?utf-8?q?verylon?=\n'
+ ' =?utf-8?q?glineco?=\n'
+ ' =?utf-8?q?nsistin?=\n'
+ ' =?utf-8?q?gofasin?=\n'
+ ' =?utf-8?q?gleword?=\n'
+ ' =?utf-8?q?thatwon?=\n'
+ ' =?utf-8?q?tfit_pl?=\n'
+ ' =?utf-8?q?usanoth?=\n'
+ ' =?utf-8?q?erveryl?=\n'
+ ' =?utf-8?q?ongword?=\n'
+ ' =?utf-8?q?thatwon?=\n'
+ ' =?utf-8?q?tfit?=\n'
+ )
+
+ # XXX Need test for when max_line_length is less than the chrome size.
def test_fold_unstructured_with_slightly_long_word(self):
h = self.make_header('Subject', 'thislongwordislessthanmaxlinelen')
@@ -1590,6 +1635,18 @@ class TestFolding(TestHeaderBase):
self.assertEqual(h.fold(policy=policy.default),
'Date: Sat, 02 Feb 2002 17:00:06 -0800\n')
+ def test_fold_overlong_words_using_RFC2047(self):
+ h = self.make_header(
+ 'X-Report-Abuse',
+ '<https://www.mailitapp.com/report_abuse.php?'
+ 'mid=xxx-xxx-xxxxxxxxxxxxxxxxxxxxxxxx==-xxx-xx-xx>')
+ self.assertEqual(
+ h.fold(policy=policy.default),
+ 'X-Report-Abuse: =?utf-8?q?=3Chttps=3A//www=2Emailitapp=2E'
+ 'com/report=5F?=\n'
+ ' =?utf-8?q?abuse=2Ephp=3Fmid=3Dxxx-xxx-xxxx'
+ 'xxxxxxxxxxxxxxxxxxxx=3D=3D-xxx-?=\n'
+ ' =?utf-8?q?xx-xx=3E?=\n')
if __name__ == '__main__':
diff --git a/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst b/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst
new file mode 100644
index 0000000..c933ee7
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst
@@ -0,0 +1,3 @@
+The header folding algorithm for the new email policies has been rewritten,
+which also fixes bpo-30788, bpo-31831, and bpo-32182. In particular, RFC2231
+folding is now done correctly.