diff options
author | Petr Viktorin <encukou@gmail.com> | 2024-07-30 22:19:48 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-30 22:19:48 (GMT) |
commit | 097633981879b3c9de9a1dd120d3aa585ecc2384 (patch) | |
tree | bd7d04f28e53f8df28dd57213e09619cc1666b9d /Lib/email | |
parent | 5912487938ac4b517209082ab9e6d2d3d0fb4f4d (diff) | |
download | cpython-097633981879b3c9de9a1dd120d3aa585ecc2384.zip cpython-097633981879b3c9de9a1dd120d3aa585ecc2384.tar.gz cpython-097633981879b3c9de9a1dd120d3aa585ecc2384.tar.bz2 |
gh-121650: Encode newlines in headers, and verify headers are sound (GH-122233)
## Encode header parts that contain newlines
Per RFC 2047:
> [...] these encoding schemes allow the
> encoding of arbitrary octet values, mail readers that implement this
> decoding should also ensure that display of the decoded data on the
> recipient's terminal will not cause unwanted side-effects
It seems that the "quoted-word" scheme is a valid way to include
a newline character in a header value, just like we already allow
undecodable bytes or control characters.
They do need to be properly quoted when serialized to text, though.
## Verify that email headers are well-formed
This should fail for custom fold() implementations that aren't careful
about newlines.
Co-authored-by: Bas Bloemsaat <bas@bloemsaat.org>
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Diffstat (limited to 'Lib/email')
-rw-r--r-- | Lib/email/_header_value_parser.py | 12 | ||||
-rw-r--r-- | Lib/email/_policybase.py | 8 | ||||
-rw-r--r-- | Lib/email/errors.py | 4 | ||||
-rw-r--r-- | Lib/email/generator.py | 13 |
4 files changed, 33 insertions, 4 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 7da1bba..ec2215a 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -92,6 +92,8 @@ TOKEN_ENDS = TSPECIALS | WSP ASPECIALS = TSPECIALS | set("*'%") ATTRIBUTE_ENDS = ASPECIALS | WSP EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') +NLSET = {'\n', '\r'} +SPECIALSNL = SPECIALS | NLSET def quote_string(value): return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' @@ -2802,9 +2804,13 @@ def _refold_parse_tree(parse_tree, *, policy): wrap_as_ew_blocked -= 1 continue tstr = str(part) - if part.token_type == 'ptext' and set(tstr) & SPECIALS: - # Encode if tstr contains special characters. - want_encoding = True + if not want_encoding: + if part.token_type == 'ptext': + # Encode if tstr contains special characters. + want_encoding = not SPECIALSNL.isdisjoint(tstr) + else: + # Encode if tstr contains newlines. + want_encoding = not NLSET.isdisjoint(tstr) try: tstr.encode(encoding) charset = encoding diff --git a/Lib/email/_policybase.py b/Lib/email/_policybase.py index 1c76ed6..c7694a4 100644 --- a/Lib/email/_policybase.py +++ b/Lib/email/_policybase.py @@ -157,6 +157,13 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta): message_factory -- the class to use to create new message objects. If the value is None, the default is Message. + verify_generated_headers + -- if true, the generator verifies that each header + they are properly folded, so that a parser won't + treat it as multiple headers, start-of-body, or + part of another header. + This is a check against custom Header & fold() + implementations. """ raise_on_defect = False @@ -165,6 +172,7 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta): max_line_length = 78 mangle_from_ = False message_factory = None + verify_generated_headers = True def handle_defect(self, obj, defect): """Based on policy, either raise defect or call register_defect. diff --git a/Lib/email/errors.py b/Lib/email/errors.py index 3ad0056..02aa5ec 100644 --- a/Lib/email/errors.py +++ b/Lib/email/errors.py @@ -29,6 +29,10 @@ class CharsetError(MessageError): """An illegal charset was given.""" +class HeaderWriteError(MessageError): + """Error while writing headers.""" + + # These are parsing defects which the parser was able to work around. class MessageDefect(ValueError): """Base class for a message defect.""" diff --git a/Lib/email/generator.py b/Lib/email/generator.py index 9d058ce..42c84aa 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -14,12 +14,14 @@ import random from copy import deepcopy from io import StringIO, BytesIO from email.utils import _has_surrogates +from email.errors import HeaderWriteError UNDERSCORE = '_' NL = '\n' # XXX: no longer used by the code below. NLCRE = re.compile(r'\r\n|\r|\n') fcre = re.compile(r'^From ', re.MULTILINE) +NEWLINE_WITHOUT_FWSP = re.compile(r'\r\n[^ \t]|\r[^ \n\t]|\n[^ \t]') class Generator: @@ -222,7 +224,16 @@ class Generator: def _write_headers(self, msg): for h, v in msg.raw_items(): - self.write(self.policy.fold(h, v)) + folded = self.policy.fold(h, v) + if self.policy.verify_generated_headers: + linesep = self.policy.linesep + if not folded.endswith(self.policy.linesep): + raise HeaderWriteError( + f'folded header does not end with {linesep!r}: {folded!r}') + if NEWLINE_WITHOUT_FWSP.search(folded.removesuffix(linesep)): + raise HeaderWriteError( + f'folded header contains newline: {folded!r}') + self.write(folded) # A blank line always separates headers from body self.write(self._NL) |