gh-121650: Encode newlines in headers, and verify headers are sound (GH-122233)

## Encode header parts that contain newlines Per RFC 2047: > [...] these encoding schemes allow the > encoding of arbitrary octet values, mail readers that implement this > decoding should also ensure that display of the decoded data on the > recipient's terminal will not cause unwanted side-effects It seems that the "quoted-word" scheme is a valid way to include a newline character in a header value, just like we already allow undecodable bytes or control characters. They do need to be properly quoted when serialized to text, though. ## Verify that email headers are well-formed This should fail for custom fold() implementations that aren't careful about newlines. Co-authored-by: Bas Bloemsaat <bas@bloemsaat.org> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
author: Petr Viktorin <encukou@gmail.com> 2024-07-30 22:19:48 (GMT)
committer: GitHub <noreply@github.com> 2024-07-30 22:19:48 (GMT)
commit: 097633981879b3c9de9a1dd120d3aa585ecc2384 (patch)
tree: bd7d04f28e53f8df28dd57213e09619cc1666b9d /Lib/email
parent: 5912487938ac4b517209082ab9e6d2d3d0fb4f4d (diff)
download: cpython-097633981879b3c9de9a1dd120d3aa585ecc2384.zip
cpython-097633981879b3c9de9a1dd120d3aa585ecc2384.tar.gz
cpython-097633981879b3c9de9a1dd120d3aa585ecc2384.tar.bz2
4 files changed, 33 insertions, 4 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 7da1bba..ec2215a 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -92,6 +92,8 @@ TOKEN_ENDS = TSPECIALS | WSP
 ASPECIALS = TSPECIALS | set("*'%")
 ATTRIBUTE_ENDS = ASPECIALS | WSP
 EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
+NLSET = {'\n', '\r'}
+SPECIALSNL = SPECIALS | NLSET
 
 def quote_string(value):
     return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
@@ -2802,9 +2804,13 @@ def _refold_parse_tree(parse_tree, *, policy):
             wrap_as_ew_blocked -= 1
             continue
         tstr = str(part)
-        if part.token_type == 'ptext' and set(tstr) & SPECIALS:
-            # Encode if tstr contains special characters.
-            want_encoding = True
+        if not want_encoding:
+            if part.token_type == 'ptext':
+                # Encode if tstr contains special characters.
+                want_encoding = not SPECIALSNL.isdisjoint(tstr)
+            else:
+                # Encode if tstr contains newlines.
+                want_encoding = not NLSET.isdisjoint(tstr)
         try:
             tstr.encode(encoding)
             charset = encoding
diff --git a/Lib/email/_policybase.py b/Lib/email/_policybase.py
index 1c76ed6..c7694a4 100644
--- a/Lib/email/_policybase.py
+++ b/Lib/email/_policybase.py
@@ -157,6 +157,13 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta):
     message_factory     -- the class to use to create new message objects.
                            If the value is None, the default is Message.
 
+    verify_generated_headers
+                        -- if true, the generator verifies that each header
+                           they are properly folded, so that a parser won't
+                           treat it as multiple headers, start-of-body, or
+                           part of another header.
+                           This is a check against custom Header & fold()
+                           implementations.
     """
 
     raise_on_defect = False
@@ -165,6 +172,7 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta):
     max_line_length = 78
     mangle_from_ = False
     message_factory = None
+    verify_generated_headers = True
 
     def handle_defect(self, obj, defect):
         """Based on policy, either raise defect or call register_defect.
diff --git a/Lib/email/errors.py b/Lib/email/errors.py
index 3ad0056..02aa5ec 100644
--- a/Lib/email/errors.py
+++ b/Lib/email/errors.py
@@ -29,6 +29,10 @@ class CharsetError(MessageError):
     """An illegal charset was given."""
 
 
+class HeaderWriteError(MessageError):
+    """Error while writing headers."""
+
+
 # These are parsing defects which the parser was able to work around.
 class MessageDefect(ValueError):
     """Base class for a message defect."""
diff --git a/Lib/email/generator.py b/Lib/email/generator.py
index 9d058ce..42c84aa 100644
--- a/Lib/email/generator.py
+++ b/Lib/email/generator.py
@@ -14,12 +14,14 @@ import random
 from copy import deepcopy
 from io import StringIO, BytesIO
 from email.utils import _has_surrogates
+from email.errors import HeaderWriteError
 
 UNDERSCORE = '_'
 NL = '\n'  # XXX: no longer used by the code below.
 
 NLCRE = re.compile(r'\r\n|\r|\n')
 fcre = re.compile(r'^From ', re.MULTILINE)
+NEWLINE_WITHOUT_FWSP = re.compile(r'\r\n[^ \t]|\r[^ \n\t]|\n[^ \t]')
 
 
 class Generator:
@@ -222,7 +224,16 @@ class Generator:
 
     def _write_headers(self, msg):
         for h, v in msg.raw_items():
-            self.write(self.policy.fold(h, v))
+            folded = self.policy.fold(h, v)
+            if self.policy.verify_generated_headers:
+                linesep = self.policy.linesep
+                if not folded.endswith(self.policy.linesep):
+                    raise HeaderWriteError(
+                        f'folded header does not end with {linesep!r}: {folded!r}')
+                if NEWLINE_WITHOUT_FWSP.search(folded.removesuffix(linesep)):
+                    raise HeaderWriteError(
+                        f'folded header contains newline: {folded!r}')
+            self.write(folded)
         # A blank line always separates headers from body
         self.write(self._NL)
author	Petr Viktorin <encukou@gmail.com>	2024-07-30 22:19:48 (GMT)
committer	GitHub <noreply@github.com>	2024-07-30 22:19:48 (GMT)
commit	097633981879b3c9de9a1dd120d3aa585ecc2384 (patch)
tree	bd7d04f28e53f8df28dd57213e09619cc1666b9d /Lib/email
parent	5912487938ac4b517209082ab9e6d2d3d0fb4f4d (diff)
download	cpython-097633981879b3c9de9a1dd120d3aa585ecc2384.zip cpython-097633981879b3c9de9a1dd120d3aa585ecc2384.tar.gz cpython-097633981879b3c9de9a1dd120d3aa585ecc2384.tar.bz2