gh-92081: Fix for email.generator.Generator with whitespace between encoded words. (#92281)

* Fix for email.generator.Generator with whitespace between encoded words. email.generator.Generator currently does not handle whitespace between encoded words correctly when the encoded words span multiple lines. The current generator will create an encoded word for each line. If the end of the line happens to correspond with the end real word in the plaintext, the generator will place an unencoded space at the start of the subsequent lines to represent the whitespace between the plaintext words. A compliant decoder will strip all the whitespace from between two encoded words which leads to missing spaces in the round-tripped output. The fix for this is to make sure that whitespace between two encoded words ends up inside of one or the other of the encoded words. This fix places the space inside of the second encoded word. A second problem happens with continuation lines. A continuation line that starts with whitespace and is followed by a non-encoded word is fine because the newline between such continuation lines is defined as condensing to a single space character. When the continuation line starts with whitespace followed by an encoded word, however, the RFCs specify that the word is run together with the encoded word on the previous line. This is because normal words are filded on syntactic breaks by encoded words are not. The solution to this is to add the whitespace to the start of the encoded word on the continuation line. Test cases are from #92081 * Rename a variable so it's not confused with the final variable.
author: Toshio Kuratomi <a.badger@gmail.com> 2024-05-20 19:10:47 (GMT)
committer: GitHub <noreply@github.com> 2024-05-20 19:10:47 (GMT)
commit: a6fdb31b6714c9f3c65fefbb3fe388b2b139a75f (patch)
tree: d522bac509d603de01cd7c4557b4b83d6a5d108d /Lib/email/_header_value_parser.py
parent: c0d81b256604a1079349d82d136db43eefcb3df1 (diff)
download: cpython-a6fdb31b6714c9f3c65fefbb3fe388b2b139a75f.zip
cpython-a6fdb31b6714c9f3c65fefbb3fe388b2b139a75f.tar.gz
cpython-a6fdb31b6714c9f3c65fefbb3fe388b2b139a75f.tar.bz2
1 files changed, 41 insertions, 7 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index d1b4c7d..6148801 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -2784,11 +2784,15 @@ def _refold_parse_tree(parse_tree, *, policy):
     # max_line_length 0/None means no limit, ie: infinitely long.
     maxlen = policy.max_line_length or sys.maxsize
     encoding = 'utf-8' if policy.utf8 else 'us-ascii'
-    lines = ['']
-    last_ew = None
+    lines = ['']  # Folded lines to be output
+    leading_whitespace = ''  # When we have whitespace between two encoded
+                             # words, we may need to encode the whitespace
+                             # at the beginning of the second word.
+    last_ew = None  # Points to the last encoded character if there's an ew on
+                    # the line
     last_charset = None
     wrap_as_ew_blocked = 0
-    want_encoding = False
+    want_encoding = False  # This is set to True if we need to encode this part
     end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
     parts = list(parse_tree)
     while parts:
@@ -2812,10 +2816,12 @@ def _refold_parse_tree(parse_tree, *, policy):
                 # 'charset' property on the policy.
                 charset = 'utf-8'
             want_encoding = True
+
         if part.token_type == 'mime-parameters':
             # Mime parameter folding (using RFC2231) is extra special.
             _fold_mime_parameters(part, lines, maxlen, encoding)
             continue
+
         if want_encoding and not wrap_as_ew_blocked:
             if not part.as_ew_allowed:
                 want_encoding = False
@@ -2847,21 +2853,38 @@ def _refold_parse_tree(parse_tree, *, policy):
                      last_charset == 'utf-8' and charset != 'us-ascii')):
                     last_ew = None
                 last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
-                                      part.ew_combine_allowed, charset)
+                                      part.ew_combine_allowed, charset, leading_whitespace)
+                # This whitespace has been added to the lines in _fold_as_ew()
+                # so clear it now.
+                leading_whitespace = ''
                 last_charset = charset
             want_encoding = False
             continue
+
         if len(tstr) <= maxlen - len(lines[-1]):
             lines[-1] += tstr
             continue
+
         # This part is too long to fit.  The RFC wants us to break at
         # "major syntactic breaks", so unless we don't consider this
         # to be one, check if it will fit on the next line by itself.
+        leading_whitespace = ''
         if (part.syntactic_break and
                 len(tstr) + 1 <= maxlen):
             newline = _steal_trailing_WSP_if_exists(lines)
             if newline or part.startswith_fws():
+                # We're going to fold the data onto a new line here.  Due to
+                # the way encoded strings handle continuation lines, we need to
+                # be prepared to encode any whitespace if the next line turns
+                # out to start with an encoded word.
                 lines.append(newline + tstr)
+
+                whitespace_accumulator = []
+                for char in lines[-1]:
+                    if char not in WSP:
+                        break
+                    whitespace_accumulator.append(char)
+                leading_whitespace = ''.join(whitespace_accumulator)
                 last_ew = None
                 continue
         if not hasattr(part, 'encode'):
@@ -2885,9 +2908,10 @@ def _refold_parse_tree(parse_tree, *, policy):
         else:
             # We can't fold it onto the next line either...
             lines[-1] += tstr
+
     return policy.linesep.join(lines) + policy.linesep
 
-def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
+def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
     """Fold string to_encode into lines as encoded word, combining if allowed.
     Return the new value for last_ew, or None if ew_combine_allowed is False.
 
@@ -2902,7 +2926,7 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
         to_encode = str(
             get_unstructured(lines[-1][last_ew:] + to_encode))
         lines[-1] = lines[-1][:last_ew]
-    if to_encode[0] in WSP:
+    elif to_encode[0] in WSP:
         # We're joining this to non-encoded text, so don't encode
         # the leading blank.
         leading_wsp = to_encode[0]
@@ -2910,6 +2934,7 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
         if (len(lines[-1]) == maxlen):
             lines.append(_steal_trailing_WSP_if_exists(lines))
         lines[-1] += leading_wsp
+
     trailing_wsp = ''
     if to_encode[-1] in WSP:
         # Likewise for the trailing space.
@@ -2929,11 +2954,20 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
 
     while to_encode:
         remaining_space = maxlen - len(lines[-1])
-        text_space = remaining_space - chrome_len
+        text_space = remaining_space - chrome_len - len(leading_whitespace)
         if text_space <= 0:
             lines.append(' ')
             continue
 
+        # If we are at the start of a continuation line, prepend whitespace
+        # (we only want to do this when the line starts with an encoded word
+        # but if we're folding in this helper function, then we know that we
+        # are going to be writing out an encoded word.)
+        if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
+            encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
+            lines[-1] += encoded_word
+            leading_whitespace = ''
+
         to_encode_word = to_encode[:text_space]
         encoded_word = _ew.encode(to_encode_word, charset=encode_as)
         excess = len(encoded_word) - remaining_space
author	Toshio Kuratomi <a.badger@gmail.com>	2024-05-20 19:10:47 (GMT)
committer	GitHub <noreply@github.com>	2024-05-20 19:10:47 (GMT)
commit	a6fdb31b6714c9f3c65fefbb3fe388b2b139a75f (patch)
tree	d522bac509d603de01cd7c4557b4b83d6a5d108d /Lib/email/_header_value_parser.py
parent	c0d81b256604a1079349d82d136db43eefcb3df1 (diff)
download	cpython-a6fdb31b6714c9f3c65fefbb3fe388b2b139a75f.zip cpython-a6fdb31b6714c9f3c65fefbb3fe388b2b139a75f.tar.gz cpython-a6fdb31b6714c9f3c65fefbb3fe388b2b139a75f.tar.bz2