summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2024-01-10 12:54:36 (GMT)
committerGitHub <noreply@github.com>2024-01-10 12:54:36 (GMT)
commite9d5b6ea2d68564f176fdf70c2d7028e060c62b5 (patch)
treec309da0c1f4a51289b8c7ff6f581c68783c08329 /Lib
parent568d220993fa9b4b812ff1b425edd80dbe17dda9 (diff)
downloadcpython-e9d5b6ea2d68564f176fdf70c2d7028e060c62b5.zip
cpython-e9d5b6ea2d68564f176fdf70c2d7028e060c62b5.tar.gz
cpython-e9d5b6ea2d68564f176fdf70c2d7028e060c62b5.tar.bz2
gh-113594: Fix UnicodeEncodeError in TokenList.fold() (GH-113730)
It occurred when try to re-encode an unknown-8bit part combined with non-unknown-8bit part.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/email/_header_value_parser.py7
-rw-r--r--Lib/test/test_email/test__header_value_parser.py39
2 files changed, 46 insertions, 0 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 0d6bd81..5b653f6 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -2766,6 +2766,7 @@ def _refold_parse_tree(parse_tree, *, policy):
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = ['']
last_ew = None
+ last_charset = None
wrap_as_ew_blocked = 0
want_encoding = False
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
@@ -2820,8 +2821,14 @@ def _refold_parse_tree(parse_tree, *, policy):
else:
# It's a terminal, wrap it as an encoded word, possibly
# combining it with previously encoded words if allowed.
+ if (last_ew is not None and
+ charset != last_charset and
+ (last_charset == 'unknown-8bit' or
+ last_charset == 'utf-8' and charset != 'us-ascii')):
+ last_ew = None
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
part.ew_combine_allowed, charset)
+ last_charset = charset
want_encoding = False
continue
if len(tstr) <= maxlen - len(lines[-1]):
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index 854f2ff..bdb0e55 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -2915,6 +2915,45 @@ class TestFolding(TestEmailBase):
"mich. And that's\n"
" all I'm sayin.\n")
+ def test_unicode_after_unknown_not_combined(self):
+ self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=\xa4"),
+ "=?unknown-8bit?q?=A4?==?utf-8?q?=C2=A4?=\n")
+ prefix = "0123456789 "*5
+ self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=A4?=\xa4"),
+ prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?=C2=A4?=\n")
+
+ def test_ascii_after_unknown_not_combined(self):
+ self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=abc"),
+ "=?unknown-8bit?q?=A4?=abc\n")
+ prefix = "0123456789 "*5
+ self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=A4?=abc"),
+ prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?abc?=\n")
+
+ def test_unknown_after_unicode_not_combined(self):
+ self._test(parser.get_unstructured("\xa4"
+ "=?unknown-8bit?q?=A4?="),
+ "=?utf-8?q?=C2=A4?==?unknown-8bit?q?=A4?=\n")
+ prefix = "0123456789 "*5
+ self._test(parser.get_unstructured(prefix + "\xa4=?unknown-8bit?q?=A4?="),
+ prefix + "=?utf-8?q?=C2=A4?=\n =?unknown-8bit?q?=A4?=\n")
+
+ def test_unknown_after_ascii_not_combined(self):
+ self._test(parser.get_unstructured("abc"
+ "=?unknown-8bit?q?=A4?="),
+ "abc=?unknown-8bit?q?=A4?=\n")
+ prefix = "0123456789 "*5
+ self._test(parser.get_unstructured(prefix + "abcd=?unknown-8bit?q?=A4?="),
+ prefix + "abcd\n =?unknown-8bit?q?=A4?=\n")
+
+ def test_unknown_after_unknown(self):
+ self._test(parser.get_unstructured("=?unknown-8bit?q?=C2?="
+ "=?unknown-8bit?q?=A4?="),
+ "=?unknown-8bit?q?=C2=A4?=\n")
+ prefix = "0123456789 "*5
+ self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=C2?="
+ "=?unknown-8bit?q?=A4?="),
+ prefix + "=?unknown-8bit?q?=C2?=\n =?unknown-8bit?q?=A4?=\n")
+
# XXX Need test of an encoded word so long that it needs to be wrapped
def test_simple_address(self):