From 2313e15578aa864c7b995de996e4787169f7ca8d Mon Sep 17 00:00:00 2001 From: R David Murray Date: Mon, 13 Jan 2014 13:19:21 -0500 Subject: #20206, #5803: more efficient algorithm that doesn't truncate output. This fixes an edge case (20206) where if the input ended in a character needing encoding but there was no newline on the string, the last byte of the encoded character would be dropped. The fix is to use a more efficient algorithm, provided by Serhiy Storchaka (5803), that does not have the bug. --- Lib/email/quoprimime.py | 142 ++++++++++++++++---------------------- Lib/test/test_email/test_email.py | 5 ++ Misc/NEWS | 4 ++ 3 files changed, 69 insertions(+), 82 deletions(-) diff --git a/Lib/email/quoprimime.py b/Lib/email/quoprimime.py index bc02281..30bf916 100644 --- a/Lib/email/quoprimime.py +++ b/Lib/email/quoprimime.py @@ -53,8 +53,9 @@ EMPTYSTRING = '' # space-wise. Remember that headers and bodies have different sets of safe # characters. Initialize both maps with the full expansion, and then override # the safe bytes with the more compact form. -_QUOPRI_HEADER_MAP = dict((c, '=%02X' % c) for c in range(256)) -_QUOPRI_BODY_MAP = _QUOPRI_HEADER_MAP.copy() +_QUOPRI_MAP = ['=%02X' % c for c in range(256)] +_QUOPRI_HEADER_MAP = _QUOPRI_MAP[:] +_QUOPRI_BODY_MAP = _QUOPRI_MAP[:] # Safe header bytes which need no encoding. for c in b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'): @@ -121,8 +122,7 @@ def unquote(s): def quote(c): - return '=%02X' % ord(c) - + return _QUOPRI_MAP[ord(c)] def header_encode(header_bytes, charset='iso-8859-1'): @@ -140,68 +140,16 @@ def header_encode(header_bytes, charset='iso-8859-1'): if not header_bytes: return '' # Iterate over every byte, encoding if necessary. - encoded = [] - for octet in header_bytes: - encoded.append(_QUOPRI_HEADER_MAP[octet]) + encoded = header_bytes.decode('latin1').translate(_QUOPRI_HEADER_MAP) # Now add the RFC chrome to each encoded chunk and glue the chunks # together. - return '=?%s?q?%s?=' % (charset, EMPTYSTRING.join(encoded)) - - -class _body_accumulator(io.StringIO): - - def __init__(self, maxlinelen, eol, *args, **kw): - super().__init__(*args, **kw) - self.eol = eol - self.maxlinelen = self.room = maxlinelen - - def write_str(self, s): - """Add string s to the accumulated body.""" - self.write(s) - self.room -= len(s) - - def newline(self): - """Write eol, then start new line.""" - self.write_str(self.eol) - self.room = self.maxlinelen - - def write_soft_break(self): - """Write a soft break, then start a new line.""" - self.write_str('=') - self.newline() - - def write_wrapped(self, s, extra_room=0): - """Add a soft line break if needed, then write s.""" - if self.room < len(s) + extra_room: - self.write_soft_break() - self.write_str(s) - - def write_char(self, c, is_last_char): - if not is_last_char: - # Another character follows on this line, so we must leave - # extra room, either for it or a soft break, and whitespace - # need not be quoted. - self.write_wrapped(c, extra_room=1) - elif c not in ' \t': - # For this and remaining cases, no more characters follow, - # so there is no need to reserve extra room (since a hard - # break will immediately follow). - self.write_wrapped(c) - elif self.room >= 3: - # It's a whitespace character at end-of-line, and we have room - # for the three-character quoted encoding. - self.write(quote(c)) - elif self.room == 2: - # There's room for the whitespace character and a soft break. - self.write(c) - self.write_soft_break() - else: - # There's room only for a soft break. The quoted whitespace - # will be the only content on the subsequent line. - self.write_soft_break() - self.write(quote(c)) + return '=?%s?q?%s?=' % (charset, encoded) +_QUOPRI_BODY_ENCODE_MAP = _QUOPRI_BODY_MAP[:] +for c in b'\r\n': + _QUOPRI_BODY_ENCODE_MAP[c] = chr(c) + def body_encode(body, maxlinelen=76, eol=NL): """Encode with quoted-printable, wrapping at maxlinelen characters. @@ -226,26 +174,56 @@ def body_encode(body, maxlinelen=76, eol=NL): if not body: return body - # The last line may or may not end in eol, but all other lines do. - last_has_eol = (body[-1] in '\r\n') - - # This accumulator will make it easier to build the encoded body. - encoded_body = _body_accumulator(maxlinelen, eol) - - lines = body.splitlines() - last_line_no = len(lines) - 1 - for line_no, line in enumerate(lines): - last_char_index = len(line) - 1 - for i, c in enumerate(line): - if body_check(ord(c)): - c = quote(c) - encoded_body.write_char(c, i==last_char_index) - # Add an eol if input line had eol. All input lines have eol except - # possibly the last one. - if line_no < last_line_no or last_has_eol: - encoded_body.newline() - - return encoded_body.getvalue() + # quote speacial characters + body = body.translate(_QUOPRI_BODY_ENCODE_MAP) + + soft_break = '=' + eol + # leave space for the '=' at the end of a line + maxlinelen1 = maxlinelen - 1 + + encoded_body = [] + append = encoded_body.append + + for line in body.splitlines(): + # break up the line into pieces no longer than maxlinelen - 1 + start = 0 + laststart = len(line) - 1 - maxlinelen + while start <= laststart: + stop = start + maxlinelen1 + # make sure we don't break up an escape sequence + if line[stop - 2] == '=': + append(line[start:stop - 1]) + start = stop - 2 + elif line[stop - 1] == '=': + append(line[start:stop]) + start = stop - 1 + else: + append(line[start:stop] + '=') + start = stop + + # handle rest of line, special case if line ends in whitespace + if line and line[-1] in ' \t': + room = start - laststart + if room >= 3: + # It's a whitespace character at end-of-line, and we have room + # for the three-character quoted encoding. + q = quote(line[-1]) + elif room == 2: + # There's room for the whitespace character and a soft break. + q = line[-1] + soft_break + else: + # There's room only for a soft break. The quoted whitespace + # will be the only content on the subsequent line. + q = soft_break + quote(line[-1]) + append(line[start:-1] + q) + else: + append(line[start:]) + + # add back final newline if present + if body[-1] in CRLF: + append('') + + return eol.join(encoded_body) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 6af6278..c787695 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -4216,6 +4216,11 @@ class TestQuopri(unittest.TestCase): def test_encode_one_line_eol(self): self._test_encode('hello\n', 'hello\r\n', eol='\r\n') + def test_encode_one_line_eol_after_non_ascii(self): + # issue 20206; see changeset 0cf700464177 for why the encode/decode. + self._test_encode('hello\u03c5\n'.encode('utf-8').decode('latin1'), + 'hello=CF=85\r\n', eol='\r\n') + def test_encode_one_space(self): self._test_encode(' ', '=20') diff --git a/Misc/NEWS b/Misc/NEWS index 196eb23..85571b0 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -43,6 +43,10 @@ Core and Builtins Library ------- +- Issues #20206 and #5803: Fix edge case in email.quoprimime.encode where it + truncated lines ending in a character needing encoding but no newline by + using a more efficient algorithm that doesn't have the bug. + - Issue #19082: Working xmlrpc.server and xmlrpc.client examples. Both in modules and in documentation. Initial patch contributed by Vajrasky Kok. -- cgit v0.12