diff options
author | Abhilash Raj <maxking@users.noreply.github.com> | 2019-06-04 17:41:34 (GMT) |
---|---|---|
committer | Barry Warsaw <barry@python.org> | 2019-06-04 17:41:34 (GMT) |
commit | 46d88a113142b26c01c95c93846a89318ba87ffc (patch) | |
tree | 756e002f21f3fa01483b309ddf85b0ed37e2e19d /Lib/email | |
parent | bc6469f79ca13217b784fb47da7ec83484a3debe (diff) | |
download | cpython-46d88a113142b26c01c95c93846a89318ba87ffc.zip cpython-46d88a113142b26c01c95c93846a89318ba87ffc.tar.gz cpython-46d88a113142b26c01c95c93846a89318ba87ffc.tar.bz2 |
bpo-35805: Add parser for Message-ID email header. (GH-13397)
* bpo-35805: Add parser for Message-ID header.
This parser is based on the definition of Identification Fields from RFC 5322
Sec 3.6.4.
This should also prevent folding of Message-ID header using RFC 2047 encoded
words and hence fix bpo-35805.
* Prevent folding of non-ascii message-id headers.
* Add fold method to MsgID token to prevent folding.
Diffstat (limited to 'Lib/email')
-rw-r--r-- | Lib/email/_header_value_parser.py | 137 | ||||
-rw-r--r-- | Lib/email/headerregistry.py | 13 |
2 files changed, 135 insertions, 15 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 14cc00c..34969ab 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -179,37 +179,30 @@ class WhiteSpaceTokenList(TokenList): class UnstructuredTokenList(TokenList): - token_type = 'unstructured' class Phrase(TokenList): - token_type = 'phrase' class Word(TokenList): - token_type = 'word' class CFWSList(WhiteSpaceTokenList): - token_type = 'cfws' class Atom(TokenList): - token_type = 'atom' class Token(TokenList): - token_type = 'token' encode_as_ew = False class EncodedWord(TokenList): - token_type = 'encoded-word' cte = None charset = None @@ -496,16 +489,19 @@ class Domain(TokenList): class DotAtom(TokenList): - token_type = 'dot-atom' class DotAtomText(TokenList): - token_type = 'dot-atom-text' as_ew_allowed = True +class NoFoldLiteral(TokenList): + token_type = 'no-fold-literal' + as_ew_allowed = False + + class AddrSpec(TokenList): token_type = 'addr-spec' @@ -809,7 +805,6 @@ class ParameterizedHeaderValue(TokenList): class ContentType(ParameterizedHeaderValue): - token_type = 'content-type' as_ew_allowed = False maintype = 'text' @@ -817,27 +812,35 @@ class ContentType(ParameterizedHeaderValue): class ContentDisposition(ParameterizedHeaderValue): - token_type = 'content-disposition' as_ew_allowed = False content_disposition = None class ContentTransferEncoding(TokenList): - token_type = 'content-transfer-encoding' as_ew_allowed = False cte = '7bit' class HeaderLabel(TokenList): - token_type = 'header-label' as_ew_allowed = False -class Header(TokenList): +class MsgID(TokenList): + token_type = 'msg-id' + as_ew_allowed = False + + def fold(self, policy): + # message-id tokens may not be folded. + return str(self) + policy.linesep + +class MessageID(MsgID): + token_type = 'message-id' + +class Header(TokenList): token_type = 'header' @@ -1583,7 +1586,7 @@ def get_addr_spec(value): addr_spec.append(token) if not value or value[0] != '@': addr_spec.defects.append(errors.InvalidHeaderDefect( - "add-spec local part with no domain")) + "addr-spec local part with no domain")) return addr_spec, value addr_spec.append(ValueTerminal('@', 'address-at-symbol')) token, value = get_domain(value[1:]) @@ -1968,6 +1971,110 @@ def get_address_list(value): value = value[1:] return address_list, value + +def get_no_fold_literal(value): + """ no-fold-literal = "[" *dtext "]" + """ + no_fold_literal = NoFoldLiteral() + if not value: + raise errors.HeaderParseError( + "expected no-fold-literal but found '{}'".format(value)) + if value[0] != '[': + raise errors.HeaderParseError( + "expected '[' at the start of no-fold-literal " + "but found '{}'".format(value)) + no_fold_literal.append(ValueTerminal('[', 'no-fold-literal-start')) + value = value[1:] + token, value = get_dtext(value) + no_fold_literal.append(token) + if not value or value[0] != ']': + raise errors.HeaderParseError( + "expected ']' at the end of no-fold-literal " + "but found '{}'".format(value)) + no_fold_literal.append(ValueTerminal(']', 'no-fold-literal-end')) + return no_fold_literal, value[1:] + +def get_msg_id(value): + """msg-id = [CFWS] "<" id-left '@' id-right ">" [CFWS] + id-left = dot-atom-text / obs-id-left + id-right = dot-atom-text / no-fold-literal / obs-id-right + no-fold-literal = "[" *dtext "]" + """ + msg_id = MsgID() + if value[0] in CFWS_LEADER: + token, value = get_cfws(value) + msg_id.append(token) + if not value or value[0] != '<': + raise errors.HeaderParseError( + "expected msg-id but found '{}'".format(value)) + msg_id.append(ValueTerminal('<', 'msg-id-start')) + value = value[1:] + # Parse id-left. + try: + token, value = get_dot_atom_text(value) + except errors.HeaderParseError: + try: + # obs-id-left is same as local-part of add-spec. + token, value = get_obs_local_part(value) + msg_id.defects.append(errors.ObsoleteHeaderDefect( + "obsolete id-left in msg-id")) + except errors.HeaderParseError: + raise errors.HeaderParseError( + "expected dot-atom-text or obs-id-left" + " but found '{}'".format(value)) + msg_id.append(token) + if not value or value[0] != '@': + msg_id.defects.append(errors.InvalidHeaderDefect( + "msg-id with no id-right")) + # Even though there is no id-right, if the local part + # ends with `>` let's just parse it too and return + # along with the defect. + if value and value[0] == '>': + msg_id.append(ValueTerminal('>', 'msg-id-end')) + value = value[1:] + return msg_id, value + msg_id.append(ValueTerminal('@', 'address-at-symbol')) + value = value[1:] + # Parse id-right. + try: + token, value = get_dot_atom_text(value) + except errors.HeaderParseError: + try: + token, value = get_no_fold_literal(value) + except errors.HeaderParseError as e: + try: + token, value = get_domain(value) + msg_id.defects.append(errors.ObsoleteHeaderDefect( + "obsolete id-right in msg-id")) + except errors.HeaderParseError: + raise errors.HeaderParseError( + "expected dot-atom-text, no-fold-literal or obs-id-right" + " but found '{}'".format(value)) + msg_id.append(token) + if value and value[0] == '>': + value = value[1:] + else: + msg_id.defects.append(errors.InvalidHeaderDefect( + "missing trailing '>' on msg-id")) + msg_id.append(ValueTerminal('>', 'msg-id-end')) + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + msg_id.append(token) + return msg_id, value + + +def parse_message_id(value): + """message-id = "Message-ID:" msg-id CRLF + """ + message_id = MessageID() + try: + token, value = get_msg_id(value) + except errors.HeaderParseError: + message_id.defects.append(errors.InvalidHeaderDefect( + "Expected msg-id but found {!r}".format(value))) + message_id.append(token) + return message_id + # # XXX: As I begin to add additional header parsers, I'm realizing we probably # have two level of parser routines: the get_XXX methods that get a token in diff --git a/Lib/email/headerregistry.py b/Lib/email/headerregistry.py index 0065204..452c6ad 100644 --- a/Lib/email/headerregistry.py +++ b/Lib/email/headerregistry.py @@ -520,6 +520,18 @@ class ContentTransferEncodingHeader: return self._cte +class MessageIDHeader: + + max_count = 1 + value_parser = staticmethod(parser.parse_message_id) + + @classmethod + def parse(cls, value, kwds): + kwds['parse_tree'] = parse_tree = cls.value_parser(value) + kwds['decoded'] = str(parse_tree) + kwds['defects'].extend(parse_tree.all_defects) + + # The header factory # _default_header_map = { @@ -542,6 +554,7 @@ _default_header_map = { 'content-type': ContentTypeHeader, 'content-disposition': ContentDispositionHeader, 'content-transfer-encoding': ContentTransferEncodingHeader, + 'message-id': MessageIDHeader, } class HeaderRegistry: |