From 3da240fd01e02a31e516847ba865a9f56d7cfcbc Mon Sep 17 00:00:00 2001 From: R David Murray Date: Wed, 16 Oct 2013 22:48:40 -0400 Subject: #18891: Complete new provisional email API. This adds EmailMessage and, MIMEPart subclasses of Message with new API methods, and a ContentManager class used by the new methods. Also a new policy setting, content_manager. Patch was reviewed by Stephen J. Turnbull and Serhiy Storchaka, and reflects their feedback. I will ideally add some examples of using the new API to the documentation before the final release. --- Lib/email/contentmanager.py | 249 ++++++++++++++++++++++++++++++++++++++++++++ Lib/email/message.py | 216 +++++++++++++++++++++++++++++++++++++- Lib/email/policy.py | 13 +++ Lib/email/utils.py | 10 +- 4 files changed, 480 insertions(+), 8 deletions(-) create mode 100644 Lib/email/contentmanager.py (limited to 'Lib/email') diff --git a/Lib/email/contentmanager.py b/Lib/email/contentmanager.py new file mode 100644 index 0000000000..d3636529b6 --- /dev/null +++ b/Lib/email/contentmanager.py @@ -0,0 +1,249 @@ +import binascii +import email.charset +import email.message +import email.errors +from email import quoprimime + +class ContentManager: + + def __init__(self): + self.get_handlers = {} + self.set_handlers = {} + + def add_get_handler(self, key, handler): + self.get_handlers[key] = handler + + def get_content(self, msg, *args, **kw): + content_type = msg.get_content_type() + if content_type in self.get_handlers: + return self.get_handlers[content_type](msg, *args, **kw) + maintype = msg.get_content_maintype() + if maintype in self.get_handlers: + return self.get_handlers[maintype](msg, *args, **kw) + if '' in self.get_handlers: + return self.get_handlers[''](msg, *args, **kw) + raise KeyError(content_type) + + def add_set_handler(self, typekey, handler): + self.set_handlers[typekey] = handler + + def set_content(self, msg, obj, *args, **kw): + if msg.get_content_maintype() == 'multipart': + # XXX: is this error a good idea or not? We can remove it later, + # but we can't add it later, so do it for now. + raise TypeError("set_content not valid on multipart") + handler = self._find_set_handler(msg, obj) + msg.clear_content() + handler(msg, obj, *args, **kw) + + def _find_set_handler(self, msg, obj): + full_path_for_error = None + for typ in type(obj).__mro__: + if typ in self.set_handlers: + return self.set_handlers[typ] + qname = typ.__qualname__ + modname = getattr(typ, '__module__', '') + full_path = '.'.join((modname, qname)) if modname else qname + if full_path_for_error is None: + full_path_for_error = full_path + if full_path in self.set_handlers: + return self.set_handlers[full_path] + if qname in self.set_handlers: + return self.set_handlers[qname] + name = typ.__name__ + if name in self.set_handlers: + return self.set_handlers[name] + if None in self.set_handlers: + return self.set_handlers[None] + raise KeyError(full_path_for_error) + + +raw_data_manager = ContentManager() + + +def get_text_content(msg, errors='replace'): + content = msg.get_payload(decode=True) + charset = msg.get_param('charset', 'ASCII') + return content.decode(charset, errors=errors) +raw_data_manager.add_get_handler('text', get_text_content) + + +def get_non_text_content(msg): + return msg.get_payload(decode=True) +for maintype in 'audio image video application'.split(): + raw_data_manager.add_get_handler(maintype, get_non_text_content) + + +def get_message_content(msg): + return msg.get_payload(0) +for subtype in 'rfc822 external-body'.split(): + raw_data_manager.add_get_handler('message/'+subtype, get_message_content) + + +def get_and_fixup_unknown_message_content(msg): + # If we don't understand a message subtype, we are supposed to treat it as + # if it were application/octet-stream, per + # tools.ietf.org/html/rfc2046#section-5.2.4. Feedparser doesn't do that, + # so do our best to fix things up. Note that it is *not* appropriate to + # model message/partial content as Message objects, so they are handled + # here as well. (How to reassemble them is out of scope for this comment :) + return bytes(msg.get_payload(0)) +raw_data_manager.add_get_handler('message', + get_and_fixup_unknown_message_content) + + +def _prepare_set(msg, maintype, subtype, headers): + msg['Content-Type'] = '/'.join((maintype, subtype)) + if headers: + if not hasattr(headers[0], 'name'): + mp = msg.policy + headers = [mp.header_factory(*mp.header_source_parse([header])) + for header in headers] + try: + for header in headers: + if header.defects: + raise header.defects[0] + msg[header.name] = header + except email.errors.HeaderDefect as exc: + raise ValueError("Invalid header: {}".format( + header.fold(policy=msg.policy))) from exc + + +def _finalize_set(msg, disposition, filename, cid, params): + if disposition is None and filename is not None: + disposition = 'attachment' + if disposition is not None: + msg['Content-Disposition'] = disposition + if filename is not None: + msg.set_param('filename', + filename, + header='Content-Disposition', + replace=True) + if cid is not None: + msg['Content-ID'] = cid + if params is not None: + for key, value in params.items(): + msg.set_param(key, value) + + +# XXX: This is a cleaned-up version of base64mime.body_encode. It would +# be nice to drop both this and quoprimime.body_encode in favor of +# enhanced binascii routines that accepted a max_line_length parameter. +def _encode_base64(data, max_line_length): + encoded_lines = [] + unencoded_bytes_per_line = max_line_length * 3 // 4 + for i in range(0, len(data), unencoded_bytes_per_line): + thisline = data[i:i+unencoded_bytes_per_line] + encoded_lines.append(binascii.b2a_base64(thisline).decode('ascii')) + return ''.join(encoded_lines) + + +def _encode_text(string, charset, cte, policy): + lines = string.encode(charset).splitlines() + linesep = policy.linesep.encode('ascii') + def embeded_body(lines): return linesep.join(lines) + linesep + def normal_body(lines): return b'\n'.join(lines) + b'\n' + if cte==None: + # Use heuristics to decide on the "best" encoding. + try: + return '7bit', normal_body(lines).decode('ascii') + except UnicodeDecodeError: + pass + if (policy.cte_type == '8bit' and + max(len(x) for x in lines) <= policy.max_line_length): + return '8bit', normal_body(lines).decode('ascii', 'surrogateescape') + sniff = embeded_body(lines[:10]) + sniff_qp = quoprimime.body_encode(sniff.decode('latin-1'), + policy.max_line_length) + sniff_base64 = binascii.b2a_base64(sniff) + # This is a little unfair to qp; it includes lineseps, base64 doesn't. + if len(sniff_qp) > len(sniff_base64): + cte = 'base64' + else: + cte = 'quoted-printable' + if len(lines) <= 10: + return cte, sniff_qp + if cte == '7bit': + data = normal_body(lines).decode('ascii') + elif cte == '8bit': + data = normal_body(lines).decode('ascii', 'surrogateescape') + elif cte == 'quoted-printable': + data = quoprimime.body_encode(normal_body(lines).decode('latin-1'), + policy.max_line_length) + elif cte == 'base64': + data = _encode_base64(embeded_body(lines), policy.max_line_length) + else: + raise ValueError("Unknown content transfer encoding {}".format(cte)) + return cte, data + + +def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None, + disposition=None, filename=None, cid=None, + params=None, headers=None): + _prepare_set(msg, 'text', subtype, headers) + cte, payload = _encode_text(string, charset, cte, msg.policy) + msg.set_payload(payload) + msg.set_param('charset', + email.charset.ALIASES.get(charset, charset), + replace=True) + msg['Content-Transfer-Encoding'] = cte + _finalize_set(msg, disposition, filename, cid, params) +raw_data_manager.add_set_handler(str, set_text_content) + + +def set_message_content(msg, message, subtype="rfc822", cte=None, + disposition=None, filename=None, cid=None, + params=None, headers=None): + if subtype == 'partial': + raise ValueError("message/partial is not supported for Message objects") + if subtype == 'rfc822': + if cte not in (None, '7bit', '8bit', 'binary'): + # http://tools.ietf.org/html/rfc2046#section-5.2.1 mandate. + raise ValueError( + "message/rfc822 parts do not support cte={}".format(cte)) + # 8bit will get coerced on serialization if policy.cte_type='7bit'. We + # may end up claiming 8bit when it isn't needed, but the only negative + # result of that should be a gateway that needs to coerce to 7bit + # having to look through the whole embedded message to discover whether + # or not it actually has to do anything. + cte = '8bit' if cte is None else cte + elif subtype == 'external-body': + if cte not in (None, '7bit'): + # http://tools.ietf.org/html/rfc2046#section-5.2.3 mandate. + raise ValueError( + "message/external-body parts do not support cte={}".format(cte)) + cte = '7bit' + elif cte is None: + # http://tools.ietf.org/html/rfc2046#section-5.2.4 says all future + # subtypes should be restricted to 7bit, so assume that. + cte = '7bit' + _prepare_set(msg, 'message', subtype, headers) + msg.set_payload([message]) + msg['Content-Transfer-Encoding'] = cte + _finalize_set(msg, disposition, filename, cid, params) +raw_data_manager.add_set_handler(email.message.Message, set_message_content) + + +def set_bytes_content(msg, data, maintype, subtype, cte='base64', + disposition=None, filename=None, cid=None, + params=None, headers=None): + _prepare_set(msg, maintype, subtype, headers) + if cte == 'base64': + data = _encode_base64(data, max_line_length=msg.policy.max_line_length) + elif cte == 'quoted-printable': + # XXX: quoprimime.body_encode won't encode newline characters in data, + # so we can't use it. This means max_line_length is ignored. Another + # bug to fix later. (Note: encoders.quopri is broken on line ends.) + data = binascii.b2a_qp(data, istext=False, header=False, quotetabs=True) + data = data.decode('ascii') + elif cte == '7bit': + # Make sure it really is only ASCII. The early warning here seems + # worth the overhead...if you care write your own content manager :). + data.encode('ascii') + elif cte in ('8bit', 'binary'): + data = data.decode('ascii', 'surrogateescape') + msg.set_payload(data) + msg['Content-Transfer-Encoding'] = cte + _finalize_set(msg, disposition, filename, cid, params) +for typ in (bytes, bytearray, memoryview): + raw_data_manager.add_set_handler(typ, set_bytes_content) diff --git a/Lib/email/message.py b/Lib/email/message.py index ebaf1c1430..cb2be21fdf 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -8,8 +8,6 @@ __all__ = ['Message'] import re import uu -import base64 -import binascii from io import BytesIO, StringIO # Intrapackage imports @@ -679,7 +677,7 @@ class Message: return failobj def set_param(self, param, value, header='Content-Type', requote=True, - charset=None, language=''): + charset=None, language='', replace=False): """Set a parameter in the Content-Type header. If the parameter already exists in the header, its value will be @@ -723,8 +721,11 @@ class Message: else: ctype = SEMISPACE.join([ctype, append_param]) if ctype != self.get(header): - del self[header] - self[header] = ctype + if replace: + self.replace_header(header, ctype) + else: + del self[header] + self[header] = ctype def del_param(self, param, header='content-type', requote=True): """Remove the given parameter completely from the Content-Type header. @@ -905,3 +906,208 @@ class Message: # I.e. def walk(self): ... from email.iterators import walk + + +class MIMEPart(Message): + + def __init__(self, policy=None): + if policy is None: + from email.policy import default + policy = default + Message.__init__(self, policy) + + @property + def is_attachment(self): + c_d = self.get('content-disposition') + if c_d is None: + return False + return c_d.lower() == 'attachment' + + def _find_body(self, part, preferencelist): + if part.is_attachment: + return + maintype, subtype = part.get_content_type().split('/') + if maintype == 'text': + if subtype in preferencelist: + yield (preferencelist.index(subtype), part) + return + if maintype != 'multipart': + return + if subtype != 'related': + for subpart in part.iter_parts(): + yield from self._find_body(subpart, preferencelist) + return + if 'related' in preferencelist: + yield (preferencelist.index('related'), part) + candidate = None + start = part.get_param('start') + if start: + for subpart in part.iter_parts(): + if subpart['content-id'] == start: + candidate = subpart + break + if candidate is None: + subparts = part.get_payload() + candidate = subparts[0] if subparts else None + if candidate is not None: + yield from self._find_body(candidate, preferencelist) + + def get_body(self, preferencelist=('related', 'html', 'plain')): + """Return best candidate mime part for display as 'body' of message. + + Do a depth first search, starting with self, looking for the first part + matching each of the items in preferencelist, and return the part + corresponding to the first item that has a match, or None if no items + have a match. If 'related' is not included in preferencelist, consider + the root part of any multipart/related encountered as a candidate + match. Ignore parts with 'Content-Disposition: attachment'. + """ + best_prio = len(preferencelist) + body = None + for prio, part in self._find_body(self, preferencelist): + if prio < best_prio: + best_prio = prio + body = part + if prio == 0: + break + return body + + _body_types = {('text', 'plain'), + ('text', 'html'), + ('multipart', 'related'), + ('multipart', 'alternative')} + def iter_attachments(self): + """Return an iterator over the non-main parts of a multipart. + + Skip the first of each occurrence of text/plain, text/html, + multipart/related, or multipart/alternative in the multipart (unless + they have a 'Content-Disposition: attachment' header) and include all + remaining subparts in the returned iterator. When applied to a + multipart/related, return all parts except the root part. Return an + empty iterator when applied to a multipart/alternative or a + non-multipart. + """ + maintype, subtype = self.get_content_type().split('/') + if maintype != 'multipart' or subtype == 'alternative': + return + parts = self.get_payload() + if maintype == 'multipart' and subtype == 'related': + # For related, we treat everything but the root as an attachment. + # The root may be indicated by 'start'; if there's no start or we + # can't find the named start, treat the first subpart as the root. + start = self.get_param('start') + if start: + found = False + attachments = [] + for part in parts: + if part.get('content-id') == start: + found = True + else: + attachments.append(part) + if found: + yield from attachments + return + parts.pop(0) + yield from parts + return + # Otherwise we more or less invert the remaining logic in get_body. + # This only really works in edge cases (ex: non-text relateds or + # alternatives) if the sending agent sets content-disposition. + seen = [] # Only skip the first example of each candidate type. + for part in parts: + maintype, subtype = part.get_content_type().split('/') + if ((maintype, subtype) in self._body_types and + not part.is_attachment and subtype not in seen): + seen.append(subtype) + continue + yield part + + def iter_parts(self): + """Return an iterator over all immediate subparts of a multipart. + + Return an empty iterator for a non-multipart. + """ + if self.get_content_maintype() == 'multipart': + yield from self.get_payload() + + def get_content(self, *args, content_manager=None, **kw): + if content_manager is None: + content_manager = self.policy.content_manager + return content_manager.get_content(self, *args, **kw) + + def set_content(self, *args, content_manager=None, **kw): + if content_manager is None: + content_manager = self.policy.content_manager + content_manager.set_content(self, *args, **kw) + + def _make_multipart(self, subtype, disallowed_subtypes, boundary): + if self.get_content_maintype() == 'multipart': + existing_subtype = self.get_content_subtype() + disallowed_subtypes = disallowed_subtypes + (subtype,) + if existing_subtype in disallowed_subtypes: + raise ValueError("Cannot convert {} to {}".format( + existing_subtype, subtype)) + keep_headers = [] + part_headers = [] + for name, value in self._headers: + if name.lower().startswith('content-'): + part_headers.append((name, value)) + else: + keep_headers.append((name, value)) + if part_headers: + # There is existing content, move it to the first subpart. + part = type(self)(policy=self.policy) + part._headers = part_headers + part._payload = self._payload + self._payload = [part] + else: + self._payload = [] + self._headers = keep_headers + self['Content-Type'] = 'multipart/' + subtype + if boundary is not None: + self.set_param('boundary', boundary) + + def make_related(self, boundary=None): + self._make_multipart('related', ('alternative', 'mixed'), boundary) + + def make_alternative(self, boundary=None): + self._make_multipart('alternative', ('mixed',), boundary) + + def make_mixed(self, boundary=None): + self._make_multipart('mixed', (), boundary) + + def _add_multipart(self, _subtype, *args, _disp=None, **kw): + if (self.get_content_maintype() != 'multipart' or + self.get_content_subtype() != _subtype): + getattr(self, 'make_' + _subtype)() + part = type(self)(policy=self.policy) + part.set_content(*args, **kw) + if _disp and 'content-disposition' not in part: + part['Content-Disposition'] = _disp + self.attach(part) + + def add_related(self, *args, **kw): + self._add_multipart('related', *args, _disp='inline', **kw) + + def add_alternative(self, *args, **kw): + self._add_multipart('alternative', *args, **kw) + + def add_attachment(self, *args, **kw): + self._add_multipart('mixed', *args, _disp='attachment', **kw) + + def clear(self): + self._headers = [] + self._payload = None + + def clear_content(self): + self._headers = [(n, v) for n, v in self._headers + if not n.lower().startswith('content-')] + self._payload = None + + +class EmailMessage(MIMEPart): + + def set_content(self, *args, **kw): + super().set_content(*args, **kw) + if 'MIME-Version' not in self: + self['MIME-Version'] = '1.0' diff --git a/Lib/email/policy.py b/Lib/email/policy.py index 38e88afe1d..f0b20f4b19 100644 --- a/Lib/email/policy.py +++ b/Lib/email/policy.py @@ -5,6 +5,7 @@ code that adds all the email6 features. from email._policybase import Policy, Compat32, compat32, _extend_docstrings from email.utils import _has_surrogates from email.headerregistry import HeaderRegistry as HeaderRegistry +from email.contentmanager import raw_data_manager __all__ = [ 'Compat32', @@ -58,10 +59,22 @@ class EmailPolicy(Policy): special treatment, while all other fields are treated as unstructured. This list will be completed before the extension is marked stable.) + + content_manager -- an object with at least two methods: get_content + and set_content. When the get_content or + set_content method of a Message object is called, + it calls the corresponding method of this object, + passing it the message object as its first argument, + and any arguments or keywords that were passed to + it as additional arguments. The default + content_manager is + :data:`~email.contentmanager.raw_data_manager`. + """ refold_source = 'long' header_factory = HeaderRegistry() + content_manager = raw_data_manager def __init__(self, **kw): # Ensure that each new instance gets a unique header factory diff --git a/Lib/email/utils.py b/Lib/email/utils.py index b3b42bb379..25b0d561f4 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -68,9 +68,13 @@ def _has_surrogates(s): # How to deal with a string containing bytes before handing it to the # application through the 'normal' interface. def _sanitize(string): - # Turn any escaped bytes into unicode 'unknown' char. - original_bytes = string.encode('ascii', 'surrogateescape') - return original_bytes.decode('ascii', 'replace') + # Turn any escaped bytes into unicode 'unknown' char. If the escaped + # bytes happen to be utf-8 they will instead get decoded, even if they + # were invalid in the charset the source was supposed to be in. This + # seems like it is not a bad thing; a defect was still registered. + original_bytes = string.encode('utf-8', 'surrogateescape') + return original_bytes.decode('utf-8', 'replace') + # Helpers -- cgit v1.2.1