python source code of message

import datetime as dt
import email
import email.header
import email.policy
import encodings
import hashlib
import json
import mimetypes
import re
import uuid
from email.message import MIMEPart
from email.utils import formatdate, getaddresses, parsedate_to_datetime

import chardet

from . import conf, html, log

aliases = {
    # Seems Google used gb2312 in some subjects, so there is another symbol
    # instead of dash, because of next bug:
    # https://bugs.python.org/issue24036
    'gb2312': 'gbk',
    # @naspeh got such encoding in my own mailbox
    'cp-1251': 'cp1251',
}
encodings.aliases.aliases.update(aliases)


class BinaryPolicy(email.policy.Compat32):
    """+
    Dovecot understands UTF-8 encoding, so let's save parsed messages
    without sanitizing.
    """

    content_manager = email.policy.raw_data_manager

    def _sanitize_header(self, name, value):
        return value

    def _fold(self, name, value, sanitize=None):
        return '%s: %s%s' % (name, value, self.linesep)

    def fold_binary(self, name, value):
        folded = self._fold(name, value)
        return folded.encode('utf8', 'surrogateescape')


policy = BinaryPolicy()


def new():
    return MIMEPart(policy)


def binary(txt, mimetype='text/plain'):
    msg = new()
    msg.set_type(mimetype)
    msg.add_header('Content-Transfer-Encoding', 'binary')
    msg.set_payload(txt, 'utf-8')
    return msg


def parse_mime(orig, uid):
    def error(e, label):
        return 'error on %r: [%s] %s' % (label, e.__class__.__name__, e)

    def try_decode(raw, charsets, label=''):
        txt = err = None
        charsets = [aliases.get(c, c) for c in charsets if c]
        charset = charsets[0]
        for c in charsets:
            try:
                txt = raw.decode(c)
                charset = c
                break
            except UnicodeDecodeError as e:
                err = error(e, label)
            except LookupError as e:
                err = error(e, label)
                if len(charsets) == 1:
                    charset = 'utf8'
        return txt, charset, err

    def decode_bytes(raw, charset, label):
        if not raw:
            return ''

        txt = None
        charset = charset and charset.lower()
        if charset == 'unknown-8bit' or not charset:
            # first trying to decode with charsets detected in message
            txt = try_decode(raw, charsets)[0] if charsets else None
            if txt:
                return txt

            # trying chardet
            detected = chardet.detect(raw)
            charset = detected['encoding']
            if charset:
                charset = charset.lower()
            else:
                charset = charsets[0] if charsets else 'utf8'

        txt, charset, err = try_decode(raw, [charset], label)
        if txt:
            # if decoded without errors add to potential charsets list
            charsets.append(charset)
        if not txt:
            log.info('UID=%s %s', uid, err)
            errors.append(err)
            txt = raw.decode(charset, 'replace')
        return txt

    def decode_header(raw, label):
        if not raw:
            return None

        parts = []
        for raw, charset in email.header.decode_header(raw):
            if isinstance(raw, str):
                txt = raw
            else:
                txt = decode_bytes(raw, charset, label)
            parts += [txt]
        header = ''.join(parts)
        header = re.sub(r'\s+', ' ', header)
        return header

    def decode_addresses(raw, label):
        if not raw:
            return None

        decoded = False
        if not isinstance(raw, str):
            decoded = True
            raw = decode_header(raw, label)
        parts = []
        for name, addr in getaddresses([raw]):
            if not decoded:
                name, addr = [decode_header(i, label) for i in (name, addr)]
            parts.append(('"%s" <%s>' % (name, addr)) if name else addr)
        return ', '.join(p for p in parts if p)

    def attachment(part, content, path):
        ctype = part.get_content_type()
        label = '%s(%s)' % (ctype, path)
        item = {'size': len(content), 'path': path}
        filename = part.get_filename()
        if filename:
            filename = decode_header(part.get_filename(), label) or ''
            filename = re.sub(r'[^\w.-]', '-', filename)
        else:
            ext = mimetypes.guess_extension(ctype) or 'txt'
            filename = 'unknown-%s%s' % (path, ext)
        end = '/'.join(i for i in (path, filename) if i)
        item['url'] = '/raw/%s/%s' % (uid, end)
        item['filename'] = filename

        content_id = part.get('Content-ID')
        if content_id:
            item['content-id'] = content_id
        if ctype.startswith('image/'):
            item['image'] = True
        return item

    def parse_part(part, path=''):
        htm, txt, files = '', '', []
        ctype = part.get_content_type()
        if ctype.startswith('message/'):
            content = part.as_bytes()
            files = [attachment(part, content, path)]
            return htm, txt, files
        elif part.get_filename():
            content = part.get_payload(decode=True)
            files = [attachment(part, content, path)]
            return htm, txt, files
        elif part.is_multipart():
            idx, parts = 0, []
            for m in part.get_payload():
                idx += 1
                path_ = '%s.%s' % (path, idx) if path else str(idx)
                htm_, txt_, files_ = parse_part(m, path_)
                parts.append((htm_, txt_))
                files += files_

            htm = [h for h, t in parts if h]
            txt = [t for h, t in parts if t]
            if part.get_content_subtype() == 'alternative':
                htm = htm[0] if htm else ''
                txt = txt[0] if txt else ''
            else:
                if htm:
                    htm = '<hr>'.join(
                        h if h else html.from_text(t)
                        for h, t in parts if h or t
                    )
                    txt = ''
                elif txt:
                    txt = '\n\n'.join(txt)
                else:
                    htm = txt = ''
            return htm, txt, files

        if ctype.startswith('text/'):
            content = part.get_payload(decode=True)
            charset = part.get_content_charset()
            label = '%s(%s)' % (ctype, path)
            content = decode_bytes(content, charset, label)
            content = content.rstrip()
            if ctype == 'text/html':
                htm = content
            else:
                txt = content
        else:
            content = part.get_payload(decode=True)
            files = [attachment(part, content, path)]
        return htm, txt, files

    charsets = list(set(c.lower() for c in orig.get_charsets() if c))
    errors, headers = [], {}
    htm, txt, files = parse_part(orig)

    for n in ('From', 'Sender', 'Reply-To', 'To', 'CC', 'BCC',):
        v = decode_addresses(orig[n], n)
        if v is None:
            continue
        headers[n] = v

    headers['Subject'] = decode_header(orig['subject'], 'Subject')
    return htm, txt, files, headers, errors


def normalize_msgid(mid):
    return mid.strip().lower()


def preview(htm, files):
    preview = htm and html.to_line(htm, 200)
    if len(preview) < 200 and files:
        preview += (' ' if preview else '') + (
            '[%s]' % ', '.join(f['filename'] for f in files)
        )
    return preview


def parsed(raw, uid, time, flags):
    # "email.message_from_bytes" uses "email.policy.compat32" policy
    # and it's by intention, because new policies don't work well
    # with real emails which have no encodings, badly formated addreses, etc.
    orig = email.message_from_bytes(raw)
    htm, txt, files, headers, errors = parse_mime(orig, uid)
    meta = {'origin_uid': uid, 'files': [], 'errors': errors}
    if htm:
        embeds = {
            f['content-id']: f['url']
            for f in files if 'content-id' in f
        }
        htm, extra_meta = html.clean(htm, embeds)
        meta.update(extra_meta)
    elif txt:
        htm = html.from_text(txt)

    meta['preview'] = preview(htm, files)
    meta['files'] = files

    fields = (
        ('From', 1), ('Sender', 1),
        ('Reply-To', 0), ('To', 0), ('CC', 0), ('BCC', 0)
    )
    for n, one in fields:
        v = headers.get(n)
        if not v:
            continue
        v = addresses(v)
        meta[n.lower()] = v[0] if one else v

    subj = headers['Subject']
    meta['subject'] = str(subj).strip() if subj else ''

    refs = orig['references']
    refs = [i.strip().lower() for i in refs.split()] if refs else []
    parent = refs[-1] if refs else None
    in_reply_to = orig['in-reply-to'] and normalize_msgid(orig['in-reply-to'])
    if in_reply_to:
        parent = in_reply_to
        if not refs:
            refs = [in_reply_to]
    meta['parent'] = parent

    mid = orig['message-id']
    if mid is None:
        log.info('UID=%s has no "Message-ID" header', uid)
        mid = '<mailur@noid>'
    else:
        mid = normalize_msgid(mid)
    meta['msgid'] = mid

    arrived = dt.datetime.strptime(time.strip('"'), '%d-%b-%Y %H:%M:%S %z')
    meta['arrived'] = int(arrived.timestamp())

    date = orig['date']
    try:
        date = date and int(parsedate_to_datetime(date).timestamp())
    except Exception as e:
        meta['errors'].append('error on date: val=%r err=%r' % (date, e))
        log.error('UID=%s can\'t parse date: val=%r err=%r', uid, date, e)
        date = None
    meta['date'] = date or meta['arrived']

    msg = new()
    msg.add_header('X-UID', '<%s>' % uid)
    msg.add_header('Message-ID', mid)
    msg.add_header('Subject', meta['subject'])
    msg.add_header('Date', orig['Date'])

    for n, v in headers.items():
        if n in msg:
            continue
        msg.add_header(n, v)

    is_draft = '\\Draft' in flags
    if is_draft:
        draft_id = orig['X-Draft-ID'] or mid
        msg.add_header('X-Draft-ID', draft_id)
        meta['draft_id'] = draft_id
        txt = parse_draft(orig)[0]
    elif orig['X-Draft-ID']:
        msg.add_header('X-Draft-ID', orig['X-Draft-ID'])

    thrid = None
    if not is_draft:
        addrs = [msg['from'] or msg['sender'], msg['to']]
        addrs = (a for a in addrs if a)
        addrs = ','.join(sorted(
            '"%s" <%s>' % (n, a) if n else a
            for n, a in getaddresses(addrs)
        ))
        addrs_n_subj = ' '.join(i for i in (addrs, subj) if i)
        thrid = hashlib.md5(addrs_n_subj.encode()).hexdigest()
        thrid = '<%s@mailur.link>' % thrid

    thrid = ' '.join(i for i in (thrid, orig['X-Thread-ID']) if i)
    if thrid:
        meta['thrid'] = thrid
        msg.add_header('X-Thread-ID', thrid)
        refs.insert(0, thrid)

    if refs:
        msg.add_header('In-Reply-To', refs[-1])
        msg.add_header('References', ' '.join(refs))

    msg.make_mixed()
    meta_txt = json.dumps(meta, sort_keys=True, ensure_ascii=False, indent=2)
    msg.attach(binary(meta_txt, 'application/json'))
    body = new()
    body.make_alternative()
    body.attach(binary(htm, 'text/html'))
    if txt:
        body.attach(binary(txt))
    msg.attach(body)

    flags = []
    if meta['errors']:
        flags.append('#err')
    return msg, flags


def sending(msg, linesep='\r\n', maxlinelen=70):
    def _fold(v, name=None):
        try:
            v.encode('ascii')
        except UnicodeEncodeError:
            v = email.header.Header(v, charset='utf-8', header_name=name)
            v = v.encode(maxlinelen=maxlinelen, linesep=linesep)
        return v

    def fold(name, value):
        return '%s: %s%s' % (name, _fold(value, name), linesep)

    def fold_addrs(name, value):
        addrs = email.utils.getaddresses([value])
        parts = []
        length = 0
        for n, a in addrs:
            part = '%s <%s>' % (_fold(n), a)
            length += len(part)
            if len(part) > maxlinelen:
                part = '%s %s' % (linesep, part)
                length = len(part)
            parts.append(part)
        addrs = ','.join(parts)
        return '%s: %s%s' % (name, addrs, linesep)

    params = [
        [a for n, a in email.utils.getaddresses([msg[name]])]
        for name in ('From', 'To') if msg[name]
    ]
    if len(params) < 2:
        raise ValueError('"From" and "To" shouldn\'t be empty')

    # These new email policies work pretty strange,
    # so this machinery is to encode "Subject", "From" and "To" headers
    # and keep mime body as is
    headers = []
    for n, f in (('Subject', fold), ('From', fold_addrs), ('To', fold_addrs)):
        headers.append(f(n, msg[n]))
        del msg[n]
    msg = b''.join([''.join(headers).encode(), msg.as_bytes()])
    params.append(msg)
    return params


def parse_draft(msg):
    def extract_txt(msg):
        mixed_types = ('multipart/mixed', 'multipart/related')
        ctype = msg.get_content_type()
        if ctype == 'text/plain':
            txt = msg.get_payload(decode=True)
            parts = []
        elif ctype == 'multipart/alternative':
            txt = msg.get_payload()
            txt = txt[0].get_payload(decode=True) if txt else ''
            parts = []
        elif ctype in mixed_types:
            parts = msg.get_payload()
            txt, _ = extract_txt(parts[0])
            parts = parts[1:]
            if len(parts) == 1 and parts[0].get_content_type() in mixed_types:
                parts = parts[0].get_payload()
        else:
            raise ValueError('Wrong content-type: %s' % ctype)
        return txt, parts

    txt, parts = extract_txt(msg)
    if isinstance(txt, bytes):
        txt = txt.decode()
    return txt, parts


def new_draft(draft, related, msgid=None):
    txt = new()
    txt.make_alternative()
    plain = draft.get('txt', '')
    txt.attach(binary(plain))
    htm = html.markdown(plain)
    txt.attach(binary(htm, 'text/html'))
    if related:
        msg = new()
        msg.make_mixed()
        msg.attach(txt)
        msg.attach(related)
    else:
        msg = txt

    msg.add_header('X-Draft-ID', draft['draft_id'])
    msg.add_header('Message-ID', msgid or draft['draft_id'])
    msg.add_header('Date', formatdate(usegmt=True))
    headers = ('From', 'To', 'CC', 'Subject', 'In-Reply-To', 'References')
    for h in headers:
        val = draft.get(h.lower())
        if val:
            msg.add_header(h, val)
    return msg


def gen_msgid():
    return '<%s@%s>' % (uuid.uuid4().hex, conf['DOMAIN'])


def gen_draftid():
    return '<%s>' % uuid.uuid4().hex[:8]


def address_name(a):
    if a[0]:
        return a[0]
    try:
        index = a[1].index('@')
    except ValueError:
        return a[1]
    return a[1][:index]


def addresses(txt):
    addrs = [
        {
            'addr': a[1].lower(),
            'name': address_name(a),
            'title': '"{}" <{}>'.format(*a) if a[0] else a[1],
            'hash': hashlib.md5(a[1].strip().lower().encode()).hexdigest(),
        } for a in getaddresses([txt])
    ]
    return addrs