python source code of parser

"""
zmail.parser
~~~~~~~~~~~~
This module provides functions to handles MIME object.
"""
import datetime
import logging
import re
import warnings
from base64 import b64decode
from datetime import timedelta, timezone, tzinfo
from email.header import decode_header
from quopri import decodestring
from typing import List
from urllib.parse import unquote

from .exceptions import ParseError
from .structures import CaseInsensitiveDict

TYPE_MULTIPART = 'multipart'
TYPE_TEXT_PLAIN = ('text', 'plain')
TYPE_TEXT_HTML = ('text', 'html')
DATE_PATTERN_1 = re.compile(r'(\w+),\s+([0-9]+)\s+(\w+)\s+([0-9]+)\s+([0-9]+):([0-9]+):([0-9]+)\s+(.+)')
DATE_PATTERN_2 = re.compile(r'([0-9]+)\s+([\w]+)\s+([0-9]+)\s+([0-9]+):([0-9]+):([0-9]+)\s+(.+)')
TIMEZONE_PATTERN = re.compile(re.compile(r"([+\-])([0-9])?([0-9])?([0-9])?([0-9])?"))
TIMEZONE_MINUTE_OFFSET = (600, 60, 10, 1)
FILENAME_PATTERN = re.compile(re.compile(r"([^']+)'([^']*)'(.+)"))
MONTH_TO_INT = CaseInsensitiveDict({
    'Jan': 1,
    'Feb': 2,
    'Mar': 3,
    'Apr': 4,
    'May': 5,
    'Jun': 6,
    'Jul': 7,
    'Aug': 8,
    'Sep': 9,
    'Oct': 10,
    'Nov': 11,
    'Dec': 12,
})
HEADER_VALUE_STRIP = '\r\n "'
logger = logging.getLogger('zmail')


def recursive_decode(_bytes: bytes, encodings: list or tuple) -> str or None:
    """Recursive decode bytes to str."""
    for encoding in encodings:
        try:
            return _bytes.decode(encoding)
        except UnicodeDecodeError:
            pass
    return None


def remove_line_feed_and_whitespace(_value: str):
    _value = _value.strip(HEADER_VALUE_STRIP)

    while r'\r\n' == _value[:4]:
        _value = _value[4:]
    while r'\r\n' == _value[-4:]:
        _value = _value[:-4]

    return _value


def parse_header_value(bvalue, encodings) -> str or None:
    """Parse mail-specified-header to real value."""
    value = recursive_decode(bvalue, encodings)

    # Decode header without converting charset.
    if value is not None:
        decoded_value = ''
        for _value, _charset in decode_header(value):
            if _charset is not None:
                try:
                    decoded_value += _value.decode(_charset)
                except UnicodeDecodeError:
                    break
            else:
                if isinstance(_value, bytes):
                    decoded_value += _value.decode('utf-8')
                elif isinstance(_value, str):
                    decoded_value += _value
        return decoded_value

    return None


def _fmt_date_tz(tz: str) -> tzinfo or None:
    match_groups = TIMEZONE_PATTERN.match(tz).groups()
    _minute_offset = 0
    if match_groups[0] == '+':
        for i, v in enumerate(match_groups[1:]):
            if v is None:
                continue
            _minute_offset += int(TIMEZONE_MINUTE_OFFSET[i] * int(v))
        return timezone(timedelta(minutes=_minute_offset))
    elif match_groups[0] == '-':
        for i, v in enumerate(match_groups[1:]):
            if v is None:
                continue
            _minute_offset += int(TIMEZONE_MINUTE_OFFSET[i] * int(v))
        return timezone(-timedelta(minutes=_minute_offset))
    return None


def fmt_date(date_as_string: str) -> datetime.datetime or None:
    """Convert mail header Date to datetime object."""
    match_one = DATE_PATTERN_1.fullmatch(date_as_string)
    match_two = DATE_PATTERN_2.fullmatch(date_as_string)
    if match_one:
        week, day, month_as_string, year, hour, minute, second, time_zone = match_one.groups()
        month = MONTH_TO_INT[month_as_string]
        tz = _fmt_date_tz(time_zone)
        return datetime.datetime(int(year), month, int(day),
                                 int(hour), int(minute), int(second), tzinfo=tz)
    elif match_two:
        day, month_as_string, year, hour, minute, second, time_zone = match_two.groups()
        month = MONTH_TO_INT[month_as_string]
        tz = _fmt_date_tz(time_zone)
        return datetime.datetime(int(year), month, int(day),
                                 int(hour), int(minute), int(second), tzinfo=tz)
    warnings.warn('Can not parse Date:{}'.format(date_as_string))
    return None


def _get_sub_charset(raw_headers: list) -> list:
    """Hardcode for some invalid mail-encoding."""
    for k, _ in raw_headers:
        if b'X-QQ' in k:
            return ['gbk']
    return []


def parse_headers(lines: List[bytes], debug=False, log=None):
    log = log or logger
    headers = CaseInsensitiveDict()
    raw_headers = []
    unknown_value_headers = []

    lines_idx = 0
    line = lines[0]
    line_count = len(lines)

    while lines:
        if line in (b'', b'\r\n', b'\n'):
            break
        try:
            bname, bvalue = line.split(b':', 1)
        except ValueError:
            raise ParseError('Invalid header:' + str(line))
        except Exception as e:
            raise ParseError('Unknown parse header error' + str(e))
        bname = bname.strip(b' \t')
        bvalue = bvalue.lstrip()

        # next line
        lines_idx += 1
        if lines_idx <= line_count - 1:
            line = lines[lines_idx]
        else:
            bvalue = bvalue.strip()
            raw_headers.append((bname, bvalue))
            # Parse header.
            name = recursive_decode(bname, ('utf-8',))
            if name is not None:
                value = parse_header_value(bvalue, ('utf-8',))
                if value is not None:
                    headers[name] = value
                else:
                    unknown_value_headers.append((name, bvalue))
            else:
                raise ParseError('Invalid header name {}'.format(str(bname)))
            break

        # consume continuation lines
        continuation = line and line[0] in (32, 9)  # (' ', '\t')

        if continuation:
            bvalue = [bvalue]
            while continuation:
                bvalue.append(line.strip(b' \t'))
                # next line
                lines_idx += 1
                if lines_idx < line_count:
                    line = lines[lines_idx]
                    continuation = line and line[0] in (32, 9)  # (' ', '\t')
                else:
                    line = b''
                    break
            bvalue = b''.join(bvalue)

        bvalue = bvalue.strip()
        raw_headers.append((bname, bvalue))
        # Parse header.
        name = recursive_decode(bname, ('utf-8',))
        if name is not None:
            value = parse_header_value(bvalue, ('utf-8',))
            if value is not None:
                headers[name] = value
            else:
                unknown_value_headers.append((name, bvalue))
        else:
            raise ParseError('Invalid header {}'.format(str(bname)))

    # Parse Content-Type
    try:
        content_type, *extra_pair = headers['content-type'].split(';')
    except Exception as e:
        content_type = 'application/octet-stream'
        extra_pair = []
        if debug:
            log.warning('Parse Content-Type error:{}'.format(str(e)))
    main_type, sub_type = content_type.split('/')

    # Remove whitespace and get lower type.
    main_type, sub_type = main_type.replace(' ', '').lower(), sub_type.replace(' ', '').lower()

    # Get extra key values.
    extra_kv = CaseInsensitiveDict()
    for pair in extra_pair:
        if pair:
            try:
                _k, _v = pair.split('=', 1)
                _k = remove_line_feed_and_whitespace(_k)
                _v = remove_line_feed_and_whitespace(_v)
                extra_kv[_k] = _v
            except Exception as e:
                if debug:
                    log.warning('Extra key-value decode error:' + pair + 'reason' + str(e))
                continue

    # Detect charsets
    charsets = []
    main_charset = extra_kv.get('charset')
    if main_charset is None:
        main_charset = 'utf-8'
    else:
        main_charset = main_charset.lower()
    charsets.append(main_charset)

    sub_charset = _get_sub_charset(raw_headers)  # type:list
    for charset in sub_charset:
        if charset not in charsets:
            charsets.append(charset)

    # Re-parses unknown headers
    for name, bvalue in unknown_value_headers:
        value = recursive_decode(bvalue, charsets)
        if value is not None:
            headers[name] = value
        else:
            if debug:
                logger.warning('Can not decode bytes-value' + str(bvalue))
    # Parse Date and convert Date to DateTimeObject.
    if headers.get('date'):
        headers['date'] = fmt_date(headers['date'])
    return raw_headers, headers, lines_idx, main_type, sub_type, charsets, extra_kv


def multiple_part_decode(lines: List[bytes], boundary: str, debug=False, log=None):
    content_text = []
    content_html = []
    attachments = []
    boundary = boundary.encode('ascii')

    # Split to parts.
    parts = []
    part_index = []
    for _idx, line in enumerate(lines):
        if boundary in line:
            part_index.append(_idx)
    if not part_index:
        raise ParseError('Can not find boundary on this mail.boundary{}'.format(boundary.decode("ascii")))
    else:
        _len = len(part_index)
        for idx_idx, idx in enumerate(part_index):
            if idx_idx + 1 <= _len - 1:
                parts.append(lines[idx + 1:part_index[idx_idx + 1]])

    for part in parts:
        parsed_part = parse(part, debug, log)  # Recursive call
        if parsed_part['content_text']:
            content_text += parsed_part['content_text']
        if parsed_part['content_html']:
            content_html += parsed_part['content_html']
        if parsed_part['attachments']:
            attachments += parsed_part['attachments']

    return content_text, content_html, attachments


def parse_one_part_body(headers: CaseInsensitiveDict, body: List[bytes], main_type: str, sub_type: str,
                        transfer_encoding: str, charsets: List[str], extra_kv: CaseInsensitiveDict,
                        debug=False, log=None):
    """Parse non-multiple-part body"""
    transfer_encoding = transfer_encoding.lower()
    content_text = None  # type:None or str
    content_html = None  # type:None or str
    attachment = None  # type:None or tuple

    content_disposition = headers.get('content-disposition')  # type:None or str

    # Is attachment.
    if content_disposition is not None and content_disposition.find('attachment') == 0:
        raw_attachment = _decode_one_part_body(body, transfer_encoding, charsets, _need_decode=False)
        if content_disposition:
            _extra_kv = CaseInsensitiveDict()
            content_disposition_extra_parts = content_disposition.split(';')
            for part in content_disposition_extra_parts:
                if '=' not in part:
                    continue
                try:
                    _k, _v = part.split('=', 1)
                    _k = remove_line_feed_and_whitespace(_k)
                    _v = remove_line_feed_and_whitespace(_v)
                    _extra_kv[_k] = _v
                except Exception as e:
                    if debug:
                        log.warning('Can not decode Content-Disposition extra part:' + part + ' reason:' + str(e))
                    continue
            filename = _extra_kv.get('filename')
            if filename is None and _extra_kv.get('filename*'):  # RFC5987 and ignore language tags
                match = FILENAME_PATTERN.fullmatch(_extra_kv.get('filename*'))
                if match:
                    _encoding, _language_tags, _name = match.groups()
                    filename = unquote(_name, _encoding)
        else:
            filename = None
        attachment_name = filename or extra_kv.get('name') or headers.get('subject') or 'Untitled'
        attachment = (attachment_name, raw_attachment)
    # Is text/plain
    elif (main_type, sub_type) == TYPE_TEXT_PLAIN:
        decoded_body = _decode_one_part_body(body, transfer_encoding, charsets)

        if decoded_body:
            content_text = decoded_body
    # Is text/html
    elif (main_type, sub_type) == TYPE_TEXT_HTML:
        decoded_body = _decode_one_part_body(body, transfer_encoding, charsets)
        if decoded_body:
            content_html = decoded_body
    else:  # All other type regard as attachment.
        raw_attachment = _decode_one_part_body(body, transfer_encoding, charsets, _need_decode=False)
        if content_disposition:
            _extra_kv = CaseInsensitiveDict()
            content_disposition_extra_parts = content_disposition.split(';')
            for part in content_disposition_extra_parts:
                if '=' not in part:
                    continue
                try:
                    _k, _v = part.split('=', 1)
                    _k = remove_line_feed_and_whitespace(_k)
                    _v = remove_line_feed_and_whitespace(_v)
                    _extra_kv[_k] = _v
                except Exception as e:
                    if debug:
                        log.warning('Can not decode Content-Disposition extra part:' + part + ' reason:' + str(e))
                    continue
            filename = _extra_kv.get('filename')
            if filename is None and _extra_kv.get('filename*'):  # RFC5987 and ignore language tags
                match = FILENAME_PATTERN.fullmatch(_extra_kv.get('filename*'))
                if match:
                    _encoding, _language_tags, _name = match.groups()
                    filename = unquote(_name, _encoding)
        else:
            filename = None
        attachment_name = filename or extra_kv.get('name') or headers.get('subject') or 'Untitled'
        attachment = (attachment_name, raw_attachment)

    return content_text, content_html, attachment


def _decode_one_part_body(lines: List[bytes], transfer_encoding: str, charsets: List[str], _need_decode=True):
    """Decode transfer-encoding then decode raw value to string."""
    if transfer_encoding == 'quoted-printable':
        decoded_bytes = decodestring(b'\r\n'.join(lines))
        if _need_decode:
            return recursive_decode(decoded_bytes, charsets)
        else:
            return b'\r\n'.join(lines)
    elif transfer_encoding == 'base64':
        decoded_bytes = b64decode(b''.join(lines))
        if _need_decode:
            return recursive_decode(decoded_bytes, charsets)
        else:
            return decoded_bytes
    elif transfer_encoding in ('binary', '8bit', '7bit'):
        if _need_decode:
            return recursive_decode(b'\r\n'.join(lines), charsets)
        else:
            return b'\r\n'.join(lines)
    else:
        raise ParseError('Invalid transfer-encoding {}'.format(transfer_encoding))


def parse(lines: List[bytes], debug=False, log=None) -> CaseInsensitiveDict:
    """Decode a multiple-part or Non-multiple-part mail to ParsedMail(as CaseInsensitiveDict)."""
    log = log or logger
    content_text = []
    content_html = []
    attachments = []

    raw_headers, headers, eof_idx, main_type, sub_type, charsets, extra_kv = parse_headers(lines, debug, log)

    body = lines[eof_idx + 1:]

    if main_type == TYPE_MULTIPART:  # Include recursive call
        boundary = extra_kv.get('boundary')
        if boundary is None:
            raise ParseError('Can not find boundary in multiple-part mail.')

        _content_text, _content_html, _attachment = multiple_part_decode(body, boundary, debug, log)

        if _content_text:
            content_text += _content_text
        if _content_html:
            content_html += _content_html
        if _attachment:
            attachments += _attachment

    else:  # Recursive exit
        transfer_encoding = headers.get('content-transfer-encoding')

        if transfer_encoding is not None:
            transfer_encoding = transfer_encoding.lower()
        else:
            # Default transfer_encoding.
            transfer_encoding = '8bit'

        _content_text, _content_html, _attachment = parse_one_part_body(headers, body, main_type, sub_type,
                                                                        transfer_encoding, charsets, extra_kv,
                                                                        debug, log)

        if _content_text:
            content_text.append(_content_text)
        if _content_html:
            content_html.append(_content_html)
        if _attachment:
            attachments.append(_attachment)

    mail = CaseInsensitiveDict()
    mail['content_text'] = content_text
    mail['content_html'] = content_html
    mail['attachments'] = attachments

    mail['headers'] = headers
    mail['raw_headers'] = raw_headers
    mail['charsets'] = charsets

    mail['subject'] = headers.get('subject')
    mail['date'] = headers.get('date')
    mail['from'] = headers.get('from')
    mail['to'] = headers.get('to')

    return mail


def parse_mail(lines: List[bytes], which: int, debug=False, log=None) -> CaseInsensitiveDict:
    """A wrapper for parse mail."""
    parsed_mail = parse(lines, debug, log)
    parsed_mail['id'] = which
    parsed_mail['raw'] = lines
    return parsed_mail