# -*- coding: utf-8 -*- # # Copyright (C) 2015-2020 Bitergia # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # # Authors: # Santiago Dueñas <sduenas@bitergia.com> # Germán Poo-Caamaño <gpoo@gnome.org> # Stephan Barth <stephan.barth@gmail.com> # Valerio Cosentino <valcos@bitergia.com> # Jesus M. Gonzalez-Barahona <jgb@gsyc.es> # Harshal Mittal <harshalmittal4@gmail.com> # # Note: some of this code was taken from the MailingListStats project import logging import mailbox import os import tempfile import gzip import bz2 import zipfile from grimoirelab_toolkit.datetime import (InvalidDateError, datetime_to_utc, str_to_datetime) from ...backend import (Backend, BackendCommand, BackendCommandArgumentParser) from ...utils import (DEFAULT_DATETIME, check_compressed_file_type, message_to_dict) CATEGORY_MESSAGE = "message" logger = logging.getLogger(__name__) class MBox(Backend): """MBox backend. This class allows the fetch the email messages stored one or several mbox files. Initialize this class passing the directory path where the mbox files are stored. The origin of the data will be set to to the value of `uri`. :param uri: URI of the mboxes; typically, the URL of their mailing list :param dirpath: directory path where the mboxes are stored :param tag: label used to mark the data :param archive: archive to store/retrieve items :param ssl_verify: enable/disable SSL verification """ version = '0.13.1' CATEGORIES = [CATEGORY_MESSAGE] DATE_FIELD = 'Date' MESSAGE_ID_FIELD = 'Message-ID' def __init__(self, uri, dirpath, tag=None, archive=None, ssl_verify=True): origin = uri super().__init__(origin, tag=tag, archive=archive, ssl_verify=ssl_verify) self.uri = uri self.dirpath = dirpath def fetch(self, category=CATEGORY_MESSAGE, from_date=DEFAULT_DATETIME): """Fetch the messages from a set of mbox files. The method retrieves, from mbox files, the messages stored in these containers. :param category: the category of items to fetch :param from_date: obtain messages since this date :returns: a generator of messages """ if not from_date: from_date = DEFAULT_DATETIME kwargs = {'from_date': from_date} items = super().fetch(category, **kwargs) return items def fetch_items(self, category, **kwargs): """Fetch the messages :param category: the category of items to fetch :param kwargs: backend arguments :returns: a generator of items """ from_date = kwargs['from_date'] logger.info("Looking for messages from '%s' on '%s' since %s", self.uri, self.dirpath, str(from_date)) mailing_list = MailingList(self.uri, self.dirpath) messages = self._fetch_and_parse_messages(mailing_list, from_date) for message in messages: yield message logger.info("Fetch process completed") @classmethod def has_archiving(cls): """Returns whether it supports archiving items on the fetch process. :returns: this backend does not support items archive """ return False @classmethod def has_resuming(cls): """Returns whether it supports to resume the fetch process. :returns: this backend supports items resuming """ return True @staticmethod def metadata_id(item): """Extracts the identifier from a MBox item.""" return item[MBox.MESSAGE_ID_FIELD] @staticmethod def metadata_updated_on(item): """Extracts the update time from a MBox item. The timestamp used is extracted from 'Date' field in its several forms. This date is converted to UNIX timestamp format. :param item: item generated by the backend :returns: a UNIX timestamp """ ts = item[MBox.DATE_FIELD] ts = str_to_datetime(ts) return ts.timestamp() @staticmethod def metadata_category(item): """Extracts the category from a MBox item. This backend only generates one type of item which is 'message'. """ return CATEGORY_MESSAGE @staticmethod def parse_mbox(filepath): """Parse a mbox file. This method parses a mbox file and returns an iterator of dictionaries. Each one of this contains an email message. :param filepath: path of the mbox to parse :returns : generator of messages; each message is stored in a dictionary of type `requests.structures.CaseInsensitiveDict` """ mbox = _MBox(filepath, create=False) for msg in mbox: message = message_to_dict(msg) yield message def _init_client(self, from_archive=False): pass def _fetch_and_parse_messages(self, mailing_list, from_date): """Fetch and parse the messages from a mailing list""" from_date = datetime_to_utc(from_date) nmsgs, imsgs, tmsgs = (0, 0, 0) for mbox in mailing_list.mboxes: tmp_path = None try: tmp_path = self._copy_mbox(mbox) for message in self.parse_mbox(tmp_path): tmsgs += 1 if not self._validate_message(message): imsgs += 1 continue # Ignore those messages sent before the given date dt = str_to_datetime(message[MBox.DATE_FIELD]) if dt < from_date: logger.debug("Message %s sent before %s; skipped", message['unixfrom'], str(from_date)) tmsgs -= 1 continue # Convert 'CaseInsensitiveDict' to dict message = self._casedict_to_dict(message) nmsgs += 1 logger.debug("Message %s parsed", message['unixfrom']) yield message except (OSError, EOFError) as e: logger.warning("Ignoring %s mbox due to: %s", mbox.filepath, str(e)) except Exception as e: if tmp_path and os.path.exists(tmp_path): os.remove(tmp_path) raise e finally: if tmp_path and os.path.exists(tmp_path): os.remove(tmp_path) logger.info("Done. %s/%s messages fetched; %s ignored", nmsgs, tmsgs, imsgs) def _copy_mbox(self, mbox): """Copy the contents of a mbox to a temporary file""" tmp_path = tempfile.mktemp(prefix='perceval_') with mbox.container as f_in: with open(tmp_path, mode='wb') as f_out: for line in f_in: f_out.write(line) return tmp_path def _validate_message(self, message): """Check if the given message has the mandatory fields""" # This check is "case insensitive" because we're # using 'CaseInsensitiveDict' from requests.structures # module to store the contents of a message. if self.MESSAGE_ID_FIELD not in message: logger.warning("Field 'Message-ID' not found in message %s; ignoring", message['unixfrom']) return False if not message[self.MESSAGE_ID_FIELD]: logger.warning("Field 'Message-ID' is empty in message %s; ignoring", message['unixfrom']) return False if self.DATE_FIELD not in message: logger.warning("Field 'Date' not found in message %s; ignoring", message['unixfrom']) return False if not message[self.DATE_FIELD]: logger.warning("Field 'Date' is empty in message %s; ignoring", message['unixfrom']) return False try: str_to_datetime(message[self.DATE_FIELD]) except InvalidDateError: logger.warning("Invalid date %s in message %s; ignoring", message[self.DATE_FIELD], message['unixfrom']) return False return True def _casedict_to_dict(self, message): """Convert a message in CaseInsensitiveDict to dict. This method also converts well known problematic headers, such as Message-ID and Date to a common name. """ message_id = message.pop(self.MESSAGE_ID_FIELD) date = message.pop(self.DATE_FIELD) msg = {k: v for k, v in message.items()} msg[self.MESSAGE_ID_FIELD] = message_id msg[self.DATE_FIELD] = date return msg class _MBox(mailbox.mbox): """Wrapper of `mailbox.mbox` to catch unhandled errors""" def get_message(self, key): """Return a Message representation or raise a KeyError.""" start, stop = self._lookup(key) self._file.seek(start) from_line = self._file.readline().replace(mailbox.linesep, b'') string = self._file.read(stop - self._file.tell()) msg = self._message_factory(string.replace(mailbox.linesep, b'\n')) try: msg.set_from(from_line[5:].decode('ascii')) return msg except UnicodeDecodeError: pass try: msg.set_from(from_line[5:].decode('utf-8')) except UnicodeDecodeError: msg.set_from(from_line[5:].decode('iso-8859-1')) return msg class MBoxCommand(BackendCommand): """Class to run MBox backend from the command line.""" BACKEND = MBox @classmethod def setup_cmd_parser(cls): """Returns the MBox argument parser.""" parser = BackendCommandArgumentParser(cls.BACKEND, from_date=True, ssl_verify=True) # Required arguments parser.parser.add_argument('uri', help="URI of the mboxes, usually the URL to their mailing list") parser.parser.add_argument('dirpath', help="Path to the mbox directory") return parser class MBoxArchive(object): """Class to access a mbox archive. MBOX archives can be stored into plain or compressed files (gzip, bz2 or zip). :param filepath: path to the mbox file """ def __init__(self, filepath): self._filepath = filepath self._compressed = check_compressed_file_type(filepath) @property def filepath(self): return self._filepath @property def container(self): if not self.is_compressed(): return open(self.filepath, mode='rb') if self.compressed_type == 'bz2': return bz2.open(self.filepath, mode='rb') elif self.compressed_type == 'gz': return gzip.open(self.filepath, mode='rb') elif self.compressed_type == "zip": _zip = zipfile.ZipFile(self.filepath) if len(_zip.infolist()) > 1: logger.error("Zip %s contains more than one file, only the first uncompressed", self.filepath) return _zip.open(_zip.infolist()[0].filename) @property def compressed_type(self): return self._compressed def is_compressed(self): return self._compressed is not None class MailingList(object): """Manage mailing lists archives. This class gives access to the local mboxes archives that a mailing list manages. :param uri: URI of the mailing lists, usually its URL address :param dirpath: path to the mboxes archives """ def __init__(self, uri, dirpath): self.uri = uri self.dirpath = dirpath @property def mboxes(self): """Get the mboxes managed by this mailing list. Returns the archives sorted by name. :returns: a list of `.MBoxArchive` objects """ archives = [] if os.path.isfile(self.dirpath): try: archives.append(MBoxArchive(self.dirpath)) except OSError as e: logger.warning("Ignoring %s mbox due to: %s", self.dirpath, str(e)) else: for root, _, files in os.walk(self.dirpath): for filename in sorted(files): try: location = os.path.join(root, filename) archives.append(MBoxArchive(location)) except OSError as e: logger.warning("Ignoring %s mbox due to: %s", filename, str(e)) return archives