# -*- coding: utf-8 -*- # # Copyright (C) 2014-2017 Bitergia # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # # Authors: # Santiago DueƱas <sduenas@bitergia.com> # import email.utils import logging import re from ..db.model import MIN_PERIOD_DATE, MAX_PERIOD_DATE, \ UniqueIdentity, Identity, Profile, Enrollment, Organization from ..exceptions import InvalidFormatError # List of names not considered as organizations MAILMAP_NO_ORGS = ['Unaffiliated'] logger = logging.getLogger(__name__) class MailmapParser(object): """Parse identities and organizations using mailmap format. Mailmap format is a plain stream that contains information about identities and their alias. It can also be used to match organizations and identities. Parsed unique identities will be stored in an object named 'uidentities'. The keys of this object are the UUID of the unique identities. Each unique identity object stores a list of identities and enrollments. Parsed organizations will be stored in 'organizations' object. Its keys are the name of the organizations and each organization object is related to a list of domains. :param stream: stream to parse :param has_orgs: set if the stream maps data about organizations :param source: source of the identities :raises InvalidFormatError: raised when the format of the stream is not valid. """ LINES_TO_IGNORE_REGEX = r"^\s*(?:#.*)?\s*$" def __init__(self, stream, has_orgs=False, source='mailmap'): self._identities = {} self._organizations = {} self.source = source self.__parse(stream, has_orgs) @property def identities(self): uids = [u for u in self._identities.values()] uids.sort(key=lambda u: u.uuid) return uids @property def organizations(self): orgs = [o for o in self._organizations.values()] orgs.sort(key=lambda o: o.name) return orgs def __parse(self, stream, has_orgs): """Parse identities and organizations using mailmap format. Mailmap format is a text plain document that stores on each line a map between an email address and its aliases. Each line follows any of the next formats: Proper Name <commit@email.xx> <proper@email.xx> <commit@email.xx> Proper Name <proper@email.xx> <commit@email.xx> Proper Name <proper@email.xx> Commit Name <commit@email.xx> When the flag `has_orgs` is set, the stream maps organizations an identities, following the next format: Organization Name <org@email.xx> Proper Name <proper@email.xx> :parse data: mailmap stream to parse :raise InvalidFormatError: raised when the format of the stream is not valid. """ if has_orgs: self.__parse_organizations(stream) else: self.__parse_identities(stream) def __parse_organizations(self, stream): """Parse organizations stream""" for aliases in self.__parse_stream(stream): # Parse identity identity = self.__parse_alias(aliases[1]) uuid = identity.email uid = self._identities.get(uuid, None) if not uid: uid = UniqueIdentity(uuid=uuid) identity.uuid = uuid uid.identities.append(identity) self._identities[uuid] = uid # Parse organization mailmap_id = aliases[0] name = self.__encode(mailmap_id[0]) if name in MAILMAP_NO_ORGS: continue org = Organization(name=name) self._organizations[name] = org enrollment = Enrollment(start=MIN_PERIOD_DATE, end=MAX_PERIOD_DATE, organization=org) uid.enrollments.append(enrollment) def __parse_identities(self, stream): """Parse identities stream""" for aliases in self.__parse_stream(stream): identity = self.__parse_alias(aliases[0]) uuid = identity.email uid = self._identities.get(uuid, None) if not uid: uid = UniqueIdentity(uuid=uuid) identity.uuid = uuid uid.identities.append(identity) self._identities[uuid] = uid profile = Profile(uuid=uuid, name=identity.name, email=identity.email, is_bot=False) uid.profile = profile # Aliases for alias in aliases[1:]: identity = self.__parse_alias(alias, uuid) uid.identities.append(identity) self._identities[uuid] = uid def __parse_alias(self, alias, uuid=None): name = self.__encode(alias[0]) email_addr = self.__encode(alias[1]) identity = Identity(name=name, email=email_addr, username=None, source=self.source, uuid=uuid) return identity def __parse_stream(self, stream): """Generic method to parse mailmap streams""" nline = 0 lines = stream.split('\n') for line in lines: nline += 1 # Ignore blank lines and comments m = re.match(self.LINES_TO_IGNORE_REGEX, line, re.UNICODE) if m: continue line = line.strip('\n').strip(' ') parts = line.split('>') if len(parts) == 0: cause = "line %s: invalid format" % str(nline) raise InvalidFormatError(cause=cause) aliases = [] for part in parts: part = part.replace(',', ' ') part = part.strip('\n').strip(' ') if len(part) == 0: continue if part.find('<') < 0: cause = "line %s: invalid format" % str(nline) raise InvalidFormatError(cause=cause) alias = email.utils.parseaddr(part + '>') aliases.append(alias) yield aliases def __encode(self, s): return s if s else None