python source code of chembl

from future import standard_library
standard_library.install_aliases()
from builtins import range
from builtins import object
import functools
import logging

import itertools
import shelve
import sys#for python3 the module name has changed	import dbm
if sys.version_info >= (3, 0):	
    import dbm
    from builtins import str
else:	
    import anydbm as dbm

import tempfile
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import Match

from opentargets_urlzsource import URLZSource
import simplejson as json
import unicodedata

class ChEMBLLookup(object):
    def __init__(self, target_uri, mechanism_uri, component_uri, protein_uri, 
            molecule_uri):
        super(ChEMBLLookup, self).__init__()
        self._logger = logging.getLogger(__name__)
        
        #save configuration locally for future use
        self.target_uri = target_uri
        self.mechanism_uri = mechanism_uri
        self.component_uri = component_uri
        self.protein_uri = protein_uri
        self.molecule_uri = molecule_uri

        self.protein_class = dict()
        self.target_component = dict()
        self.mechanisms = {}
        self.target2molecule = {}
        self.disease2molecule = {}
        self.targets = {}
        self.uni2chembl = {}
        self.molecule2synonyms = {}
        self.protein_classification = {}
        self.protein_class = {}
        self.protein_class_label_to_id = {}
        self.molecules_dict = self.populate_molecules_dict()

    '''
    To remove when Py3 will be the default language
    '''
    def str_hook(self, value):
        new_value = value
        if not isinstance(value, str):
            new_value = unicodedata.normalize('NFKD', value).encode('ascii','ignore')

        assert isinstance(new_value, str)
        return new_value
    '''
    Internal function to populate a dictionary like object on creation
    '''
    def populate_molecules_dict(self):
        # Shelve creates a file with specific database. Using a temp file requires a workaround to open it.
        t_filename = tempfile.NamedTemporaryFile(delete=True).name
        # dbm could not work: Eg. dbm.error: cannot add item.
        # Use dumbdbm for the local execution. Python 3 should fix this issue.
        dumb_dict = dbm.open(t_filename, 'n')
        shelve_out = shelve.Shelf(dict=dumb_dict)
        for uri in self.molecule_uri:
            self._logger.debug('ChEMBL getting Molecule from %s', uri)
            with URLZSource(uri).open() as f_obj:
                for line in f_obj:
                    #TODO handle malformed JSON lines better
                    mol = json.loads(line)
                    shelve_out[str(mol["molecule_chembl_id"])] = mol

        self._logger.debug('ChEMBL Molecule loading done.')
        return shelve_out


    def download_molecules_linked_to_target(self):
        '''generate a dictionary with all the synonyms known for a given molecules.
         Only retrieves molecules linked to a target'''

        '''fetches all the targets from chembl and store their data and a mapping to uniprot id'''

        for uri in self.target_uri:
            with URLZSource(uri).open() as f_obj:
                for line in f_obj:
                    i = json.loads(line)
                    if 'target_components' in i and \
                            i['target_components'] and \
                            'accession' in i['target_components'][0] and \
                            i['target_components'][0]['accession']:
                        uniprot_id = i['target_components'][0]['accession']
                        self.targets[uniprot_id] = i
                        self.uni2chembl[uniprot_id] = i['target_chembl_id']

        allowed_target_chembl_ids = set(self.uni2chembl.values())
        for uri in self.mechanism_uri:
            with URLZSource(uri).open() as f_obj:
                for line in f_obj:
                    i = json.loads(line)
                    self.mechanisms[i['record_id']] = i
                    target_id = i['target_chembl_id']
                    if target_id in allowed_target_chembl_ids:
                        if target_id not in self.target2molecule:
                            self.target2molecule[target_id] = set()
                        self.target2molecule[target_id].add(i['molecule_chembl_id'])

        required_molecules = set()
        self._logger.info('chembl t2m mols')
        for molecules in list(self.target2molecule.values()):
            for molecule in molecules:
                required_molecules.add(molecule)
        required_molecules = list(required_molecules)
        batch_size = 100
        self._logger.debug('chembl populate synonyms')
        for i in range(0, len(required_molecules), batch_size):
            self.populate_synonyms_for_molecule(required_molecules[i:i + batch_size],
                                                 self.molecule2synonyms)

    @staticmethod
    def __extract_protein_classes_from(uris):
        """uris is a list of filenames: str
        returns ({id:[{label:l, id:id},...],...}, {label:id,...})
        """
        classes = {}
        classes_inv_idx = {}

        for uri in uris:
            with URLZSource(uri).open() as f_obj:
                for line in f_obj:
                    i = json.loads(line)
                    protein_class_id = i.pop('protein_class_id')

                    gen = ((k, dict(label=v, id='')) for k, v in i.items() if v)
                    protein_class_data = sorted(gen, key=lambda x: x[0], reverse=True)

                    classes[protein_class_id] = protein_class_data

                    label = protein_class_data[0][1]['label']
                    classes_inv_idx[label] = protein_class_id

        '''inject missing ids'''
        for k, v in classes.items():
            for level, data in v:
                label = data['label']
                if label in classes_inv_idx:
                    data['id'] = classes_inv_idx[label]

        return classes, classes_inv_idx


    def download_protein_classification(self):
        '''fetches targets components from chembls and inject the target class data in self.protein_classification'''

        self.protein_class, self.protein_class_label_to_id = \
            self.__extract_protein_classes_from(self.protein_uri)

        for uri in self.component_uri:
            with URLZSource(uri).open() as f_obj:
                for line in f_obj:
                    i = json.loads(line)
                    if 'accession' in i:
                        if i['accession'] not in self.protein_classification:
                            self.protein_classification[i['accession']] = []
                        for classification in i['protein_classifications']:
                            protein_class_id = classification['protein_classification_id']
                            self.protein_classification[i['accession']].append(dict(self.protein_class[protein_class_id]))

    def get_molecules_from_evidence(self, es, index):

        fields = ['target.id','disease.id', 'evidence.target2drug.urls']
        for e in Search().using(es).index(index).query(
            Match(type="known_drug")).source(include=fields).scan():
            e = e.to_dict()
            #get information from URLs that we need to extract short ids
            #e.g. https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL502835
            molecule_ids = [self.str_hook(i['url'].split('/')[-1]) for i in e['evidence']['target2drug']['urls'] if
                           '/compound/' in i['url']]
            if molecule_ids:
                molecule_id=molecule_ids[0]

                disease_id = self.str_hook(e['disease']['id'])
                target_id = self.str_hook(e['target']['id'])
                if disease_id not in self.disease2molecule:
                    self.disease2molecule[disease_id]=set()
                self.disease2molecule[disease_id].add(molecule_id)
                if target_id not in self.target2molecule:
                    self.target2molecule[target_id]=set()
                self.target2molecule[target_id].add(molecule_id)

    def populate_synonyms_for_molecule(self, molecule_set, molecules_syn_dict):
        def _append_to_mol2syn(m2s_dict, molecule):
            """if molecule has synonyms create a clean entry in m2s_dict with all synms for that chembl_id.
            Returns either None if goes ok or the molecule chembl id if something wrong"""
            if 'molecule_synonyms' in molecule and molecule['molecule_synonyms']:
                synonyms = []
                for syn in molecule['molecule_synonyms']:
                    synonyms.append(syn['synonyms'])
                    synonyms.append(syn['molecule_synonym'])
                synonyms = list(set(synonyms))
                m2s_dict[molecule['molecule_chembl_id']] = synonyms
                return None
            else:
                return molecule['molecule_chembl_id']

        if not molecule_set or not len(molecule_set):
            self._logger.warn("No molecules in set")
            return

        data = {'molecules':[]}
        for mol_k in molecule_set:
            if mol_k in self.molecules_dict:
                data['molecules'].append(self.molecules_dict[mol_k])
            else:
                raise ValueError('problem retrieving the molecule info from the local db', str(mol_k))

        #if the data is what we expected, process it
        if 'molecules' in data:
            map_f = functools.partial(_append_to_mol2syn, molecules_syn_dict)
            mols_without_syn = \
                list(itertools.filterfalse(lambda mol: mol is None, map(map_f, data['molecules'])))
            if mols_without_syn:
                self._logger.debug('molecule list with no synonyms %s', str(mols_without_syn))

        else:
            self._logger.error("there is no 'molecules' key in the structure")
            raise RuntimeError("unexpected chembl API response")