python source code of Drug

from future import standard_library
standard_library.install_aliases()
from builtins import object
import logging

import simplejson as json
import elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import MatchAll

from opentargets_urlzsource import URLZSource
from mrtarget.common.esutil import ElasticsearchBulkIndexManager
from mrtarget.common.connection import new_es_client
from mrtarget.common.LookupHelpers import LookUpDataRetriever

import tempfile
import sys
import unicodedata
#for python3 the module name has changed
if sys.version_info >= (3, 0):
    import dbm
    from builtins import str
else:
    import anydbm as dbm
import csv
import shelve
import codecs
import urllib.request, urllib.parse, urllib.error
from numbers import Number
from collections import defaultdict


"""
Generates elasticsearch action objects from the results iterator

Output suitable for use with elasticsearch.helpers 
"""
def elasticsearch_actions(items, index):
    for ident, item in items:
        action = {}
        action["_index"] = index
        action["_id"] = ident
        #elasticsearch client uses https://github.com/elastic/elasticsearch-py/blob/master/elasticsearch/serializer.py#L24
        #to turn objects into JSON bodies. This in turn calls json.dumps() using simplejson if present.
        action["_source"] = item

        yield action


def get_parent_id(mol):
    #if it has a parent use the parents id
    if "molecule_hierarchy" in mol and mol["molecule_hierarchy"] is not None \
            and "parent_chembl_id" in mol["molecule_hierarchy"] \
            and mol["molecule_hierarchy"]["parent_chembl_id"] is not None:
        return mol["molecule_hierarchy"]["parent_chembl_id"]
    else:
    #if there is no parent, use its own id
        #print("Unable to find .molecule_hierarchy.parent_chembl_id for %s"%mol["molecule_chembl_id"])
        return mol["molecule_chembl_id"]

class DrugProcess(object):

    def __init__(self, es_hosts, es_index, es_mappings, es_settings,
            es_index_gene, es_index_efo,
            workers_write, queue_write,
            cache_efo, cache_efo_contains,
            cache_target, cache_target_u2e, cache_target_contains,
            chembl_target_uris, 
            chembl_mechanism_uris, 
            chembl_component_uris,
            chembl_protein_uris, 
            chembl_molecule_uris,
            chembl_indication_uris,
            adverse_events_uris,
            drugbank_uris):
        self.es_hosts = es_hosts
        self.es_index = es_index
        self.es_mappings = es_mappings
        self.es_settings = es_settings
        self.es_index_gene = es_index_gene
        self.es_index_efo = es_index_efo
        self.workers_write = workers_write
        self.queue_write = queue_write

        self.cache_efo = cache_efo
        self.cache_efo_contains = cache_efo_contains
        self.cache_target = cache_target
        self.cache_target_u2e = cache_target_u2e
        self.cache_target_contains = cache_target_contains

        self.chembl_target_uris = chembl_target_uris
        self.chembl_mechanism_uris = chembl_mechanism_uris
        self.chembl_component_uris = chembl_component_uris
        self.chembl_protein_uris = chembl_protein_uris
        self.chembl_molecule_uris = chembl_molecule_uris
        self.chembl_indication_uris = chembl_indication_uris

        self.adverse_events_uris = adverse_events_uris

        self.drugbank_uris = drugbank_uris

        self.logger = logging.getLogger(__name__)

    def process_all(self, dry_run):
        es = new_es_client(self.es_hosts)

        drugs = self.generate(es)
        self.store(es, dry_run, drugs)

    # to avoid: String or Integer object expected for key, unicode found.
    # to validate assert below
    def str_hook(self, value):
        new_value = value
        if not isinstance(value, str):
            #new_value = value.encode('UTF-8')
            new_value = unicodedata.normalize('NFKD', value).encode('ascii','ignore')

        assert isinstance(new_value, str)
        return new_value

    def create_shelf(self, uris, key_f):
        #sanity check inputs
        assert uris is not None
        assert len(uris) > 0
        
        # Shelve creates a file with specific database. Using a temp file requires a workaround to open it.
        # dumbdbm creates an empty database file. In this way shelve can open it properly.

        #note: this file is never deleted!
        filename = tempfile.NamedTemporaryFile(delete=True).name
        shelf = shelve.Shelf(dict=dbm.open(filename, 'n'))
        for uri in uris:
            with URLZSource(uri).open() as f_obj:
                #for python2 we need to decode utf-8
                if sys.version_info < (3, 0):
                    f_obj = codecs.getreader("utf-8")(f_obj)
                for line_no, line in enumerate(f_obj):
                    try:
                        obj = json.loads(line)
                    except json.JSONDecodeError as e:
                        self.logger.error("Unable to read line %d %s %s", line_no, uri, e)
                        raise e
                        
                    key_value = key_f(obj)
                    key = self.str_hook(key_value)
                    if key is not None:
                        if key in shelf:
                            raise ValueError("Duplicate key %s in uri %s" % (key,uri))
                        shelf[key] = obj
        return shelf

    def create_shelf_multi(self, uris, key_f):
        #sanity check inputs
        assert uris is not None
        assert len(uris) > 0

        # Shelve creates a file with specific database. Using a temp file requires a workaround to open it.
        # dumbdbm creates an empty database file. In this way shelve can open it properly.

        #note: this file is never deleted!
        filename = tempfile.NamedTemporaryFile(delete=True).name
        shelf = shelve.Shelf(dict=dbm.open(filename, 'n'))
        for uri in uris:
            with URLZSource(uri).open() as f_obj:
                #for python2 we need to decode utf-8
                if sys.version_info < (3, 0):
                    f_obj = codecs.getreader("utf-8")(f_obj)
                for line_no, line in enumerate(f_obj):
                    try:
                        obj = json.loads(line)
                    except json.JSONDecodeError as e:
                        self.logger.error("Unable to read line %d %s", line_no, uri)
                        raise e

                    key_value = key_f(obj)
                    key = self.str_hook(key_value)
                    if key is not None:
                        existing = shelf.get(key,[])
                        existing.append(obj)
                        shelf[key] = existing
        return shelf

    def create_shelf_multi_csv(self, uris, key_col, dialect):
        # sanity check inputs
        assert uris is not None
        assert len(uris) > 0

        # Shelve creates a file with specific database. Using a temp file requires a workaround to open it.
        # dumbdbm creates an empty database file. In this way shelve can open it properly.

        #note: this file is never deleted!
        filename = tempfile.NamedTemporaryFile(delete=True).name
        shelf = shelve.Shelf(dict=dbm.open(filename, 'n'))
        for uri in uris:
            with URLZSource(uri).open() as f_obj:
                f_obj = codecs.getreader("utf-8")(f_obj)
                for row in csv.DictReader(f_obj, dialect=dialect):
                    key_value = row[key_col]
                    key = self.str_hook(key_value)
                    if key is not None:
                        row_dict = dict(row)
                        del row_dict[key_col]
                        existing = shelf.get(key,[])
                        existing.append(row_dict)
                        shelf[key] = existing
        return shelf

    def create_shelf_csv(self, uris, key_col, dialect):
        # sanity check inputs
        assert uris is not None
        assert len(uris) > 0

        # Shelve creates a file with specific database. Using a temp file requires a workaround to open it.
        # dumbdbm creates an empty database file. In this way shelve can open it properly.

        #note: this file is never deleted!
        filename = tempfile.NamedTemporaryFile(delete=True).name
        shelf = shelve.Shelf(dict=dbm.open(filename, 'n'))
        for uri in uris:
            with URLZSource(uri).open() as f_obj:
                f_obj = codecs.getreader("utf-8")(f_obj)
                for row in csv.DictReader(f_obj, dialect=dialect):
                    key_value = row[key_col]
                    key = self.str_hook(key_value)
                    if key is not None:
                        if key in shelf:
                            raise ValueError("Duplicate key %s in uri %s" % (key,uri))
                        row_dict = dict(row)
                        del row_dict[key_col]
                        shelf[key] = row_dict
        return shelf

    def clean_ids(self, source, ids):
        if source == "ClinicalTrials":
            #can be comma separated, so split em
            split_ids = set()
            for id in ids:
                for split_id in id.split(","):
                    split_id = split_id.strip()
                    split_ids.add(split_id)
            ids = sorted(split_ids)
        return ids

    def build_urls(self, source, ids):
        urls = []

        if source == "FDA":
            for id in ids:
                args = {}
                args["search"] = "set_id:%s" % id
                urls.append("https://api.fda.gov/drug/label.json?"+urllib.parse.urlencode(args))
        elif source == "ATC":
            for id in ids:
                args = {}
                args["code"] = id
                urls.append("https://www.whocc.no/atc_ddd_index/?"+urllib.parse.urlencode(args))
        elif source == "DailyMed":
            for id in ids:
                #these already come from chembl with setid= in the identifer
                urls.append("https://dailymed.nlm.nih.gov/dailymed/lookup.cfm?"+id)
        elif source == "ClinicalTrials":
            args = {}
            args["id"] = "OR".join(['"%s"' % id for id in ids])
            urls.append("https://clinicaltrials.gov/search?"+urllib.parse.urlencode(args))
        elif source == "PubMed":
            args = {}
            args["query"] = " OR ".join(['EXT_ID:%s' % id for id in ids])
            urls.append("https://europepmc.org/search?"+urllib.parse.urlencode(args))
        elif source == "Wikipedia":
            for id in ids:
                urls.append("https://www.wikipedia.org/"+id)
        elif source == "DOI":
            for id in ids:
                urls.append("http://dx.doi.org/"+id)
        elif source == "Other":
            #assume this is an url
            #TODO check?
            for id in ids:
                urls.append(id)
        elif source == "ISBN":
            #we can't do anything useful with these
            pass
        elif source == "KEGG":
            for id in ids:
                urls.append("https://www.genome.jp/dbget-bin/www_bget?dr:"+id)
        elif source == "PMC":
            for id in ids:
                urls.append("https://www.ncbi.nlm.nih.gov/pmc/articles/"+id)
        else:
            # TODO only report each source once
            self.logger.warning("Unregonized source %s for %s", source, ids)
            return None

        return urls


    def handle_indication(self, indication):

        if "efo_id" in indication \
                and indication["efo_id"] is not None \
                and indication["efo_id"] is not "*":
            out = {}

            efo_id = indication["efo_id"]
            #make sure this is with an underscore not colon
            efo_id = efo_id.replace(":","_")

            out["efo_id"] = efo_id

            if efo_id not in self.lookup_data.available_efos:
                # TODO throw an exception to allow to bubble up
                # TODO only log each one once
                self.logger.warning("Unrecognized disease %s",efo_id)
                return None

            stored_efo = self.lookup_data.available_efos.get_efo(efo_id)

            #get label from our EFO index
            out["efo_label"] = stored_efo["label"]

            #get full URI from our EFO index
            out["efo_uri"] = stored_efo["code"]

            #max phase
            if "max_phase_for_ind" in indication \
                    and indication["max_phase_for_ind"] is not None:
                assert isinstance(indication["max_phase_for_ind"], int)
                out["max_phase_for_indication"] = indication["max_phase_for_ind"]

            # indication references
            if "indication_refs" in indication and indication["indication_refs"] is not None:
                references = {}
                for ref in indication["indication_refs"]:
                    if "ref_type" in ref and ref["ref_type"] is not None \
                            and "ref_id" in ref and ref["ref_id"] is not None:

                        #don't keep the URL, can build a better one later to handle multi-id
                        ref_type = self.str_hook(ref["ref_type"])
                        ref_id = self.str_hook(ref["ref_id"])

                        #create a set to ensure uniqueness
                        if ref_type not in references:
                            references[ref_type] = set()
                        references[ref_type].add(ref_id)
                    else:
                        # warn if one of these is missing
                        self.logger.warn("missing ref_type and/or ref_id")
                
                for ref_type in references:
                    if "references" not in out:
                        out["references"] = []

                    reference = {}
                    reference["source"] = ref_type
                    reference["ids"] = tuple(sorted(references[ref_type]))
                    reference["ids"] = self.clean_ids(reference["source"], reference["ids"])
                    urls = self.build_urls(reference["source"], reference["ids"])
                    if urls is not None:
                        reference["urls"] = urls
                    #TODO build a URL list that can handle multiple ids (when possible)
                    if reference not in out["references"]:
                        out["references"].append(reference)
                
                if "references" in out:
                    out["references"] = sorted(out["references"],key = lambda x:x["source"])

            return out
        else:
            #indication without EFO ID, skipping
            return None

    '''
    This will create the mechanism ES dictionary from the provided shelf dict
    '''
    def handle_mechanism(self, mech, targets):
        out = {}

        #handle target information from target endpoint
        #do this first, so we can stop early if its a target we are not interested in
        if "target_chembl_id" in mech and mech["target_chembl_id"] is not None:
            target_id = self.str_hook(mech["target_chembl_id"])
            target = targets[target_id]

            if "target_components" not in target \
                or target["target_components"] is None \
                or len(target["target_components"]) == 0:
                # we can't handle this at the moment, skipping
                #self.logger.warning("No component for %s",target_id)
                return None

            for target_component in target["target_components"]:

                out_component = {}
                assert "accession" in target_component                
                target_accession = target_component["accession"]
                if target_accession is None:
                    self.logger.warning("skipping unaccessioned component in %s", target_id)
                    continue

                #at the end of this we need a valid ensembl id that we have in the gene index
                ensembl_id = None
                if target_accession in self.lookup_data.available_genes:
                    ensembl_id = target_accession
                    out_component["ensembl"] = ensembl_id
                else:
                    try:
                        ensembl_id = self.lookup_data.available_genes.get_uniprot2ensembl(target_accession)
                    except ValueError as e:
                        #multiple ensembl ids per protein
                        #log with a warning, and ignore
                        self.logger.warning("multiple ensembl ids for uniprot id %s",target_accession)
                        continue

                    if ensembl_id is not None:
                        out_component["ensembl"] = ensembl_id
                    else:
                        # TODO only log each one once
                        self.logger.warning("Unrecognized target accession %s",target_accession)
                        continue

                gene = self.lookup_data.available_genes.get_gene(ensembl_id)

                if "approved_name" in gene \
                        and gene["approved_name"] is not None \
                        and len(gene["approved_name"]) > 0:
                    out_component["approved_name"] = gene["approved_name"]

                if "approved_symbol" in gene \
                        and gene["approved_symbol"] is not None \
                        and len(gene["approved_symbol"]) > 0:
                    out_component["approved_symbol"] = self.str_hook(gene["approved_symbol"])

                if "target_components" not in out:
                    out["target_components"] = []
                out["target_components"].append(out_component)

            #add some information from the chembl source
            # TODO what if this is different from ensembl ?
            if "target_type" in target and target["target_type"] is not None:
                #chembl stores them as all-caps, we want them to be pretty
                out["target_type"] = self.str_hook(target["target_type"].lower())

            if "pref_name" in target and target["pref_name"] is not None:
                out["target_name"] = self.str_hook(target["pref_name"])

        else:
            # no target_chembl_id - should this be dropped?
            self.logger.warning("no target_chembl_id found")
            return None

        if "action_type" in mech and mech["action_type"] is not None:
            # convert to lowercase
            out["action_type"] = mech["action_type"].lower()

        if "mechanism_of_action" in mech and mech["mechanism_of_action"] is not None:
            out["description"] = self.str_hook(mech["mechanism_of_action"])

        if "mechanism_refs" in mech and mech["mechanism_refs"] is not None:
            references = {}
            for ref in mech["mechanism_refs"]:
                if "ref_type" in ref and ref["ref_type"] is not None \
                        and "ref_id" in ref and ref["ref_id"] is not None:

                    #don't keep the URL, can build a better one later to handle multi-id
                    ref_type = self.str_hook(ref["ref_type"])
                    ref_id = self.str_hook(ref["ref_id"])

                    #create a set to ensure uniqueness
                    if ref_type not in references:
                        references[ref_type] = set()
                    references[ref_type].add(ref_id)
                else:
                    # warn if one of these is missing
                    self.logger.warn("missing ref_type and/or ref_id")
            
            for ref_type in references:
                if "references" not in out:
                    out["references"] = []

                reference = {}
                reference["source"] = ref_type
                reference["ids"] = tuple(sorted(references[ref_type]))
                reference["ids"] = self.clean_ids(reference["source"], reference["ids"])
                urls = self.build_urls(reference["source"], reference["ids"])
                if urls is not None:
                    reference["urls"] = urls
                #TODO build a URL list that can handle multiple ids (when possible)
                if reference not in out["references"]:
                    out["references"].append(reference)
            
            if "references" in out:
                out["references"] = sorted(out["references"],key=lambda x : x["source"])

        return out


    '''
    This will create the drug dictionary object suitable for storing in elasticsearch
    from the provided shelf-backed dictionaries of relevant chembl endpoint data
    '''
    def handle_drug(self, ident, mol, indications, mechanisms, all_targets, 
            adverse_events, drugbank_ids):

        drug = {}
        drug["id"] = ident

        if "internal_compound" in mol and mol["internal_compound"] is not None:
            # note, not in chembl
            assert isinstance(mol["internal_compound"], bool), ident
            drug["internal_compound"] = mol["internal_compound"]
        else:
            #default to explicitly false 
            drug["internal_compound"] = False

        if "molecule_type" in mol and mol["molecule_type"] is not None:
            #TODO format check

            #assert isinstance(mol["molecule_type"], str), ident
            drug["type"] = self.str_hook(mol["molecule_type"])
            
        if "pref_name" in mol and mol["pref_name"] is not None:
            #TODO casing? always uppercase, do we inital case, lower case?
            #assert isinstance(mol["pref_name"], str), ident
            drug["pref_name"] = self.str_hook(mol["pref_name"])
            
        if "first_approval" in mol and mol["first_approval"] is not None:
            # assert isinstance(mol["first_approval"], int), ident (???)
            assert isinstance(mol["first_approval"], int)
            assert mol["first_approval"] > 1900
            assert mol["first_approval"] < 2100
            drug["year_first_approved"] = mol["first_approval"]

        if "max_phase" in mol and mol["max_phase"] is not None:
            #check this is 0 1 2 3 4
            # assert isinstance(mol["max_phase"], int), ident
            assert isinstance(mol["max_phase"], int)

            #this should be an integer?
            drug["max_clinical_trial_phase"] = mol["max_phase"]

        if "withdrawn_flag" in mol and mol["withdrawn_flag"] is not None:
            #TODO check always true
            assert isinstance(mol["withdrawn_flag"], bool)
            drug["withdrawn_flag"] = mol["withdrawn_flag"]

        if "withdrawn_reason" in mol and mol["withdrawn_reason"] is not None:
            #TODO check always string
            #TODO check only present when withdrawn_flag
            #note, this is noisy e.g.
            #  "Self-poisonings"
            #  "Self-poisoning"
            #  "Self-Poisonings"
            reasons = set()
            mol_withdraw_reason = self.str_hook(mol["withdrawn_reason"])
            for reason in mol_withdraw_reason.split(";"):
                reasons.add(reason.strip())
            drug["withdrawn_reason"] = sorted(reasons)

        if "withdrawn_year" in mol and mol["withdrawn_year"] is not None:
            assert isinstance(mol["withdrawn_year"], int)
            assert mol["withdrawn_year"] > 1900
            assert mol["withdrawn_year"] < 2100
            drug["withdrawn_year"] = mol["withdrawn_year"]

        if "withdrawn_country" in mol and mol["withdrawn_country"] is not None:
            #TODO check always string
            #TODO check only present when withdrawn_flag
            #split and trim by semicolon
            #TODO casing?
            countries = set()
            mol_withdraw_country = self.str_hook(mol["withdrawn_country"])
            for country in mol_withdraw_country.split(";"):
                countries.add(country.strip())
            drug["withdrawn_country"] = sorted(countries)

        if "withdrawn_class" in mol and mol["withdrawn_class"] is not None:
            #TODO check always string
            #TODO check only present when withdrawn_flag
            #TODO casing?
            classes = set()
            mol_withdraw_class = self.str_hook(mol["withdrawn_class"])
            for clazz in mol_withdraw_class.split(";"):
                classes.add(clazz.strip())
            drug["withdrawn_class"] = sorted(classes)

        if "black_box_warning" in mol and mol["black_box_warning"] is not None:
            #unicode converted to true/false
            #check it comes in as a unicode
            #assert isinstance(mol["black_box_warning"], str), \
            #    "%s black_box_warning = %s " % (ident,repr(mol["black_box_warning"]))
            #convert unicode to an integer - will throw if it can't
            bbw = int(mol["black_box_warning"])
            if bbw == 0:
                drug["black_box_warning"] = False
            elif bbw == 1:
                drug["black_box_warning"] = True
            else:
                raise ValueError("Unexpected value for black_box_warning: %d"%bbw)

        if "molecule_synonyms" in mol and mol["molecule_synonyms"] is not None:
            # use set to avoid duplicates
            synonyms = set() 
            trade_names = set()

            for molecule_synonym in mol["molecule_synonyms"]:
                if "molecule_synonym" in molecule_synonym \
                        and molecule_synonym["molecule_synonym"] is not None \
                        and "syn_type" in molecule_synonym \
                        and molecule_synonym["syn_type"] is not None:

                    syn_type = self.str_hook(molecule_synonym["syn_type"])
                    synonym = self.str_hook(molecule_synonym["molecule_synonym"])

                    if "TRADE_NAME" == syn_type.upper():
                        trade_names.add(synonym)
                    else:
                        synonyms.add(synonym)

            if len(synonyms) > 0:
                drug["synonyms"] = sorted(synonyms)
            if len(synonyms) > 0:
                drug["trade_names"] = sorted(trade_names)

        if "cross_references" in mol and mol["cross_references"] is not None:
            references = {}

            for ref in mol["cross_references"]:
                #TODO warn if one of these is missing
                if "xref_src" in ref and ref["xref_src"] is not None \
                        and "xref_id" in ref and ref["xref_id"] is not None:

                    #don't keep the URL, can build a better one later to handle multi-id
                    ref_type = self.str_hook(ref["xref_src"])
                    ref_id = self.str_hook(ref["xref_id"])

                    #create a set to ensure uniqueness
                    if ref_type not in references:
                        references[ref_type] = set()
                    references[ref_type].add(ref_id)
            
            for ref_type in references:
                if "cross_references" not in drug:
                    drug["cross_references"] = []

                reference = {}
                reference["source"] = ref_type
                reference["ids"] = tuple(sorted(references[ref_type]))
                #TODO build a URL list that can handle multiple ids (when possible)
                drug["cross_references"].append(reference)

        # add a drugbank crossreference if applicable
        if ident in drugbank_ids:
            for drugbank_id in drugbank_ids[ident]:
                reference = {}
                reference["source"] = "drugbank"
                reference["ids"] = (drugbank_id["To src:'2'"],)
                if "cross_references" not in drug:
                    drug["cross_references"] = []
                drug["cross_references"].append(reference)

        if "chebi_par_id" in mol and mol["chebi_par_id"] is not None:
            assert isinstance(mol["chebi_par_id"], int)
            chebi_id = mol["chebi_par_id"]

            if "cross_references" not in drug:
                drug["cross_references"] = []
            reference = {}
            reference["source"] = "ChEBI"
            reference["ids"] = (chebi_id,)
            #TODO build a URL 
            if reference not in drug["cross_references"]:
                drug["cross_references"].append(reference)

        # add smiles
        if "molecule_structures" in mol and \
                mol["molecule_structures"] is not None and \
                "canonical_smiles" in mol["molecule_structures"]:
            # TODO validate ?
            drug["canonical_smiles"] = mol["molecule_structures"]["canonical_smiles"]

        #sort cross references for consistent order after all possible ones have been added
        if "cross_references" in drug:
            drug["cross_references"] = sorted(drug["cross_references"],key=lambda x: x["source"])

        if ident in indications:
            drug["indications"] = []
            for indication in indications[ident]:
                out = self.handle_indication(indication)
                if out is not None:
                    drug["indications"].append(out)

        if ident in mechanisms:
            drug["mechanisms_of_action"] = []
            for mechanism in mechanisms[ident]:
                out = self.handle_mechanism(mechanism, all_targets)
                if out is not None:
                    drug["mechanisms_of_action"].append(out)

        # add adverse events
        if ident in adverse_events:
            drug["adverse_events"] = {}
            drug["adverse_events"]["significant"] = []
            for adverse_event in adverse_events[ident]:
                assert "event" in adverse_event
                assert "count" in adverse_event
                assert "llr" in adverse_event
                assert "critval" in adverse_event

                # critval is the same per-drug for all adverse events
                if "critval" not in drug["adverse_events"]:
                    drug["adverse_events"]["critval"] = float(adverse_event["critval"])
                else:
                    assert drug["adverse_events"]["critval"] == float(adverse_event["critval"])

                drug["adverse_events"]["significant"].append({
                    "event": adverse_event["event"],
                    "count": int(adverse_event["count"]),
                    "llr": float(adverse_event["llr"])
                })
            
            drug["adverse_events"]["significant"].sort(key=lambda x:x["llr"], reverse=True)

        return drug


    def handle_drug_child(self, drug, ident, mol, indications, mechanisms, targets, 
            adverse_events, drugbank_ids):

        #get a drug object for the child, validated and cleaned
        child_drug = self.handle_drug(ident, mol, indications, mechanisms, targets, 
            adverse_events, drugbank_ids)

        #add extra information to the drug based on the child

        if "child_chembl_ids" not in drug:
            drug["child_chembl_ids"] = []
        drug["child_chembl_ids"].append(child_drug["id"])

        if "synonyms" in child_drug:
            for synonym in child_drug["synonyms"]:
                if "synonyms" in drug:
                    if synonym not in drug["synonyms"]:
                        drug["synonyms"].append(synonym)
                        drug["synonyms"] = sorted(drug["synonyms"])
                else:
                    drug["synonyms"] = [synonym]

        # TODO add child prefered name as a synonym ?

        if "trade_names" in child_drug:
            for name in child_drug["trade_names"]:
                if "trade_names" in drug:
                    if name not in drug["trade_names"]:
                        drug["trade_names"].append(name)
                        drug["trade_names"] = sorted(drug["trade_names"])
                else:
                    drug["trade_names"] = [name]

        if "indications" in child_drug:
            for indication in child_drug["indications"]:
                if "indications" in drug:
                    if indication not in drug["indications"]:
                        drug["indications"].append(indication)
                else:
                    drug["indications"] = [indication]

        if "mechanisms_of_action" in child_drug:
            for mechanism in child_drug["mechanisms_of_action"]:
                if "mechanisms_of_action" in drug:
                    if mechanism not in drug["mechanisms_of_action"]:
                        drug["mechanisms_of_action"].append(mechanism)
                else:
                    drug["mechanisms_of_action"] = [mechanism]

        if "max_clinical_trial_phase" in child_drug:
            if "max_clinical_trial_phase" in drug:
                #compare and take highest
                if child_drug["max_clinical_trial_phase"] > drug["max_clinical_trial_phase"]:
                    drug["max_clinical_trial_phase"] = child_drug["max_clinical_trial_phase"]
            else:
                #in child but not parent, add to parent
                drug["max_clinical_trial_phase"] = child_drug["max_clinical_trial_phase"]

        if "year_first_approved" in child_drug:
            if "year_first_approved" in drug:
                #compare and take lowest
                if child_drug["year_first_approved"] < drug["year_first_approved"]:
                    drug["year_first_approved"] = child_drug["year_first_approved"]
            else:
                #in child but not parent, add to parent
                drug["year_first_approved"] = child_drug["year_first_approved"]

        if "cross_references" in child_drug:
            if "cross_references" in drug:
                #merge and unique
                # note dict is not hashable so cant use a simple set
                cross_references = list(drug["cross_references"])
                for other_cross_reference in child_drug["cross_references"]:
                    if other_cross_reference not in cross_references:
                        cross_references.append(other_cross_reference)
                drug["cross_references"] = tuple(cross_references)
            else:
                #in child but not parent, add to parent
                drug["cross_references"] = child_drug["cross_references"]


        # TODO withdrawn_year and other withdrawn

        # TODO black box warning

        # TODO adverse events (at the moment there shouldn't be any about child drugs)

        

    def generate(self, es):

        # pre-load into indexed shelf dicts

        self.logger.info("Starting pre-loading")

        #create lookup tables
        self.lookup_data = LookUpDataRetriever(es,  
            gene_index = self.es_index_gene,
            gene_cache_size = self.cache_target,
            gene_cache_u2e_size = self.cache_target_u2e,
            gene_cache_contains_size = self.cache_target_contains,
            efo_index = self.es_index_efo,
            efo_cache_size = self.cache_efo,
            efo_cache_contains_size = self.cache_efo_contains
            ).lookup


        # these are all separate files
        # intentional, partly because its what chembl API gives us, and partly because
        # it is easier for partners to add information to existing chembl records

        # TODO potentially load these in separate processes?

        self.logger.debug("Loading molecules")
        mols = self.create_shelf_multi(self.chembl_molecule_uris, get_parent_id)
        self.logger.debug("Loaded %d molecules", len(mols))
        self.logger.debug("Loading indications")
        indications = self.create_shelf_multi(self.chembl_indication_uris, lambda x : x["molecule_chembl_id"])
        self.logger.debug("Loaded %d indications", len(indications))
        self.logger.debug("Loading mechanisms")
        mechanisms = self.create_shelf_multi(self.chembl_mechanism_uris, lambda x : x["molecule_chembl_id"])
        self.logger.debug("Loaded %d mechanisms", len(mechanisms))
        self.logger.debug("Loading targets")
        targets = self.create_shelf(self.chembl_target_uris, lambda x : x["target_chembl_id"])
        self.logger.debug("Loaded %d targets", len(targets))
        adverse_events = self.create_shelf_multi_csv(self.adverse_events_uris, "chembl_id", csv.excel)
        self.logger.debug("Loaded %d adverse events", len(adverse_events))
        #technically this can be duplicate e.g. CHEMBL1236107
        drugbank_ids = self.create_shelf_multi_csv(self.drugbank_uris, "From src:'1'", csv.excel_tab)
        self.logger.debug("Loaded %d drugbank ids", len(drugbank_ids))
        self.logger.info("Completed pre-loading")        

        drugs = {}
        #TODO finish
        for ident in mols:
            parent_mol = None
            child_mols = []

            for mol in mols[ident]:
                mol["molecule_chembl_id"] = self.str_hook(mol["molecule_chembl_id"])
                if mol["molecule_chembl_id"] == ident:
                    #this is the parent
                    assert parent_mol is None
                    parent_mol = mol
                else:
                    #this is a child
                    assert mol not in child_mols
                    child_mols.append(mol)

            # ToDo: check with AF
            assert parent_mol is not None, ident

            #TODO sure no grandparenting
            
            child_mols = sorted(child_mols, key = lambda x: x["molecule_chembl_id"])

            drug = self.handle_drug(ident, parent_mol,
                indications, mechanisms,
                targets, adverse_events, drugbank_ids)

            #append information from children
            for child_mol in child_mols:
                self.handle_drug_child(drug, child_mol["molecule_chembl_id"], child_mol,
                    indications, mechanisms,
                    targets, adverse_events, drugbank_ids)

            if "indications" in drug:
                drug["number_of_indications"] = len(drug["indications"])
                # buld a summary of therapeutic areas covered by indications
                # TODO avoid repeat EFO lookup by doing inside handle_indication()
                indication_therapeutic_areas = defaultdict(int)
                for indication in drug["indications"]:
                    efo_id = indication["efo_id"]
                    stored_efo = self.lookup_data.available_efos.get_efo(efo_id)
                    if "therapeutic_codes" in stored_efo and "therapeutic_labels" in stored_efo:
                        for ta_code, ta_label in zip(
                                stored_efo["therapeutic_codes"], stored_efo["therapeutic_labels"]):
                            indication_therapeutic_areas[ta_code, ta_label] += 1
                drug["indication_therapeutic_areas"] = []
                for (ta_code, ta_label), value in sorted(
                        indication_therapeutic_areas.items(), key=lambda x: x[1], reverse=True):
                    indication_therapeutic_area = {}
                    indication_therapeutic_area["therapeutic_code"] = ta_code
                    indication_therapeutic_area["therapeutic_label"] = ta_label
                    indication_therapeutic_area["count"] = value
                    drug["indication_therapeutic_areas"].append(indication_therapeutic_area)
                drug["indication_therapeutic_areas"] = tuple(drug["indication_therapeutic_areas"])
            else:
                drug["number_of_indications"] = 0

            if "mechanisms_of_action" in drug:
                drug["number_of_mechanisms_of_action"] = len(drug["mechanisms_of_action"])
            else:
                drug["number_of_mechanisms_of_action"] = 0

            # only keep those with indications or mechanisms 
            if drug["number_of_indications"] == 0 \
                    and drug["number_of_mechanisms_of_action"] == 0:
                continue

            drugs[ident] = drug

        return drugs

    def store(self, es, dry_run, data):
        self.logger.info("Starting drug storage")
        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(list(data.items()), self.es_index)
            failcount = 0
            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)
        
        self.logger.debug("Completed storage")


    """
    Run a series of QC tests on EFO elasticsearch index. Returns a dictionary
    of string test names and result objects
    """
    def qc(self, es, index):
        self.logger.info("Starting QC")

        #number of drug entries
        drug_count = 0
        #Note: try to avoid doing this more than once!
        for drug_entry in Search().using(es).index(index).query(MatchAll()).scan():
            drug_count += 1

        #put the metrics into a single dict
        metrics = dict()
        metrics["drug.count"] = drug_count

        self.logger.info("Finished QC")
        return metrics