python source code of Reactome

from builtins import str
from builtins import object
import logging
import csv

import networkx as nx
from networkx.algorithms import all_simple_paths

from mrtarget.common.DataStructure import TreeNode, JSONSerializable
from mrtarget.common.connection import new_es_client
from mrtarget.common.esutil import ElasticsearchBulkIndexManager
from opentargets_urlzsource import URLZSource

import elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import MatchAll

import simplejson as json

class ReactomeNode(TreeNode, JSONSerializable):
    def __init__(self, **kwargs):
        super(ReactomeNode, self).__init__(**kwargs)


class ReactomeDataDownloader(object):

    def __init__(self, pathway_data_url, pathway_relation_url):
        self.logger = logging.getLogger(__name__)
        self.allowed_species = ['Homo sapiens']
        self.headers = ["id", "description", "species"]
        self.headers_pathway_rel = ["id", "related_id"]
        self.pathway_data_url = pathway_data_url
        self.pathway_relation_url = pathway_relation_url

    def get_pathway_data(self):
        self.valid_pathway_ids = []
        with URLZSource(self.pathway_data_url).open() as source:
            for i, row in enumerate(csv.DictReader(source, fieldnames=self.headers, dialect='excel-tab'), start=1):
                if len(row) != 3:
                    raise ValueError('Reactome.py: Pathway file format unexpected at line %d.' % i)

                pathway_id = row["id"]
                pathway_name = row["description"]
                species = row["species"]

                if pathway_id not in self.valid_pathway_ids:
                    if species in self.allowed_species:
                        self.valid_pathway_ids.append(pathway_id)
                        yield dict(id=pathway_id,
                                name=pathway_name,
                                species=species,
                                )
                        if len(self.valid_pathway_ids) % 1000 == 0:
                            self.logger.debug("%i rows parsed for reactome_pathway_data" % len(self.valid_pathway_ids))
                else:
                    self.logger.warn("Pathway id %s is already loaded, skipping duplicate data" % pathway_id)

        self.logger.info('parsed %i rows for reactome_pathway_data' % len(self.valid_pathway_ids))

    def get_pathway_relations(self):
        added_relations = []
        with URLZSource(self.pathway_relation_url).open() as source:
            for i, row in enumerate(
                    csv.DictReader(source, fieldnames=self.headers_pathway_rel, dialect='excel-tab'), start=1):
                if len(row) != 2:
                    raise ValueError('Reactome.py: Pathway Relation file format unexpected at line %d.' % i)

                parent_id = row["id"]
                child_id = row["related_id"]

                relation = (parent_id, child_id)
                if relation not in added_relations:
                    if parent_id in self.valid_pathway_ids:
                        yield dict(id=parent_id,
                                child=child_id,
                                )
                        added_relations.append(relation)
                        if len(added_relations) % 1000 == 0:
                            self.logger.debug("%i rows parsed from reactome_pathway_relation" % len(added_relations))
                else:
                    self.logger.warn("Pathway relation %s is already loaded, skipping duplicate data" % str(relation))
        self.logger.info('parsed %i rows from reactome_pathway_relation' % len(added_relations))

def generate_documents(g):
    for node, node_data in g.nodes(data=True):
        if node != 'root':
            ancestors = set()
            paths = list(all_simple_paths(g, 'root', node))
            for path in paths:
                for p in path:
                    ancestors.add(p)

            #ensure these are real tuples, not generators
            #otherwise they can't be serialized to json
            children = tuple(g.successors(node))
            parents = tuple(g.predecessors(node))

            body = dict(id=node,
                label=node_data['name'],
                path=paths,
                children=children,
                parents=parents,
                is_root=node == 'root',
                ancestors=list(ancestors)
            )
            yield body

"""
Generates elasticsearch action objects from the results iterator

Output suitable for use with elasticsearch.helpers 
"""
def elasticsearch_actions(reactions, index):
    for reaction in reactions:
        action = {}
        action["_index"] = index
        action["_id"] = reaction["id"]
        #elasticsearch client uses https://github.com/elastic/elasticsearch-py/blob/master/elasticsearch/serializer.py#L24
        #to turn objects into JSON bodies. This in turn calls json.dumps() using simplejson if present.
        action["_source"] = reaction

        yield action

class ReactomeProcess(object):
    def __init__(self, es_hosts, es_index, es_mappings, es_settings,
            pathway_data_url, pathway_relation_url,
            workers_write, queue_write):
        self.es_hosts = es_hosts
        self.es_index = es_index
        self.es_mappings = es_mappings
        self.es_settings = es_settings
        self.downloader = ReactomeDataDownloader(pathway_data_url, pathway_relation_url)

        self.logger = logging.getLogger(__name__)
        self.g = nx.DiGraph(name="reactome")
        self.data = {}
        self.workers_write = workers_write
        self.queue_write = queue_write

    def process_all(self, dry_run):

        self.g.add_node('root', name="", species="")

        for row in self.downloader.get_pathway_data():
            self.g.add_node(row['id'], name=row['name'], species=row['species'])
        children = set()
        for row in self.downloader.get_pathway_relations():
            self.g.add_edge(row['id'], row['child'])
            children.add(row['child'])

        nodes_without_parent = set(self.g.nodes()) - children
        for node in nodes_without_parent:
            if node != 'root':
                self.g.add_edge('root', node)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            docs = generate_documents(self.g)
            actions = elasticsearch_actions(docs, self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, _ in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s failed to index" % failcount)


    """
    Run a series of QC tests on EFO elasticsearch index. Returns a dictionary
    of string test names and result objects
    """
    def qc(self, es, index):
        self.logger.info("Starting QC")

        #number of reactions
        reaction_count = 0
        #Note: try to avoid doing this more than once!
        for _ in Search().using(es).index(index).query(MatchAll()).scan():
            reaction_count += 1

        #put the metrics into a single dict
        metrics = dict()
        metrics["reactome.count"] = reaction_count

        self.logger.info("Finished QC")
        return metrics