#!/usr/bin/python """Provides abstract class for all subclasses (different methods to infer compatible patterns)""" import logging import itertools import os from copy import deepcopy from collections import defaultdict import networkx as nx from networkx.readwrite import json_graph import json __author__ = 'Johannes REITER' __date__ = 'April, 2014' # get logger for application logger = logging.getLogger('treeomics') TREE_ROOT = 'germline' # root node of the evolutionary tree class Phylogeny(object): """ Managing the derivation of perfect (and persistent) trees representing the clonal evolution of cancer """ def __init__(self, patient, mps): # reference to the patient data self.patient = patient # dictionary of mutation patterns with the corresponding set of shared mutations self.mps = mps self._tree_root = None self.node_scores = None self.compatible_nodes = None self.conflicting_nodes = None # only relevant if not the whole solution space is explored self.conflicting_mutations = None # mutation pattern conflict graph self.cf_graph = None # path to generated PNG of inferred tree for HTML report self.tree_plot = None def infer_evolutionary_tree(self, updated_mps, founders, unique_mutations, confidence=None): """ Construct a phylogenetic tree from the maximum set of compatible mutations :param updated_mps: evolutionary-conflict-free mutation patterns from which the tree is inferred :param founders: set of mutations which are present in all samples :param unique_mutations: dictionary of mutations which are only present in a sample :param confidence: confidence value in parsimony-informative branch :return inferred tree """ # create new persistent tree and the root node tree = nx.DiGraph() tree.add_node(TREE_ROOT, name=TREE_ROOT, muts=set()) founding_mp = frozenset([sa_idx for sa_idx in range( len(self.patient.sample_names) if self.patient.sc_names is None else len(self.patient.sc_names))]) tree.add_node(founding_mp, name='', muts=founders) # add edge from the germline to the founding clone tree.add_edge(TREE_ROOT, founding_mp, muts=tree.node[founding_mp]['muts']) # add compatible mutation patterns to the evolutionary tree for mp, muts in updated_mps.items(): if mp == founding_mp: for mut in muts: tree.edge[TREE_ROOT][mp]['muts'].add(mut) elif len(mp) > 1: _add_evolutionary_node(tree, founding_mp, mp, '', muts, confidence=confidence) else: # leaves (private mutations) are considered separately pass # Insert the leaves (samples) into the tree with all its private mutations for sc_idx in range( len(self.patient.sample_names) if self.patient.sc_names is None else len(self.patient.sc_names)): # add leaves to the evolutionary tree _add_evolutionary_node(tree, founding_mp, frozenset([sc_idx]), self.patient.sample_names[sc_idx] if self.patient.sc_names is None else self.patient.sc_names[sc_idx], unique_mutations[sc_idx]) # _add_subclone_mutations(tree, TREE_ROOT, set()) # prepare tree for meaningful output in the figures self._add_evolutionary_information(tree) logger.info('Inferred evolutionary tree from the compatible mutation patterns.') return tree def _add_evolutionary_information(self, tree): """ Traverse tree and assign meaningful names to the nodes """ # use first in first out queue to traverse the tree in level order: start with the root fifo_queue = list() fifo_queue.append((TREE_ROOT, set())) subclone_idx = 0 while len(fifo_queue): # take first node of the first in first out queue and process it cur_node, acquired_mutations = fifo_queue.pop(0) tree.node[cur_node]['muts'] = acquired_mutations # add meaningful names to internal nodes if len(list(tree.successors(cur_node))): # internal node if subclone_idx == 0: tree.node[cur_node]['name'] = 'Germline'+' '+self.patient.name else: # subclones are numbered in level order tree.node[cur_node]['name'] = 'SC '+str(subclone_idx) subclone_idx += 1 # add children of the current node to the queue for child in tree.successors(cur_node): total_mutations = acquired_mutations.copy() fifo_queue.append((child, total_mutations.union(tree[cur_node][child]['muts']))) # else: # this node is a leave node (=sample) logger.debug("Added evolutionary information to the derived tree.") @staticmethod def save_json_tree(filepath, tree): """ Transform inferred phylogeny to JSON object and save it to a file :param filepath: path to the output file :param tree: reconstructed phylogeny given as networkx DiGraph object """ # create simplified JSON tree out_ids = dict() out_tree = nx.DiGraph() for out_id, node in enumerate(tree.nodes()): out_ids[node] = out_id out_tree.add_node(out_id, name=tree.node[node]['name']) for u, v in tree.edges(): out_tree.add_edge(out_ids[u], out_ids[v], value=len(tree.edge[u][v]['muts'])) # create json output from reconstructed phylogeny json_data = json_graph.node_link_data(out_tree) # json object to output file with open(filepath, 'w') as json_file: json.dump(json_data, json_file, indent=4) logger.info('Create JSON file from reconstructed phylogeny: {}.'.format(filepath)) @staticmethod def write_html_file(outfilepath, json_filepath): with open(outfilepath, 'w') as html_file, open(_get_html_template()) as temp_file: for line in temp_file: html_file.write(line.replace('SETFILENAME', json_filepath)) def create_conflict_graph(nodes, weights=None): """ Create a graph where the nodes are given by the mutation patterns and the edges model the evolutionary conflicts among them :param nodes: dictionary of frozensets describing the samples where a set of mutations is present :param weights: each node in the graph is weighted corresponding to the confidence in the sequencing data of the mutation modeled by the reliability scores :return conflict graph """ # create empty conflict graph cf_graph = nx.Graph() incompatible_mp = defaultdict(set) # since each mutation occurs at most once running through this is # in O(m^2 * n) = O(|mutations|*|mutations|*|samples|) as # the number of clones is bounded by the number of distinct mutations for (node1, node2) in itertools.combinations(nodes.keys(), 2): # characters need to appear at least in two samples # otherwise a conflict is not possible if len(node1) < 2 or len(node2) < 2: continue if len(node1.intersection(node2)) > 0: # at least one sample where both characters are present (11) # check if some characters are present in one clone but not in the other and visa versa if len(node1.difference(node2)) > 0 and len(node2.difference(node1)) > 0: # check for 01 and 10 # => conflict exists among characters of clone 1 and clone 2 incompatible_mp[node1].add(node2) incompatible_mp[node2].add(node1) if node1 not in cf_graph: if weights is not None: cf_graph.add_node(node1, weight=weights[node1], muts=deepcopy(nodes[node1])) else: cf_graph.add_node(node1, weight=len(nodes[node1]), muts=deepcopy(nodes[node1])) if node2 not in cf_graph: if weights is not None: cf_graph.add_node(node2, weight=weights[node2], muts=deepcopy(nodes[node2])) else: cf_graph.add_node(node2, weight=len(nodes[node2]), muts=deepcopy(nodes[node2])) # add edge between conflicting clones cf_graph.add_edge(node1, node2) logger.info('Created conflict graph with {} nodes of weight {:.2f} and {} evolutionary conflicts.'.format( cf_graph.order(), sum(data['weight'] for _, data in cf_graph.nodes(data=True)), cf_graph.size())) return cf_graph def compute_graph_nodes(mps, sample_names, mut_names, present_p_values, absent_p_values, coverage=None, min_absent_cov=0): """ Compute the nodes (given by the pattern of the clones) in the conflict graph If the minimum required coverage at non-significantly mutated mutations is 0, the number of clones is identical to the number of nodes (i.e., no unknown positions) :param mps: mutation patterns :param sample_names :param mut_names: mutation keys :param present_p_values: p-values for the confidence that a variant is present :param absent_p_values: p-values for the confidence that a variant is absent :param coverage: dictionary of mutation keys and dictionary of sample names for the coverage :param min_absent_cov: minimum required coverage at non-significantly mutated position otherwise unknown :return dictionary of nodes and corresponding variants, node weights, dictionary of clones which are unknown """ # clones form the basis for the nodes in the graph nodes = deepcopy(mps) node_weights = dict() mut_pattern_scores = dict() unknown_positions = defaultdict(set) # dictionary of mutations which form different mutation patterns due to unknown positions unknown_muts = defaultdict(dict) for node in nodes.keys(): # calculate weight of each mutation pattern by going through all variants having the same pattern node_weights[node] = 0 for mut_idx in nodes[node]: # t = [] weight = 1.0 for sa_idx, sample_name in enumerate(sample_names): if sa_idx in node: # variants are present in these samples # check if data for p-value calculation was provided if mut_names[mut_idx] in present_p_values[sample_name]: weight *= 1.0 - present_p_values[sample_name][mut_names[mut_idx]] else: weight *= 1.0 # t.append(present_p_values[sample_name][mut_names[mut_idx]]) else: # variants are absent in these samples # not enough coverage at this position to sure about the absence # check if the presence, reduces the number of conflicts if coverage is not None and mut_names[mut_idx] in coverage and \ sample_name in coverage[mut_names[mut_idx]] and \ 0 <= coverage[mut_names[mut_idx]][sample_name] < min_absent_cov: unknown_positions[(node, mut_idx)].add(sa_idx) logger.debug('Variant {} has low coverage of {} and absent-p-value {:.3f}'.format( mut_names[mut_idx], coverage[mut_names[mut_idx]][sample_name], min(absent_p_values[sample_name][mut_names[mut_idx]], 0.5))) # p-value can be at most 0.5 (variant is present or absent with 50% chance each) # come positions have coverage 0; presence or absence has to be random # check if data for p-value calculation was provided if mut_names[mut_idx] in absent_p_values[sample_name]: weight *= 1.0 - min(absent_p_values[sample_name][mut_names[mut_idx]], 0.5) # else: # weight *= 0.9 # coverage data is not available! # t.append(absent_p_values[sample_name][mut_names[mut_idx]]) # logger.debug('Variant {} has weight {:.3e} from p-values {}.'.format( # mut_names[mut_idx], weight, ', '.join('{:.2e}'.format(a) for a in t))) mut_pattern_scores[mut_idx] = weight node_weights[node] += weight if (node, mut_idx) in unknown_positions: unknown_muts[mut_idx][node] = weight # run through all unknown positions and generate nodes with all possible patterns for node, mut_idx in unknown_positions.keys(): # generates all combinations (present-absent) of all unknown positions in a given clone for length in range(1, len(unknown_positions[(node, mut_idx)])+1): for unknown_samples in itertools.combinations(unknown_positions[(node, mut_idx)], length): # form new mutation pattern new_node = frozenset(node.union(unknown_samples)) weight = 1.0 for sa_idx, sample_name in enumerate(sample_names): if sa_idx in node: # variants are present in these samples if mut_names[mut_idx] in present_p_values[sample_name]: weight *= 1.0 - present_p_values[sample_name][mut_names[mut_idx]] # t.append(present_p_values[sample_name][mut_names[mut_idx]]) elif sa_idx in unknown_samples: # p-value can be at most 0.5 (variant is present or absent with 50% chance each) if mut_names[mut_idx] in absent_p_values[sample_name]: weight *= min(absent_p_values[sample_name][mut_names[mut_idx]], 0.5) else: # will never be used, since unknowns can only appear if there is coverage data # and if there is coverage data, then there are also p-values weight *= 0.5 else: # variants are absent in these samples # p-value can be at most 0.5 (variant is present or absent with 50% chance each) if mut_names[mut_idx] in absent_p_values[sample_name]: weight *= 1.0 - min(absent_p_values[sample_name][mut_names[mut_idx]], 0.5) # else: # weight *= 0.9 # coverage data is not available! # add mutation to the newly formed pattern nodes[new_node].add(mut_idx) # add weight to the newly formed pattern if new_node in node_weights: node_weights[new_node] += weight else: node_weights[new_node] = weight unknown_muts[mut_idx][new_node] = weight if len(unknown_positions[(node, mut_idx)]) > 3 and length == 3: logger.warn('Variant {} has low coverage in too many samples ({}) such '.format( mut_names[mut_idx], len(unknown_positions[(node, mut_idx)])) + 'that all possible combinations can be explored.') break return nodes, node_weights, unknown_muts, mut_pattern_scores def _add_evolutionary_node(tree, parent, mp, node_name, mutations, confidence=None): """ Find the right place in the tree to insert the new node (clone) There is exactly one such place. Depth-first-search is used. :param tree: :param parent: :param mp: :param node_name: :param mutations: set of mutation uniquely present in this node :param confidence: confidence in branching :return: """ # logger.debug('Parent {} - clone {} - mutations {}'.format(parent, clone, mutations)) # find a node where the given clone is a subset of the node's samples # but none of its children is a superset of the given clone => insertion place is found for child in tree.successors(parent): if mp.issubset(child): if _add_evolutionary_node(tree, child, mp, node_name, mutations, confidence=confidence): return True # clone is not a subset of any existing children # hence, a new node has to be created on this level tree.add_node(mp, name=node_name, muts=mutations.union(tree.node[parent]['muts'])) # add confidence value of this branching if confidence is not None and mp in confidence: tree.node[mp]['conf'] = confidence[mp] # check if any of the existing children of the parent are subsets of the new node new_grandchildren = set() for child in tree.successors(parent): if child.issubset(mp): new_grandchildren.add(child) for grandchild in new_grandchildren: helper_mutations = tree[parent][grandchild]['muts'] tree.remove_edge(parent, grandchild) tree.add_edge(mp, grandchild, muts=helper_mutations) tree.node[grandchild]['muts'] = tree.node[grandchild]['muts'].union(mutations) logger.debug('{} became a child of {}.'.format(grandchild, mp)) # insert edge to the newly added clone (node) tree.add_edge(parent, mp, muts=mutations) logger.debug('Successfully inserted {} with {} mutations as child of {}.'.format(mp, len(tree.node[mp]['muts']), parent)) return True def _get_html_template(): filename = "tree_template.html" input_dir = "input" # derive correct input directory if not os.getcwd().endswith('treeomics'): template_path = os.path.join(input_dir, filename) else: template_path = os.path.join('..', input_dir, filename) logger.debug('Path to HTML template: '.format(template_path)) return template_path