"""" scTDA. Library for topological data analysis of high-throughput single-cell RNA-seq data. Copyright 2017, Pablo G. Camara, Columbia University. All rights reserved. """ __author__ = "Pablo G. Camara" __maintainer__ = "Pablo G. Camara" __email__ = "pablo.g.camara@gmail.com" __credits__ = "Patrick van Nieuwenhuizen, Luis Aparicio, Yan Meng" import json import matplotlib_venn import networkx import numexpr import numpy import numpy.linalg import numpy.random import pandas import pickle import pylab import requests import sakmapper import scipy.cluster.hierarchy as sch import scipy.interpolate import scipy.optimize import scipy.signal import scipy.spatial.distance import scipy.stats import sklearn.cluster import sklearn.metrics.pairwise from mpl_toolkits.mplot3d import Axes3D import warnings warnings.filterwarnings("ignore") pylab.rcParams["patch.force_edgecolor"] = True pylab.rcParams['patch.facecolor'] = 'k' """ GLOBAL METHODS """ def ParseAyasdiGraph(name, source, lab, user, password): """ Parses Ayasdi graph given by 'source' ID and 'lab' ID, and generates files name.gexf, name.json and name.groups.json that are used by SCTDA classes. Arguments 'user' and 'password' specify Ayasdi login credentials. """ headers = {"Content-type": "application/json"} session = requests.Session() session.post('https://platform.ayasdi.com/workbench/login', data={'username': user, 'passphrase': password}, verify=False) r = session.get('https://platform.ayasdi.com/workbench/v0/sources/' + source + '/networks/' + lab) sp = json.loads(r.content) rows = [int(x['id']) for x in sp['nodes']] dic2 = {} for i in rows: payload = {"network_nodes_descriptions": [{"network_id": lab, "node_ids": [i]}]} r = session.post('https://platform.ayasdi.com/workbench/v0/sources/' + source + '/retrieve_row_indices', data=json.dumps(payload), headers=headers) dic2[i] = json.loads(r.content)['row_indices'] with open(name + '.json', 'wb') as handle3: json.dump(dic2, handle3) rowcount = [] with open(name + '.gexf', 'w') as g: g.write('<?xml version="1.0" encoding="UTF-8"?>\n') g.write('<gexf xmlns="http://www.gexf.net/1.2draft" version="1.2">\n') g.write('\t<graph mode="static" defaultedgetype="undirected">\n') g.write('\t\t<nodes>\n') for nod in sp['nodes']: g.write('\t\t\t<node id="' + str(nod['id']) + '" label="' + str(nod['row_count']) + '" />\n') rowcount.append(float(nod['row_count'])) g.write('\t\t</nodes>\n') g.write('\t\t<edges>\n') for n5, edg in enumerate(sp['links']): g.write('\t\t\t<edge id="' + str(n5) + '" source="' + str(edg['from']) + '" target="' + str(edg['to']) + '" />\n') g.write('\t\t</edges>\n') g.write('\t</graph>\n') g.write('</gexf>\n') r = session.get('https://platform.ayasdi.com/workbench/v0/sources/' + source + '/networks/' + lab + '/node_groups') sp = json.loads(r.content) dicgroups = {} for m in sp: dicgroups[m['name']] = m['node_ids'] with open(name + '.groups.json', 'wb') as handle3: json.dump(dicgroups, handle3) def benjamini_hochberg(pvalues): """ Benjamini-Hochberg adjusted p-values for multiple testing. 'pvalues' contains a list of p-values. Adapted from http://stackoverflow.com/questions/7450957/how-to-implement-rs-p-adjust-in-python. Not subjected to copyright. """ pvalues = numpy.array(pvalues) n = float(pvalues.shape[0]) new_pvalues = numpy.empty(int(n)) values = [(pvalue, i) for i, pvalue in enumerate(pvalues)] values.sort() values.reverse() new_values = [] for i, vals in enumerate(values): rank = n - i pvalue, index = vals new_values.append((n/rank) * pvalue) for i in xrange(0, int(n)-1): if new_values[i] < new_values[i+1]: new_values[i+1] = new_values[i] for i, vals in enumerate(values): pvalue, index = vals new_pvalues[index] = new_values[i] return list(new_pvalues) def is_number(s): """ Checks whether a string can be converted into a float """ try: float(s) return True except ValueError: return False def compare_results(table1, table2, threshold=0.05): """ Compares two tables produced by UnrootedGraph.save() or RootedGraph.save(). The parameter 'threshold' indicates the threshold in the q-value for a gene to be considered as significant. It produces the Venn diagram comparing genes that are significant in each of the analysis. If the input tables were produced by RootedGraph.save() it also plots the correlation of the centroid of the genes that are significant in the two data sets, as well as the correlation of the dispersion of the genes that are significant in both data sets. """ f1p = [] f1c = {} f1d = {} f1 = open(table1 + '.genes.tsv', 'r') total = [] for n, line in enumerate(f1): sp = line[:-1].split('\t') if n == 0: if 'Centroid' in sp: rooted = True else: rooted = False else: total.append(sp[0]) if float(sp[7]) < threshold: f1p.append(sp[0]) if rooted: f1c[sp[0]] = float(sp[8]) f1d[sp[0]] = float(sp[9]) f1.close() f2p = [] fx = [] fy = [] gx = [] gy = [] f1 = open(table2 + '.genes.tsv', 'r') for n, line in enumerate(f1): if n > 0: sp = line[:-1].split('\t') total.append(sp[0]) if float(sp[7]) < threshold: f2p.append(sp[0]) if sp[0] in f1p and rooted: fx.append(f1c[sp[0]]) gx.append(f1d[sp[0]]) fy.append(float(sp[8])) gy.append(float(sp[9])) f1.close() total = set(total) pylab.figure() matplotlib_venn.venn2([set(f1p), set(f2p)], set_labels=[table1, table2]) print "Overlap between significant genes (Fisher's exact test p-value): " + str(scipy.stats.fisher_exact( [[len(set(f1p) & set(f2p)), len(set(f1p)) - len(set(f1p) & set(f2p))], [len(set(f2p)) - len(set(f1p) & set(f2p)), len(total) + len(set(f1p) & set(f2p)) - len(set(f1p)) - len(set(f2p))]])[1]) if rooted: pylab.figure() pylab.scatter(fx, fy, alpha=0.2, s=8) xt = numpy.linspace(int(min(fx + fy)), int(max(fx + fy) + 1), 10) pylab.plot(xt, xt, 'k-') pylab.xlim(int(min(fx + fy)), int(max(fx + fy)) + 1) pylab.ylim(int(min(fx + fy)), int(max(fx + fy)) + 1) pylab.xlabel('Centroids ' + table1) pylab.ylabel('Centroids ' + table2) print "Pearson's correlation between centroids: " + str(scipy.stats.pearsonr(fx, fy)) pylab.figure() pylab.scatter(gx, gy, alpha=0.2, s=8) xt = numpy.linspace(int(min(gx + gy)), int(max(gx + gy)) + 1, 10) pylab.plot(xt, xt, 'k-') pylab.xlim(int(min(gx + gy)), int(max(gx + gy)) + 1) pylab.ylim(int(min(gx + gy)), int(max(gx + gy)) + 1) pylab.xlabel('Dispersion ' + table1) pylab.ylabel('Dispersion ' + table2) print "Pearson's correlation between dispersions: " + str(scipy.stats.pearsonr(gx, gy)) pylab.show() def hierarchical_clustering(mat, method='average', cluster_distance=True, labels=None, thres=0.65): """ Performs hierarchical clustering based on distance matrix 'mat' using the method specified by 'method'. Optional argument 'labels' may specify a list of labels. If cluster_distance is True, the clustering is performed on the distance matrix using euclidean distance. Otherwise, mat specifies the distance matrix for clustering. Adapted from http://stackoverflow.com/questions/7664826/how-to-get-flat-clustering-corresponding-to-color-clusters-in-the-dendrogram-cre Not subjected to copyright. """ D = numpy.array(mat) if not cluster_distance: Dtriangle = scipy.spatial.distance.squareform(D) else: Dtriangle = scipy.spatial.distance.pdist(D, metric='euclidean') fig = pylab.figure(figsize=(8, 8)) ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6]) Y = sch.linkage(Dtriangle, method=method) Z1 = sch.dendrogram(Y, orientation='right', color_threshold=thres*max(Y[:, 2])) ax1.set_xticks([]) ax1.set_yticks([]) ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2]) Y = sch.linkage(Dtriangle, method=method) Z2 = sch.dendrogram(Y, color_threshold=thres*max(Y[:, 2])) ax2.set_xticks([]) ax2.set_yticks([]) axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] D = D[idx1, :] D = D[:, idx2] im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.get_cmap('jet_r')) if labels is None: axmatrix.set_xticks([]) axmatrix.set_yticks([]) else: axmatrix.set_xticks(range(len(labels))) lab = [labels[idx1[m]] for m in range(len(labels))] axmatrix.set_xticklabels(lab) axmatrix.set_yticks(range(len(labels))) axmatrix.set_yticklabels(lab) for tick in pylab.gca().xaxis.iter_ticks(): tick[0].label2On = False tick[0].label1On = True tick[0].label1.set_rotation('vertical') for tick in pylab.gca().yaxis.iter_ticks(): tick[0].label2On = True tick[0].label1On = False axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6]) pylab.colorbar(im, cax=axcolor) pylab.show() return Z1 def find_clusters(z): clus = {} for y, m in enumerate(z['dcoord']): for n, q in enumerate(m): if q == 0.0: if z['color_list'][y] not in clus.keys(): clus[z['color_list'][y]] = [z['leaves'][int((z['icoord'][y][n]-5.0)/10.0)]] else: clus[z['color_list'][y]].append(z['leaves'][int((z['icoord'][y][n]-5.0)/10.0)]) return clus """ CLASSES """ class Preprocess(object): """ Class for preparing and filtering data based on RNA spike in read counts. It takes as input one or more files with read counts produced by HTSeq-count. These can be all from a same time point or from multiple time points. It permits to read, filter and organize the data to put it in the appropriate form for SCTDA. """ def __init__(self, files, timepoints, libs, cells, spike='_null_'): """ Initializes the class by providing a list of files ('files'), timepoints ('timepoints') and library id's ('libs'), as well as the number of cells per file ('cells'), which can be a list, and the common identifier for the RNA spike-in reads ('spike'). The order of the genes must be the same in all files. """ self.sigmoid = False self.cdr = [] self.residuals = {} self.subsampled = False self.target_subsample = 0.0 self.data = [] self.spike = spike if type(cells) != list: self.len = [cells]*len(files) else: self.len = cells self.fil = files self.long = list(numpy.repeat(timepoints, self.len)) self.batch = list(numpy.repeat(libs, self.len)) self.cal = [] self.cal2 = [] self.tal = {} totalspikes = {} self.totaltransc = {} self.spikes_ratio = {} self.data_spikes = [] datat = {} qft = {} self.genes = [] self.totaltran = [] for nn, f in enumerate(self.fil): carma = [] totalspikes[f] = numpy.zeros(self.len[nn]) self.totaltransc[f] = numpy.zeros(self.len[nn]) self.spikes_ratio[f] = numpy.zeros(self.len[nn]) datat[f] = [] qft[f] = [] fol = open(f, 'r') qty = 0 for line in fol: sp = line[:-1].split('\t') q = numpy.array(map(lambda x: float(x), sp[1:])) if spike in sp[0]: qft[f].append(q) totalspikes[f] += q if numpy.mean(q) > 0.0: carma.append(q/numpy.mean(q)) qty += 1 else: self.totaltransc[f] += q datat[f].append(q) if nn == 0: self.genes.append(sp[0]) if spike != '_null_': self.spikes_ratio[f] = totalspikes[f]/self.totaltransc[f] self.cal2 += list(self.spikes_ratio[f]) fol.close() factors = [] for k in numpy.transpose(numpy.array(carma)): factors.append(numpy.mean(k)) self.cal += factors self.tal[f] = factors self.totaltran += list(self.totaltransc[f]) self.data += list(numpy.transpose(numpy.array(datat[f]))) self.data_spikes += list(numpy.transpose(numpy.array(qft[f]))) for m in self.data: self.cdr.append(len(m)-list(m).count(0.0)) self.totaltran = numpy.array(self.totaltran) self.data = numpy.transpose(numpy.array(self.data)) self.data_spikes = numpy.transpose(numpy.array(self.data_spikes)) self.which_genes = numpy.ones(len(self.data), dtype=bool) self.which_samples = numpy.ones(len(self.data[0]), dtype=bool) self.which_samples_subsampled = numpy.ones(len(self.data[0]), dtype=bool) self.which_samples_backup = numpy.ones(len(self.data[0]), dtype=bool) self.data_subsampled = self.data self.cdr_subsampled = self.cdr self.q = numpy.log2(1.0+1.0e6*self.data/self.totaltran) self.q = self.q[self.q > 0.0] self.genes = numpy.array(self.genes) def subsample(self, n): """ Subsample counts so that each cell has exactly n total counts. If n = 0 sets the state of the Preprocess instance to the original unsampled situation. If a cell has less than n transcripts, it is removed from the analysis. Preprocess.which_samples indicates the cells that are kept in the analysis. """ if n > 0: self.subsampled = True self.target_subsample = float(n) self.which_samples_subsampled = (self.totaltran >= n) self.which_samples = self.which_samples_subsampled & self.which_samples_backup probs = self.data/self.totaltran for nn2, m in enumerate(list(self.which_samples)): if not m: probs[nn2] = 0.0 self.data_subsampled = [] self.cdr_subsampled = [] for rn in range(self.data.shape[1]): if self.which_samples_subsampled[rn]: self.data_subsampled.append(numpy.random.multinomial(n, probs[:, rn])) else: self.data_subsampled.append(numpy.zeros(len(probs[:, rn]))) m = self.data_subsampled[-1] self.cdr_subsampled.append(len(m)-list(m).count(0.0)) self.data_subsampled = numpy.transpose(numpy.array(self.data_subsampled)) else: self.subsampled = False self.data_subsampled = self.data self.cdr_subsampled = self.cdr self.target_subsample = 0.0 self.which_samples = self.which_samples_backup self.which_samples_subsampled = numpy.ones(len(self.data[0]), dtype=bool) self.sigmoid = False print str(int(list(self.which_samples).count(True))) + ' cells remain after subsmapling' def show_statistics(self): """ Displays some basic statistics of the data. These are summarized in two plots. The first contains the ratio spike-in reads/uniquely mapped reads versus the ratio spike-in reads/average number of spike-in reads in the library. This plot is not displayed when there are not spike-ins in the analysis. The second is a histogram of the read counts expressed as transcripts per million (TPM), in a log_2(1+TPM) scale. The third plot shows the fraction of dropout events against the average expression for each gene. The four plot shows the distribution of the totl number of transcripts per cell. The fifth plot shows the distribution of complexity across cells. When subsampling and/or cell filtering have been performed, the corresponding plots for the subsampled data are overlaid in red. """ if self.spike != '_null_': col = [] for m in list(self.which_samples): if m and list(self.which_samples).count(False) > 0: col.append('r.') else: col.append('b.') pylab.figure() for n in range(len(self.cal)): pylab.plot(self.cal[n], self.cal2[n], col[n], alpha=0.6) pylab.yscale('log') pylab.xlabel('Spike-in reads / average spike-in reads library') pylab.ylabel('Spike-in reads / uniquely mapped reads') pylab.figure() pylab.hist(self.q, 100, alpha=0.6, color='b') if self.subsampled: datatt = self.data_subsampled[:, self.which_samples] else: datatt = self.data[:, self.which_samples] if list(self.which_samples).count(False) > 0 or self.subsampled: if self.subsampled: q2 = numpy.log2(1.0+1.0e6*datatt/self.target_subsample) else: q2 = numpy.log2(1.0+1.0e6*datatt/self.totaltran[self.which_samples]) q2 = q2[q2 > 0.0] pylab.hist(q2, 100, alpha=0.8, color='r') pylab.xlabel('log_2 (1 + TPM)') pylab.figure() x = [] y = [] for m in list(self.data): x.append(float(list(m).count(0.0))/float(len(m))) y.append(numpy.mean(m)) pylab.scatter(y, x, alpha=0.2, s=5, c='b') if list(self.which_samples).count(False) > 0 or self.subsampled: xs = [] ys = [] for m in list(self.data_subsampled[:, self.which_samples]): xs.append(float(list(m).count(0.0))/float(len(m))) ys.append(numpy.mean(m)) pylab.scatter(ys, xs, alpha=0.7, s=5, c='r') if self.spike != '_null_': xsp = [] ysp = [] for m in list(self.data_spikes): xsp.append(float(list(m).count(0.0))/float(len(m))) ysp.append(numpy.mean(m)) pylab.scatter(ysp, xsp, alpha=0.7, s=10, c='y') pylab.xscale('log') pylab.ylim(-0.05, 1.05) pylab.xlim(min(numpy.array(y)[numpy.array(y) > 0]), max(y)) pylab.xlabel('Average transcripts per cell') pylab.ylabel('Fraction of cells with non-detected expression') pylab.figure() pylab.hist(numpy.log10(self.totaltran[self.totaltran > 0.0]), 30, alpha=0.6, color='g') pylab.xlabel('Total number of transcripts in the cell') pylab.figure() pylab.hist(self.cdr, 30, alpha=0.6, color='y') if list(self.which_samples).count(False) > 0 or self.subsampled: pylab.hist(numpy.array(self.cdr_subsampled)[self.which_samples], 30, alpha=0.6, color='r') pylab.xlabel('Cell complexity') print 'Minimum number of transcripts per cell: ' + \ str(int(min(self.totaltran[self.which_samples]))) print 'Minimum cell complexity: ' + str(int(min(numpy.array(self.cdr)[self.which_samples]))) if self.subsampled: print 'Minimum number of transcripts per cell (subsampled): ' + str(int(self.target_subsample)) print 'Minimum cell complexity (subsampled): ' + \ str(int(min(numpy.array(self.cdr_subsampled)[self.which_samples]))) pylab.show() def fit_sigmoid(self, to_spikes=False): """ Fits a sigmoid function to model the dependence of the dropout rate with the average expression and assigns a z-score to each gene by fitting the residuals with a normal distribution, where the standard deviation is estimated from the lower 16th percentile of the data. If subsampled has been performed through subsample, it fits the sigmoid to the subsampled data. If to_spikes is set to True, it uses spike in data to fit the sigmoid. If cells have been filtered out, it is taken into account. """ def sigmoid(xt, x0, k): yt = 1 / (1 + numpy.exp(k*(xt-x0))) return yt if to_spikes and self.spike != '_null_': data = list(numpy.array(self.data_spikes)[:, numpy.array(self.which_samples)]) if self.subsampled: datab = list(numpy.array(self.data_subsampled)[:, numpy.array(self.which_samples)]) else: datab = list(numpy.array(self.data)[:, numpy.array(self.which_samples)]) elif self.subsampled: data = list(numpy.array(self.data_subsampled)[:, self.which_samples]) else: data = numpy.array(self.data)[:, numpy.array(self.which_samples)] pylab.figure() x = [] y = [] for m in list(data): if numpy.mean(m) > 0.0: x.append(float(list(m).count(0.0))/float(len(m))) y.append(numpy.mean(m)) ys_f = numpy.log10(numpy.array(y)) popt, pcov = scipy.optimize.curve_fit(sigmoid, ys_f, numpy.array(x)) xp = numpy.linspace(int(min(ys_f[ys_f > -numpy.infty])), int(max(ys_f))+1, 50) yp = sigmoid(xp, *popt) if to_spikes and self.spike != '_null_': xb = [] yb = [] for m in list(datab): xb.append(float(list(m).count(0.0))/float(len(m))) yb.append(numpy.mean(m)) ys_fb = numpy.log10(numpy.array(yb)) pylab.scatter(ys_fb, xb, alpha=0.2, s=5, c='b') else: pylab.scatter(ys_f, x, alpha=0.2, s=5, c='b') if self.spike != '_null_' and not self.subsampled: xrt = [] yrt = [] for m in list(self.data_spikes[:, self.which_samples]): xrt.append(float(list(m).count(0.0))/float(len(m))) yrt.append(numpy.mean(m)) ys_frt = numpy.log10(numpy.array(yrt)) pylab.scatter(ys_frt, xrt, alpha=0.7, s=10, c='y') pylab.plot(xp, yp, 'r-') pylab.xlabel('Average transcripts per cell') pylab.ylabel('Fraction of cells with non-detected expression') pylab.ylim(-0.05, 1.05) pylab.figure() self.residuals = {} if to_spikes and self.spike != '_null_': for n, (x1, y1) in enumerate(zip(list(ys_fb), xb)): self.residuals[n] = y1 - sigmoid(x1, *popt) else: for n, (x1, y1) in enumerate(zip(list(ys_f), x)): self.residuals[n] = y1 - sigmoid(x1, *popt) pylab.hist(self.residuals.values(), 200, normed=True) xst = numpy.linspace(min(self.residuals.values()), max(self.residuals.values()), 1000) sig = numpy.median(self.residuals.values())-numpy.percentile(self.residuals.values(), 16) pylab.plot(xst, scipy.stats.norm.pdf(xst, numpy.median(self.residuals.values()), sig), 'r-') pylab.xlabel('Residuals') for m in self.residuals.keys(): self.residuals[m] /= sig self.sigmoid = True pylab.show() def select_genes(self, n_cells=0, avg_counts=0.0, min_z=-numpy.infty): """ Selects a set of genes, based on various criteria, that will be used for building the topological representation. Selects for genes that are expressed in at least n_cells, that have at least avg_counts counts across all cells, and that have a minimum z-score of min_z with respect to the sigmoid fit. Needs to be run after fit_sigmoid if min_z is specified. If subsampling has been performed, it considers the subsampled dataset. """ if min_z > -numpy.infty and not self.sigmoid: print 'fit_sigmoid() needs to be run before selecting genes' else: if self.subsampled: data = self.data_subsampled[:, self.which_samples] else: data = self.data[:, self.which_samples] col = [] x = [] y = [] n = 0 for ng, m in enumerate(list(data)): if numpy.mean(m) > 0.0: x.append(float(list(m).count(0.0))/float(len(m))) y.append(numpy.mean(m)) if len(m)-list(m).count(0.0) >= n_cells and numpy.mean(m) >= avg_counts and self.residuals[n] >= min_z: self.which_genes[ng] = True col.append('r') else: self.which_genes[ng] = False col.append('b') n += 1 else: self.which_genes[ng] = False pylab.figure() ys_f = numpy.log10(numpy.array(y)) pylab.scatter(ys_f, x, alpha=0.5, s=5, c=col) if self.spike != '_null_': xrt = [] yrt = [] for m in list(self.data_spikes[:, self.which_samples]): xrt.append(float(list(m).count(0.0))/float(len(m))) yrt.append(numpy.mean(m)) ys_frt = numpy.log10(numpy.array(yrt)) pylab.scatter(ys_frt, xrt, alpha=0.7, s=10, c='y') pylab.xlabel('Average transcripts per cell') pylab.ylabel('Fraction of cells with non-detected expression') pylab.ylim(-0.05, 1.05) print str(int(col.count('r'))) + " genes were selected" pylab.show() def reset_genes(self): """ Includes all genes in the analysis. """ self.which_genes = numpy.ones(len(self.data), dtype=bool) def select_cells(self, min_transcripts=0.0, min_cdr=0.0, filterXlow=0.0, filterXhigh=1.0e8, filterYlow=0.0, filterYhigh=1.0e8): """ Selects a set of cells, based on various criteria, that will be used for subsequent analysis. Parameter 'min_transcripts' sets the minimum total number transcripts. Parameter 'least min_cdr' specifies the minimum cell complexity. Parameters 'filterXlow' and 'filterXhigh' set respectively lower and upper bounds in the ratio between spike-in reads and the average number of spike-in reads in the library. Parameters 'filterYlow' and 'filterYhigh' set respectively lower and upper bounds in the ratio between spike-in reads and uniquely mapped reads. Subsampling is not taken into account in the conditions. """ if (filterXlow != 0.0 or filterXhigh != 1.0e8 or filterYlow != 0.0 or filterYhigh != 1.0e8) and self.spike == '_null_': print 'No spike-ins selected' else: for n, (m, c) in enumerate(zip(list(self.totaltran), self.cdr)): if m >= min_transcripts and c >= min_cdr: self.which_samples[n] = True & self.which_samples_subsampled[n] self.which_samples_backup[n] = True else: self.which_samples[n] = False self.which_samples_backup[n] = False if self.spike != '_null_': n1 = 0 for n, f in enumerate(self.fil): for nok in range(self.len[n]): if (filterXhigh > self.tal[f][nok] > filterXlow and filterYlow < self.spikes_ratio[f][nok] < filterYhigh): self.which_samples[n1] &= True self.which_samples_backup[n1] &= True else: self.which_samples[n1] = False self.which_samples_backup[n1] = False n1 += 1 col = [] for m in self.which_samples: if m: col.append('r.') else: col.append('b.') pylab.figure() for n in range(len(self.cal)): pylab.plot(self.cal[n], self.cal2[n], col[n], alpha=0.6) pylab.yscale('log') pylab.xlabel('spike-in reads / average spike-in reads library') pylab.ylabel('spike-in reads / uniquely mapped reads') pylab.show() print str(int(list(self.which_samples_backup).count(True))) + " cells were selected" if self.subsampled: print '(' + str(int(list(self.which_samples).count(True))) + " cells after subsampling)" def reset_cells(self): """ Includes all cells in the analysis. If subsampling has been performed, it is taken into account. """ self.which_samples = self.which_samples_subsampled self.which_samples_backup = numpy.ones(len(self.data[0]), dtype=bool) def save(self, name): """ Produces two tab separated files, called 'name.all.tsv' and 'name.mapper.tsv', where rows are the cells in self.which_samples. The first column of 'name.all.tsv' contains a unique identifier of the cell, the second column contains the sampling timepoint, the third column contains the library id and the remaining columns contain to log_2(1+TPM) expression values for each gene. If subsampling has been performed, the data refers to the subsampled data, whereas a third file called 'name.no_subsampling.tsv' is also created with the same format, containing non-subsampled data. 'name.mapper.tsv' contains log_2(1+TPM) expression values for each gene in self.which_genes. """ g = open(name + '.all.tsv', 'w') p = 'ID\ttimepoint\tlib\t' if self.subsampled: data = self.data_subsampled else: data = self.data datat = numpy.transpose(data) for l in list(self.genes): p += l + '\t' g.write(p[:-1] + '\n') for n, m in enumerate(list(self.which_samples)): if m: p = 'D' + str(self.long[n]) + '_' + self.batch[n] + '_' + str(n) + '\t' + str(self.long[n]) \ + '\t' + self.batch[n] + '\t' for t in list(datat[n]): if self.subsampled: cv = self.target_subsample else: cv = self.totaltran[n] p += str(numpy.log2(1.0+1000000.0*t/float(cv))) + '\t' g.write(p[:-1] + '\n') g.close() if self.subsampled: g = open(name + '.no_subsampling.tsv', 'w') p = 'ID\ttimepoint\tlib\t' for l in list(self.genes): p += l + '\t' g.write(p[:-1] + '\n') for n, m in enumerate(list(self.which_samples)): if m: p = 'D' + str(self.long[n]) + '_' + self.batch[n] + '_' + str(n) + '\t' + str(self.long[n]) \ + '\t' + self.batch[n] + '\t' for t in list(numpy.transpose(self.data)[n]): if self.subsampled: cv = self.target_subsample else: cv = self.totaltran[n] p += str(numpy.log2(1.0+1000000.0*t/float(cv))) + '\t' g.write(p[:-1] + '\n') g.close() g = open(name + '.mapper.tsv', 'w') p = '' for m in list(self.genes[self.which_genes]): p += m + '\t' g.write(p[:-1] + '\n') for n, m in enumerate(list(self.which_samples)): if m: p = '' for t in list(datat[n, self.which_genes]): if self.subsampled: cv = self.target_subsample else: cv = self.totaltran[n] p += str(numpy.log2(1.0+1000000.0*t/float(cv))) + '\t' g.write(p[:-1] + '\n') g.close() class TopologicalRepresentation(object): """ Class for building a topological representation of the data using SakMapper """ def __init__(self, table, lens='mds', metric='correlation', precomputed=False, **kwargs): """ Initializes the class by providing the mapper input table generated by Preprocess.save(). The parameter 'metric' specifies the metric distance to be used ('correlation', 'euclidean' or 'neighbor'). The parameter 'lens' specifies the dimensional reduction algorithm to be used ('mds' or 'pca'). The rest of the arguments are passed directly to sklearn.manifold.MDS or sklearn.decomposition.PCA. It plots the low-dimensional projection of the data. """ self.df = pandas.read_table(table + '.mapper.tsv') if lens == 'neighbor': self.lens_data_mds = sakmapper.apply_lens(self.df, lens=lens, **kwargs) elif lens == 'mds': if precomputed: self.lens_data_mds = sakmapper.apply_lens(self.df, lens=lens, metric=metric, dissimilarity='precomputed', **kwargs) else: self.lens_data_mds = sakmapper.apply_lens(self.df, lens=lens, metric=metric, **kwargs) else: self.lens_data_mds = sakmapper.apply_lens(self.df, lens=lens, **kwargs) pylab.figure() pylab.scatter(numpy.array(self.lens_data_mds)[:, 0], numpy.array(self.lens_data_mds)[:, 1], s=10, alpha=0.7) pylab.show() def save(self, name, resolution, gain, equalize=True, cluster='agglomerative', statistics='db', max_K=5): """ Generates a topological representation using the Mapper algorithm with resolution and gain specified by the parameters 'resolution' and 'gain'. When equalize is set to True, patches are chosen such that they contain the same number of points. The parameter 'cluster' specifies the clustering method ('agglomerative' or 'kmeans'). The parameter 'statistics' specifies the criterion for choosing the optimal number of clusters ('db' for Davies-Bouildin index, or 'gap' for the gap statistic). The parameter 'max_K' specifies the maximum number of clusters to be considered within each patch. The topological representation is stored in the files 'name.gexf' and 'name.json'. It returns a dictionary with the patches. """ G, all_clusters, patches = sakmapper.mapper_graph(self.df, lens_data=self.lens_data_mds, resolution=resolution, gain=gain, equalize=equalize, clust=cluster, stat=statistics, max_K=max_K) dic = {} for n, rs in enumerate(all_clusters): dic[str(n)] = map(lambda x: int(x), rs) with open(name + '.json', 'wb') as handle3: json.dump(dic, handle3) networkx.write_gexf(G, name + '.gexf') return patches class UnrootedGraph(object): """ Main class for topological analysis of non-longitudinal single cell RNA-seq expression data. """ def __init__(self, name, table, shift=None, log2=True, posgl=False, csv=False, groups=True): """ Initializes the class by providing the the common name ('name') of .gexf and .json files produced by e.g. ParseAyasdiGraph() and the name of the file containing the filtered raw data ('table'), as produced by Preprocess.save(). Optional argument 'shift' can be an integer n specifying that the first n columns of the table should be ignored, or a list of columns that should only be considered. If optional argument 'log2' is False, it is assumed that the filtered raw data is in units of TPM instead of log_2(1+TPM). When optional argument 'posgl' is False, a files name.posg and name.posgl are generated with the positions of the graph nodes for visualization. When 'posgl' is True, instead of generating new positions, the positions stored in files name.posg and name.posgl are used for visualization of the topological graph. If connected is False, all connected components of the network are displayed. When 'csv' is True, the input table is in CSV format. When 'groups' is False, the class is initialized with an empty group dictionary (e.g. required when the topological representation has been generated through TopologicalRepresentation.save()). """ self.name = name self.g = networkx.read_gexf(name + '.gexf') listii = [len(aa.nodes()) for aa in list(networkx.connected_component_subgraphs(self.g))] indexii = listii.index(numpy.max(listii)) self.gl = list(networkx.connected_component_subgraphs(self.g))[indexii] self.pl = self.gl.nodes() self.adj = numpy.array(networkx.to_numpy_matrix(self.gl, nodelist=self.pl)) self.log2 = log2 self.cellID = [] self.libs = [] self.cdr = [] if not posgl: try: from networkx.drawing.nx_agraph import graphviz_layout self.posgl = graphviz_layout(self.gl, 'sfdp', '-Goverlap=false -GK=0.1') self.posg = graphviz_layout(self.g, 'sfdp', '-Goverlap=false -GK=0.1') except: self.posgl = networkx.spring_layout(self.gl) self.posg = networkx.spring_layout(self.g) with open(name + '.posgl', 'w') as handler: pickle.dump(self.posgl, handler) with open(name + '.posg', 'w') as handler: pickle.dump(self.posg, handler) else: with open(name + '.posgl', 'r') as handler: self.posgl = pickle.load(handler) with open(name + '.posg', 'r') as handler: self.posg = pickle.load(handler) with open(name + '.json', 'r') as handler: self.dic = json.load(handler) if groups: with open(name + '.groups.json', 'r') as handler: self.dicgroups = json.load(handler) else: self.dicgroups = {} if csv: cx = ',' else: cx = '\t' with open(table, 'r') as f: self.dicgenes = {} self.geneindex = {} for n, line in enumerate(f): sp2 = numpy.array(line[:-1].split(cx)) if csv: sp = [x.split('|')[0] for x in sp2] else: sp = sp2 if shift is None: if n == 0: poi = sp if 'lib' in list(sp): coyt = list(sp).index('lib') else: coyt = -1 if 'timepoint' in list(sp): ttt = list(sp).index('timepoint') else: ttt = -1 else: poi = [] self.cdr.append(0.0) for n2, mji in enumerate(sp): if is_number(mji): poi.append(mji) if float(mji) > 0.0 and n2 != 0 and n2 != coyt and n2 != ttt: self.cdr[-1] += 1.0 elif n2 == coyt: self.libs.append(mji) poi.append(0.0) elif n2 == 0: self.cellID.append(mji) poi.append(0.0) else: poi.append(0.0) elif type(shift) == int: poi = sp[shift:] else: poi = list(numpy.array(sp)[shift]) if n == 0: for u, q in enumerate(poi): self.dicgenes[q] = [] self.geneindex[u] = q else: for u, q in enumerate(poi): self.dicgenes[self.geneindex[u]].append(float(q)) self.samples = [] for i in self.pl: self.samples += self.dic[i] self.samples = numpy.array(list(set(self.samples))) self.cellID = numpy.array(self.cellID) self.cdr = numpy.array(self.cdr)/float(len(self.cdr)) def get_gene(self, genin, ignore_log=False, con=True): """ Returns a dictionary that asigns to each node id the average value of the column 'genin' in the raw table. 'genin' can be also a list of columns, on which case the average of all columns. The output is normalized such that the sum over all nodes is equal to 1. It also provides as an output the normalization factor, to convert the dictionary to log_2(1+TPM) units (or TPM units). When 'ignore_log' is True it treat entries as being in natural scale, even if self.log2 is True (used internally). When 'con' is False, it uses all nodes, not only the ones in the first connected component of the topological representation (used internally). """ if genin is not None and 'lib_' in genin[:4]: return self.count_gene('lib', genin[genin.index('_')+1:], con=con) elif genin is not None and 'ID_' in genin[:3]: return self.count_gene('ID', genin[genin.index('_')+1:], con=con) elif genin == '_CDR': genecolor = {} lista = [] for i in self.dic.keys(): if con: if str(i) in self.pl: genecolor[str(i)] = 0.0 lista.append(i) else: genecolor[str(i)] = 0.0 lista.append(i) kis = range(len(self.cdr)) for i in sorted(lista): pol = 0.0 for j in self.dic[i]: pol += float(self.cdr[kis[j]]) pol /= float(len(self.dic[i])) genecolor[str(i)] += pol tol = sum(genecolor.values()) if tol > 0.0: for ll in genecolor.keys(): genecolor[ll] = genecolor[ll]/tol return genecolor, tol else: if type(genin) != list: genin = [genin] genecolor = {} lista = [] for i in self.dic.keys(): if con: if str(i) in self.pl: genecolor[str(i)] = 0.0 lista.append(i) else: genecolor[str(i)] = 0.0 lista.append(i) for mju in genin: if mju is None: for i in sorted(lista): genecolor[str(i)] = 0.0 else: geys = self.dicgenes[mju] kis = range(len(geys)) for i in sorted(lista): pol = 0.0 if self.log2 and not ignore_log: for j in self.dic[i]: pol += (numpy.power(2, float(geys[kis[j]]))-1.0) pol = numpy.log2(1.0+(pol/float(len(self.dic[i])))) else: for j in self.dic[i]: pol += float(geys[kis[j]]) pol /= float(len(self.dic[i])) genecolor[str(i)] += pol tol = sum(genecolor.values()) if tol > 0.0: for ll in genecolor.keys(): genecolor[ll] = genecolor[ll]/tol return genecolor, tol def connectivity_pvalue(self, genin, n=500): """ Returns statistical significance of connectivity of gene specified by 'genin', using 'n' permutations. """ if genin is not None: jk = len(self.pl) pm = numpy.zeros((jk, n), dtype='float32') llm = list(numpy.arange(numpy.max(self.samples)+1)[self.samples]) koi = {k: u for u, k in enumerate(llm)} geys = numpy.tile(self.dicgenes[genin], (n, 1))[:, self.samples] map(numpy.random.shuffle, geys) tot = numpy.zeros(n) for k, i in enumerate(self.pl): pk = geys[:, numpy.array(map(lambda x: koi[x], self.dic[i]))] q = pk.shape[1] if self.log2: t1 = numexpr.evaluate('sum(2**pk - 1, 1)')/q pm[k, :] = numexpr.evaluate('log1p(t1)')/0.693147 tot += pm[k, :] else: pm[k, :] = numexpr.evaluate('sum(pk, 1)')/q tot += pm[k, :] pm = pm/tot conn = (float(jk)/float(jk-1))*numpy.einsum('ij,ij->j', numpy.dot(self.adj, pm), pm) return numpy.mean(conn > self.connectivity(genin)) else: return 0.0 def connectivity(self, genis, ind=1): """ Returns the value of order 'ind' connectivity for the gene or list of genes specified by 'genis'. """ dicgen = self.get_gene(genis)[0] ex = [] for uu in self.pl: ex.append(dicgen[uu]) ex = numpy.array(ex) cor = float(len(self.pl))/float(len(self.pl)-1) return cor*numpy.dot(numpy.transpose(ex), numpy.dot(numpy.linalg.matrix_power(self.adj, ind), ex)) def delta(self, genis, group=None): """ Returns the mean, minimum and maximum expression values of the gene or list of genes specified by argument 'genin'. Optional argument 'group' allows to restrict this method to one of the node groups specified in the file name.groups.json. """ per = [] dicgen, tot = self.get_gene(genis) if group is not None: if type(group) == list: for k in group: per += self.dicgroups[k] per = list(set(per)) else: per = self.dicgroups[group] mi = [dicgen[str(node)] for node in per] else: mi = [dicgen[node] for node in self.pl] if numpy.mean(mi) > 0.0: return numpy.mean(mi)*tot, numpy.min(mi)*tot, numpy.max(mi)*tot else: return 0.0, 0.0, 0.0 def expr(self, genis, group=None): """ Returns the number of rows with non-zero expression of gene or list of genes specified by argument 'genin'. Optional argument 'group' allows to restrict this method to one of the node groups specified in the file name.groups.json. """ per = [] if group is not None: if type(group) == list: for k in group: per += self.dicgroups[k] per = list(set(per)) else: per = self.dicgroups[group] po = [] for q in per: po += list(self.dic[str(q)]) po = list(set(po)) pi = sum(1 for i in numpy.array(self.dicgenes[genis])[po] if i > 0.0) else: pi = sum(1 for i in self.dicgenes[genis] if i > 0.0) return pi def save(self, n=500, filtercells=0, filterexp=0.0, annotation={}): """ Computes UnrootedGraph.expr(), UnrootedGraph.delta(), UnrootedGraph.connectivity(), UnrootedGraph.connectivity_pvalue() and Benjamini-Holchberg adjusted q-values for all genes that are expressed in more than 'filtercells' cells and whose maximum expression value is above 'filterexp'. The optional argument 'annotation' allos to include a dictionary with lists of genes to be annotated in the table. The output is stored in a tab separated file called name.genes.txt. """ pol = [] with open(self.name + '.genes.tsv', 'w') as ggg: cul = 'Gene\tCells\tMean\tMin\tMax\tConnectivity\tp_value\tq-value (BH)\t' for m in sorted(annotation.keys()): cul += m + '\t' ggg.write(cul[:-1] + '\n') lp = sorted(self.dicgenes.keys()) for gi in lp: if self.expr(gi) > filtercells and self.delta(gi)[2] > filterexp: pol.append(self.connectivity_pvalue(gi, n=n)) por = benjamini_hochberg(pol) mj = 0 for gi in lp: po = self.expr(gi) m1, m2, m3 = self.delta(gi) if po > filtercells and m3 > filterexp: cul = gi + '\t' + str(po) + '\t' + str(m1) + '\t' + str(m2) + '\t' + str(m3) + '\t' + \ str(self.connectivity(gi)) + '\t' + str(pol[mj]) + '\t' + str(por[mj]) + '\t' for m in sorted(annotation.keys()): if gi in annotation[m]: cul += 'Y' + '\t' else: cul += 'N' + '\t' ggg.write(cul[:-1] + '\n') mj += 1 centr = [] disp = [] centr2 = [] disp2 = [] f = open(self.name + '.genes.tsv', 'r') for n, line in enumerate(f): if n > 0: sp = line[:-1].split('\t') if float(sp[7]) <= 0.05: centr.append(float(sp[1])) disp.append(float(sp[5])) else: centr2.append(float(sp[1])) disp2.append(float(sp[5])) f.close() pylab.scatter(centr2, disp2, alpha=0.2, s=9, c='b') pylab.scatter(centr, disp, alpha=0.3, s=9, c='r') pylab.xlabel('Cells') pylab.ylabel('Connectivity') pylab.yscale('log') pylab.ylim(0.01, 1) pylab.xlim(0, max(centr+centr2)) pylab.show() def JSD_matrix(self, lista, maximum_matrix_entries=5*10**7, verbose=False): """ Returns the Jensen-Shannon distance matrix of the list of genes specified by 'lista'. If 'verbose' is set to True, it prints progress on the screen. 'maximum_matrix_entries' limits memory usage. If the largest matrix constructed as part of the calculation exceeds 'maximum_matrix_entries', the job is divided into multiple jobs, performed in series. The largest matrix has dimension (# nodes)*(# genes tested)^2. """ ge = numpy.array([self.get_gene(genis)[0].values() for genis in lista]) ge2 = numpy.copy(ge) ge2[ge2 == 0] = 1 plogp = numpy.sum(ge2*numpy.log2(ge2), axis=1) plogptile = numpy.tile(plogp, (len(lista), 1)) if len(ge)*len(ge[0])**2 <= maximum_matrix_entries: if verbose: print 'within limits: %s' % (len(ge)*len(ge[0])**2) ge_tile = numpy.tile(ge,(len(lista), 1, 1)) ge_tile_T = numpy.transpose(ge_tile, [1, 0, 2]) pq = 0.5*(ge_tile + ge_tile_T) pq[pq == 0] = 1 pqlogpq = numpy.sum(pq*numpy.log2(pq), axis=2) else: group_number = len(ge)*len(ge[0])**2 / maximum_matrix_entries + 1 group_length = len(ge) / group_number dd = range(0, len(ge), group_length) if dd[-1] != len(ge): dd += [len(ge)] if verbose: print 'outside limits: %s' % (len(ge)*len(ge[0])**2) print 'group_number = %s' % group_number print 'group_length = %s' % group_length print 'dd = %s'%dd for i in range(len(dd)-1): if verbose: print 'i = %s' % i ge_tile = numpy.tile(ge, (dd[i+1] - dd[i], 1, 1)) ge_tile_T = numpy.transpose(numpy.tile(ge[range(dd[i], dd[i+1])], (len(ge), 1, 1)), [1, 0, 2]) pq = 0.5*(ge_tile + ge_tile_T) pq[pq == 0] = 1 sliver = numpy.sum(pq*numpy.log2(pq), axis=2) if i == 0: pqlogpq = sliver else: pqlogpq = numpy.vstack((pqlogpq, sliver)) jsdiv = 0.5 * (plogptile + plogptile.T - 2*pqlogpq) return numpy.sqrt(jsdiv) def cor_matrix(self, lista, c=1): """ Returns correlation distance matrix of the list of genes specified by 'lista'. It uses 'c' cores for the computation. """ geys = numpy.array([self.dicgenes[mju] for mju in lista]) return sklearn.metrics.pairwise.pairwise_distances(geys, metric='correlation', n_jobs=c) def adjacency_matrix(self, lista, ind=1, verbose=False): """ Returns the adjacency matrix of order 'ind' of the list of genes specified by 'lista'. If 'verbose' is set to True, it prints progress on the screen. """ cor = float(len(self.pl))/float(len(self.pl)-1) pol = {} mat = [] for genis in lista: pol[genis] = self.get_gene(genis)[0] for n, m1 in enumerate(lista): if verbose: print n ex1 = [] for uu in self.pl: ex1.append(pol[m1][uu]) ex1 = numpy.array(ex1) mat.append([]) for m2 in lista: ex2 = [] for uu in self.pl: ex2.append(pol[m2][uu]) ex2 = numpy.array(ex2) mat[-1].append(cor*numpy.dot(numpy.transpose(ex1), numpy.dot(numpy.linalg.matrix_power(self.adj, ind), ex2))) return numpy.array(mat) def draw(self, color, connected=True, labels=False, ccmap='jet', weight=8.0, save='', ignore_log=False, table=False, axis=[], a=0.4, dpi=None, figsize=None, lw=1.0): """ Displays topological representation of the data colored according to the expression of a gene, genes or list of genes, specified by argument 'color'. This can be a gene or a list of one, two or three genes or lists of genes, to be respectively mapped to red, green and blue channels. When only one gene or list of genes is specified, it uses color map specified by 'ccmap'. If optional argument 'connected' is set to True, only the largest connected component of the graph is displayed. If argument 'labels' is True, node id's are also displayed. Argument 'weight' allows to set a scaling factor for node sizes. When optional argument 'save' specifies a file name, the figure will be save in the file, in the format specified by its extension, and no plot will be displayed on the screen. When 'ignore_log' is True, it treat expression values as being in natural scale, even if self.log2 is True (used internally). When argument 'table' is True, it displays in addition a table with some statistics of the gene or genes. Optional argument 'axis' allows to specify axis limits in the form [xmin, xmax, ymin, ymax]. Parameter alpha specifies the alpha value of edges. """ if connected: pg = self.gl pos = self.posgl else: pg = self.g pos = self.posg fig = pylab.figure(dpi=dpi, figsize=figsize) networkx.draw_networkx_edges(pg, pos, width=1, alpha=a) sizes = numpy.array([len(self.dic[node]) for node in pg.nodes()])*weight values = [] if type(color) == str: color = [color] if type(color) == list and len(color) == 1: coloru, tol = self.get_gene(color[0], ignore_log=ignore_log, con=connected) values = [coloru[node] for node in pg.nodes()] nol = networkx.draw_networkx_nodes(pg, pos, node_color=values, node_size=sizes, cmap=pylab.get_cmap(ccmap), linewidths=lw) nol.set_edgecolor('k') polca = values elif type(color) == list and len(color) == 2: colorr, tolr = self.get_gene(color[0], ignore_log=ignore_log, con=connected) rmax = float(max(colorr.values())) if rmax == 0.0: rmax = 1.0 colorb, tolb = self.get_gene(color[1], ignore_log=ignore_log, con=connected) bmax = float(max(colorb.values())) if bmax == 0.0: bmax = 1.0 values = [(1.0-colorb[node]/bmax, max(1.0-(colorr[node]/rmax+colorb[node]/bmax), 0.0), 1.0-colorr[node]/rmax) for node in pg.nodes()] nol = networkx.draw_networkx_nodes(pg, pos, node_color=values, node_size=sizes, linewidths=lw) nol.set_edgecolor('k') polca = [(colorr[node], colorb[node]) for node in pg.nodes()] elif type(color) == list and len(color) == 3: colorr, tolr = self.get_gene(color[0], ignore_log=ignore_log, con=connected) rmax = float(max(colorr.values())) if rmax == 0.0: rmax = 1.0 colorg, tolg = self.get_gene(color[1], ignore_log=ignore_log, con=connected) gmax = float(max(colorg.values())) if gmax == 0.0: gmax = 1.0 colorb, tolb = self.get_gene(color[2], ignore_log=ignore_log, con=connected) bmax = float(max(colorb.values())) if bmax == 0.0: bmax = 1.0 values = [(max(1.0-(colorg[node]/gmax+colorb[node]/bmax), 0.0), max(1.0-(colorr[node]/rmax+colorb[node]/bmax), 0.0), max(1.0-(colorr[node]/rmax+colorg[node]/gmax), 0.0)) for node in pg.nodes()] nol = networkx.draw_networkx_nodes(pg, pos, node_color=values, node_size=sizes, linewidths=lw) nol.set_edgecolor('k') polca = [(colorr[node], colorg[node], colorb[node]) for node in pg.nodes()] elif type(color) == list and len(color) == 4: colorr, tolr = self.get_gene(color[0], ignore_log=ignore_log, con=connected) rmax = float(max(colorr.values())) if rmax == 0.0: rmax = 1.0 colorg, tolg = self.get_gene(color[1], ignore_log=ignore_log, con=connected) gmax = float(max(colorg.values())) if gmax == 0.0: gmax = 1.0 colorb, tolb = self.get_gene(color[2], ignore_log=ignore_log, con=connected) bmax = float(max(colorb.values())) if bmax == 0.0: bmax = 1.0 colord, told = self.get_gene(color[3], ignore_log=ignore_log, con=connected) dmax = float(max(colord.values())) if dmax == 0.0: dmax = 1.0 values = [(max(1.0-(colorg[node]/gmax+colorb[node]/bmax), 0.0), max(1.0-(colorr[node]/rmax+colorb[node]/bmax+0.36*colord[node]/dmax), 0.0), max(1.0-(colorr[node]/rmax+colorg[node]/gmax+colord[node]/dmax), 0.0)) for node in pg.nodes()] nol = networkx.draw_networkx_nodes(pg, pos, node_color=values, node_size=sizes, linewidths=lw) nol.set_edgecolor('k') polca = [(colorr[node], colorg[node], colorb[node], colord[node]) for node in pg.nodes()] if labels: networkx.draw_networkx_labels(pg, pos, font_size=5, font_family='sans-serif') frame1 = pylab.gca() frame1.axes.get_xaxis().set_ticks([]) frame1.axes.get_yaxis().set_ticks([]) if table: if type(color) == str: cell_text = [[str(min(values)*tol), str(max(values)*tol), str(numpy.median(values)*tol), str(self.expr(color)), str(self.connectivity(color, ind=1)), str(self.connectivity_pvalue(color, n=500))]] columns = ['Min.', 'Max.', 'Median', 'Cells', 'Connectivity', 'p value'] rows = [color] pylab.table(cellText=cell_text, rowLabels=rows, colLabels=columns, loc='bottom') elif type(color) == list and len(color) == 1: cell_text = [[str(min(values)*tol), str(max(values)*tol), str(numpy.median(values)*tol), str(self.expr(color[0])), str(self.connectivity(color[0], ind=1)), str(self.connectivity_pvalue(color[0], n=500))]] columns = ['Min.', 'Max.', 'Median', 'Cells', 'Connectivity', 'p value'] rows = [color[0]] pylab.table(cellText=cell_text, rowLabels=rows, colLabels=columns, loc='bottom') elif type(color) == list and len(color) == 2: valuesr = colorr.values() valuesb = colorb.values() cell_text = [[str(min(valuesr)*tolr), str(max(valuesr)*tolr), str(numpy.median(valuesr)*tolr), str(self.expr(color[0])), str(self.connectivity(color[0], ind=1)), str(self.connectivity_pvalue(color[0], n=500))], [str(min(valuesb)*tolb), str(max(valuesb)*tolb), str(numpy.median(valuesb)*tolb), str(self.expr(color[1])), str(self.connectivity(color[1], ind=1)), str(self.connectivity_pvalue(color[1], n=500))]] columns = ['Min.', 'Max.', 'Median', 'Cells', 'Connectivity', 'p value'] rows = [color[0], color[1]] pylab.table(cellText=cell_text, rowLabels=rows, colLabels=columns, loc='bottom', rowColours=['r', 'b']) elif type(color) == list and len(color) == 3: valuesr = colorr.values() valuesg = colorg.values() valuesb = colorb.values() cell_text = [[str(min(valuesr)*tolr), str(max(valuesr)*tolr), str(numpy.median(valuesr)*tolr), str(self.expr(color[0])), str(self.connectivity(color[0], ind=1)), str(self.connectivity_pvalue(color[0], n=500))], [str(min(valuesg)*tolg), str(max(valuesg)*tolg), str(numpy.median(valuesg)*tolg), str(self.expr(color[1])), str(self.connectivity(color[1], ind=1)), str(self.connectivity_pvalue(color[1], n=500))], [str(min(valuesb)*tolb), str(max(valuesb)*tolb), str(numpy.median(valuesb)*tolb), str(self.expr(color[2])), str(self.connectivity(color[2], ind=1)), str(self.connectivity_pvalue(color[2], n=500))]] columns = ['Min.', 'Max.', 'Median', 'Cells', 'Connectivity', 'p value'] rows = [color[0], color[1], color[2]] the_table = pylab.table(cellText=cell_text, rowLabels=rows, colLabels=columns, loc='bottom', rowColours=['r', 'g', 'b'], colWidths=[0.08] * 7) the_table.scale(1.787, 1) pylab.subplots_adjust(bottom=0.2) if len(axis) == 4: pylab.axis(axis) if save == '': pylab.show() else: fig.savefig(save) return polca def count_gene(self, genin, cond, con=True): """ Returns a dictionary that assigns to each node id the fraction of cells in the node for which column 'genin' is equal to 'cond'. When optional argument 'con' is False, it uses all nodes, not only the ones in the first connected component of the topological representation (used internally). """ genecolor = {} lista = [] for i in self.dic.keys(): if con: if str(i) in self.pl: genecolor[str(i)] = 0.0 lista.append(i) else: genecolor[str(i)] = 0.0 lista.append(i) if genin is None: for i in sorted(lista): genecolor[str(i)] = 0.0 else: if genin == 'lib': geys = self.libs elif genin == 'ID': geys = list(self.cellID) else: geys = self.dicgenes[genin] for i in sorted(lista): pol = 0.0 for j in self.dic[i]: if geys[j] == cond: pol += 1.0 genecolor[str(i)] = pol/float(len(self.dic[i])) tol = sum(genecolor.values()) if tol > 0.0: for ll in genecolor.keys(): genecolor[ll] = genecolor[ll]/tol return genecolor, tol def show_statistics(self): """ Shows several statistics of the data. The first plot shows the distribution of the number of cells per node in the topological representation. The second plot shows the distribution of the number of common cells between nodes that share an edge in the topological representation. The third plot contains the distribution of the number of nodes that contain the same cell. Finally, the fourth plot shows the distribution of transcripts in log_2(1+TPM) scale, after filtering. """ x = map(len, self.dic.values()) pylab.figure() pylab.hist(x, max(x)-1, alpha=0.6, color='b') pylab.xlabel('Cells per node') x = [] for q in self.g.edges(): x.append(len(set(self.dic[q[0]]).intersection(self.dic[q[1]]))) pylab.figure() pylab.hist(x, max(x)-1, alpha=0.6, color='g') pylab.xlabel('Shared cells between connected nodes') pel = [] for m in self.dic.values(): pel += list(m) q = [] for m in range(max(pel)+1): o = pel.count(m) if o > 0: q.append(o) pylab.figure() pylab.hist(q, max(q)-1, alpha=0.6, color='r') pylab.xlabel('Number of nodes containing the same cell') pylab.figure() r = [] for m in self.dicgenes.keys(): r += list(self.dicgenes[m]) r = [k for k in r if 30 > k > 0.0] pylab.hist(r, 100, alpha=0.6) pylab.xlabel('Expression') pylab.show() def cellular_subpopulations(self, threshold=0.05, min_cells=5, clus_thres=0.65): """ Identifies potential transient cellular subpopulations. The parameter 'threshold' sets an upper bound of the q-value of the genes that are considered in the analysis. The parameter 'min_cells' sets the minimum number of cells on which each of the genes considered in the analysis is expressed. Cellular subpopulations are determined by clustering the Jensen-Shannon distance matrix of the genes that pass all the constraints. The number of clusters is controlled in this case by the parameter 'clus_thres'. In both cases a list with the genes associated to each cluster is returned. It requires the presence of the file 'name.genes.tsv', produced by the method RotedGraph.save(). """ con = [] dis = [] nam = [] f = open(self.name + '.genes.tsv', 'r') for n, line in enumerate(f): if n > 0: sp = line[:-1].split('\t') if float(sp[7]) < threshold and float(sp[1]) > min_cells: nam.append(sp[0]) f.close() mat2 = self.JSD_matrix(nam) return [map(lambda xx: nam[xx], m) for m in find_clusters(hierarchical_clustering(mat2, labels=nam, cluster_distance=True, thres=clus_thres)).values()] class RootedGraph(UnrootedGraph): """ Inherits from UnrootedGraph. Main class for topological analysis of longitudinal single cell RNA-seq expression data. """ def get_distroot(self, root): """ Returns a dictionary of graph distances to node specified by argument 'root' """ distroot = {} for i in sorted(self.pl): distroot[str(i)] = networkx.shortest_path_length(self.gl, str(root), i) return distroot def get_dendrite(self): """ Returns function that for each graph node takes the value of the correlation between the graph distance function to the node and the sampling time fuunction specified by self.rootlane (used internally). """ dendrite = {} daycolor = self.get_gene(self.rootlane, ignore_log=True)[0] for i in self.pl: distroot = self.get_distroot(i) x = [] y = [] for q in distroot.keys(): if distroot[q] != max(distroot.values()): x.append(daycolor[q]-min(daycolor.values())) y.append(distroot[q]) dendrite[str(i)] = -scipy.stats.spearmanr(x, y)[0] return dendrite def find_root(self, dendritem): """ Given the output of RootedGraph.get_dendrite() as 'dendritem', it returns the less and the most differentiated nodes (used internally). """ q = 1000.0 q2 = -1000.0 ind = 0 ind2 = 0 for n in dendritem.keys(): if -2.0 < dendritem[n] < q: q = dendritem[n] ind = n if dendritem[n] > q2 and dendritem[n] > -2.0: q2 = dendritem[n] ind2 = n return ind, ind2 def dendritic_graph(self): """ Builds skeleton of the topological representation (used internally) """ diam = networkx.diameter(self.gl) g3 = networkx.Graph() dicdend = {} for n in range(diam-1): nodedist = [] for k in self.pl: dil = networkx.shortest_path_length(self.gl, self.root, k) if dil == n: nodedist.append(str(k)) g2 = self.gl.subgraph(nodedist) dicdend[n] = sorted(networkx.connected_components(g2)) for n2, yu in enumerate(dicdend[n]): g3.add_node(str(n) + '_' + str(n2)) if n > 0: for n3, yu2 in enumerate(dicdend[n-1]): if networkx.is_connected(self.gl.subgraph(list(yu)+list(yu2))): g3.add_edge(str(n) + '_' + str(n2), str(n-1) + '_' + str(n3)) return g3, dicdend def __init__(self, name, table, rootlane='timepoint', shift=None, log2=True, posgl=False, csv=False, groups=True): """ Initializes the class by providing the the common name ('name') of .gexf and .json files produced by e.g. ParseAyasdiGraph(), the name of the file containing the filtered raw data ('table'), as produced by Preprocess.save(), and the name of the column that contains sampling time points. Optional argument 'shift' can be an integer n specifying that the first n columns of the table should be ignored, or a list of columns that should only be considered. If optional argument 'log2' is False, it is assumed that the filtered raw data is in units of TPM instead of log_2(1+TPM). When optional argument 'posgl' is False, a files name.posg and name.posgl are generated with the positions of the graph nodes for visualization. When 'posgl' is True, instead of generating new positions, the positions stored in files name.posg and name.posgl are used for visualization of the topological graph. """ UnrootedGraph.__init__(self, name, table, shift, log2, posgl, csv, groups) self.rootlane = rootlane self.root, self.leaf = self.find_root(self.get_dendrite()) self.g3, self.dicdend = self.dendritic_graph() self.edgesize = [] self.dicedgesize = {} self.edgesizeprun = [] self.nodesize = [] self.dicmelisa = {} self.nodesizeprun = [] self.dicmelisaprun = {} for ee in self.g3.edges(): yu = self.dicdend[int(ee[0].split('_')[0])][int(ee[0].split('_')[1])] yu2 = self.dicdend[int(ee[1].split('_')[0])][int(ee[1].split('_')[1])] self.edgesize.append(self.gl.subgraph(list(yu)+list(yu2)).number_of_edges()-self.gl.subgraph(yu).number_of_edges() - self.gl.subgraph(yu2).number_of_edges()) self.dicedgesize[ee] = self.edgesize[-1] for ee in self.g3.nodes(): lisa = [] for uu in self.dicdend[int(ee.split('_')[0])][int(ee.split('_')[1])]: lisa += self.dic[uu] self.nodesize.append(len(set(lisa))) self.dicmelisa[ee] = set(lisa) try: from networkx.drawing.nx_agraph import graphviz_layout self.posg3 = graphviz_layout(self.g3, 'sfdp') except: self.posg3 = networkx.spring_layout(self.g3) self.dicdis = self.get_distroot(self.root) pel2, tol = self.get_gene(self.rootlane, ignore_log=True) self.pel = numpy.array([pel2[m] for m in self.pl])*tol dr2 = self.get_distroot(self.root) self.dr = numpy.array([dr2[m] for m in self.pl]) self.po = scipy.stats.linregress(self.pel, self.dr) def select_diff_path(self): """ Returns a linear subgraph of the skeleton of the topological representation that maximizes the number of edges (used internally). """ lista = [] last = '0_0' while True: siz = 0 novel = None for ee in self.dicedgesize.keys(): if ((ee[0] == last and float(ee[1].split('_')[0]) > float(ee[0].split('_')[0])) or (ee[1] == last and float(ee[0].split('_')[0]) > float(ee[1].split('_')[0]))) \ and self.dicedgesize[ee] > siz and ee not in lista: novel = ee siz = self.dicedgesize[ee] if novel is not None: lista.append(novel) if float(novel[1].split('_')[0]) > float(novel[0].split('_')[0]): last = novel[1] else: last = novel[0] else: break return lista def draw_skeleton(self, color, labels=False, ccmap='jet', weight=8.0, save='', ignore_log=False, markpath=False): """ Displays skeleton of topological representation of the data colored according to the expression of a gene, genes or list of genes, specified by argument 'color'. This can be a gene or a list of one, two or three genes or lists of genes, to be respectively mapped to red, green and blue channels. When only one gene or list of genes is specified, it uses color map specified by 'ccmap'. If argument 'labels' is True, node id's are also displayed. Argument 'weight' allows to set a scaling factor for node sizes. When optional argument 'save' specifies a file name, the figure will be save in the file, in the format specified by its extension, and no plot will be displayed on the screen. When 'ignore_log' is True, it treat expression values as being in natural scale, even if self.log2 is True (used internally). When argument 'markpath' is True, it highlights the linear path produced by RootedGraph.select_diff_path(). """ values = [] pg = self.g3 pos = self.posg3 edgesize = self.edgesize nodesize = self.nodesize fig = pylab.figure() networkx.draw_networkx_edges(pg, pos, width=numpy.log2(numpy.array(edgesize)+1)*8.0/float(numpy.log2(1+max(edgesize))), alpha=0.6) if markpath: culer = self.select_diff_path() edgesize2 = [self.dicedgesize[m] for m in culer] networkx.draw_networkx_edges(pg, pos, edgelist=culer, edge_color='r', width=numpy.log2(numpy.array(edgesize2)+1)*8.0/float(numpy.log2(1+max(edgesize))), alpha=0.6) if type(color) == str or (type(color) == list and len(color) == 1): values = [] for _ in pg.nodes(): values.append(0.0) if type(color) == str: color = [[color]] for colorm in color[0]: geys = self.dicgenes[colorm] for llp, ee in enumerate(pg.nodes()): pol = 0.0 if self.log2 and not ignore_log: for uni in self.dicmelisa[ee]: pol += (numpy.power(2, float(geys[uni]))-1.0) pol = numpy.log2(1.0+(pol/float(len(self.dicmelisa[ee])))) else: for uni in self.dicmelisa[ee]: pol += geys[uni] pol /= len(self.dicmelisa[ee]) values[llp] += pol nol = networkx.draw_networkx_nodes(pg, pos, node_color=values, node_size=numpy.array(nodesize)*weight*50.0/float(max(nodesize)), cmap=pylab.get_cmap(ccmap)) nol.set_edgecolor('k') elif type(color) == list and len(color) == 2: geysr = self.dicgenes[color[0]] geysb = self.dicgenes[color[1]] colorr = {} colorb = {} for ee in pg.nodes(): polr = 0.0 polb = 0.0 if self.log2 and not ignore_log: for uni in self.dicmelisa[ee]: polr += (numpy.power(2, float(geysr[uni]))-1.0) polb += (numpy.power(2, float(geysb[uni]))-1.0) polr = numpy.log2(1.0+(polr/float(len(self.dicmelisa[ee])))) polb = numpy.log2(1.0+(polb/float(len(self.dicmelisa[ee])))) else: for uni in self.dicmelisa[ee]: polr += geysr[uni] polb += geysb[uni] polr /= len(self.dicmelisa[ee]) polb /= len(self.dicmelisa[ee]) colorr[ee] = polr colorb[ee] = polb rmax = float(max(colorr.values())) bmax = float(max(colorb.values())) values = [(1.0-colorb[node]/bmax, max(1.0-(colorr[node]/rmax+colorb[node]/bmax), 0.0), 1.0-colorr[node]/rmax) for node in pg.nodes()] nol = networkx.draw_networkx_nodes(pg, pos, node_color=values, node_size=numpy.array(nodesize)*weight*50.0/float(max(nodesize))) nol.set_edgecolor('k') elif type(color) == list and len(color) == 3: geysr = self.dicgenes[color[0]] geysg = self.dicgenes[color[1]] geysb = self.dicgenes[color[2]] colorr = {} colorg = {} colorb = {} for ee in pg.nodes(): polr = 0.0 polg = 0.0 polb = 0.0 if self.log2 and not ignore_log: for uni in self.dicmelisa[ee]: polr += (numpy.power(2, float(geysr[uni]))-1.0) polg += (numpy.power(2, float(geysg[uni]))-1.0) polb += (numpy.power(2, float(geysb[uni]))-1.0) polr = numpy.log2(1.0+(polr/float(len(self.dicmelisa[ee])))) polg = numpy.log2(1.0+(polg/float(len(self.dicmelisa[ee])))) polb = numpy.log2(1.0+(polb/float(len(self.dicmelisa[ee])))) else: for uni in self.dicmelisa[ee]: polr += geysr[uni] polg += geysg[uni] polb += geysb[uni] polr /= len(self.dicmelisa[ee]) polg /= len(self.dicmelisa[ee]) polb /= len(self.dicmelisa[ee]) colorr[ee] = polr colorg[ee] = polg colorb[ee] = polb rmax = float(max(colorr.values())) gmax = float(max(colorg.values())) bmax = float(max(colorb.values())) values = [(max(1.0-(colorg[node]/gmax+colorb[node]/bmax), 0.0), max(1.0-(colorr[node]/rmax+colorb[node]/bmax), 0.0), max(1.0-(colorr[node]/rmax+colorg[node]/gmax), 0.0)) for node in pg.nodes()] nol = networkx.draw_networkx_nodes(pg, pos, node_color=values, node_size=numpy.array(nodesize)*weight*50.0/float(max(nodesize))) nol.set_edgecolor('k') frame1 = pylab.gca() frame1.axes.get_xaxis().set_ticks([]) frame1.axes.get_yaxis().set_ticks([]) if labels: networkx.draw_networkx_labels(pg, pos, font_size=5, font_family='sans-serif') if save == '': pylab.show() else: fig.savefig(save) return values def centroid(self, genin, ignore_log=False): """ Returns the centroid and dispersion of a genes or list of genes specified by argument 'genin'. When 'ignore_log' is True, it treat expression values as being in natural scale, even if self.log2 is True (used internally). """ dicge = self.get_gene(genin, ignore_log=ignore_log)[0] pel1 = 0.0 pel2 = 0.0 pel3 = 0.0 for node in self.pl: pel1 += self.dicdis[node]*dicge[node] pel2 += dicge[node] if pel2 > 0.0: cen = float(pel1)/float(pel2) for node in self.pl: pel3 += numpy.power(self.dicdis[node]-cen, 2)*dicge[node] return [(cen-self.po[1])/self.po[0], (numpy.sqrt(pel3/float(pel2))-self.po[1])/self.po[0]] else: return [None, None] def get_gene(self, genin, ignore_log=False, con=True): """ Returns a dictionary that asigns to each node id the average value of the column 'genin' in the raw table. 'genin' can be also a list of columns, on which case the average of all columns. The output is normalized such that the sum over all nodes is equal to 1. It also provides as an output the normalization factor, to convert the dictionary to log_2(1+TPM) units (or TPM units). When 'ignore_log' is True it treat entries as being in natural scale, even if self.log2 is True (used internally). When 'con' is False, it uses all nodes, not only the ones in the first connected component of the topological representation (used internally). Argument 'genin' may also be equal to the special keyword '_dist_root', on which case it returns the graph distance funtion to the root node. It can be also equal to 'timepoint_xxx', on which case it returns a dictionary with the fraction of cells belonging to timepoint xxx in each node. """ if genin == '_dist_root': return self.get_distroot(self.root), 1.0 elif genin is not None and 'timepoint_' in genin: return self.count_gene(self.rootlane, float(genin[genin.index('_')+1:])) else: return UnrootedGraph.get_gene(self, genin, ignore_log, con) def draw_expr_timeline(self, genin, ignore_log=False, path=False, save='', axis=[], smooth=False): """ It displays the expression of a gene or list of genes, specified by argument 'genin', at different time points, as inferred from the distance to root function. When 'ignore_log' is True, it treat expression values as being in natural scale, even if self.log2 is True (used internally). When optional argument 'save' specifies a file name, the figure will be save in the file, in the format specified by its extension, and no plot will be displayed on the screen. Optional argument 'axis' allows to specify axis limits in the form [xmin, xmax, ymin, ymax]. If argument 'path' is True, expression is computed only across the linear path produced by RootedGraph.select_diff_path(). """ distroot_inv = {} if not path: pel = self.get_distroot(self.root) for m in pel.keys(): if pel[m] not in distroot_inv.keys(): distroot_inv[pel[m]] = [m] else: distroot_inv[pel[m]].append(m) else: pel = self.select_diff_path() cali = [] for mmn in pel: cali.append(mmn[0]) cali.append(mmn[1]) cali = list(set(cali)) for mmn in cali: distroot_inv[int(mmn.split('_')[0])] = self.dicdend[int(mmn.split('_')[0])][int(mmn.split('_')[1])] if type(genin) != list: genin = [genin] polter = {} for qsd in distroot_inv.keys(): genecolor = {} lista = [] for i in self.dic.keys(): if str(i) in distroot_inv[qsd]: genecolor[str(i)] = 0.0 lista += list(self.dic[i]) pol = [] for mju in genin: geys = self.dicgenes[mju] for j in lista: if self.log2 and not ignore_log: pol.append(numpy.power(2, float(geys[j]))-1.0) else: pol.append(float(geys[j])) pol = map(lambda xcv: numpy.log2(1+xcv), pol) polter[qsd] = [numpy.mean(pol)-numpy.std(pol), numpy.mean(pol), numpy.mean(pol)+numpy.std(pol)] x = [] y = [] y1 = [] y2 = [] for m in sorted(polter.keys()): x.append((m-self.po[1])/self.po[0]) y1.append(polter[m][0]) y.append(polter[m][1]) y2.append(polter[m][2]) xnew = numpy.linspace(min(x), max(x), 300) ynew = scipy.interpolate.spline(x, y, xnew) if smooth: ynew = scipy.signal.savgol_filter(ynew, 30, 3) fig = pylab.figure(figsize=(12, 3)) pylab.fill_between(xnew, 0, ynew, alpha=0.5) pylab.ylim(0.0, max(ynew)*1.2) pylab.xlabel(self.rootlane) pylab.ylabel('<log2 (1+x)>') if len(axis) == 2: pylab.xlim(axis) elif len(axis) == 4: pylab.xlim(axis[0], axis[1]) pylab.ylim(axis[2], axis[3]) else: pylab.xlim(min(xnew), max(xnew)) if save == '': pylab.show() else: fig.savefig(save) return polter def plot_CDR_correlation(self, doplot=True): """ Displays correlation between sampling time points and CDR. It returns the two parameters of the linear fit, Pearson's r, p-value and standard error. If optional argument 'doplot' is False, the plot is not displayed. """ pel2, tol = self.get_gene(self.rootlane, ignore_log=True) pel = numpy.array([pel2[m] for m in self.pl])*tol dr2 = self.get_gene('_CDR')[0] dr = numpy.array([dr2[m] for m in self.pl]) po = scipy.stats.linregress(pel, dr) if doplot: pylab.scatter(pel, dr, s=9.0, alpha=0.7, c='r') pylab.xlim(min(pel), max(pel)) pylab.ylim(0, max(dr)*1.1) pylab.xlabel(self.rootlane) pylab.ylabel('CDR') xk = pylab.linspace(min(pel), max(pel), 50) pylab.plot(xk, po[1]+po[0]*xk, 'k--', linewidth=2.0) pylab.show() return po def plot_rootlane_correlation(self): """ Displays correlation between sampling time points and graph distance to root node. It returns the two parameters of the linear fit, Pearson's r, p-value and standard error. """ pylab.scatter(self.pel, self.dr, s=9.0, alpha=0.7, c='r') pylab.xlim(min(self.pel), max(self.pel)) pylab.ylim(0, max(self.dr)+1) pylab.xlabel(self.rootlane) pylab.ylabel('Distance to root node') xk = pylab.linspace(min(self.pel), max(self.pel), 50) pylab.plot(xk, self.po[1]+self.po[0]*xk, 'k--', linewidth=2.0) pylab.show() return self.po def save(self, n=500, filtercells=0, filterexp=0.0, annotation={}): """ Computes RootedGraph.expr(), RootedGraph.delta(), RootedGraph.connectivity(), RootedGraph.connectivity_pvalue(), RootedGraph.centroid() and Benjamini-Holchberg adjusted q-values for all genes that are expressed in more than 'filtercells' cells and whose maximum expression value is above 'filterexp'. The optional argument 'annotation' allos to include a dictionary with lists of genes to be annotated in the table. The output is stored in a tab separated file called name.genes.txt. """ pol = [] with open(self.name + '.genes.tsv', 'w') as ggg: cul = 'Gene\tCells\tMean\tMin\tMax\tConnectivity\tp_value\tq-value (BH)\tCentroid\tDispersion\t' for m in sorted(annotation.keys()): cul += m + '\t' ggg.write(cul[:-1] + '\n') lp = sorted(self.dicgenes.keys()) for gi in lp: if self.expr(gi) > filtercells and self.delta(gi)[2] > filterexp: pol.append(self.connectivity_pvalue(gi, n=n)) por = benjamini_hochberg(pol) mj = 0 for gi in lp: po = self.expr(gi) m1, m2, m3 = self.delta(gi) p1, p2 = self.centroid(gi) if po > filtercells and m3 > filterexp: cul = gi + '\t' + str(po) + '\t' + str(m1) + '\t' + str(m2) + '\t' + str(m3) + '\t' +\ str(self.connectivity(gi)) + '\t' + str(pol[mj]) + '\t' + str(por[mj]) +\ '\t' + str(p1) + '\t' + str(p2) + '\t' for m in sorted(annotation.keys()): if gi in annotation[m]: cul += 'Y' + '\t' else: cul += 'N' + '\t' ggg.write(cul[:-1] + '\n') mj += 1 centr = [] disp = [] centr2 = [] disp2 = [] f = open(self.name + '.genes.tsv', 'r') for n, line in enumerate(f): if n > 0: sp = line[:-1].split('\t') if float(sp[7]) <= 0.05: centr.append(float(sp[1])) disp.append(float(sp[5])) else: centr2.append(float(sp[1])) disp2.append(float(sp[5])) f.close() pylab.scatter(centr2, disp2, alpha=0.2, s=9, c='b') pylab.scatter(centr, disp, alpha=0.3, s=9, c='r') pylab.xlabel('cells') pylab.ylabel('connectivity') pylab.yscale('log') pylab.ylim(0.01, 1) pylab.xlim(0, max(centr+centr2)) fig = pylab.figure() ax = fig.add_subplot(111, projection='3d') f = open(self.name + '.genes.tsv', 'r') for n, line in enumerate(f): if n > 0: sp = line[:-1].split('\t') if float(sp[7]) <= 0.05: ax.scatter(float(sp[8]), float(sp[9]), float(sp[1]), c='k', alpha=0.2, s=10) ax.set_xlabel('Centroid') ax.set_ylabel('Dispersion') ax.set_zlabel('Cells') pylab.show() def cellular_subpopulations(self, min_dispersion, threshold=0.05, min_cells=5, max_K=8, method='centroid', clus_thres=0.65): """ Identifies potential transient cellular subpopulations. The parameter 'min_dispersion' sets an upper bound of the dispersion of the genes that are considered in the analysis. The parameter 'threshold' sets an upper bound of the q-value of the genes that are considered in the analysis. The parameter 'min_cells' sets the minimum number of cells on which each of the genes considered in the analysis is expressed. The parameter 'method' specifies the method used to determine transient populations. When 'method' is set to 'centroid', cellular subpopulations are determined by clustering the centroids of low dispersion genes using the results stored in the file 'name.genes.txt', produced by RootedGraph.save() The parameter 'max_K' sets the maximum number of clusters to be considered in the analysis. Two plots are produced. The first plot shows the dependence of the Davies-Bouldin index with respect to the number of clusters. The second plot displays the dispersion and centroid of genes that satisfy the threshold in the number of cells and the q-value, and shows the optimal clustering of low dispersion genes. When the parameter 'method' is set to 'js', cellular subpopulations are determined by clustering the Jensen-Shannon distance matrix of the genes that pass all the constraints. The number of clusters is controlled in this case by the parameter 'clus_thres'. In both cases a list with the genes associated to each cluster is returned. It requires the presence of the file 'name.genes.tsv', produced by the method RotedGraph.save(). """ con = [] dis = [] contot = [] distot = [] nam = [] f = open(self.name + '.genes.tsv', 'r') for n, line in enumerate(f): if n > 0: sp = line[:-1].split('\t') if float(sp[9]) < min_dispersion and float(sp[7]) < threshold and float(sp[1]) > min_cells: con.append(float(sp[8])) dis.append(float(sp[9])) nam.append(sp[0]) elif float(sp[7]) < threshold and float(sp[1]) > min_cells: contot.append(float(sp[8])) distot.append(float(sp[9])) f.close() if method == 'centroid': y = [] z = [] for m in range(2, max_K): y_pred = sklearn.cluster.KMeans(n_clusters=m, random_state=170).fit_predict(numpy.array( zip(con, [0.0]*len(con)))) clus = [[] for _ in range(m)] for t, q in zip(y_pred, con): clus[t].append(q) rij = [] numpy.array(clus) for n1 in range(m): rij.append([]) for n2 in range(m): if n2 != n1: rij[-1].append( (numpy.var(clus[n1])-numpy.var(clus[n2]))/(abs( numpy.mean(clus[n1])-numpy.mean(clus[n2])))) else: rij[-1].append(0.0) ri = [] for n in range(m): ri.append(max(rij[n])) ssb = 0.0 ssw = 0.0 for n in range(m): ssb += len(clus[n])*(numpy.mean(clus[n])-numpy.mean(con))**2 for q in clus[n]: ssw += (q-numpy.mean(clus[n]))**2 y.append(numpy.mean(ri)) z.append(ssb*(len(con)-m)/(ssw*(m-1))) pylab.figure() pylab.plot(range(2, max_K), y, 'r-', linewidth=2.0) r = numpy.infty rn = -1 for n, m in enumerate(y): if m < r: rn = n r = m y_pred = sklearn.cluster.KMeans(n_clusters=rn+2, random_state=170).fit_predict( numpy.array(zip(con, [0.0]*len(con)))) pylab.figure() pylab.scatter(contot, distot, c='k', alpha=0.1) pylab.scatter(con, dis, c=y_pred, alpha=0.6) pylab.xlabel('Centroid') pylab.ylabel('Dispersion') pylab.show() g = [[] for _ in range(rn + 2)] for m, n in zip(list(y_pred), nam): g[m].append(n) return g elif method == 'js': return UnrootedGraph.cellular_subpopulations(self, threshold=threshold, min_cells=min_cells, clus_thres=clus_thres)