#!/usr/bin/env python """ Module for miRDeep2 wrappers and utilities Created July 2014 Copyright (C) Damien Farrell This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import sys, os, string, types, re import shutil, glob, collections import itertools import subprocess #import pylab as plt import numpy as np import pandas as pd from . import base, utils mirdeep2options = {'base': [('input',''),('adapter','TGGAATTCTCGGGTGCCAAGG'),('filetype','fastq'), ('bowtieindex',''),('refgenome',''),('species','hsa'), ('mature',''), ('hairpin',''), ('other',''),('mirbase',os.getcwd()), ('randfold',1), ('overwrite',1)]} mirdeepcols = ['#miRNA','read_count','mean_norm','miRDeep2 score','chr','seed','precursor', 'freq','precursor coordinate','mirbase seed match','star read count','rfam alert', 'consensus mature sequence','consensus star sequence', 'consensus precursor sequence'] def create_mirbase_files(species,path): """Generate species specific mature/hairpin files for input to mirdeep""" mature = os.path.join(path, 'mature.fa') hairpin = os.path.join(path, 'hairpin.fa') names=[] for f in [mature,hairpin]: fname = os.path.splitext(f)[0]+'_'+species+'.fa' utils.get_subset_fasta(f, labels=[species], outfile=fname) names.append(fname) print ('wrote mirbase files for species %s' %species) return names def create_sample_map(path, ext='fastq'): """Create filename mapping to run all samples at once. This is required.""" os.chdir(path) files = sorted(glob.glob('*.'+ext)) print (files) fname = 'combined.txt' i=1 rows=[] for f in files: rows.append((os.path.basename(f), 's%02d'%i)) i+=1 res=pd.DataFrame(rows) res.to_csv(fname, index=False,sep=' ',header=False) return (fname) def combine_labels(labels, filename): """Combine sample labels with mirdeep combined.txt file so we can match sample ids""" comb = pd.read_csv(filename,sep=' ',names=['filename','id']) comb['name'] = comb.filename.apply (lambda x: x.split('.')[0]) labels = labels.merge(comb,on='name') labels['id'] = labels.id+'(norm)' return labels def run_multiple(**kwargs): """Prepare and run mirdeep2""" if kwargs['filetype'] == 'fasta': ext='fa' else: ext='fastq' path = kwargs['input'] #create filename/id mapping samplemap = create_sample_map(path, ext) #get mirbase subset for species if provided if kwargs['species'] != '': mature, hairpin = create_mirbase_files(kwargs['species'], kwargs['mirbase']) kwargs['mature'] = mature kwargs['hairpin'] = hairpin if kwargs['other'] != '': kwargs['other'], h = create_mirbase_files(kwargs['other'], kwargs['mirbase']) run(samplemap, **kwargs) return def run(infile, refgenome, bowtieindex, mature='', hairpin='', other='', randfold=False, overwrite=True, filetype='fastq', adapter=None, clean=True, outpath=None, **kwargs): """Run all mirdeep2 steps including adapter trimming. Uses a config file even if we only have one sample.""" label = os.path.splitext(os.path.basename(infile))[0] print ('running %s' %label) os.environ["BOWTIE_INDEXES"] = os.path.dirname(bowtieindex) collapsed = 'collapsedreads.fa' if filetype=='fasta': mapparams='-c' else: mapparams='-e -h' if adapter =='': adapter = 'none' if randfold == False: params='-c' else: params = '' if other=='': other = 'none' #if mapping has been done already we can skip it if not os.path.exists('mapped.arf') or overwrite == True: try: os.remove('mapped.arf') os.remove(collapsed) except: pass cmd1 = ('mapper.pl %s -d %s -j -l 18 -m -k %s -s %s' ' -p %s -t mapped.arf -v' %(infile,mapparams,adapter,collapsed,bowtieindex)) print (cmd1) result = subprocess.check_output(cmd1, shell=True, executable='/bin/bash') else: print ('arf file found, skipping mapper step') #mirdeep core cmd2 = ('miRDeep2.pl %s %s mapped.arf' ' %s %s %s -z _%s %s' ' -d > report.log' %(collapsed,refgenome,mature,other,hairpin,label,params)) print (cmd2) result = subprocess.check_output(cmd2, shell=True, executable='/bin/bash') #remove junk if clean == True: tempfiles = glob.glob('dir_*') for f in tempfiles: shutil.rmtree(f) #move results to dest folder if outpath != None: pass #move files.. return def quantifier(path, mature, precursor, star=None, collapsed='collapsedreads.fa', time='novel'): """Run quantifier module using custom known mature/precursors""" current = os.getcwd() os.chdir(path) cmd = 'quantifier.pl -p %s -m %s -r %s -y %s -k -d -g 1 -U' %(precursor,mature,collapsed,time) print (cmd) result = subprocess.check_output(cmd, shell=True, executable='/bin/bash') os.chdir(current) return def get_pdf_path(path): """get path to pdfs""" ppath = glob.glob(os.path.join(path,'pdf*'))[0] return ppath def get_chromosome(x): val = x.split('_')[0] try: return '%02d' %int(val) except: return val def get_coords(x): """Get start/end from precursor coords string""" l = re.split("[\:\..]+",x) return pd.Series(l[:4],index=['chr','start','end','strand']) def get_score_stats(path): """Get mirdeep results from the summary file""" resfile = glob.glob(os.path.join(path,'result*.csv'))[0] if os.path.splitext(resfile)[1] != '.csv': return df = pd.read_csv(resfile, sep='\t',header=0,nrows=22) df = df.dropna() df['known'] = df['known miRNAs detected by miRDeep2'].apply(lambda r: int(r.split()[0])) df['FDR'] = df['novel miRNAs, estimated false positives'].apply(lambda r: int(r.split()[0])) return df def plot_score_stats(df): f,axs=plt.subplots(2,2,figsize=(10,6)) grid=axs.flat df.plot('miRDeep2 score','known',marker='o',ax=grid[0],legend=False) grid[0].set_title('known miRNAs') df.plot('miRDeep2 score','novel miRNAs reported by miRDeep2',marker='o',ax=grid[1],legend=False) grid[1].set_title('novel miRNAs') df.plot('miRDeep2 score','estimated signal-to-noise',marker='o',ax=grid[2],legend=False) grid[2].set_title('signal-to-noise') df.plot('miRDeep2 score','FDR',marker='o',ax=grid[3],legend=False) grid[3].set_title('FDR') plt.tight_layout() f.savefig('mirdeep_score_stats.png') return def read_results_file(infile): """Get mirdeep results from the summary file""" if os.path.splitext(infile)[1] != '.csv': return scol = 'miRDeep2 score' df = pd.read_csv(infile, sep='\t',header=23) df = df.dropna() #remove junk idx = df[df['provisional id']=='tag id'].index[0] df = df.drop(idx) df['novel'] = np.where(df['miRBase miRNA']=='-',True,False) colstorename = {'example miRBase miRNA with the same seed':'mirbase seed match', 'significant randfold p-value': 'randfold'} df = df.rename(columns=colstorename) #df = df.convert_objects(convert_numeric=True) df = df.infer_objects() #df['chr'] = df['provisional id'].apply(get_chromosome) df['seed'] = df['consensus mature sequence'].apply(lambda x: x[1:8]) coords = df['precursor coordinate'].apply(get_coords) df = df.join(coords) #pd.concat([df,coords]) return df def get_results(path): """Process known and novel results from mirdeep run. Combines with expression data and removes redundant entries""" resfile = glob.glob(os.path.join(path,'result*.csv'))[0] df = read_results_file(resfile) #use quantifier module to get novel expression results from predicted precursors #if not done already novelmature = os.path.join(path,'novel_mature.fa') novelprecursor = os.path.join(path,'novel_precursor.fa') res = os.path.join(path, 'expression_novel.html') reads = 'collapsedreads.fa' if not os.path.exists(res): novel = df[df.novel==True] mkey = 'consensus mature sequence' pkey = 'consensus precursor sequence' utils.dataframe_to_fasta(novel, mkey, 'provisional id', outfile=novelmature) utils.dataframe_to_fasta(novel, pkey, 'provisional id', outfile=novelprecursor) quantifier(path, os.path.abspath(novelmature), os.path.abspath(novelprecursor)) #get expression results and merge with prediction results to get other info files = glob.glob(os.path.join(path,'miRNAs_expressed_all_samples*.csv')) res=[] for f in files: if 'novel' in f: key='provisional id' else: key='miRBase miRNA' q = pd.read_csv(f,sep='\t') samples = float(len(q.filter(regex="norm").columns)) #print 'samples: %s' %samples q['freq'] = q.filter(regex="norm").apply(lambda r: len(r.nonzero()[0])/samples,1) #apply 5p id so we can merge with results file and keep star seqs q['id'] = q['#miRNA'].apply(lambda x: x[:-2]+'5p' if str(x).endswith('3p') else x) #loses information on multiple precursors for a mature seq q = q.merge(df,left_on=['id'],right_on=key).drop_duplicates('#miRNA') res.append(q) res = pd.concat(res) res = res.apply( lambda x: pd.to_numeric(x, errors='ignore' )) #get mean normalised count res['mean_norm'] = res.filter(regex="norm").apply(lambda r: r[r.nonzero()[0]].mean(),1) res = res.sort_values(by=['read_count'], ascending=False) res = res.drop_duplicates('#miRNA') res = res.reset_index(drop=True) #res['std'] = res.filter(regex="norm").std(1) #res['cv'] = res['std']/res['mean_norm'] return res def get_column_names(df): """Extract column names for multiple samples""" cols = [i for i in df.columns if (i.startswith('s') and len(i)<=3)] normcols = [i+'(norm)' for i in cols] return cols, normcols def filter_expr_results(df, cols=None, score=0, freq=0.5, mean_norm=0, total_reads=0): """Additional filters for abundances/no. samples""" if cols is None: cols = [i for i in df.columns if (i.startswith('s') and len(i)<=3)] normcols = [i+'(norm)' for i in cols] df = df[(df['miRDeep2 score']>=score)] df = df[df.freq>=freq] df = df[df['read_count']>=total_reads] df = df[df['mean_norm']>=mean_norm] df = df[df['randfold'].isin(['yes','-'])] #n = n[n['rfam alert']=='-'] #df = df.reset_index(drop=True) return df def analyse_results(path, outpath=None, **kwargs): """General analysis of mirdeep results""" if outpath != None: if not os.path.exists(outpath): os.mkdir(outpath) os.chdir(outpath) df = get_results(path) idcols,normcols = get_column_names(df) known = df[df.novel==False] novel = df[df.novel==True] idmap = get_file_ids(path) #cutoffs and freqs need to be configurable.. k = filter_expr_results(known,score=0,freq=.5,total_reads=50) n = filter_expr_results(novel,score=4,freq=.8,total_reads=50) cols = mirdeepcols core = pd.concat([k,n]) utils.dataframe_to_fasta(core, 'consensus mature sequence', '#miRNA', 'mirdeep_core.fa') k[cols].to_csv('known_mirdeep.csv') n[cols].to_csv('novel_mirdeep.csv') utils.create_html(n[cols],'novel_mirdeep') k['perc'] = k['read_count']/k['read_count'].sum() print (k[cols[:8]]) print print (n[cols[:9]]) print ('mirdeep summary') print ('-------------------------------') print (('%s (%s novel) identified' %(len(df),len(novel)))) print ('quantifier results after filtering:') print ('%s/%s known' %(len(k),len(known))) print ('%s/%s novel' %(len(n),len(novel))) print ('top 10 known account for %2.2f' %k['perc'][:10].sum()) print ('%s are 3p strand' %len(k[k['#miRNA'].str.contains('3p')])) #print df[(df.mean_norm>300) & (df.freq<0.5)][cols[:7]] print k=k.set_index('#miRNA') n=n.set_index('#miRNA') fig, ax = plt.subplots(figsize=(8,6)) k['read_count'][:10].plot(kind='barh',colormap='Spectral',ax=ax,log=True) plt.title('miRDeep2 top 10') plt.tight_layout() fig.savefig('mirdeep_top_known.png',dpi=150) fig, ax = plt.subplots(figsize=(8,8)) df.plot('freq','mean_norm',kind='scatter',ax=ax,logy=True,alpha=0.8) fig.savefig('mirdeep_freqsvcounts.png') fig=plt.figure() fig = plot_read_count_dists(n,h=5) fig.savefig('mirdeep_novel_counts.png') fig = plot_read_count_dists(k) fig.savefig('mirdeep_known_counts.png') #perSampleDists(k) fig,ax = plt.subplots(figsize=(10,6)) core[idcols].sum().plot(kind='bar',ax=ax) plt.title('total miRNA counts per sample (unnormalised)') plt.tight_layout() fig.savefig('mirdeep_total_persample.png') #found per chromosome fig = plt.figure(figsize=(8,6)) x = core.sort('chr').groupby('chr').size() x.plot(kind='bar') plt.title('miRNA per chromosome') fig.savefig('mirdeep_chromosome_dist.png') ss = get_score_stats(path) plot_score_stats(ss) #plt.show() #plt.close() return df,k,n def plot_read_count_dists(df,h=8): """Boxplots of read count distributions per miRNA - use seaborn?""" w=int(h*(len(df)/60.0))+4 fig, ax = plt.subplots(figsize=(w,h)) cols,normcols = get_column_names(df) df = df[normcols] t=df.T t.index = cols try: import sns sns.boxplot(t,linewidth=1.0,saturation=0.2,palette='coolwarm_r') sns.despine(trim=True) except: t.plot(kind='box',color='black',grid=False,whis=1.0,ax=ax) ax.set_yscale('log') plt.setp(ax.xaxis.get_majorticklabels(), rotation=90) plt.ylabel('read count') plt.tight_layout() return fig def get_file_ids(path): """Get file<->mirdeep2 id mapping""" idmap = pd.read_csv(os.path.join(path, 'combined.txt'),sep=' ', header=None,names=['filename','id']) return idmap def get_label_map(path, labels): """Get results labels mapped to labels with the filenames""" condmap = pd.read_csv(labels) idmap = get_file_ids(path) def matchname(x): r = idmap[idmap['filename'].str.contains(x)] if len(r)>0: return r.id.squeeze() else: return np.nan condmap['id'] = condmap.filename.apply(lambda x: matchname(x),1) condmap = condmap.dropna() return condmap def test_quantifier(path): resfile = glob.glob(os.path.join(path,'result*.csv'))[0] df = read_results_file(resfile) novelmature = os.path.join(path, 'novel_mature.fa') novelstar = os.path.join(path, 'novel_star.fa') novelprecursor = os.path.join(path, 'novel_precursor.fa') reads = '../results_mirdeep_combined/collapsedreads.fa' #base.dataframe2Fasta(df[df.novel==True], 'consensus star sequence', 'provisional id', # outfile=novelstar) quantifier(path, os.path.abspath(novelmature), os.path.abspath(novelprecursor), os.path.abspath(novelstar), reads) return def check_quantifier_results(path): """Check quantifier vs results file in case of miscounts""" resfile = glob.glob(os.path.join(path,'result*.csv'))[0] df = read_results_file(resfile) files = glob.glob(os.path.join(path,'miRNAs_expressed_all_samples*.csv')) q = pd.read_csv(files[0],sep='\t') key='provisional id' m=q.merge(df,left_on='#miRNA',right_on=key).drop_duplicates('#miRNA') m.sc = m['miRDeep2 score'] m['err'] = abs(m['read_count']-m['total read count']) cols=['#miRNA','total read count','read_count','miRDeep2 score'] print (m[m.err>400].sort('total read count',ascending=False)[cols]) m['size'] = np.select([m.sc < 2, m.sc < 3, m.sc < 4], [20,40,50], 80) f,ax=plt.subplots(1,1) plt.xscale('log') plt.yscale('log') m.plot(x='total read count',y='read_count', kind='scatter',s=60,alpha=0.6,ax=ax) #ax.plot([0, 1], [0, 1], transform=ax.transAxes,color='red',alpha=0.7) plt.show() return def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("-r", "--run", dest="run", action='store_true', help="run predictions") parser.add_option("-i", "--input", dest="input", help="input path or file") parser.add_option("-c", "--config", dest="config", help="config file") parser.add_option("-a", "--analyse", dest="analyse", help="analyse results of mirdeep2") opts, remainder = parser.parse_args() pd.set_option('display.width', 800) if opts.run == True: #all other options are stored in config file if opts.config == None: print ('No config file provided.') base.write_default_config('mirdeep2.conf', defaults=mirdeep2options) return cp = base.parse_config(opts.config) if opts.input != None: conf.input = os.path.abspath(opts.input) options = base.get_options(cp) run_multiple(**options) elif opts.analyse != None: analyse_results(opts.analyse) elif opts.test == True: test(opts.input) if __name__ == '__main__': main()