python source code of analysis

#!/usr/bin/env python

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""
smallrnaseq additional analysis routines
Created June 2014
Copyright (C) Damien Farrell
"""

from __future__ import absolute_import, print_function
import sys, os, string, types, re, csv
import shutil, glob, collections
import itertools
import subprocess
import pylab as plt
import seaborn as sns
import numpy as np
import pandas as pd
try:
    import HTSeq
except:
    'HTSeq not present'

from . import base, utils

def read_length_dist(df):

    df['length'] = df.seq.str.len()
    bins = np.linspace(1,df.length.max(),df.length.max())
    x = np.histogram(df.length,bins=bins)
    return x

def summarise_reads(path):
    """Count reads in all files in path"""

    resultfile = os.path.join(path, 'read_stats.csv')
    files = glob.glob(os.path.join(path,'*.fastq'))
    vals=[]
    rl=[]
    for f in files:
        label = os.path.splitext(os.path.basename(f))[0]
        s = utils.fastq_to_dataframe(f)
        l = len(s)
        vals.append([label,l])
        print (label, l)

    df = pd.DataFrame(vals,columns=['path','total reads'])
    df.to_csv(resultfile)
    df.plot(x='path',y='total reads',kind='barh')
    plt.tight_layout()
    plt.savefig(os.path.join(path,'total_reads.png'))
    #df = pd.concat()
    return df

def remove_known_rnas(path, adapters=[], outpath='RNAremoved'):
    """Map to annotated RNAs and put remaining reads in output dir"""

    index = 'bosTau6-tRNAs'
    params = '-v 0 --best'
    files = glob.glob(os.path.join(path,'*.fastq'))
    #files = ['test.fastq']
    if not os.path.exists(outpath):
        os.mkdir(outpath)
    for f in files:
        print (f)
        label = os.path.splitext(os.path.basename(f))[0]
        trimAdapters(f, adapters)
        fastq2fasta('cut.fastq')
        rem = os.path.join(outpath, label+'.fa')
        #samfile = os.path.join(outpath, '%s_%s_mapped.sam' %(label,index))
        samfile = 'out.sam'
        base.bowtie_align('cut.fa', index, outfile=samfile, params=params, remaining=rem)

    return

def ks_test(df, ids):
    """KS test to determine rc freq distributions of replicates
    threshold count is the one which corresponds to the first minimum.
    i.e.  when the distributions of reads amongst replicates begin to be similar
    see http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822534/
    ids are pairs of replicate column ids for mirna results
    """

    def getMin(x):
        r=0
        x=list(x)
        m=x[0]
        for i in range(len(x)):
            if x[i]<m and x[i+1]>x[i] and abs(m-x[i])>0.02:
                r=readcuts[i]
                break
        return r

    from scipy.stats import ks_2samp
    readcuts = np.arange(0,300,2)
    result=[]
    testdata=[]
    for i in ids:
        stats=[]
        for rc in readcuts:
            d = mdp.filterExprResults(df,score=-500,freq=0,meanreads=rc,totalreads=0)
            testdata.append(d)
            s1,s2=i
            x=d[s1+'(norm)']
            y=d[s2+'(norm)']
            val = ks_2samp(x, y)
            #print len(x),len(y),rc, val
            stats.append(val[0])
        result.append(stats)
    result = pd.DataFrame(np.array(result).T)
    result=result.set_index(readcuts)
    result.plot(lw=2,colormap='Set1',legend=False)
    tmins = result.apply(getMin)
    print (tmins)
    mean=tmins.mean()
    std=tmins.std()

    plt.axvspan(mean-std/2,mean+std/2,color='g',alpha=0.3)
    plt.xlabel('read count threshold')
    plt.ylabel('KS')
    plt.savefig('KS_test.png',dpi=150)

    return

def mirna_discovery_test(sourcefile):
    """Test miRNAs found per read file size. Assumes runs have been
     done and placed in benchmarking dir"""

    path = 'benchmarking'
    #sizes = np.arange(5e5,7.e6,5e5)
    #sizes = np.insert(sizes,0,[5e4,1e5,2e5])
    #print sizes
    #base.createRandomFastqFiles(sourcefile, path, sizes)

    #mirdeep
    outpath = 'benchmarking/mirdeep'
    a = mdp.getResults(outpath)
    k1 = a[a.novel==False]
    k1=k1.set_index('#miRNA')
    k1 = k1[k1.s16>5]
    mp1 = mdp.get_file_ids(outpath)
    cols,ncols = mdp.get_column_names(k1)
    #srnabench
    outpath = 'benchmarking/srnabench'
    #k2,n,iso = srb.get_results(outpath)
    k2 = k2.set_index('name')
    k2 = k2[k2.s16>5]
    mp2 = srb.getFileIDs(outpath).sort('filename')

    def getFound(k,mapping,rc=5):
        tot = float(len(k))
        f=[]
        results=[]
        for i,r in mapping.iterrows():
            fn = r.filename
            i = float(re.findall("\d+.\d+",fn)[0])
            df = k[k[r.id]>rc]
            n = df[-df.index.isin(f)]
            f.extend(n.index)
            results.append((i,len(f),len(f),n.total.max()))
        r = pd.DataFrame(results,columns=['reads','total','frac','rcm'])
        return r

    r1 = getFound(k1, mp1)
    r2 = getFound(k2, mp2)

    fig=plt.figure(figsize=(8,6))
    ax=fig.add_subplot(111)
    l1=ax.plot(r1.reads,r1.frac,'bo-',color='blue',lw=2)
    l2=ax.plot(r2.reads,r2.frac,'bo-',color='red',lw=2)
    ax.set_xlabel('reads (million)')
    ax.set_ylabel('total miRNA found')
    ax.legend(l1+l2,['mirdeep2','srnabench'],loc=2)
    ax3 = plt.axes([.5, .25, .3, .3])
    ax3.semilogy(r1.reads,r1.rcm,color='blue',ls='-')
    ax3.semilogy(r2.reads,r2.rcm,color='red',ls='-')
    ax3.set_title('mean abundance (new miRNAs)')
    plt.tight_layout()
    fig.savefig('benchmark_discovery.png')

    '''fig,ax=plt.subplots(1,1)
    x=k1[cols]
    from matplotlib.colors import LogNorm
    x=x.replace(0,.1)
    ax.pcolor(x,cmap='Blues',
              norm=LogNorm(vmin=x.min().min(), vmax=x.max().max()))
    plt.yticks(np.arange(0.5, len(x.index), 1), x.index)
    plt.xticks(np.arange(0.5, len(x.columns), 1), x.columns)'''

    plt.show()
    return

def do_pca(X, c=3):
    """Do PCA"""

    from sklearn import preprocessing
    from sklearn.decomposition.pca import PCA, RandomizedPCA
    #do PCA
    #S = standardize_data(X)
    S = pd.DataFrame(preprocessing.scale(X),columns = X.columns)
    pca = PCA(n_components=c)
    pca.fit(S)
    print (pca.explained_variance_ratio_)
    #print pca.components_
    w = pd.DataFrame(pca.components_,columns=S.columns)#,index=['PC1','PC2'])
    #print w.T.max(1).sort_values()
    pX = pca.fit_transform(S)
    pX = pd.DataFrame(pX,index=X.index)
    return pX

def plot_pca(pX, palette='Spectral', labels=None, ax=None, colors=None):
    """Plot PCA result, input should be a dataframe"""

    if ax==None:
        fig,ax=plt.subplots(1,1,figsize=(6,6))
    cats = pX.index.unique()
    colors = sns.mpl_palette(palette, len(cats)+1)
    print (len(cats), len(colors))
    for c, i in zip(colors, cats):
        #print (i, len(pX.ix[i]))
        #if not i in pX.index: continue
        ax.scatter(pX.ix[i, 0], pX.ix[i, 1], color=c, s=90, label=i,
                   lw=.8, edgecolor='black', alpha=0.8)
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    i=0
    if labels is not None:
        for n, point in pX.iterrows():
            l=labels[i]
            ax.text(point[0]+.1, point[1]+.1, str(l),fontsize=(9))
            i+=1
    ax.legend(fontsize=10,bbox_to_anchor=(1.5, 1.05))
    sns.despine()
    plt.tight_layout()
    return

def do_mds(X):
    """Do MDS"""

    from sklearn import manifold
    seed = np.random.RandomState(seed=3)
    mds = manifold.MDS(n_components=3, max_iter=3000, eps=1e-9, random_state=seed,
                        n_jobs=1)
    pX = mds.fit(X.values).embedding_
    pX = pd.DataFrame(pX,index=X.index)
    return pX

def get_aligned_reads_lengths(path, label, refs):
    """Get read length distributions mapped to sam files representing
      different ref annotations. Returns dataframe of read length freqs
      with column label for db mapped to"""

    cols=[]
    for ref in refs:
        f = os.path.join(path,'%s_%s.sam' %(label,ref))
        reads = base.get_aligned(f)
        x = read_length_dist(reads)
        x = pd.Series(x[0],index=x[1][:-1],name=ref)
        x = x/x.sum()
        #print x
        cols.append(x)
    x = pd.DataFrame(cols).T
    x.index = x.index.astype(int)
    x.index.name = 'length'
    return x

def classify(X, y, cl, name=''):
    """Classification using gene features"""

    from sklearn.metrics import classification_report, accuracy_score
    np.random.seed()
    ind = np.random.permutation(len(X))

    from sklearn.cross_validation import train_test_split
    Xtrain, Xtest, ytrain, ytest  = train_test_split(X, y, test_size=0.4)
    #print X
    cl.fit(Xtrain, ytrain)
    ypred = cl.predict(Xtest)

    print (classification_report(ytest, ypred))
    #print accuracy_score(ytest, ypred)
    from sklearn import cross_validation
    yl = pd.Categorical(y).labels
    sc = cross_validation.cross_val_score(cl, X, yl, scoring='roc_auc', cv=5)
    print("AUC: %0.2f (+/- %0.2f)" % (sc.mean(), sc.std() * 2))
    return cl

def read_length_distributions(path, refs):
    """Get read lengths for all aligned files in path mapped
       to ref db names, assumes the fasta files and sam files are present in the
       target folder
    """

    files = glob.glob(path+'/*.fa')
    fnames = [os.path.splitext(os.path.basename(f))[0] for f in files]
    fnames = [i for i in fnames if not i.endswith('_r')]
    print (fnames)

    res = []
    for n in fnames:
        x = get_aligned_reads_lengths(path, n, refs)
        x = x.ix[10:40]
        x['file'] = n
        res.append(x)
    res = pd.concat(res)
    res = res.reset_index()
    #print res[:60]
    #plot histograms
    m = pd.melt(res, id_vars=['file','length'], value_vars=refs,
                var_name='ref', value_name='freq')
    g = sns.factorplot(y='freq',x='length', data=m, row='ref', kind="bar",
                        size=2,aspect=4,sharex=False,palette='Blues_d')
    plt.savefig(os.path.join(path,'read_lengths.png'))
    return res

def get_trna_fragments(samfile, fastafile, truecounts, bedfile=None):
    """Get trf5/3/i fragments from a set reads aligned to a trna sequences.
    This finds the locations of all reads inside the aligned parent and
    classifies the fragments using a scheme similar to Telonis at al."""

    refs = utils.fasta_to_dataframe(fastafile)
    #get rna structure for reference seqs?
    #refs['structure'] = refs.apply(lambda r: utils.rnafold(r.sequence)[0],1)
    a = utils.get_aligned_reads(samfile, truecounts)
    #a = a.merge(truecounts, on='seq')
    print ('%s total sequences with %s counts' %(len(a),a.reads.sum()))
    if len(a) == 0:
        return

    def get_type(x, refs):
        #classify by position in parent seqs
        n = x['name']
        seq = refs.ix[n].sequence
        if x.start<=2:
            return 'trf5'
        elif len(seq)-x.end == 0 and x.seq.endswith('CCA'):
            return 'trf3'
        else:
            return 'itrf'

    def get_loop(x):
        if x.start>11 and x.start<17:
            return 'D'
        elif x.start>30 and x.start<40:
            return 'A'

    #remove sequence redundancy by grouping into unique fragments then get classes
    #first sort reads by sequence and name so we get consistent ids for different samples..
    a = a.sort_values(['seq','name'])
    f = a.groupby('seq').agg({'name':base.first, 'reads':np.max, 'read':np.size,
                                 'start':base.first,'end':base.first})
    f=f.rename(columns={'read':'loci'})
    f = f.reset_index()
    f['anticodon'] = f.apply(lambda x: x['name'].split('-')[1],1)
    f['aa'] = f.anticodon.str[:3]
    f['perc'] = (f.reads/f.reads.sum()*100).round(3)
    f['length'] = f.seq.str.len()

    f['trf'] = f.apply(lambda x: get_type(x, refs), 1)
    f['loop'] = f.apply(lambda x: get_loop(x), 1)
    f['id'] = f.apply(lambda x:'%s_%s.%s.%s' % (x['name'],x.length,x.start,x.end),1)

    print ('%s unique fragments' %len(f))
    print ('%s total counts in fragments' %f.reads.sum())
    return f

if __name__ == '__main__':
    main()