python source code of insertsizes

import collections
import logging
import os
import tempfile
import numpy

from svviz.utilities import mean, stddev
from svviz.kde import gaussian_kde
from svviz import plotting

def removeOutliers(data, m = 10.):
    """ a method of trimming outliers from a list/array using 
    outlier-safe methods of calculating the center and variance;
    only removes the upper tail, not the lower tail """
    if len(data) < 2:
        return data
        
    data = numpy.array(data)
    d_abs = numpy.abs(data - numpy.median(data))
    d = data - numpy.median(data)
    mdev = numpy.median(d_abs)
    s = d/mdev if mdev else 0.
    return data[s<m]

def chooseOrientation(orientations):
    logging.info("  counts +/-:{:<6} -/+:{:<6} +/+:{:<6} -/-:{:<6} unpaired:{:<6}".format(orientations[False, True], 
                                                    orientations[True, False], 
                                                    orientations[True, True],
                                                    orientations[False, False],
                                                    orientations["unpaired"]))
    ranked = sorted(orientations, key=lambda x: orientations[x])
    chosenOrientations = [ranked.pop()]
    while len(ranked) > 0:
        candidate = ranked.pop()
        if orientations[chosenOrientations[-1]] < 2* orientations[candidate]:
            chosenOrientations.append(candidate)
        else:
            break
    if chosenOrientations[0] == "unpaired":
        chosenOrientations = "any"
    else:
        d = {False: "+", True:"-"}
        chosenOrientations = ["".join(d[x] for x in o) for o in chosenOrientations]
    return chosenOrientations

def getSearchRegions(bam, minLength=0):
    # get the chromosomes and move X, Y, M/MT to the end
    chromosomes = []
    for i in range(bam.nreferences):
        if bam.lengths[i] > minLength:
            chromosomes.append(bam.getrname(i))

    for start, end in [(2500000, 50000000), (None, None)]:
        for chrom in sorted(chromosomes):
            yield chrom, start, end

def sampleInsertSizes(bam, maxreads=50000, skip=0, minmapq=40, maxExpectedSize=20000, keepReads=False):
    """ get the insert size distribution, cutting off the tail at the high end, 
    and removing most oddly mapping pairs

    50,000 reads seems to be sufficient to get a nice distribution, and higher values
        don't tend to change the distribution much """

    inserts = []
    readLengths  = []
    
    count = 0
    reads = []

    orientations = collections.Counter()
    NMs = []
    INDELs = []

    for chrom, start, end in getSearchRegions(bam):
        for read in bam.fetch(chrom, start, end):
            if skip > 0:
                skip -= 1
                continue

            try:
                NMs.append(read.opt("NM")/float(len(read.seq)))
            except KeyError:
                pass

            try:
                INDELs.append(sum(1 for i in zip(*read.cigartuples)[1] if i in[1,2])/float(len(read.seq)))
            except TypeError:
                pass

            if orientations["unpaired"] > 2500 and count < 1000:
                # bail out early if it looks like it's single-ended
                break

            if not read.is_paired:
                orientations["unpaired"] += 1
                readLengths.append(len(read.seq))
                continue
                
            if not read.is_read1:
                continue
            
            if not read.is_proper_pair:
                continue
            if read.is_unmapped or read.mate_is_unmapped:
                continue
            if read.tid != read.rnext:
                continue
            if read.mapq < minmapq:
                continue
            
            # if abs(read.isize) < 20000:
            inserts.append(abs(read.isize))

            curOrient = (read.is_reverse, read.mate_is_reverse)
            if read.reference_start > read.next_reference_start:
                curOrient = not curOrient[0], not curOrient[1]
            orientations[curOrient] += 1
            readLengths.append(len(read.seq))

            if keepReads:
                reads.append(read)

            count += 1
            if count > maxreads:
                break
        if count >= maxreads:
            break

    chosenOrientations = chooseOrientation(orientations)

    return removeOutliers(inserts), reads, chosenOrientations, numpy.array(readLengths)


class ReadStatistics(object):
    def __init__(self, bam, keepReads=False):
        self.insertSizes = []
        self.readLengths = []
        self.orientations = []
        self._insertSizeKDE = None
        self.singleEnded = False

        self._insertSizeScores = {} # cache

        try:
            self.insertSizes, self.reads, self.orientations, self.readLengths = sampleInsertSizes(bam, keepReads=keepReads)
            if len(self.insertSizes) > 1:
                logging.info("  insert size mean: {:.2f} std: {:.2f}".format(numpy.mean(self.insertSizes), numpy.std(self.insertSizes)))
        except ValueError as e:
            print("*"*100, "here")
            print("ERROR:", e)

    def hasInsertSizeDistribution(self):
        if len(self.insertSizes) > 1000:
            return True
        return False

    def meanInsertSize(self):
        if self.hasInsertSizeDistribution():
            return mean(self.insertSizes)
        return None

    def stddevInsertSize(self):
        if self.hasInsertSizeDistribution():
            return stddev(self.insertSizes)
        return None


    def scoreInsertSize(self, isize):
        if not self.hasInsertSizeDistribution():
            return 0

        if self._insertSizeKDE is None:
            self._insertSizeKDE = gaussian_kde(self.insertSizes)

        # the gaussian kde call is pretty slow with ~50,000 data points in it, so we'll cache the result for a bit of a speed-up
        isize = abs(isize)
        if not isize in self._insertSizeScores:
            self._insertSizeScores[isize] = self._insertSizeKDE(isize)

        return self._insertSizeScores[isize]


    def hasReadLengthDistribution(self):
        if len(self.readLengths) > 1000:
            return True
        return False

    def meanReadLength(self):
        if self.hasReadLengthDistribution():
            return mean(self.readLengths)
        return None

    def stddevReadLength(self):
        if self.hasReadLengthDistribution():
            return stddev(self.readLengths)
        return None

    def readLengthUpperQuantile(self):
        if self.hasReadLengthDistribution():
            return numpy.percentile(self.readLengths, 99)
        return None


def plotInsertSizeDistribution(isd, sampleName, dataHub):
    try:
        from rpy2 import robjects as ro

        d = tempfile.mkdtemp()
        filename = os.path.join(d, sampleName)

        if not filename.endswith(".png"):
            filename += ".png"

        ro.r.png(filename, res=250, width=1200, height=1200)

        alleles = ["alt", "ref", "amb"]
        others = [[len(chosenSet) for chosenSet in dataHub.samples[sampleName].chosenSets(allele)] for allele in alleles]
        plotting.ecdf([isd.insertSizes]+others, ["average"]+alleles, xlab="Insert size (bp)", main=sampleName, legendWhere="bottomright", lwd=2)
        
        ro.r["dev.off"]()

        data = open(filename).read()
        return data
    except ImportError:
        return None