org.apache.commons.math3.stat.descriptive.rank.Median Java Examples

The following examples show how to use org.apache.commons.math3.stat.descriptive.rank.Median. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: HDF5PCACoveragePoNCreationUtilsUnitTest.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

6 votes

@Test(dataProvider="readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
    final Median median = new Median();
    final RealMatrix counts = readCount.counts();
    final double[] targetMedians = IntStream.range(0, counts.getRowDimension())
            .mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
    final double threshold = new Percentile(percentile).evaluate(targetMedians);
    final Boolean[] toBeKept = DoubleStream.of(targetMedians)
            .mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
    final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
    final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
    Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
    Assert.assertEquals(result.getRight().length, toBeKeptCount);
    int nextIndex = 0;
    for (int i = 0; i < toBeKept.length; i++) {
        if (toBeKept[i]) {
            int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
            Assert.assertEquals(index, nextIndex++);
            Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
            Assert.assertEquals(result.getRight()[index], targetMedians[i]);
        } else {
            Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
        }
    }
}

Example #2

Source File: ReadCountCollectionUtilsUnitTest.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

6 votes

@Test(dataProvider="readCountAndPercentileData")
public void testExtremeMedianColumnsData(final ReadCountCollection readCount, final double percentile) {
    final Median median = new Median();
    final RealMatrix counts = readCount.counts();
    final double[] columnMedians = IntStream.range(0, counts.getColumnDimension())
            .mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
    final double top = new Percentile(100 - percentile).evaluate(columnMedians);
    final double bottom = new Percentile(percentile).evaluate(columnMedians);
    final Boolean[] toBeKept = DoubleStream.of(columnMedians)
            .mapToObj(d -> d <= top && d >= bottom).toArray(Boolean[]::new);
    final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
    final ReadCountCollection result = ReadCountCollectionUtils.removeColumnsWithExtremeMedianCounts(readCount, percentile, NULL_LOGGER);
    Assert.assertEquals(result.columnNames().size(), toBeKeptCount);
    int nextIndex = 0;
    for (int i = 0; i < toBeKept.length; i++) {
        if (toBeKept[i]) {
            int index = result.columnNames().indexOf(readCount.columnNames().get(i));
            Assert.assertEquals(index, nextIndex++);
            Assert.assertEquals(counts.getColumn(i), result.counts().getColumn(index));
        } else {
            Assert.assertEquals(result.columnNames().indexOf(readCount.columnNames().get(i)), -1);
        }
    }
}

Example #3

Source File: HDF5PCACoveragePoNCreationUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Calculates the median of column medians and subtract it from all counts.
 * @param readCounts the input counts to center.
 * @return the median of medians that has been subtracted from all counts.
 */
@VisibleForTesting
static double subtractMedianOfMedians(final ReadCountCollection readCounts, final Logger logger) {
    final RealMatrix counts = readCounts.counts();
    final Median medianCalculator = new Median();
    final double[] columnMedians = MatrixSummaryUtils.getColumnMedians(counts);

    final double medianOfMedians = medianCalculator.evaluate(columnMedians);
    counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
        @Override
        public double visit(final int row, final int column, final double value) {
            return value - medianOfMedians;
        }
    });
    logger.info(String.format("Counts centered around the median of medians %.2f", medianOfMedians));
    return medianOfMedians;
}

Example #4

Source File: HDF5PCACoveragePoNCreationUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Final pre-panel normalization that consists of dividing all counts by the median of
 * its column and log it with base 2.
 * <p>
 *     The normalization occurs in-place.
 * </p>
 *
 * @param readCounts the input counts to normalize.
 */
@VisibleForTesting
static void normalizeAndLogReadCounts(final ReadCountCollection readCounts, final Logger logger) {
    final RealMatrix counts = readCounts.counts();
    final Median medianCalculator = new Median();

    final double[] medians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(col -> medianCalculator.evaluate(counts.getColumn(col))).toArray();
    counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
        @Override
        public double visit(final int row, final int column, final double value) {
            return Math.log(Math.max(EPSILON, value / medians[column])) * INV_LN_2;
        }
    });

    logger.info("Counts normalized by the column median and log2'd.");
}

Example #5

Source File: CoveragePoNQCUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 *  Given a single sample tangent normalization (or other coverage profile), determine whether any contig looks like
 *   it has an arm level event (defined as 25% (or more) of the contig amplified/deleted)
 *
 * @param singleSampleTangentNormalized Tangent normalized data for a single sample.
 * @return never {@code null}
 */
private static Boolean hasSuspiciousContigs(final ReadCountCollection singleSampleTangentNormalized, final Map<String, Double> contigToMedian) {
    final List<String> allContigsPresent = retrieveAllContigsPresent(singleSampleTangentNormalized);
    for (String contig: allContigsPresent) {
        final ReadCountCollection oneContigReadCountCollection = singleSampleTangentNormalized.subsetTargets(singleSampleTangentNormalized.targets().stream().filter(t -> t.getContig().equals(contig)).collect(Collectors.toSet()));
        final RealVector counts = oneContigReadCountCollection.counts().getColumnVector(0);
        for (int i = 0; i < 4; i++) {
            final RealVector partitionCounts = counts.getSubVector(i * counts.getDimension() / 4, counts.getDimension() / 4);
            final double[] partitionArray = DoubleStream.of(partitionCounts.toArray()).map(d -> Math.pow(2, d)).sorted().toArray();
            double median = new Median().evaluate(partitionArray);
            final double medianShiftInCRSpace = contigToMedian.getOrDefault(contig, 1.0) - 1.0;
            median -= medianShiftInCRSpace;
            if ((median > AMP_THRESHOLD) || (median < DEL_THRESHOLD)) {
                logger.info("Suspicious contig: " + singleSampleTangentNormalized.columnNames().get(0) + " " + contig + " (" + median + " -- " + i + ")");
                return true;
            }
        }
    }
    return false;
}

Example #6

Source File: HDF5PCACoveragePoNCreationUtilsUnitTest.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

6 votes

@Test(dataProvider = "readCountOnlyData")
public void testSubtractMedianOfMedians(final ReadCountCollection readCounts) {
    final RealMatrix counts = readCounts.counts();
    final Median median = new Median();
    final double[] columnMedians = IntStream.range(0, counts.getColumnDimension())
            .mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
    final double center = median.evaluate(columnMedians);
    final double[][] expected = new double[counts.getRowDimension()][];
    for (int i = 0; i < expected.length; i++) {
        expected[i] = counts.getRow(i).clone();
        for (int j = 0; j < expected[i].length; j++) {
            expected[i][j] -= center;
        }
    }
    HDF5PCACoveragePoNCreationUtils.subtractMedianOfMedians(readCounts, NULL_LOGGER);
    final RealMatrix newCounts = readCounts.counts();
    Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
    Assert.assertEquals(newCounts.getRowDimension(), expected.length);
    for (int i = 0; i < expected.length; i++) {
        for (int j = 0; j < expected[i].length; j++) {
            Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
        }
    }
}

Example #7

Source File: MetaScores.java From metron with Apache License 2.0

6 votes

public MetaScores(List<Double> scores) {
  // A meta alert could be entirely alerts with no values.
  DoubleSummaryStatistics stats = scores
      .stream()
      .mapToDouble(a -> a)
      .summaryStatistics();
  metaScores.put("max", stats.getMax());
  metaScores.put("min", stats.getMin());
  metaScores.put("average", stats.getAverage());
  metaScores.put("count", stats.getCount());
  metaScores.put("sum", stats.getSum());

  // median isn't in the stats summary
  double[] arr = scores
      .stream()
      .mapToDouble(d -> d)
      .toArray();
  metaScores.put("median", new Median().evaluate(arr));
}

Example #8

Source File: MatrixTools.java From Juicebox with MIT License

5 votes

public static double getMedian(float[] values) {
    double[] array = new double[values.length];
    for (int k = 0; k < values.length; k++) {
        array[k] = values[k];
    }
    Median median = new Median();
    return median.evaluate(array);
}

Example #9

Source File: HDF5PCACoveragePoNCreationUtilsUnitTest.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

@Test(dataProvider = "readCountOnlyData")
public void testNormalizeAndLogReadCounts(final ReadCountCollection readCounts) {
    final RealMatrix counts = readCounts.counts();
    final Median median = new Median();
    final double[] columnMedians = IntStream.range(0, counts.getColumnDimension())
            .mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
    final double epsilon = HDF5PCACoveragePoNCreationUtils.EPSILON;
    final double[][] expected = new double[counts.getRowDimension()][];
    for (int i = 0; i < expected.length; i++) {
        expected[i] = counts.getRow(i).clone();
        for (int j = 0; j < expected[i].length; j++) {
            expected[i][j] /= columnMedians[j];
            if (expected[i][j] < epsilon) {
                expected[i][j] = epsilon;
            }
            expected[i][j] = Math.log(expected[i][j]) / Math.log(2);
        }
    }
    HDF5PCACoveragePoNCreationUtils.normalizeAndLogReadCounts(readCounts, NULL_LOGGER);
    final RealMatrix newCounts = readCounts.counts();
    Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
    Assert.assertEquals(newCounts.getRowDimension(), expected.length);
    for (int i = 0; i < expected.length; i++) {
        for (int j = 0; j < expected[i].length; j++) {
            Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
        }
    }
}

Example #10

Source File: RunGA.java From quaerite with Apache License 2.0

5 votes

private void reportFinal(GADB gaDb, ExperimentFactory experimentFactory, int num)
        throws SQLException {

    System.out.println("--------------------------------");
    System.out.println("FINAL RESULTS ON TESTING:");
    List<ExperimentNameScorePair> scores = gaDb.getNBestExperimentNames(
            TEST_PREFIX, num,
            experimentFactory.getTestScorer().getPrimaryStatisticName());

    SummaryStatistics summaryStatistics = new SummaryStatistics();
    double[] vals = new double[gaConfig.getNFolds()];
    int i = 0;
    for (ExperimentNameScorePair esp : scores) {
        System.out.println("experiment '" + esp.getExperimentName() + "': "
                + threePlaces.format(esp.getScore()));
        vals[i++] = esp.getScore();
        summaryStatistics.addValue(esp.getScore());
    }
    if (scores.size() > 1) {
        Median median = new Median();
        median.setData(vals);
        System.out.println("");

        System.out.println("mean: " +
                threePlaces.format(summaryStatistics.getMean()));
        System.out.println("median: " +
                threePlaces.format(median.evaluate()));
        System.out.println("stdev:" +
                threePlaces.format(summaryStatistics.getStandardDeviation()));
    }
}

Example #11

Source File: EDCoWThreshold.java From SONDY with GNU General Public License v3.0

5 votes

public double mad(double [] autoCorrelationValues){
    double [] tempTable = new double[autoCorrelationValues.length];
    Median m = new Median();
    double medianValue = m.evaluate(autoCorrelationValues);
    for(int i=0 ; i<autoCorrelationValues.length ;i++){
            tempTable[i] = Math.abs(autoCorrelationValues[i] - medianValue);
    }
    return m.evaluate(tempTable); //return the median of tempTable, the equation (13) in the paper
}

Example #12

Source File: MeanAboveMedian.java From housing-model with MIT License

5 votes

@Override
public double evaluate(double[] arg0) throws MathIllegalArgumentException {
	double median = (new Median()).evaluate(arg0);
	double totalAboveMedian = 0.0;
	int countAboveMedian = 0;
	for(double val : arg0) {
		if(val > median) {
			totalAboveMedian += val;
			++countAboveMedian;
		}
	}
	return(totalAboveMedian/countAboveMedian);
}

Example #13

Source File: MeanAboveMedian.java From housing-model with MIT License

5 votes

@Override
public double evaluate(double[] data, int begin, int length)
		throws MathIllegalArgumentException {
	double median = (new Median()).evaluate(data,begin,length);
	double totalAboveMedian = 0.0;
	int countAboveMedian = 0;
	for(int i=begin; i<begin+length; ++i) {
		if(data[i] > median) {
			totalAboveMedian += data[i];
			++countAboveMedian;
		}
	}
	return(totalAboveMedian/countAboveMedian);
}

Example #14

Source File: MatrixSummaryUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Return an array containing the median for each column in the given matrix.
 * @param m Not {@code null}.  Size MxN, where neither dimension is zero.  If any entry is NaN, it is disregarded
 *          in the calculation.
 * @return array of size N.  Never {@code null}
 */
public static double[] getColumnMedians(final RealMatrix m) {
    Utils.nonNull(m, "Cannot calculate medians on a null matrix.");
    final Median medianCalculator = new Median();
    return IntStream.range(0, m.getColumnDimension()).boxed()
            .mapToDouble(i -> medianCalculator.evaluate(m.getColumn(i))).toArray();
}

Example #15

Source File: MatrixSummaryUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Return an array containing the median for each row in the given matrix.
 * @param m Not {@code null}.  Size MxN.    If any entry is NaN, it is disregarded
 *          in the calculation.
 * @return array of size M.  Never {@code null}
 */
public static double[] getRowMedians(final RealMatrix m) {
    Utils.nonNull(m, "Cannot calculate medians on a null matrix.");
    final Median medianCalculator = new Median();
    return IntStream.range(0, m.getRowDimension()).boxed()
            .mapToDouble(i -> medianCalculator.evaluate(m.getRow(i))).toArray();
}

Example #16

Source File: AlleleLikelihoods.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Updates the likelihood of the NonRef allele (if present) based on the likelihoods of a set of non-symbolic
 * <p>
 *     This method does
 * </p>
 *
 * @param allelesToConsider
 */
@SuppressWarnings("unchecked")  // for the cast (A) Allele.NON_REF_ALLELE below
public void updateNonRefAlleleLikelihoods(final AlleleList<A> allelesToConsider) {
    final int nonRefAlleleIndex = indexOfAllele((A) Allele.NON_REF_ALLELE);
    if ( nonRefAlleleIndex < 0) {
        return;
    }
    final int alleleCount = alleles.numberOfAlleles();
    final int nonSymbolicAlleleCount = alleleCount - 1;
    // likelihood buffer reused across evidence:
    final double[] qualifiedAlleleLikelihoods = new double[nonSymbolicAlleleCount];
    final Median medianCalculator = new Median();
    for (int s = 0; s < samples.numberOfSamples(); s++) {
        final double[][] sampleValues = valuesBySampleIndex[s];
        final int evidenceCount = evidenceBySampleIndex.get(s).size();
        for (int r = 0; r < evidenceCount; r++) {
            final BestAllele bestAllele = searchBestAllele(s, r, true);
            int numberOfQualifiedAlleleLikelihoods = 0;
            for (int i = 0; i < alleleCount; i++) {
                final double alleleLikelihood = sampleValues[i][r];
                if (i != nonRefAlleleIndex && alleleLikelihood < bestAllele.likelihood
                        && !Double.isNaN(alleleLikelihood) && allelesToConsider.indexOfAllele(alleles.getAllele(i)) != MISSING_INDEX) {
                    qualifiedAlleleLikelihoods[numberOfQualifiedAlleleLikelihoods++] = alleleLikelihood;
                }
            }
            final double nonRefLikelihood = medianCalculator.evaluate(qualifiedAlleleLikelihoods, 0, numberOfQualifiedAlleleLikelihoods);
            // when the median is NaN that means that all applicable likekihoods are the same as the best
            // so the evidence is not informative at all given the existing alleles. Unless there is only one (or zero) concrete
            // alleles with give the same (the best) likelihood to the NON-REF. When there is only one (or zero) concrete
            // alleles we set the NON-REF likelihood to NaN.
            sampleValues[nonRefAlleleIndex][r] = !Double.isNaN(nonRefLikelihood) ? nonRefLikelihood
                    : nonSymbolicAlleleCount <= 1 ? Double.NaN : bestAllele.likelihood;
        }
    }
}

Example #17

Source File: SVDDenoisingUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Preprocess (i.e., transform to fractional coverage, correct GC bias, filter, impute, and truncate)
 * and standardize read counts from a panel of normals.
 * All inputs are assumed to be valid.
 * The dimensions of {@code readCounts} should be samples x intervals.
 * To reduce memory footprint, {@code readCounts} is modified in place when possible.
 * Filtering is performed by using boolean arrays to keep track of intervals and samples
 * that have been filtered at any step and masking {@code readCounts} with them appropriately.
 * If {@code intervalGCContent} is null, GC-bias correction will not be performed.
 */
static PreprocessedStandardizedResult preprocessAndStandardizePanel(final RealMatrix readCounts,
                                                                    final double[] intervalGCContent,
                                                                    final double minimumIntervalMedianPercentile,
                                                                    final double maximumZerosInSamplePercentage,
                                                                    final double maximumZerosInIntervalPercentage,
                                                                    final double extremeSampleMedianPercentile,
                                                                    final boolean doImputeZeros,
                                                                    final double extremeOutlierTruncationPercentile) {
    //preprocess (transform to fractional coverage, correct GC bias, filter, impute, truncate) and return copy of submatrix
    logger.info("Preprocessing read counts...");
    final PreprocessedStandardizedResult preprocessedStandardizedResult = preprocessPanel(readCounts, intervalGCContent,
            minimumIntervalMedianPercentile, maximumZerosInSamplePercentage, maximumZerosInIntervalPercentage,
            extremeSampleMedianPercentile, doImputeZeros, extremeOutlierTruncationPercentile);
    logger.info("Panel read counts preprocessed.");

    //standardize in place
    logger.info("Standardizing read counts...");
    divideBySampleMedianAndTransformToLog2(preprocessedStandardizedResult.preprocessedStandardizedValues);
    logger.info("Subtracting median of sample medians...");
    final double[] sampleLog2Medians = MatrixSummaryUtils.getRowMedians(preprocessedStandardizedResult.preprocessedStandardizedValues);
    final double medianOfSampleMedians = new Median().evaluate(sampleLog2Medians);
    preprocessedStandardizedResult.preprocessedStandardizedValues
            .walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
        @Override
        public double visit(int sampleIndex, int intervalIndex, double value) {
            return value - medianOfSampleMedians;
        }
    });
    logger.info("Panel read counts standardized.");

    return preprocessedStandardizedResult;
}

Example #18

Source File: CalculateContamination.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

private List<PileupSummary> filterSitesByCoverage(final List<PileupSummary> allSites) {
    // Just in case the intervals given to GetPileupSummaries contained un-covered sites, we remove them
    // so that a bunch of zeroes don't throw off the median coverage
    final List<PileupSummary> coveredSites = allSites.stream().filter(s -> s.getTotalCount() > MIN_COVERAGE).collect(Collectors.toList());
    final double[] coverage = coveredSites.stream().mapToDouble(PileupSummary::getTotalCount).toArray();
    final double medianCoverage = new Median().evaluate(coverage);
    final double meanCoverage = new Mean().evaluate(coverage);
    final double lowCoverageThreshold = medianCoverage * lowCoverageRatioThreshold;
    final double highCoverageThreshold = meanCoverage * highCoverageRatioThreshold;
    return coveredSites.stream()
            .filter(ps -> ps.getTotalCount() > lowCoverageThreshold && ps.getTotalCount() < highCoverageThreshold)
            .collect(Collectors.toList());
}

Example #19

Source File: CoveragePoNQCUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

@VisibleForTesting
static Map<String, Double> getContigToMedianCRMap(final ReadCountCollection readCountCollection) {
    final List<String> allContigsPresent = retrieveAllContigsPresent(readCountCollection);
    final Map<String, Double> contigToMedian = new LinkedHashMap<>();
    for (String contig: allContigsPresent) {
        final ReadCountCollection oneContigReadCountCollection = readCountCollection.subsetTargets(readCountCollection.targets().stream().filter(t -> t.getContig().equals(contig)).collect(Collectors.toSet()));
        final double[] flatCounts = Doubles.concat(oneContigReadCountCollection.counts().getData());

        // Put into CRSpace
        final double[] flatCountsInCRSpace = DoubleStream.of(flatCounts).map(d -> Math.pow(2, d)).toArray();
        contigToMedian.put(contig, new Median().evaluate(flatCountsInCRSpace));
    }
    return contigToMedian;
}

Example #20

Source File: Percentile.java From morpheus-core with Apache License 2.0

5 votes

public static void main(String[] args) {
    final double[] values = new java.util.Random().doubles(5000).toArray();
    final Percentile stat1 = new Percentile(0.5);
    final Median stat2 = new Median();
    for (double value : values) stat1.add(value);
    final double result1 = stat1.getValue();
    final double result2 = stat2.evaluate(values);
    if (result1 != result2) {
        throw new RuntimeException("Error: " + result1 + " != " + result2);
    }
}

Example #21

Source File: MatrixSummaryUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Return an array containing the median for each column in the given matrix.
 * @param m Not {@code null}.  Size MxN, where neither dimension is zero.  If any entry is NaN, it is disregarded
 *          in the calculation.
 * @return array of size N.  Never {@code null}
 */
public static double[] getColumnMedians(final RealMatrix m) {
    Utils.nonNull(m, "Cannot calculate medians on a null matrix.");
    final Median medianCalculator = new Median();
    return IntStream.range(0, m.getColumnDimension()).boxed()
            .mapToDouble(i -> medianCalculator.evaluate(m.getColumn(i))).toArray();
}

Example #22

Source File: MatrixSummaryUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Return an array containing the median for each row in the given matrix.
 * @param m Not {@code null}.  Size MxN.    If any entry is NaN, it is disregarded
 *          in the calculation.
 * @return array of size M.  Never {@code null}
 */
public static double[] getRowMedians(final RealMatrix m) {
    Utils.nonNull(m, "Cannot calculate medians on a null matrix.");
    final Median medianCalculator = new Median();
    return IntStream.range(0, m.getRowDimension()).boxed()
            .mapToDouble(i -> medianCalculator.evaluate(m.getRow(i))).toArray();
}

Example #23

Source File: ReadCountCollectionUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Impute zero counts to the median of non-zero values in the enclosing target row.
 *
 * <p>The imputation is done in-place, thus the input matrix is well be modified as a result of this call.</p>
 *
 * @param readCounts the input and output read-count matrix.
 */
public static void imputeZeroCountsAsTargetMedians(final ReadCountCollection readCounts, final Logger logger) {

    final RealMatrix counts = readCounts.counts();
    final int targetCount = counts.getRowDimension();

    final Median medianCalculator = new Median();
    int totalCounts = counts.getColumnDimension() * counts.getRowDimension();

    // Get the number of zeroes contained in the counts.
    final long totalZeroCounts = IntStream.range(0, targetCount)
            .mapToLong(t -> DoubleStream.of(counts.getRow(t))
                    .filter(c -> c == 0.0).count()).sum();

    // Get the median of each row, not including any entries that are zero.
    final double[] medians = IntStream.range(0, targetCount)
            .mapToDouble(t -> medianCalculator.evaluate(
                    DoubleStream.of(counts.getRow(t))
                            .filter(c -> c != 0.0)
                            .toArray()
            )).toArray();

    // Change any zeros in the counts to the median for the row.
    counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
        @Override
        public double visit(final int row, final int column, final double value) {
            return value != 0 ? value : medians[row];
        }
    });

    if (totalZeroCounts > 0) {
        final double totalZeroCountsPercentage = 100.0 * (totalZeroCounts / totalCounts);
        logger.info(String.format("Some 0.0 counts (%d out of %d, %.2f%%) were imputed to their enclosing target " +
                "non-zero median", totalZeroCounts, totalZeroCounts, totalZeroCountsPercentage));
    } else {
        logger.info("No count is 0.0 thus no count needed to be imputed.");
    }
}

Example #24

Source File: SegmentMergeUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Calculates the distance between two data sets based on the Hodges-Lehmann estimator.
 * @param data1     first data set
 * @param data2     second data set
 * @return          distance between data sets based on the Hodges-Lehmann estimator
 */
private static double hodgesLehmannDistance(final double[] data1, final double[] data2) {
    double[] differences = new double[data1.length * data2.length];
    for (int i = 0; i < data1.length; i++) {
        for (int j = 0; j < data2.length; j++) {
            differences[i * data2.length + j] = data1[i] - data2[j];
        }
    }
    return new Median().evaluate(differences);
}

Example #25

Source File: ReadCountCollectionUtilsUnitTest.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

@Test(dataProvider = "tooManyZerosData")
public void testImputeZeroCounts(final ReadCountCollection readCounts) {
    final Median median = new Median();
    final RealMatrix counts = readCounts.counts();
    final double[] targetNonZeroMedians = IntStream.range(0, counts.getRowDimension())
            .mapToDouble(i -> median.evaluate(DoubleStream.of(counts.getRow(i)).filter(d -> d != 0.0).toArray())).toArray();
    final double[][] expected = new double[counts.getRowDimension()][];
    final double[][] original = counts.getData();
    for (int i = 0; i < expected.length; i++) {
        final double[] rowCounts = counts.getRow(i).clone();
        expected[i] = rowCounts;
        for (int j = 0; j < expected[i].length; j++) {
            if (expected[i][j] == 0.0) {
                expected[i][j] = targetNonZeroMedians[i];
            }
        }
    }
    ReadCountCollectionUtils.imputeZeroCountsAsTargetMedians(readCounts, NULL_LOGGER);
    final RealMatrix newCounts = readCounts.counts();
    Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
    Assert.assertEquals(newCounts.getRowDimension(), expected.length);
    for (int i = 0; i < expected.length; i++) {
        for (int j = 0; j < expected[i].length; j++) {
            Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], "i,j == " + i + "," + j + " " + original[i][j]);
        }
    }
}

Example #26

Source File: GCCorrector.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

4 votes

private static double medianOrDefault(final List<Double> list) {
    return list.size() > 0 ? new Median().evaluate(list.stream().mapToDouble(d->d).toArray()) : DUMMY_VALUE_NEVER_USED;
}

Example #27

Source File: EDCoWThreshold.java From SONDY with GNU General Public License v3.0

4 votes

public double theta1(double [] autoCorrelationValues, double gama){
    Median m = new Median();
    return  (m.evaluate(autoCorrelationValues) + (gama * mad(autoCorrelationValues)));				
}

Example #28

Source File: EDCoWThreshold.java From SONDY with GNU General Public License v3.0

4 votes

public double theta2(double [][] crossCorrelationValues, double gama){
    double[] vecCrossCorrelation = transformMatrix(crossCorrelationValues);
    Median m = new Median();		
    return (m.evaluate(vecCrossCorrelation) + (gama * mad(vecCrossCorrelation)));		
}

Example #29

Source File: GATKProtectedMathUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

4 votes

public static int median(final int[] values) {
    return (int) FastMath.round(new Median().evaluate(Arrays.stream(values).mapToDouble(n -> n).toArray()));
}

Example #30

Source File: CalculateContamination.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

4 votes

private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
    if (sites.isEmpty()) {
        return new ArrayList<>();
    }
    final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
    final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
    final List<Double> smoothedCopyRatios = new ArrayList<>();
    final List<Double> hetRatios = new ArrayList<>();

    for (final PileupSummary site : sites) {
        final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
        final List<PileupSummary> nearbySites = tc.targets(nearbySpan);

        final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount()/averageCoverage).average().orElseGet(() -> 0);
        smoothedCopyRatios.add(averageNearbyCopyRatio);

        final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2*x*(1-x)).sum();
        final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
        final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
        hetRatios.add(hetRatio);
    }

    final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x->x).toArray());
    final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size())
            .filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 *medianSmoothedCopyRatio)
            .boxed().collect(Collectors.toList());

    final double meanHetRatio = hetRatios.stream().mapToDouble(x->x).average().getAsDouble();
    final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size())
            .filter(n -> hetRatios.get(n) < meanHetRatio * 0.5)
            .boxed().collect(Collectors.toList());

    //TODO: as extra security, filter out sites that are near too many hom alts

    logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity",
            indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));

    logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));

    final Set<Integer> badSites = new TreeSet<>();
    badSites.addAll(indicesWithAnomalousCopyRatio);
    badSites.addAll(indicesWithLossOfHeterozygosity);

    return IntStream.range(0, sites.size())
            .filter(n -> !badSites.contains(n))
            .mapToObj(sites::get)
            .filter(s -> s.getAltFraction() > 0.8)
            .collect(Collectors.toList());
}