Java Code Examples for htsjdk.variant.variantcontext.VariantContext#getGenotypes()

The following examples show how to use htsjdk.variant.variantcontext.VariantContext#getGenotypes() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AS_RMSMappingQuality.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private Map<Allele, Integer> getADcounts(final VariantContext vc) {
    final GenotypesContext genotypes = vc.getGenotypes();
    if ( genotypes == null || genotypes.size() == 0 ) {
        genotype_logger.warn("VC does not have genotypes -- annotations were calculated in wrong order");
        return null;
    }

    final Map<Allele, Integer> variantADs = new HashMap<>();
    for(final Allele a : vc.getAlleles())
        variantADs.put(a,0);

    for (final Genotype gt : vc.getGenotypes()) {
        if(gt.hasAD()) {
            final int[] ADs = gt.getAD();
            for (int i = 1; i < vc.getNAlleles(); i++) {
                variantADs.put(vc.getAlternateAllele(i - 1), variantADs.get(vc.getAlternateAllele(i - 1)) + ADs[i]); //here -1 is to reconcile allele index with alt allele index
            }
        }
    }
    return variantADs;
}
 
Example 2
Source File: XHMMSegmentGenotyperIntegrationTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private void assertVariantsAreCoveredBySegments(final List<VariantContext> variants,
                                                final List<HiddenStateSegmentRecord<CopyNumberTriState, Target>> variantSegments) {
    for (final VariantContext variant : variants) {
        final List<HiddenStateSegmentRecord<CopyNumberTriState, Target>> matches =
                variantSegments.stream()
                .filter(s -> new SimpleInterval(variant).equals(s.getSegment().getInterval()))
                .collect(Collectors.toList());
        Assert.assertFalse(matches.isEmpty());
        for (final Genotype genotype : variant.getGenotypes()) {
            final boolean discovery = genotype.getExtendedAttribute(XHMMSegmentGenotyper.DISCOVERY_KEY).toString().equals(XHMMSegmentGenotyper.DISCOVERY_TRUE);
            if (discovery) {
                Assert.assertTrue(matches.stream().anyMatch(s -> s.getSampleName().equals(genotype.getSampleName())));
            } else {
                Assert.assertTrue(matches.stream().noneMatch(s -> s.getSampleName().equals(genotype.getSampleName())));
            }
        }
    }
}
 
Example 3
Source File: StrandBiasTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
//template method for calculating strand bias annotations using the three different methods
public Map<String, Object> annotate(final ReferenceContext ref,
                                    final VariantContext vc,
                                    final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
    Utils.nonNull(vc);
    if ( !vc.isVariant() ) {
        return Collections.emptyMap();
    }

    // if the genotype and strand bias are provided, calculate the annotation from the Genotype (GT) field
    if ( vc.hasGenotypes() ) {
        for (final Genotype g : vc.getGenotypes()) {
            if (g.hasAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)) {
                return calculateAnnotationFromGTfield(vc.getGenotypes());
            }
        }
    }

    if (likelihoods != null) {
        return calculateAnnotationFromLikelihoods(likelihoods, vc);
    }
    return Collections.emptyMap();
}
 
Example 4
Source File: XHMMSegmentGenotyperIntegrationTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private void assertVariantsPlGtAndGQAreConsistent(final List<VariantContext> variants) {
    for (final VariantContext vc :variants) {
        for (final Genotype gt : vc.getGenotypes()) {
            final int[] PL = gt.getPL();
            Assert.assertNotNull(PL);

            final int[] twoLowestPLIndices = IntStream.range(0, PL.length)
                    .boxed()
                    .sorted((a, b) -> Integer.compare(PL[a], PL[b]))
                    .limit(2)
                    .mapToInt(n -> n)
                    .toArray();
            final int minPLIndex = twoLowestPLIndices[0];
            final int secondPLIndex = twoLowestPLIndices[1];
            Assert.assertEquals(vc.getAlleles().indexOf(gt.getAlleles().get(0)), minPLIndex);
            final int expectedGQ = Math.min(XHMMSegmentGenotyper.MAX_GQ, PL[secondPLIndex] - PL[minPLIndex]);
            Assert.assertEquals(gt.getGQ(), expectedGQ);
        }
    }
}
 
Example 5
Source File: AlleleFrequencyCalculator.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) {
    final int numAlleles = vc.getNAlleles();
    Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent");
    final double[] log10Result = new double[numAlleles];
    Arrays.fill(log10Result, Double.NEGATIVE_INFINITY);
    for (final Genotype g : vc.getGenotypes()) {
        if (!g.hasLikelihoods()) {
            continue;
        }
        final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles);

        final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);

        new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex ->
            glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) ->
                    log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.log10(count))));
    }
    return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x));
}
 
Example 6
Source File: SelectVariants.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Initialize cache of allele anyploid indices
 *
 * Initialize the cache of PL index to a list of alleles for each ploidy.
 *
 * @param vc    Variant Context
 */
private void initalizeAlleleAnyploidIndicesCache(final VariantContext vc) {
    if (vc.getType() != VariantContext.Type.NO_VARIATION) { // Bypass if not a variant
        for (final Genotype g : vc.getGenotypes()) {
            // Make a new entry if the we have not yet cached a PL to allele indices map for this ploidy and allele count
            // skip if there are no PLs -- this avoids hanging on high-allelic somatic samples, for example, where
            // there's no need for the PL indices since they don't exist
            if (g.getPloidy() != 0 && (!ploidyToNumberOfAlleles.containsKey(g.getPloidy()) || ploidyToNumberOfAlleles.get(g.getPloidy()) < vc.getNAlleles())) {
                if (vc.getGenotypes().stream().anyMatch(Genotype::hasLikelihoods)) {
                    GenotypeLikelihoods.initializeAnyploidPLIndexToAlleleIndices(vc.getNAlleles() - 1, g.getPloidy());
                    ploidyToNumberOfAlleles.put(g.getPloidy(), vc.getNAlleles());
                }
            }
        }
    }

}
 
Example 7
Source File: GenotypeSummaries.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public Map<String, Object> annotate(final ReferenceContext ref,
                                    final VariantContext vc,
                                    final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
    Utils.nonNull(vc);
    if ( ! vc.hasGenotypes() ) {
        return Collections.emptyMap();
    }

    final Map<String,Object> returnMap = new LinkedHashMap<>();
    returnMap.put(GATKVCFConstants.NOCALL_CHROM_KEY, vc.getNoCallCount());

    final DescriptiveStatistics stats = new DescriptiveStatistics();
    for( final Genotype g : vc.getGenotypes() ) {
        if( g.hasGQ() ) {
            stats.addValue(g.getGQ());
        }
    }
    if( stats.getN() > 0L ) {
        returnMap.put(GATKVCFConstants.GQ_MEAN_KEY, String.format("%.2f", stats.getMean()));
        if( stats.getN() > 1L ) {
            returnMap.put(GATKVCFConstants.GQ_STDEV_KEY, String.format("%.2f", stats.getStandardDeviation()));
        }
    }

    return returnMap;
}
 
Example 8
Source File: RMSMappingQuality.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 *
 * @return the number of reads at the given site, calculated as InfoField {@link VCFConstants#DEPTH_KEY} minus the
 * format field {@link GATKVCFConstants#MIN_DP_FORMAT_KEY} or DP of each of the HomRef genotypes at that site
 * @throws UserException.BadInput if the {@link VCFConstants#DEPTH_KEY} is missing or if the calculated depth is <= 0
 */
@VisibleForTesting
private static int getNumOfReads(final VariantContext vc) {
    if(vc.hasAttribute(GATKVCFConstants.MAPPING_QUALITY_DEPTH_DEPRECATED)) {
        int mqDP = vc.getAttributeAsInt(GATKVCFConstants.MAPPING_QUALITY_DEPTH_DEPRECATED, 0);
        if (mqDP > 0) {
            return mqDP;
        }
    }

    //don't use the full depth because we don't calculate MQ for reference blocks
    //don't count spanning deletion calls towards number of reads
    int numOfReads = vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, -1);
    if(vc.hasGenotypes()) {
        for(final Genotype gt : vc.getGenotypes()) {
           if(hasReferenceDepth(gt)) {
                //site-level DP contribution will come from MIN_DP for gVCF-called reference variants or DP for BP resolution
                if (gt.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)) {
                    numOfReads -= Integer.parseInt(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString());
                } else if (gt.hasDP()) {
                    numOfReads -= gt.getDP();
                }
            }
            else if(hasSpanningDeletionAllele(gt)) {
                //site-level DP contribution will come from MIN_DP for gVCF-called reference variants or DP for BP resolution
                if (gt.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)) {
                    numOfReads -= Integer.parseInt(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString());
                } else if (gt.hasDP()) {
                    numOfReads -= gt.getDP();
                }
            }
        }
    }
    if (numOfReads <= 0){
        numOfReads = -1;  //return -1 to result in a NaN
    }
    return numOfReads;
}
 
Example 9
Source File: SelectVariants.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Checks if vc has a variant call for (at least one of) the samples.
 *
 * @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap)
 * @param compVCs the comparison VariantContext (discordance)
 * @return true VariantContexts are discordant, false otherwise
 */
private boolean isDiscordant (final VariantContext vc, final Collection<VariantContext> compVCs) {
    if (vc == null) {
        return false;
    }

    // if we're not looking at specific samples then the absence of a compVC means discordance
    if (noSamplesSpecified) {
        return (compVCs == null || compVCs.isEmpty());
    }

    // check if we find it in the variant rod
    final GenotypesContext genotypes = vc.getGenotypes(samples);
    for (final Genotype g : genotypes) {
        if (sampleHasVariant(g)) {
            // There is a variant called (or filtered with not exclude filtered option set) that is not HomRef for at least one of the samples.
            if (compVCs == null) {
                return true;
            }
            // Look for this sample in the all vcs of the comp ROD track.
            boolean foundVariant = false;
            for (final VariantContext compVC : compVCs) {
                if (haveSameGenotypes(g, compVC.getGenotype(g.getSampleName()))) {
                    foundVariant = true;
                    break;
                }
            }
            // if (at least one sample) was not found in all VCs of the comp ROD, we have discordance
            if (!foundVariant)
                return true;
        }
    }
    return false; // we only get here if all samples have a variant in the comp rod.
}
 
Example 10
Source File: FilteredHaplotypeFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected void accumulateDataForLearning(final VariantContext vc, final ErrorProbabilities errorProbabilities, final Mutect2FilteringEngine filteringEngine) {
    // we record the maximum non-sequencing artifact that is not this filter itself
    final double artifactProbability = errorProbabilities.getProbabilitiesByFilter().entrySet().stream()
            .filter(e -> e.getKey().errorType() != ErrorType.SEQUENCING)
            .filter(e -> !e.getKey().filterName().equals(filterName()))
            .flatMap(e -> e.getValue().stream())  // the value is a list of double, we need the max of all the lists
            .max(Double::compareTo).orElse(0.0);

    for (final Genotype tumorGenotype : vc.getGenotypes()) {
        if (!filteringEngine.isTumor(tumorGenotype)) {
            continue;
        }

        final Optional<String> phasingString = makePhasingString(tumorGenotype);

        if (!phasingString.isPresent()) {
            continue;
        }

        if (!accumulatingPhasedProbabilities.containsKey(phasingString.get())) {
            accumulatingPhasedProbabilities.put(phasingString.get(), new ArrayList<>());
        }

        accumulatingPhasedProbabilities.get(phasingString.get()).add(ImmutablePair.of(vc.getStart(), artifactProbability));
    }
}
 
Example 11
Source File: VCFRecordWriter.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
protected void writeRecord(VariantContext vc) {
	final GenotypesContext gc = vc.getGenotypes();
	if (gc instanceof LazyParsingGenotypesContext)
		((LazyParsingGenotypesContext)gc).getParser().setHeaderDataCache(
			gc instanceof LazyVCFGenotypesContext ? vcfHeaderDataCache
			                                      : bcfHeaderDataCache);

	writer.add(vc);
}
 
Example 12
Source File: AS_RankSumTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public Map<String, Object> annotate(final ReferenceContext ref,
                                    final VariantContext vc,
                                    final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
    Utils.nonNull(vc, "vc is null");

    final GenotypesContext genotypes = vc.getGenotypes();
    if (genotypes == null || genotypes.isEmpty()) {
        return Collections.emptyMap();
    }

    final List<Double> refQuals = new ArrayList<>();
    final List<Double> altQuals = new ArrayList<>();

    if( likelihoods != null) {
        fillQualsFromLikelihood(vc, likelihoods, refQuals, altQuals);
    }

    if ( refQuals.isEmpty() && altQuals.isEmpty() ) {
        return Collections.emptyMap();
    }

    final MannWhitneyU mannWhitneyU = new MannWhitneyU();

    // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
    final MannWhitneyU.Result result = mannWhitneyU.test(Doubles.toArray(altQuals), Doubles.toArray(refQuals), MannWhitneyU.TestType.FIRST_DOMINATES);
    final double zScore = result.getZ();

    if (Double.isNaN(zScore)) {
        return Collections.emptyMap();
    } else {
        return Collections.singletonMap(getKeyNames().get(0), String.format("%.3f", zScore));
    }
}
 
Example 13
Source File: GenotypeGVCFsIntegrationTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Test
public void testGenotypingForSomaticGVCFs() throws IOException{
    final List<Integer> starPositions = new ArrayList<>();
    starPositions.add(319);
    starPositions.add(321);
    starPositions.add(322);
    starPositions.add(323);

    final File output = createTempFile("tmp", ".vcf");
    ArgumentsBuilder args =   new ArgumentsBuilder()
            .addVCF(getTestFile("threeSamples.MT.g.vcf"))
            .addReference(new File(b37Reference))
            .addOutput(output)
            .add(CombineGVCFs.SOMATIC_INPUT_LONG_NAME, true);
    runCommandLine(args);

    //compared with the combined GVCF, this output should have called GTs and no alts with LODs less than TLOD_THRESHOLD
    //uncalled alleles should be removed

    final List<VariantContext> results = VariantContextTestUtils.getVariantContexts(output);

    //qualitative match
    for (final VariantContext vc : results) {
        Assert.assertTrue(!vc.getAlleles().contains(Allele.NON_REF_ALLELE));
        Assert.assertTrue(vc.getAlternateAlleles().size() >= 1);
        Assert.assertTrue(vc.filtersWereApplied());  //filtering should happen during combine, but make sure filters aren't dropped

        for (final Genotype g : vc.getGenotypes()) {
            Assert.assertTrue(g.isCalled());
            //homRef sites are sometimes filtered because they had low quality, filtered alleles that GGVCFs genotyped out
            //ideally if the site wasn't homRef and had a good allele it would be PASS, but htsjdk will only output genotype filters if at least one genotype is filtered
        }

        //MT:302 has an alphabet soup of alleles in the GVCF -- make sure the ones we keep are good
        if (vc.getStart() == 302) {
            Assert.assertEquals(vc.getNAlleles(), 6);
            double[] sample0LODs = VariantContextGetters.getAttributeAsDoubleArray(vc.getGenotype(0), GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, 0.0);
            double[] sample1LODs = VariantContextGetters.getAttributeAsDoubleArray(vc.getGenotype(1), GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, 0.0);
            double[] sample2LODs = VariantContextGetters.getAttributeAsDoubleArray(vc.getGenotype(2), GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, 0.0);
            for (int i = 0; i < vc.getNAlleles() - 1; i++) {
                Assert.assertTrue(sample0LODs[i] > TLOD_THRESHOLD || sample1LODs[i] > TLOD_THRESHOLD || sample2LODs[i] > TLOD_THRESHOLD);
            }
        }

        //make sure phasing is retained
        if (vc.getStart() == 317 || vc.getStart() == 320) {
            VariantContextTestUtils.assertGenotypeIsPhasedWithAttributes(vc.getGenotype(2));
        }

        //make sure *-only variants are dropped
        Assert.assertFalse(starPositions.contains(vc.getStart()));

        //sample 0 is also phased
        if (vc.getStart() == 4713 || vc.getStart() == 4720) {
            VariantContextTestUtils.assertGenotypeIsPhasedWithAttributes(vc.getGenotype(0));
        }

        //MT:4769 in combined GVCF has uncalled alleles
        if (vc.getStart() == 4769) {
            Assert.assertEquals(vc.getNAlleles(), 2);
            Assert.assertTrue(vc.getAlternateAlleles().get(0).basesMatch("G"));
        }
    }

    //exact match
    final File expectedFile = getTestFile("threeSamples.MT.vcf");
    final List<VariantContext> expected = VariantContextTestUtils.getVariantContexts(expectedFile);
    final VCFHeader header = VCFHeaderReader.readHeaderFrom(new SeekablePathStream(IOUtils.getPath(expectedFile.getAbsolutePath())));
    assertForEachElementInLists(results, expected,
            (a, e) -> VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, ATTRIBUTES_TO_IGNORE, ATTRIBUTES_WITH_JITTER, header));

}
 
Example 14
Source File: AlleleFrequencyCalculator.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
 *
 * @param vc the VariantContext holding the alleles and sample information.  The VariantContext
 *           must have at least 1 alternative allele
 * @return result (for programming convenience)
 */
public AFCalculationResult calculate(final VariantContext vc, final int defaultPloidy) {
    Utils.nonNull(vc, "VariantContext cannot be null");
    final int numAlleles = vc.getNAlleles();
    final List<Allele> alleles = vc.getAlleles();
    Utils.validateArg( numAlleles > 1, () -> "VariantContext has only a single reference allele, but getLog10PNonRef requires at least one at all " + vc);

    final double[] priorPseudocounts = alleles.stream()
            .mapToDouble(a -> a.isReference() ? refPseudocount : (a.length() == vc.getReference().length() ? snpPseudocount : indelPseudocount)).toArray();

    double[] alleleCounts = new double[numAlleles];
    final double flatLog10AlleleFrequency = -MathUtils.log10(numAlleles); // log10(1/numAlleles)
    double[] log10AlleleFrequencies = new IndexRange(0, numAlleles).mapToDouble(n -> flatLog10AlleleFrequency);

    for (double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY; alleleCountsMaximumDifference > THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE; ) {
        final double[] newAlleleCounts = effectiveAlleleCounts(vc, log10AlleleFrequencies);
        alleleCountsMaximumDifference = Arrays.stream(MathArrays.ebeSubtract(alleleCounts, newAlleleCounts)).map(Math::abs).max().getAsDouble();
        alleleCounts = newAlleleCounts;
        final double[] posteriorPseudocounts = MathArrays.ebeAdd(priorPseudocounts, alleleCounts);

        // first iteration uses flat prior in order to avoid local minimum where the prior + no pseudocounts gives such a low
        // effective allele frequency that it overwhelms the genotype likelihood of a real variant
        // basically, we want a chance to get non-zero pseudocounts before using a prior that's biased against a variant
        log10AlleleFrequencies = new Dirichlet(posteriorPseudocounts).log10MeanWeights();
    }

    double[] log10POfZeroCountsByAllele = new double[numAlleles];
    double log10PNoVariant = 0;

    final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL);
    final Map<Integer, int[]> nonVariantIndicesByPloidy = new Int2ObjectArrayMap<>();

    // re-usable buffers of the log10 genotype posteriors of genotypes missing each allele
    final List<DoubleArrayList> log10AbsentPosteriors = IntStream.range(0,numAlleles).mapToObj(n -> new DoubleArrayList()).collect(Collectors.toList());
    for (final Genotype g : vc.getGenotypes()) {
        if (!g.hasLikelihoods()) {
            continue;
        }
        final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy();
        final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles);

        final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);

        //the total probability
        if (!spanningDeletionPresent) {
            log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX];
        } else {
            nonVariantIndicesByPloidy.computeIfAbsent(ploidy, p -> genotypeIndicesWithOnlyRefAndSpanDel(p, alleles));
            final int[] nonVariantIndices = nonVariantIndicesByPloidy.get(ploidy);
            final double[] nonVariantLog10Posteriors = MathUtils.applyToArray(nonVariantIndices, n -> log10GenotypePosteriors[n]);
            // when the only alt allele is the spanning deletion the probability that the site is non-variant
            // may be so close to 1 that finite precision error in log10SumLog10 yields a positive value,
            // which is bogus.  Thus we cap it at 0.
            log10PNoVariant += Math.min(0,MathUtils.log10SumLog10(nonVariantLog10Posteriors));
        }

        // if the VC is biallelic the allele-specific qual equals the variant qual
        if (numAlleles == 2) {
            continue;
        }

        // for each allele, we collect the log10 probabilities of genotypes in which the allele is absent, then add (in log space)
        // to get the log10 probability that the allele is absent in this sample
        log10AbsentPosteriors.forEach(DoubleArrayList::clear);  // clear the buffers.  Note that this is O(1) due to the primitive backing array
        for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
            final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
            glCalc.genotypeAlleleCountsAt(genotype).forEachAbsentAlleleIndex(a -> log10AbsentPosteriors.get(a).add(log10GenotypePosterior), numAlleles);
        }

        final double[] log10PNoAllele = log10AbsentPosteriors.stream()
                .mapToDouble(buffer -> MathUtils.log10SumLog10(buffer.toDoubleArray()))
                .map(x -> Math.min(0, x)).toArray();    // if prob of non hom ref > 1 due to finite precision, short-circuit to avoid NaN

        // multiply the cumulative probabilities of alleles being absent, which is addition of logs
        MathUtils.addToArrayInPlace(log10POfZeroCountsByAllele, log10PNoAllele);
    }

    // for biallelic the allele-specific qual equals the variant qual, and we short-circuited the calculation above
    if (numAlleles == 2) {
        log10POfZeroCountsByAllele[1] = log10PNoVariant;
    }

    // unfortunately AFCalculationResult expects integers for the MLE.  We really should emit the EM no-integer values
    // which are valuable (eg in CombineGVCFs) as the sufficient statistics of the Dirichlet posterior on allele frequencies
    final int[] integerAlleleCounts = Arrays.stream(alleleCounts).mapToInt(x -> (int) Math.round(x)).toArray();
    final int[] integerAltAlleleCounts = Arrays.copyOfRange(integerAlleleCounts, 1, numAlleles);

    //skip the ref allele (index 0)
    final Map<Allele, Double> log10PRefByAllele = IntStream.range(1, numAlleles).boxed()
            .collect(Collectors.toMap(alleles::get, a -> log10POfZeroCountsByAllele[a]));

    return new AFCalculationResult(integerAltAlleleCounts, alleles, log10PNoVariant, log10PRefByAllele);
}
 
Example 15
Source File: LiftoverUtils.java    From picard with MIT License 4 votes vote down vote up
/**
 * method to swap the reference and alt alleles of a bi-allelic, SNP
 *
 * @param vc                   the {@link VariantContext} (bi-allelic SNP) that needs to have it's REF and ALT alleles swapped.
 * @param annotationsToReverse INFO field annotations (of double value) that will be reversed (x->1-x)
 * @param annotationsToDrop    INFO field annotations that will be dropped from the result since they are invalid when REF and ALT are swapped
 * @return a new {@link VariantContext} with alleles swapped, INFO fields modified and in the genotypes, GT, AD and PL corrected appropriately
 */
public static VariantContext swapRefAlt(final VariantContext vc, final Collection<String> annotationsToReverse, final Collection<String> annotationsToDrop) {

    if (!vc.isBiallelic() || !vc.isSNP()) {
        throw new IllegalArgumentException("swapRefAlt can only process biallelic, SNPS, found " + vc.toString());
    }

    final VariantContextBuilder swappedBuilder = new VariantContextBuilder(vc);

    swappedBuilder.attribute(SWAPPED_ALLELES, true);

    // Use getBaseString() (rather than the Allele itself) in order to create new Alleles with swapped
    // reference and non-variant attributes
    swappedBuilder.alleles(Arrays.asList(vc.getAlleles().get(1).getBaseString(), vc.getAlleles().get(0).getBaseString()));

    final Map<Allele, Allele> alleleMap = new HashMap<>();

    // A mapping from the old allele to the new allele, to be used when fixing the genotypes
    alleleMap.put(vc.getAlleles().get(0), swappedBuilder.getAlleles().get(1));
    alleleMap.put(vc.getAlleles().get(1), swappedBuilder.getAlleles().get(0));

    final GenotypesContext swappedGenotypes = GenotypesContext.create(vc.getGenotypes().size());
    for (final Genotype genotype : vc.getGenotypes()) {
        final List<Allele> swappedAlleles = new ArrayList<>();
        for (final Allele allele : genotype.getAlleles()) {
            if (allele.isNoCall()) {
                swappedAlleles.add(allele);
            } else {
                swappedAlleles.add(alleleMap.get(allele));
            }
        }
        // Flip AD
        final GenotypeBuilder builder = new GenotypeBuilder(genotype).alleles(swappedAlleles);
        if (genotype.hasAD() && genotype.getAD().length == 2) {
            final int[] ad = ArrayUtils.clone(genotype.getAD());
            ArrayUtils.reverse(ad);
            builder.AD(ad);
        } else {
            builder.noAD();
        }

        //Flip PL
        if (genotype.hasPL() && genotype.getPL().length == 3) {
            final int[] pl = ArrayUtils.clone(genotype.getPL());
            ArrayUtils.reverse(pl);
            builder.PL(pl);
        } else {
            builder.noPL();
        }
        swappedGenotypes.add(builder.make());
    }
    swappedBuilder.genotypes(swappedGenotypes);

    for (final String key : vc.getAttributes().keySet()) {
        if (annotationsToDrop.contains(key)) {
            swappedBuilder.rmAttribute(key);
        } else if (annotationsToReverse.contains(key) && !vc.getAttributeAsString(key, "").equals(VCFConstants.MISSING_VALUE_v4)) {
            final double attributeToReverse = vc.getAttributeAsDouble(key, -1);

            if (attributeToReverse < 0 || attributeToReverse > 1) {
                log.warn("Trying to reverse attribute " + key +
                        " but found value that isn't between 0 and 1: (" + attributeToReverse + ") in variant " + vc + ". Results might be wrong.");
            }
            swappedBuilder.attribute(key, 1 - attributeToReverse);
        }
    }

    return swappedBuilder.make();
}
 
Example 16
Source File: RMSMappingQuality.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 *
 * @return the number of reads at the given site, trying first {@Link GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY},
 * falling back to calculating the value as InfoField {@link VCFConstants#DEPTH_KEY} minus the
 * format field {@link GATKVCFConstants#MIN_DP_FORMAT_KEY} or DP of each of the HomRef genotypes at that site.
 * If neither of those is possible, will fall back to calculating the reads from the likelihoods data if provided.
 * @throws UserException.BadInput if the {@link VCFConstants#DEPTH_KEY} is missing or if the calculated depth is <= 0
 */
@VisibleForTesting
static long getNumOfReads(final VariantContext vc,
                         final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
    if(vc.hasAttribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY)) {
        List<Long> mqTuple = VariantContextGetters.getAttributeAsLongList(vc, GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY,0L);
        if (mqTuple.get(TOTAL_DEPTH_INDEX) > 0) {
            return mqTuple.get(TOTAL_DEPTH_INDEX);
        }
    }

    long numOfReads = 0;
    if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) {
        numOfReads = VariantContextGetters.getAttributeAsLong(vc, VCFConstants.DEPTH_KEY, -1L);
        if(vc.hasGenotypes()) {
            for(final Genotype gt : vc.getGenotypes()) {
                if(gt.isHomRef()) {
                    //site-level DP contribution will come from MIN_DP for gVCF-called reference variants or DP for BP resolution
                    if (gt.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)) {
                        numOfReads -= Long.parseLong(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString());
                    } else if (gt.hasDP()) {
                        numOfReads -= gt.getDP();
                    }
                }
            }
        }

    // If there is no depth key, try to compute from the likelihoods
    } else if (likelihoods != null && likelihoods.numberOfAlleles() != 0) {
        for (int i = 0; i < likelihoods.numberOfSamples(); i++) {
            for (GATKRead read : likelihoods.sampleEvidence(i)) {
                if (read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) {
                    numOfReads++;
                }
            }
        }
    }
    if (numOfReads <= 0) {
        numOfReads = -1;  //return -1 to result in a NaN
    }
    return numOfReads;
}
 
Example 17
Source File: ContaminationFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public List<Double> calculateErrorProbabilityForAlleles(final VariantContext vc, final Mutect2FilteringEngine filteringEngine, ReferenceContext referenceContext) {
    // for every alt allele, a list of the depth and posterior pairs of each sample
    final List<List<ImmutablePair<Integer, Double>>> depthsAndPosteriorsPerAllele = new ArrayList<>();
    new IndexRange(0, vc.getNAlleles()-1).forEach(i -> depthsAndPosteriorsPerAllele.add(new ArrayList<>()));

    for (final Genotype tumorGenotype : vc.getGenotypes()) {
        if (filteringEngine.isNormal(tumorGenotype)) {
            continue;
        }

        final double contaminationFromFile = contaminationBySample.getOrDefault(tumorGenotype.getSampleName(), defaultContamination);
        final double contamination = Math.max(0, Math.min(contaminationFromFile, 1 - EPSILON)); // handle file with contamination == 1
        final int[] ADs = tumorGenotype.getAD();
        final int totalAD = (int) MathUtils.sum(ADs);
        final int[] altADs = Arrays.copyOfRange(ADs, 1, ADs.length); // get ADs of alts only
        // POPAF has only alt allele data
        final double[] negativeLog10AlleleFrequencies = VariantContextGetters.getAttributeAsDoubleArray(vc,
                GATKVCFConstants.POPULATION_AF_KEY, () -> new double[]{Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY}, Double.POSITIVE_INFINITY);
        final double[] alleleFrequencies = MathUtils.applyToArray(negativeLog10AlleleFrequencies, x -> Math.pow(10,-x));

        final SomaticClusteringModel model = filteringEngine.getSomaticClusteringModel();
        final double[] logSomaticLikelihoodPerAllele = Arrays.stream(altADs).mapToDouble(altCount -> model.logLikelihoodGivenSomatic(totalAD, altCount)).toArray();

        final double[] singleContaminantLikelihoodPerAllele = new double[alleleFrequencies.length];
        final double[] manyContaminantLikelihoodPerAllele = new double[alleleFrequencies.length];
        final double[] logContaminantLikelihoodPerAllele = new double[alleleFrequencies.length];
        final double[] logOddsOfRealVsContaminationPerAllele = new double[alleleFrequencies.length];
        final double[] posteriorProbOfContaminationPerAllele = new double[alleleFrequencies.length];
        new IndexRange(0,alleleFrequencies.length).forEach(i -> {
            singleContaminantLikelihoodPerAllele[i] = 2 * alleleFrequencies[i] * (1 - alleleFrequencies[i]) * MathUtils.binomialProbability(totalAD,  altADs[i], contamination /2)
                    + MathUtils.square(alleleFrequencies[i]) * MathUtils.binomialProbability(totalAD,  altADs[i], contamination);
            manyContaminantLikelihoodPerAllele[i] = MathUtils.binomialProbability(totalAD, altADs[i], contamination * alleleFrequencies[i]);
            logContaminantLikelihoodPerAllele[i] = Math.log(Math.max(singleContaminantLikelihoodPerAllele[i], manyContaminantLikelihoodPerAllele[i]));
            logOddsOfRealVsContaminationPerAllele[i] = logSomaticLikelihoodPerAllele[i] - logContaminantLikelihoodPerAllele[i];
        });

        new IndexRange(0,alleleFrequencies.length).forEach(i -> {
            posteriorProbOfContaminationPerAllele[i] = filteringEngine.posteriorProbabilityOfError(vc, logOddsOfRealVsContaminationPerAllele[i], i);
            depthsAndPosteriorsPerAllele.get(i).add(ImmutablePair.of(altADs[i], posteriorProbOfContaminationPerAllele[i]));
        });

    }

    return depthsAndPosteriorsPerAllele.stream().map(alleleData -> alleleData.isEmpty() ? Double.NaN : weightedMedianPosteriorProbability(alleleData)).collect(Collectors.toList());
}
 
Example 18
Source File: VariantContextTestUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public static void assertGenotypePosteriorsAttributeWasRemoved(final VariantContext actual, final VariantContext expected) {
    for (final Genotype g : actual.getGenotypes()) {
        Assert.assertFalse(g.hasExtendedAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY));
    }
}
 
Example 19
Source File: PedigreeAnnotation.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
protected GenotypesContext getFounderGenotypes(VariantContext vc) {
    if ((pedigreeFile!= null) && (!hasAddedPedigreeFounders)) {
        initializeSampleDBAndSetFounders(pedigreeFile);
    }
    return (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(new HashSet<>(founderIds));
}