Java Code Examples for htsjdk.variant.variantcontext.VariantContext#getAlleles()

The following examples show how to use htsjdk.variant.variantcontext.VariantContext#getAlleles() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HomRefBlockUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testToVariantContext(){
    final VariantContext vc = getVariantContext();
    final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, vc.getAlleles());
    final Genotype genotype1 = gb.GQ(15).DP(6).PL(new int[]{0, 10, 100}).make();
    final Genotype genotype2 = gb.GQ(17).DP(10).PL(new int[]{0, 5, 80}).make();

    final HomRefBlock block = getHomRefBlock(vc);
    block.add(vc.getEnd(), genotype1);
    block.add(vc.getEnd() + 1, genotype2);

    final VariantContext newVc = block.toVariantContext(SAMPLE_NAME, false);
    Assert.assertEquals(newVc.getGenotypes().size(), 1);
    final Genotype genotype = newVc.getGenotypes().get(0);
    Assert.assertEquals(genotype.getDP(),8); //dp should be median of the added DPs
    Assert.assertEquals(genotype.getGQ(), 5); //GQ should have been recalculated with the minPls
    Assert.assertTrue(genotype.getAlleles().stream().allMatch(a -> a.equals(REF)));
}
 
Example 2
Source File: VariantContextTestUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
public static Boolean alleleSpecificAnnotationEquals(VariantContext actual, VariantContext expected, String annotation) {
    List<Allele> Aalleles = actual.getAlleles();
    String[] actualAnnotation = String.join(",",actual.getAttributeAsStringList(annotation, "")).split("\\|",-1);
    String[] expectedAnnotation = String.join(",",expected.getAttributeAsStringList(annotation, "")).split("\\|",-1);
    if (Arrays.equals(actualAnnotation, expectedAnnotation)) {
        return true;
    }if (actualAnnotation.length!=expectedAnnotation.length) {
        return false;
    }
    for (int i = 0; i < Aalleles.size(); i++) {
        Allele al = Aalleles.get(i);

        int k = expected.getAlleleIndex(al);

        if (!actualAnnotation[i].equals(expectedAnnotation[k])) {
            return false;
        }
    }
    return true;
}
 
Example 3
Source File: StrandBiasUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 Allocate and fill a Nx2 strand contingency table where N is the number of alleles.  In the end, it'll look something like this:
 *             fwd      rev
 *   allele1   #       #
 *   allele2   #       #
 *
 *   NOTE:Only use informative reads
 *
 * @param vc    VariantContext from which to get alleles
 * @param likelihoods per-read allele likelihoods to determine if each read is informative
 * @param perAlleleValues modified to store the output counts
 * @param minCount minimum threshold of counts to use
 */
public static void getStrandCountsFromLikelihoodMap( final VariantContext vc,
                                              final AlleleLikelihoods<GATKRead, Allele> likelihoods,
                                              final ReducibleAnnotationData<List<Integer>> perAlleleValues,
                                              final int minCount) {
    if( likelihoods == null || vc == null ) {
        return;
    }

    final Allele ref = vc.getReference();
    final List<Allele> allAlts = vc.getAlternateAlleles();

    for (final String sample : likelihoods.samples()) {
        final ReducibleAnnotationData<List<Integer>> sampleTable = new AlleleSpecificAnnotationData<>(vc.getAlleles(),null);
        likelihoods.bestAllelesBreakingTies(sample).stream()
                .filter(ba -> ba.isInformative())
                .forEach(ba -> updateTable(ba.allele, ba.evidence, ref, allAlts, sampleTable));
        if (passesMinimumThreshold(sampleTable, minCount)) {
            combineAttributeMap(sampleTable, perAlleleValues);
        }
    }
}
 
Example 4
Source File: AS_StrandBiasTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Parses the raw data stings of combined contingency matrix data and calls client methods calculateReducedData(myData)
 * implementation to generate double digest of provided allele information which is stored in '|' delineated lists.
 *
 * @param vc -- contains the final set of alleles, possibly subset by GenotypeGVCFs
 * @param originalVC -- used to get all the alleles for all gVCFs
 * @return
 */
@Override
public  Map<String, Object> finalizeRawData(final VariantContext vc, final VariantContext originalVC) {
    if (!vc.hasAttribute(getPrimaryRawKey())) {
        return new HashMap<>();
    }
    String rawContingencyTableData = vc.getAttributeAsString(getPrimaryRawKey(),null);
    if (rawContingencyTableData == null) {
        return new HashMap<>();
    }
    AlleleSpecificAnnotationData<List<Integer>> myData = new AlleleSpecificAnnotationData<>(originalVC.getAlleles(), rawContingencyTableData);
    parseRawDataString(myData);

    Map<Allele, Double> perAltRankSumResults = calculateReducedData(myData);

    String annotationString = makeReducedAnnotationString(vc, perAltRankSumResults);
    String rawAnnotationsString = StrandBiasUtils.makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap());
    Map<String, Object> returnMap = new HashMap<>();
    returnMap.put(getKeyNames().get(0), annotationString);
    returnMap.put(getPrimaryRawKey(), rawAnnotationsString);  //this is in case raw annotations are requested
    return returnMap;
}
 
Example 5
Source File: AS_RMSMappingQuality.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private Map<Allele, Integer> getADcounts(final VariantContext vc) {
    final GenotypesContext genotypes = vc.getGenotypes();
    if ( genotypes == null || genotypes.size() == 0 ) {
        genotype_logger.warn("VC does not have genotypes -- annotations were calculated in wrong order");
        return null;
    }

    final Map<Allele, Integer> variantADs = new HashMap<>();
    for(final Allele a : vc.getAlleles())
        variantADs.put(a,0);

    for (final Genotype gt : vc.getGenotypes()) {
        if(gt.hasAD()) {
            final int[] ADs = gt.getAD();
            for (int i = 1; i < vc.getNAlleles(); i++) {
                variantADs.put(vc.getAlternateAllele(i - 1), variantADs.get(vc.getAlternateAllele(i - 1)) + ADs[i]); //here -1 is to reconcile allele index with alt allele index
            }
        }
    }
    return variantADs;
}
 
Example 6
Source File: AS_RMSMappingQuality.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Takes combined raw annotation data sums, and calculates per allele the average root mean squared from the raw data
 * using expected Allele Depth counts data.
 *
 * Will output delineated doubles in the format: sqrt(TotalAllele1RMS/Allele1Depth)|sqrt(TotalAllele1RMS/Allele1Depth)|...
 */
@Override
@SuppressWarnings({"unchecked", "rawtypes"})//FIXME generics here blow up
public Map<String, Object> finalizeRawData(final VariantContext vc, final VariantContext originalVC) {
    if (!vc.hasAttribute(getPrimaryRawKey())) {
        return new HashMap<>();
    }
    final String rawMQdata = vc.getAttributeAsString(getPrimaryRawKey(),null);
    if (rawMQdata == null) {
        return new HashMap<>();
    }

    final Map<String,Object> annotations = new HashMap<>();
    final ReducibleAnnotationData myData = new AlleleSpecificAnnotationData<Double>(originalVC.getAlleles(), rawMQdata);
    parseRawDataString(myData);

    final String annotationString = makeFinalizedAnnotationString(vc, myData.getAttributeMap());
    annotations.put(getKeyNames().get(0), annotationString);
    annotations.put(getPrimaryRawKey(), makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap()));
    return annotations;
}
 
Example 7
Source File: AS_RankSumTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Parses the raw data histograms generated for each allele and outputs the median value for each histogram
 * as a representative double value.
 *
 * @param vc -- contains the final set of alleles, possibly subset by GenotypeGVCFs
 * @param originalVC -- used to get all the alleles for all gVCFs
 * @return the finalized key and value as well as the raw key and value
 */
public  Map<String, Object> finalizeRawData(final VariantContext vc, final VariantContext originalVC) {
    if (!vc.hasAttribute(getPrimaryRawKey())) {
        return new HashMap<>();
    }
    final String rawRankSumData = vc.getAttributeAsString(getPrimaryRawKey(),null);
    if (rawRankSumData == null) {
        return new HashMap<>();
    }
    final Map<String,Object> annotations = new HashMap<>();
    final AlleleSpecificAnnotationData<Histogram> myData = new AlleleSpecificAnnotationData<>(originalVC.getAlleles(), rawRankSumData);
    parseRawDataString(myData);

    final Map<Allele, Double> perAltRankSumResults = calculateReducedData(myData.getAttributeMap(), myData.getRefAllele());
    //shortcut for no ref values
    if (perAltRankSumResults.isEmpty()) {
        return annotations;
    }
    final String annotationString = makeReducedAnnotationString(vc, perAltRankSumResults);
    annotations.put(getKeyNames().get(0), annotationString);
    annotations.put(getPrimaryRawKey(), makeCombinedAnnotationString(vc.getAlleles(), myData.getAttributeMap()));
    return annotations;
}
 
Example 8
Source File: DepthPerAlleleBySample.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
protected int[] annotateWithLikelihoods(VariantContext vc, Genotype g, Set<Allele> alleles, final AlleleLikelihoods<GATKRead, Allele> likelihoods) {

        final Map<Allele, Integer> alleleCounts = new LinkedHashMap<>();
        for ( final Allele allele : vc.getAlleles() ) {
            alleleCounts.put(allele, 0);
        }
        final Map<Allele, List<Allele>> alleleSubset = alleles.stream().collect(Collectors.toMap(a -> a, Arrays::asList));
        final AlleleLikelihoods<GATKRead, Allele> subsettedLikelihoods = likelihoods.marginalize(alleleSubset);
        subsettedLikelihoods.bestAllelesBreakingTies(g.getSampleName()).stream()
                .filter(ba -> ba.isInformative())
                .forEach(ba -> alleleCounts.compute(ba.allele, (allele,prevCount) -> prevCount + 1));

        final int[] counts = new int[alleleCounts.size()];
        counts[0] = alleleCounts.get(vc.getReference()); //first one in AD is always ref
        for (int i = 0; i < vc.getNAlleles() -1; i++) {
            counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i));
        }

        return counts;
    }
 
Example 9
Source File: DepthPerAlleleBySample.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
public void annotate(final ReferenceContext ref,
                     final VariantContext vc,
                     final Genotype g,
                     final GenotypeBuilder gb,
                     final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
    Utils.nonNull(gb, "gb is null");
    Utils.nonNull(vc, "vc is null");

    if ( g == null || !g.isCalled() || likelihoods == null) {
        return;
    }
    final Set<Allele> alleles = new LinkedHashSet<>(vc.getAlleles());

    // make sure that there's a meaningful relationship between the alleles in the likelihoods and our VariantContext
    Utils.validateArg(likelihoods.alleles().containsAll(alleles), () -> "VC alleles " + alleles + " not a  subset of AlleleLikelihoods alleles " + likelihoods.alleles());

    gb.AD(annotateWithLikelihoods(vc, g, alleles, likelihoods));
}
 
Example 10
Source File: GermlineCNVIntervalVariantComposerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test(dataProvider = "alleleDeterminationTestData")
public void testAlleleDetermination(final int refAutosomalCopyNumber,
                                    final Set<String> allosomalContigs,
                                    final int baselineCopyNumber,
                                    final Allele expectedAllele) {
    final File outputFile = createTempFile("test", ".vcf");
    final VariantContextWriter outputWriter = GATKVariantContextUtils.createVCFWriter(outputFile.toPath(), null, false);
    final GermlineCNVIntervalVariantComposer variantComposer = new GermlineCNVIntervalVariantComposer(
            outputWriter, "TEST_SAMPLE_NAME", new IntegerCopyNumberState(refAutosomalCopyNumber), allosomalContigs);
    final VariantContext var = variantComposer.composeVariantContext(
            new IntervalCopyNumberGenotypingData(TEST_INTERVAL, TEST_DISTRIBUTION,
                    new IntegerCopyNumberState(baselineCopyNumber)));
    final List<Allele> allAlleles = var.getAlleles();
    Assert.assertEquals(allAlleles, GermlineCNVIntervalVariantComposer.ALL_ALLELES);
    final Genotype gen = var.getGenotype(TEST_SAMPLE_NAME);
    final Allele genAllele = gen.getAlleles().get(0);
    Assert.assertEquals(genAllele, expectedAllele);
}
 
Example 11
Source File: StrelkaAllelicDepth.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
@NotNull
public static VariantContext enrich(@NotNull final VariantContext context) {
    if (!context.isIndel() && !context.isSNP() ) {
        return context;
    }

    final VariantContextBuilder contextBuilder = new VariantContextBuilder(context).noGenotypes();

    final List<Allele> alleles = context.getAlleles();
    final Function<Allele, String> alleleKey = alleleKey(context);

    final List<Genotype> updatedGenotypes = Lists.newArrayList();
    for (Genotype genotype : context.getGenotypes()) {
        if (!genotype.hasAD() && hasRequiredAttributes(genotype, alleles, alleleKey)) {
            final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(genotype).AD(readAD(genotype, alleles, alleleKey));
            updatedGenotypes.add(genotypeBuilder.make());
        } else {
            updatedGenotypes.add(genotype);
        }
    }

    return contextBuilder.genotypes(updatedGenotypes).make();
}
 
Example 12
Source File: DepthPerSampleHC.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public void annotate( final ReferenceContext ref,
                      final VariantContext vc,
                      final Genotype g,
                      final GenotypeBuilder gb,
                      final AlleleLikelihoods<GATKRead, Allele> likelihoods ) {
    Utils.nonNull(vc);
    Utils.nonNull(g);
    Utils.nonNull(gb);

    if ( likelihoods == null || !g.isCalled() ) {
        logger.warn("Annotation will not be calculated, genotype is not called or alleleLikelihoodMap is null");
        return;
    }

    // check that there are reads
    final String sample = g.getSampleName();
    if (likelihoods.sampleEvidenceCount(likelihoods.indexOfSample(sample)) == 0) {
        gb.DP(0);
        return;
    }

    final Set<Allele> alleles = new LinkedHashSet<>(vc.getAlleles());

    // make sure that there's a meaningful relationship between the alleles in the likelihoods and our VariantContext
    if ( !likelihoods.alleles().containsAll(alleles) ) {
        logger.warn("VC alleles " + alleles + " not a strict subset of AlleleLikelihoods alleles " + likelihoods.alleles());
        return;
    }

    // the depth for the HC is the sum of the informative alleles at this site.  It's not perfect (as we cannot
    // differentiate between reads that align over the event but aren't informative vs. those that aren't even
    // close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP).
    final Map<Allele, List<Allele>> alleleSubset = alleles.stream().collect(Collectors.toMap(a -> a, a -> Arrays.asList(a)));
    final AlleleLikelihoods<GATKRead, Allele> subsettedLikelihoods = likelihoods.marginalize(alleleSubset);
    final int depth = (int) subsettedLikelihoods.bestAllelesBreakingTies(sample).stream().filter(ba -> ba.isInformative()).count();
    gb.DP(depth);
}
 
Example 13
Source File: VariantAnnotator.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private AlleleLikelihoods<GATKRead, Allele> makeLikelihoods(final VariantContext vc, final ReadsContext readsContext) {
    final List<GATKRead> reads = Utils.stream(readsContext).collect(Collectors.toList());
    final AlleleLikelihoods<GATKRead, Allele> result = new AlleleLikelihoods<>(variantSamples,
            new IndexedAlleleList<>(vc.getAlleles()), AssemblyBasedCallerUtils.splitReadsBySample(variantSamples, getHeaderForReads(), reads));

    final ReadPileup readPileup = new ReadPileup(vc, readsContext);
    final Map<String, ReadPileup> pileupsBySample = readPileup.splitBySample(getHeaderForReads(), "__UNKNOWN__");

    final Allele refAllele = vc.getReference();
    final List<Allele> altAlleles = vc.getAlternateAlleles();
    final int numAlleles = vc.getNAlleles();

    // manually fill each sample's likelihoods one read at a time, assigning probability 1 (0 in log space) to the matching allele, if present
    for (final Map.Entry<String, ReadPileup> samplePileups : pileupsBySample.entrySet()) {
        final LikelihoodMatrix<GATKRead, Allele> sampleMatrix = result.sampleMatrix(result.indexOfSample(samplePileups.getKey()));

        final MutableInt readIndex = new MutableInt(0);
        for (final PileupElement pe : samplePileups.getValue()) {
            // initialize all alleles as equally unlikely
            IntStream.range(0, numAlleles).forEach(a -> sampleMatrix.set(a, readIndex.intValue(), Double.NEGATIVE_INFINITY));

            // if present set the matching allele as likely
            final Allele pileupAllele = GATKVariantContextUtils.chooseAlleleForRead(pe, refAllele, altAlleles, minBaseQualityScore);
            if (pileupAllele != null) {
                sampleMatrix.set(sampleMatrix.indexOfAllele(pileupAllele), readIndex.intValue(), 0);
            }


            readIndex.increment();
        }
    }

    return result;
}
 
Example 14
Source File: LiftoverUtils.java    From picard with MIT License 5 votes vote down vote up
protected static VariantContextBuilder reverseComplementVariantContext(final VariantContext source, final Interval target, final ReferenceSequence refSeq) {
    if (target.isPositiveStrand()) {
        throw new IllegalArgumentException("This should only be called for negative strand liftovers");
    }
    final int start = target.getStart();
    final int stop = target.getEnd();

    final List<Allele> origAlleles = new ArrayList<>(source.getAlleles());
    final VariantContextBuilder vcb = new VariantContextBuilder(source);

    vcb.rmAttribute(VCFConstants.END_KEY)
            .chr(target.getContig())
            .start(start)
            .stop(stop)
            .alleles(reverseComplementAlleles(origAlleles));

    if (isIndelForLiftover(source)) {
        // check that the reverse complemented bases match the new reference
        if (referenceAlleleDiffersFromReferenceForIndel(vcb.getAlleles(), refSeq, start, stop)) {
            return null;
        }
        leftAlignVariant(vcb, start, stop, vcb.getAlleles(), refSeq);
    }

    vcb.genotypes(fixGenotypes(source.getGenotypes(), origAlleles, vcb.getAlleles()));

    return vcb;
}
 
Example 15
Source File: StrandBiasUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public static Map<String, Object> computeSBAnnotation(VariantContext vc, AlleleLikelihoods<GATKRead, Allele> likelihoods, String key) {
    // calculate the annotation from the likelihoods
    // likelihoods can come from HaplotypeCaller or Mutect2 call to VariantAnnotatorEngine
    final Map<String, Object> annotations = new HashMap<>();
    final ReducibleAnnotationData<List<Integer>> myData = new AlleleSpecificAnnotationData<>(vc.getAlleles(),null);
    getStrandCountsFromLikelihoodMap(vc, likelihoods, myData, MIN_COUNT);
    Map<Allele, List<Integer>> perAlleleValues = new LinkedHashMap<>(myData.getAttributeMap());
    perAlleleValues.values().removeIf(Objects::isNull);
    final String annotationString = makeRawAnnotationString(vc.getAlleles(), perAlleleValues);
    annotations.put(key, annotationString);
    return annotations;
}
 
Example 16
Source File: VcfToVariant.java    From genomewarp with Apache License 2.0 5 votes vote down vote up
private static Map<Allele, Integer> buildAlleleMap(VariantContext vc) {
  final Map<Allele, Integer> alleleMap = new HashMap<>(vc.getAlleles().size() + 1);
  alleleMap.put(Allele.NO_CALL, -1);

  int alleleNum = 0;
  for (Allele currAllele : vc.getAlleles()) {
    alleleMap.put(currAllele, alleleNum++);
  }

  return alleleMap;
}
 
Example 17
Source File: OrientationBiasReadCounts.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 *  Annotate the given variant context with the OxoG read count attributes, directly from the read pileup.
 *
 *  This method may be slow and should be considered EXPERIMENTAL, especially with regard to indels and complex/mixed
 *    variants.
 *
 * @param vc variant context for the genotype.  Necessary so that we can see all alleles.
 * @param gb genotype builder to put the annotations into.
 * @param readPileup pileup of the reads at this vc.  Note that this pileup does not have to match the
 *                   genotype.  In other words, this tool does not check that the pileup was generated from the
 *                   genotype sample.
 */
public static void annotateSingleVariant(final VariantContext vc, final GenotypeBuilder gb,
                                         final ReadPileup readPileup, int meanBaseQualityCutoff) {
    Utils.nonNull(gb, "gb is null");
    Utils.nonNull(vc, "vc is null");

    // Create a list of unique alleles
    final List<Allele> variantAllelesWithDupes = vc.getAlleles();
    final Set<Allele> alleleSet = new LinkedHashSet<>(variantAllelesWithDupes);
    final List<Allele> variantAlleles = new ArrayList<>(alleleSet);

    // Initialize the mappings
    final Map<Allele, MutableInt> f1r2Counts = variantAlleles.stream()
            .collect(Collectors.toMap(Function.identity(), a -> new MutableInt(0)));

    final Map<Allele, MutableInt> f2r1Counts = variantAlleles.stream()
            .collect(Collectors.toMap(Function.identity(), a -> new MutableInt(0)));

    final List<Allele> referenceAlleles = variantAlleles.stream().filter(a -> a.isReference() && !a.isSymbolic()).collect(Collectors.toList());
    final List<Allele> altAlleles = variantAlleles.stream().filter(a -> a.isNonReference() && !a.isSymbolic()).collect(Collectors.toList());

    if (referenceAlleles.size() != 1) {
        logger.warn("Number of reference alleles does not equal  for VC: " + vc);
    }

    // We MUST have exactly 1 non-symbolic reference allele and a read pileup,
    if ((referenceAlleles.size() == 1) && (readPileup != null) && !referenceAlleles.get(0).isSymbolic()) {
        final Allele referenceAllele = referenceAlleles.get(0);
        Utils.stream(readPileup)
                .filter(pe -> isUsableRead(pe.getRead()))
                .forEach(pe -> incrementCounts(pe, f1r2Counts, f2r1Counts, referenceAllele, altAlleles, meanBaseQualityCutoff));
    }

    final int[] f1r2 = variantAlleles.stream().mapToInt(a -> f1r2Counts.get(a).intValue()).toArray();

    final int[] f2r1 = variantAlleles.stream().mapToInt(a -> f2r1Counts.get(a).intValue()).toArray();

    gb.attribute(GATKVCFConstants.F1R2_KEY, f1r2);
    gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1);
}
 
Example 18
Source File: CompOverlap.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Returns true if every allele in eval is also in comp
 *
 * @param eval  eval context
 * @param comp db context
 * @return true if eval and db are discordant
 */
public boolean discordantP(VariantContext eval, VariantContext comp) {
    for (Allele a : eval.getAlleles()) {
        if (!comp.hasAllele(a, true))
            return true;
    }

    return false;
}
 
Example 19
Source File: AlleleFrequencyCalculator.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
 *
 * @param vc the VariantContext holding the alleles and sample information.  The VariantContext
 *           must have at least 1 alternative allele
 * @return result (for programming convenience)
 */
public AFCalculationResult calculate(final VariantContext vc, final int defaultPloidy) {
    Utils.nonNull(vc, "VariantContext cannot be null");
    final int numAlleles = vc.getNAlleles();
    final List<Allele> alleles = vc.getAlleles();
    Utils.validateArg( numAlleles > 1, () -> "VariantContext has only a single reference allele, but getLog10PNonRef requires at least one at all " + vc);

    final double[] priorPseudocounts = alleles.stream()
            .mapToDouble(a -> a.isReference() ? refPseudocount : (a.length() == vc.getReference().length() ? snpPseudocount : indelPseudocount)).toArray();

    double[] alleleCounts = new double[numAlleles];
    final double flatLog10AlleleFrequency = -MathUtils.log10(numAlleles); // log10(1/numAlleles)
    double[] log10AlleleFrequencies = new IndexRange(0, numAlleles).mapToDouble(n -> flatLog10AlleleFrequency);

    for (double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY; alleleCountsMaximumDifference > THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE; ) {
        final double[] newAlleleCounts = effectiveAlleleCounts(vc, log10AlleleFrequencies);
        alleleCountsMaximumDifference = Arrays.stream(MathArrays.ebeSubtract(alleleCounts, newAlleleCounts)).map(Math::abs).max().getAsDouble();
        alleleCounts = newAlleleCounts;
        final double[] posteriorPseudocounts = MathArrays.ebeAdd(priorPseudocounts, alleleCounts);

        // first iteration uses flat prior in order to avoid local minimum where the prior + no pseudocounts gives such a low
        // effective allele frequency that it overwhelms the genotype likelihood of a real variant
        // basically, we want a chance to get non-zero pseudocounts before using a prior that's biased against a variant
        log10AlleleFrequencies = new Dirichlet(posteriorPseudocounts).log10MeanWeights();
    }

    double[] log10POfZeroCountsByAllele = new double[numAlleles];
    double log10PNoVariant = 0;

    final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL);
    final Map<Integer, int[]> nonVariantIndicesByPloidy = new Int2ObjectArrayMap<>();

    // re-usable buffers of the log10 genotype posteriors of genotypes missing each allele
    final List<DoubleArrayList> log10AbsentPosteriors = IntStream.range(0,numAlleles).mapToObj(n -> new DoubleArrayList()).collect(Collectors.toList());
    for (final Genotype g : vc.getGenotypes()) {
        if (!g.hasLikelihoods()) {
            continue;
        }
        final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy();
        final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles);

        final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);

        //the total probability
        if (!spanningDeletionPresent) {
            log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX];
        } else {
            nonVariantIndicesByPloidy.computeIfAbsent(ploidy, p -> genotypeIndicesWithOnlyRefAndSpanDel(p, alleles));
            final int[] nonVariantIndices = nonVariantIndicesByPloidy.get(ploidy);
            final double[] nonVariantLog10Posteriors = MathUtils.applyToArray(nonVariantIndices, n -> log10GenotypePosteriors[n]);
            // when the only alt allele is the spanning deletion the probability that the site is non-variant
            // may be so close to 1 that finite precision error in log10SumLog10 yields a positive value,
            // which is bogus.  Thus we cap it at 0.
            log10PNoVariant += Math.min(0,MathUtils.log10SumLog10(nonVariantLog10Posteriors));
        }

        // if the VC is biallelic the allele-specific qual equals the variant qual
        if (numAlleles == 2) {
            continue;
        }

        // for each allele, we collect the log10 probabilities of genotypes in which the allele is absent, then add (in log space)
        // to get the log10 probability that the allele is absent in this sample
        log10AbsentPosteriors.forEach(DoubleArrayList::clear);  // clear the buffers.  Note that this is O(1) due to the primitive backing array
        for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
            final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
            glCalc.genotypeAlleleCountsAt(genotype).forEachAbsentAlleleIndex(a -> log10AbsentPosteriors.get(a).add(log10GenotypePosterior), numAlleles);
        }

        final double[] log10PNoAllele = log10AbsentPosteriors.stream()
                .mapToDouble(buffer -> MathUtils.log10SumLog10(buffer.toDoubleArray()))
                .map(x -> Math.min(0, x)).toArray();    // if prob of non hom ref > 1 due to finite precision, short-circuit to avoid NaN

        // multiply the cumulative probabilities of alleles being absent, which is addition of logs
        MathUtils.addToArrayInPlace(log10POfZeroCountsByAllele, log10PNoAllele);
    }

    // for biallelic the allele-specific qual equals the variant qual, and we short-circuited the calculation above
    if (numAlleles == 2) {
        log10POfZeroCountsByAllele[1] = log10PNoVariant;
    }

    // unfortunately AFCalculationResult expects integers for the MLE.  We really should emit the EM no-integer values
    // which are valuable (eg in CombineGVCFs) as the sufficient statistics of the Dirichlet posterior on allele frequencies
    final int[] integerAlleleCounts = Arrays.stream(alleleCounts).mapToInt(x -> (int) Math.round(x)).toArray();
    final int[] integerAltAlleleCounts = Arrays.copyOfRange(integerAlleleCounts, 1, numAlleles);

    //skip the ref allele (index 0)
    final Map<Allele, Double> log10PRefByAllele = IntStream.range(1, numAlleles).boxed()
            .collect(Collectors.toMap(alleles::get, a -> log10POfZeroCountsByAllele[a]));

    return new AFCalculationResult(integerAltAlleleCounts, alleles, log10PNoVariant, log10PRefByAllele);
}