Java Code Examples for htsjdk.variant.variantcontext.VariantContext#getNAlleles()

The following examples show how to use htsjdk.variant.variantcontext.VariantContext#getNAlleles() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SelectVariants.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Initialize cache of allele anyploid indices
 *
 * Initialize the cache of PL index to a list of alleles for each ploidy.
 *
 * @param vc    Variant Context
 */
private void initalizeAlleleAnyploidIndicesCache(final VariantContext vc) {
    if (vc.getType() != VariantContext.Type.NO_VARIATION) { // Bypass if not a variant
        for (final Genotype g : vc.getGenotypes()) {
            // Make a new entry if the we have not yet cached a PL to allele indices map for this ploidy and allele count
            // skip if there are no PLs -- this avoids hanging on high-allelic somatic samples, for example, where
            // there's no need for the PL indices since they don't exist
            if (g.getPloidy() != 0 && (!ploidyToNumberOfAlleles.containsKey(g.getPloidy()) || ploidyToNumberOfAlleles.get(g.getPloidy()) < vc.getNAlleles())) {
                if (vc.getGenotypes().stream().anyMatch(Genotype::hasLikelihoods)) {
                    GenotypeLikelihoods.initializeAnyploidPLIndexToAlleleIndices(vc.getNAlleles() - 1, g.getPloidy());
                    ploidyToNumberOfAlleles.put(g.getPloidy(), vc.getNAlleles());
                }
            }
        }
    }

}
 
Example 2
Source File: AlleleFrequencyCalculator.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) {
    final int numAlleles = vc.getNAlleles();
    Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent");
    final double[] log10Result = new double[numAlleles];
    Arrays.fill(log10Result, Double.NEGATIVE_INFINITY);
    for (final Genotype g : vc.getGenotypes()) {
        if (!g.hasLikelihoods()) {
            continue;
        }
        final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles);

        final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);

        new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex ->
            glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) ->
                    log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.log10(count))));
    }
    return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x));
}
 
Example 3
Source File: DepthPerAlleleBySample.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
protected int[] annotateWithLikelihoods(VariantContext vc, Genotype g, Set<Allele> alleles, final AlleleLikelihoods<GATKRead, Allele> likelihoods) {

        final Map<Allele, Integer> alleleCounts = new LinkedHashMap<>();
        for ( final Allele allele : vc.getAlleles() ) {
            alleleCounts.put(allele, 0);
        }
        final Map<Allele, List<Allele>> alleleSubset = alleles.stream().collect(Collectors.toMap(a -> a, Arrays::asList));
        final AlleleLikelihoods<GATKRead, Allele> subsettedLikelihoods = likelihoods.marginalize(alleleSubset);
        subsettedLikelihoods.bestAllelesBreakingTies(g.getSampleName()).stream()
                .filter(ba -> ba.isInformative())
                .forEach(ba -> alleleCounts.compute(ba.allele, (allele,prevCount) -> prevCount + 1));

        final int[] counts = new int[alleleCounts.size()];
        counts[0] = alleleCounts.get(vc.getReference()); //first one in AD is always ref
        for (int i = 0; i < vc.getNAlleles() -1; i++) {
            counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i));
        }

        return counts;
    }
 
Example 4
Source File: AS_RMSMappingQuality.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private Map<Allele, Integer> getADcounts(final VariantContext vc) {
    final GenotypesContext genotypes = vc.getGenotypes();
    if ( genotypes == null || genotypes.size() == 0 ) {
        genotype_logger.warn("VC does not have genotypes -- annotations were calculated in wrong order");
        return null;
    }

    final Map<Allele, Integer> variantADs = new HashMap<>();
    for(final Allele a : vc.getAlleles())
        variantADs.put(a,0);

    for (final Genotype gt : vc.getGenotypes()) {
        if(gt.hasAD()) {
            final int[] ADs = gt.getAD();
            for (int i = 1; i < vc.getNAlleles(); i++) {
                variantADs.put(vc.getAlternateAllele(i - 1), variantADs.get(vc.getAlternateAllele(i - 1)) + ADs[i]); //here -1 is to reconcile allele index with alt allele index
            }
        }
    }
    return variantADs;
}
 
Example 5
Source File: ValidationReport.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private SiteStatus calcSiteStatus(VariantContext vc) {
    if ( vc == null ) return SiteStatus.NO_CALL;
    if ( vc.isFiltered() ) return SiteStatus.FILTERED;
    if ( vc.isMonomorphicInSamples() ) return SiteStatus.MONO;
    if ( vc.hasGenotypes() ) return SiteStatus.POLY;  // must be polymorphic if isMonomorphicInSamples was false and there are genotypes

    if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
        int ac = 0;
        if ( vc.getNAlleles() > 2 ) {
            return SiteStatus.POLY;
        }
        else
            ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
        return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO;
    } else {
        return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do
    }
}
 
Example 6
Source File: AS_QualByDepth.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public static List<Integer> parseQualList(final VariantContext vc) {
    final List<Integer> alleleQualList = new ArrayList<>();
    if (vc.hasAttribute(GATKVCFConstants.AS_QUAL_KEY)) {
        //Parse the VC's allele-specific qual values
        List<Object> alleleQualObjList = vc.getAttributeAsList(GATKVCFConstants.AS_QUAL_KEY);
        if (alleleQualObjList.size() != vc.getNAlleles() - 1) {
            throw new IllegalStateException("Number of AS_QUAL values doesn't match the number of alternate alleles.");
        }
        for (final Object obj : alleleQualObjList) {
            alleleQualList.add(Integer.parseInt(obj.toString()));
        }
    }
    else if (vc.hasAttribute(GATKVCFConstants.AS_RAW_QUAL_APPROX_KEY)) {
        String asQuals = vc.getAttributeAsString(GATKVCFConstants.AS_RAW_QUAL_APPROX_KEY, "").replaceAll("\\[\\]\\s","");
        String[] values = asQuals.split(AnnotationUtils.ALLELE_SPECIFIC_SPLIT_REGEX, -1); //allow for empty tokens at the end
        if (values.length != vc.getNAlleles()) {
            throw new IllegalStateException("Number of AS_QUALapprox values doesn't match the number of alleles in the variant context.");
        }
        for (int i = 1; i < vc.getNAlleles(); i++) {
            if (!values[i].equals("")) {
                alleleQualList.add(Integer.parseInt(values[i]));
            } else {
                alleleQualList.add(0);
            }
        }
    }
    return alleleQualList;
}
 
Example 7
Source File: SVContext.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static void assertIsStructuralVariantContext(final VariantContext vc) {
    Utils.nonNull(vc, "the input variant context must not be null");
    Utils.nonNull(vc.getStructuralVariantType(), "the input variant-context is not structural; the SVTYPE annotation is missing");
    if (vc.getNAlleles() != 2) {
        throw new IllegalArgumentException("structural variant context must be biallelic");
    } else if (vc.getAlleles().get(0).isNonReference()) {
        throw new IllegalArgumentException("the allele with index 0 must be reference");
    } else if (vc.getAlleles().get(1).isReference()) {
        throw new IllegalArgumentException("the allele with index 1 must be non-reference");
    }
}
 
Example 8
Source File: CreateSomaticPanelOfNormals.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public void apply(final VariantContext vc, final ReadsContext rc, final ReferenceContext ref, final FeatureContext fc) {

    // if there are no non-span-del alt alleles, there's no artifact
    if (vc.getNAlleles() == 1) {
        return;
    } else if (vc.getNAlleles() == 2 && vc.getAlternateAllele(0).basesMatch(Allele.SPAN_DEL)) {
        return;
    }

    final List<VariantContext> germline = fc.getValues(germlineResource, new SimpleInterval(vc));
    final double germlineAF = germline.isEmpty() ? 0 : Mutect2Engine.getAttributeAsDoubleList(germline.get(0), VCFConstants.ALLELE_FREQUENCY_KEY, 0.0)
            .stream().mapToDouble(af -> af).sum();

    // note: if at this site some input vcfs had a variant and some had only a spanning deletion from an upstream event,
    // GenomicsDBImport removes the spanning deletion from the ADs and so the altCount logic works and counts only
    // real variants, not spanning deletions.
    final List<Genotype> variantGenotypes = vc.getGenotypes().stream()
            .filter(g -> hasArtifact(g, germlineAF)).collect(Collectors.toList());

    if (variantGenotypes.size() < minSampleCount) {
        return;
    }

    final double fraction = (double) variantGenotypes.size() / numSamples;

    final List<int[]> altAndRefCounts = variantGenotypes.stream()
            .map(g -> new int[] {altCount(g), g.getAD()[0]})
            .collect(Collectors.toList());

    final BetaDistributionShape betaDistributionShape = fitBeta(altAndRefCounts);

    final VariantContext outputVc = new VariantContextBuilder(vc.getSource(), vc.getContig(), vc.getStart(), vc.getEnd(), vc.getAlleles())
            .attribute(FRACTION_INFO_FIELD, fraction)
            .attribute(BETA_SHAPE_INFO_FIELD, new double[] { betaDistributionShape.getAlpha(), betaDistributionShape.getBeta()})
            .make();

    vcfWriter.add(outputVc);
}
 
Example 9
Source File: Mutect2FilteringEngine.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public double[] weightedAverageOfTumorAFs(final VariantContext vc) {
    final MutableDouble totalWeight = new MutableDouble(0);
    final double[] AFs = new double[vc.getNAlleles() - 1];
    vc.getGenotypes().stream().filter(this::isTumor).forEach(g ->  {
        final double weight = MathUtils.sum(g.getAD());
        totalWeight.add(weight);
        final double[] sampleAFs = VariantContextGetters.getAttributeAsDoubleArray(g, VCFConstants.ALLELE_FREQUENCY_KEY,
                () -> new double[] {0.0}, 0.0);
        MathArrays.scaleInPlace(weight, sampleAFs);
        MathUtils.addToArrayInPlace(AFs, sampleAFs);
    });
    MathArrays.scaleInPlace(1/totalWeight.getValue(), AFs);
    return AFs;
}
 
Example 10
Source File: Mutect2VariantFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Converts the single probability calculated for the site to be the probability for each allele
 * @param vc
 * @param filteringEngine
 * @param referenceContext
 * @return
 */
@Override
public List<Double> errorProbabilities(final VariantContext vc, final Mutect2FilteringEngine filteringEngine, ReferenceContext referenceContext) {
    int numAltAlleles = vc.getNAlleles() - 1;
    final double result = Mutect2FilteringEngine.roundFinitePrecisionErrors(requiredInfoAnnotations().stream().allMatch(vc::hasAttribute) ?
            calculateErrorProbability(vc, filteringEngine, referenceContext) : 0.0);
    return Collections.nCopies(numAltAlleles, result);
}
 
Example 11
Source File: VariantAnnotator.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private AlleleLikelihoods<GATKRead, Allele> makeLikelihoods(final VariantContext vc, final ReadsContext readsContext) {
    final List<GATKRead> reads = Utils.stream(readsContext).collect(Collectors.toList());
    final AlleleLikelihoods<GATKRead, Allele> result = new AlleleLikelihoods<>(variantSamples,
            new IndexedAlleleList<>(vc.getAlleles()), AssemblyBasedCallerUtils.splitReadsBySample(variantSamples, getHeaderForReads(), reads));

    final ReadPileup readPileup = new ReadPileup(vc, readsContext);
    final Map<String, ReadPileup> pileupsBySample = readPileup.splitBySample(getHeaderForReads(), "__UNKNOWN__");

    final Allele refAllele = vc.getReference();
    final List<Allele> altAlleles = vc.getAlternateAlleles();
    final int numAlleles = vc.getNAlleles();

    // manually fill each sample's likelihoods one read at a time, assigning probability 1 (0 in log space) to the matching allele, if present
    for (final Map.Entry<String, ReadPileup> samplePileups : pileupsBySample.entrySet()) {
        final LikelihoodMatrix<GATKRead, Allele> sampleMatrix = result.sampleMatrix(result.indexOfSample(samplePileups.getKey()));

        final MutableInt readIndex = new MutableInt(0);
        for (final PileupElement pe : samplePileups.getValue()) {
            // initialize all alleles as equally unlikely
            IntStream.range(0, numAlleles).forEach(a -> sampleMatrix.set(a, readIndex.intValue(), Double.NEGATIVE_INFINITY));

            // if present set the matching allele as likely
            final Allele pileupAllele = GATKVariantContextUtils.chooseAlleleForRead(pe, refAllele, altAlleles, minBaseQualityScore);
            if (pileupAllele != null) {
                sampleMatrix.set(sampleMatrix.indexOfAllele(pileupAllele), readIndex.intValue(), 0);
            }


            readIndex.increment();
        }
    }

    return result;
}
 
Example 12
Source File: SelectVariants.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
 *
 * @param vc       the VariantContext record to subset
 * @param preserveAlleles should we trim constant sequence from the beginning and/or end of all alleles, or preserve it?
 * @param removeUnusedAlternates removes alternate alleles with AC=0
 * @return the subsetted VariantContext
 */
private VariantContext subsetRecord(final VariantContext vc, final boolean preserveAlleles, final boolean removeUnusedAlternates) {
    //subContextFromSamples() always decodes the vc, which is a fairly expensive operation.  Avoid if possible
    if (noSamplesSpecified && !removeUnusedAlternates) {
        return vc;
    }

    // strip out the alternate alleles that aren't being used
    final VariantContext sub = vc.subContextFromSamples(samples, removeUnusedAlternates);

    // If no subsetting happened, exit now
    if (sub.getNSamples() == vc.getNSamples() && sub.getNAlleles() == vc.getNAlleles()) {
        return vc;
    }

    // fix the PL and AD values if sub has fewer alleles than original vc and remove a fraction of the genotypes if needed
    final GenotypesContext oldGs = sub.getGenotypes();
    GenotypesContext newGC = sub.getNAlleles() == vc.getNAlleles() ? oldGs :
            AlleleSubsettingUtils.subsetAlleles(oldGs, 0, vc.getAlleles(), sub.getAlleles(), GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES, vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0));

    if (fractionGenotypes > 0) {
        final List<Genotype> genotypes = newGC.stream().map(genotype -> randomGenotypes.nextDouble() > fractionGenotypes ? genotype :
                new GenotypeBuilder(genotype).alleles(getNoCallAlleles(genotype.getPloidy())).noGQ().make()).collect(Collectors.toList());
        newGC = GenotypesContext.create(new ArrayList<>(genotypes));
    }

    // since the VC has been subset (either by sample or allele), we need to strip out the MLE tags
    final VariantContextBuilder builder = new VariantContextBuilder(sub);
    builder.rmAttributes(Arrays.asList(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY));
    builder.genotypes(newGC);
    addAnnotations(builder, vc, sub.getSampleNames());
    final VariantContext subset = builder.make();

    return preserveAlleles? subset : GATKVariantContextUtils.trimAlleles(subset,true,true);
}
 
Example 13
Source File: ASEReadCounter.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext) {
    final String contig = alignmentContext.getContig();
    final long position = alignmentContext.getPosition();

    final char refAllele = (char) referenceContext.getBase();

    final List<VariantContext> VCs = featureContext.getValues(variants);
    if (VCs != null && VCs.size() > 1) {
        throw new UserException("More then one variant context at position: " + contig + ":" + position);
    }
    if (VCs == null || VCs.isEmpty()) {
        return;
    }

    final VariantContext vc = VCs.get(0);
    if (!vc.isBiallelic()) {
        logger.warn("Ignoring site: cannot run ASE on non-biallelic sites: " + vc.toString());
        return;
    }

    if (vc.getHetCount() < 1) {
        logger.warn("Ignoring site: variant is not het at postion: " + contig + ":" + position);
        return;
    }

    if (vc.getNAlleles() == 1 || vc.getAlternateAllele(0).getBases().length == 0) {
        throw new UserException("The file of variant sites must contain heterozygous sites and cannot be a GVCF file containing <NON_REF> alleles.");
    }

    final char altAllele = (char) vc.getAlternateAllele(0).getBases()[0];

    final String siteID = vc.getID();
    final ReadPileup pileup = filterPileup(alignmentContext.getBasePileup(), countType);

    // count up the depths of all and QC+ bases
    final String line = calculateLineForSite(pileup, siteID, refAllele, altAllele);
    if (line != null) {
        outputStream.println(line);
    }
}
 
Example 14
Source File: ApplyVQSR.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Calculate the allele-specific filter status of vc
 * @param vc
 * @param recals
 * @param builder   is modified by adding attributes
 * @return a String with the filter status for this site
 */
private String doAlleleSpecificFiltering(final VariantContext vc, final List<VariantContext> recals, final VariantContextBuilder builder) {
    double bestLod = VariantRecalibratorEngine.MIN_ACCEPTABLE_LOD_SCORE;
    final List<String> culpritStrings = new ArrayList<>();
    final List<String> lodStrings = new ArrayList<>();
    final List<String> AS_filterStrings = new ArrayList<>();

    String[] prevCulpritList = null;
    String[] prevLodList = null;
    String[] prevASfiltersList = null;

    //get VQSR annotations from previous run of ApplyRecalibration, if applicable
    if(foundINDELTranches || foundSNPTranches) {
        final String prevCulprits = vc.getAttributeAsString(GATKVCFConstants.AS_CULPRIT_KEY,"");
        prevCulpritList = prevCulprits.isEmpty()? new String[0] : prevCulprits.split(listPrintSeparator);
        final String prevLodString = vc.getAttributeAsString(GATKVCFConstants.AS_VQS_LOD_KEY,"");
        prevLodList = prevLodString.isEmpty()? new String[0] : prevLodString.split(listPrintSeparator);
        final String prevASfilters = vc.getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY,"");
        prevASfiltersList = prevASfilters.isEmpty()? new String[0] : prevASfilters.split(listPrintSeparator);
    }

    //for each allele in the current VariantContext
    for (int altIndex = 0; altIndex < vc.getNAlleles()-1; altIndex++) {
        final Allele allele = vc.getAlternateAllele(altIndex);

        //if the current allele is not part of this recalibration mode, add its annotations to the list and go to the next allele
        if (!VariantDataManager.checkVariationClass(vc, allele, MODE)) {
            updateAnnotationsWithoutRecalibrating(altIndex, prevCulpritList, prevLodList, prevASfiltersList, culpritStrings, lodStrings, AS_filterStrings);
            continue;
        }

        //if the current allele does need to have recalibration applied...

        //initialize allele-specific VQSR annotation data with values for spanning deletion
        String alleleLodString = emptyFloatValue;
        String alleleFilterString = emptyStringValue;
        String alleleCulpritString = emptyStringValue;

        //if it's not a spanning deletion, replace those allele strings with the real values
        if (!GATKVCFConstants.isSpanningDeletion(allele)) {
            VariantContext recalDatum = getMatchingRecalVC(vc, recals, allele);
            if (recalDatum == null) {
                throw new UserException("Encountered input allele which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants with flag -AS. First seen at: " + vc);
            }

            //compare VQSLODs for all alleles in the current mode for filtering later
            final double lod = recalDatum.getAttributeAsDouble(GATKVCFConstants.VQS_LOD_KEY, VariantRecalibratorEngine.MIN_ACCEPTABLE_LOD_SCORE);
            if (lod > bestLod)
                bestLod = lod;

            alleleLodString = String.format("%.4f", lod);
            alleleFilterString = generateFilterString(lod);
            alleleCulpritString = recalDatum.getAttributeAsString(GATKVCFConstants.CULPRIT_KEY, ".");

            if(recalDatum != null) {
                if (recalDatum.hasAttribute(GATKVCFConstants.POSITIVE_LABEL_KEY))
                    builder.attribute(GATKVCFConstants.POSITIVE_LABEL_KEY, true);
                if (recalDatum.hasAttribute(GATKVCFConstants.NEGATIVE_LABEL_KEY))
                    builder.attribute(GATKVCFConstants.NEGATIVE_LABEL_KEY, true);
            }
        }

        //append per-allele VQSR annotations
        lodStrings.add(alleleLodString);
        AS_filterStrings.add(alleleFilterString);
        culpritStrings.add(alleleCulpritString);
    }

    // Annotate the new record with its VQSLOD, AS_FilterStatus, and the worst performing annotation
    if(!AS_filterStrings.isEmpty() )
        builder.attribute(GATKVCFConstants.AS_FILTER_STATUS_KEY, AnnotationUtils.encodeStringList(AS_filterStrings));
    if(!lodStrings.isEmpty())
        builder.attribute(GATKVCFConstants.AS_VQS_LOD_KEY, AnnotationUtils.encodeStringList(lodStrings));
    if(!culpritStrings.isEmpty())
        builder.attribute(GATKVCFConstants.AS_CULPRIT_KEY, AnnotationUtils.encodeStringList(culpritStrings));

    return generateFilterStringFromAlleles(vc, bestLod);
}
 
Example 15
Source File: Mutect2FilteringEngine.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public int[] sumADsOverSamples(final VariantContext vc, final boolean includeTumor, final boolean includeNormal) {
    final int[] ADs = new int[vc.getNAlleles()];
    vc.getGenotypes().stream().filter(g -> (includeTumor && isTumor(g)) || (includeNormal && isNormal(g)))
            .map(Genotype::getAD).forEach(ad -> new IndexRange(0, vc.getNAlleles()).forEach(n -> ADs[n] += ad[n]));
    return ADs;
}
 
Example 16
Source File: AlleleFrequencyCalculator.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
 *
 * @param vc the VariantContext holding the alleles and sample information.  The VariantContext
 *           must have at least 1 alternative allele
 * @return result (for programming convenience)
 */
public AFCalculationResult calculate(final VariantContext vc, final int defaultPloidy) {
    Utils.nonNull(vc, "VariantContext cannot be null");
    final int numAlleles = vc.getNAlleles();
    final List<Allele> alleles = vc.getAlleles();
    Utils.validateArg( numAlleles > 1, () -> "VariantContext has only a single reference allele, but getLog10PNonRef requires at least one at all " + vc);

    final double[] priorPseudocounts = alleles.stream()
            .mapToDouble(a -> a.isReference() ? refPseudocount : (a.length() == vc.getReference().length() ? snpPseudocount : indelPseudocount)).toArray();

    double[] alleleCounts = new double[numAlleles];
    final double flatLog10AlleleFrequency = -MathUtils.log10(numAlleles); // log10(1/numAlleles)
    double[] log10AlleleFrequencies = new IndexRange(0, numAlleles).mapToDouble(n -> flatLog10AlleleFrequency);

    for (double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY; alleleCountsMaximumDifference > THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE; ) {
        final double[] newAlleleCounts = effectiveAlleleCounts(vc, log10AlleleFrequencies);
        alleleCountsMaximumDifference = Arrays.stream(MathArrays.ebeSubtract(alleleCounts, newAlleleCounts)).map(Math::abs).max().getAsDouble();
        alleleCounts = newAlleleCounts;
        final double[] posteriorPseudocounts = MathArrays.ebeAdd(priorPseudocounts, alleleCounts);

        // first iteration uses flat prior in order to avoid local minimum where the prior + no pseudocounts gives such a low
        // effective allele frequency that it overwhelms the genotype likelihood of a real variant
        // basically, we want a chance to get non-zero pseudocounts before using a prior that's biased against a variant
        log10AlleleFrequencies = new Dirichlet(posteriorPseudocounts).log10MeanWeights();
    }

    double[] log10POfZeroCountsByAllele = new double[numAlleles];
    double log10PNoVariant = 0;

    final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL);
    final Map<Integer, int[]> nonVariantIndicesByPloidy = new Int2ObjectArrayMap<>();

    // re-usable buffers of the log10 genotype posteriors of genotypes missing each allele
    final List<DoubleArrayList> log10AbsentPosteriors = IntStream.range(0,numAlleles).mapToObj(n -> new DoubleArrayList()).collect(Collectors.toList());
    for (final Genotype g : vc.getGenotypes()) {
        if (!g.hasLikelihoods()) {
            continue;
        }
        final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy();
        final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles);

        final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);

        //the total probability
        if (!spanningDeletionPresent) {
            log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX];
        } else {
            nonVariantIndicesByPloidy.computeIfAbsent(ploidy, p -> genotypeIndicesWithOnlyRefAndSpanDel(p, alleles));
            final int[] nonVariantIndices = nonVariantIndicesByPloidy.get(ploidy);
            final double[] nonVariantLog10Posteriors = MathUtils.applyToArray(nonVariantIndices, n -> log10GenotypePosteriors[n]);
            // when the only alt allele is the spanning deletion the probability that the site is non-variant
            // may be so close to 1 that finite precision error in log10SumLog10 yields a positive value,
            // which is bogus.  Thus we cap it at 0.
            log10PNoVariant += Math.min(0,MathUtils.log10SumLog10(nonVariantLog10Posteriors));
        }

        // if the VC is biallelic the allele-specific qual equals the variant qual
        if (numAlleles == 2) {
            continue;
        }

        // for each allele, we collect the log10 probabilities of genotypes in which the allele is absent, then add (in log space)
        // to get the log10 probability that the allele is absent in this sample
        log10AbsentPosteriors.forEach(DoubleArrayList::clear);  // clear the buffers.  Note that this is O(1) due to the primitive backing array
        for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
            final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
            glCalc.genotypeAlleleCountsAt(genotype).forEachAbsentAlleleIndex(a -> log10AbsentPosteriors.get(a).add(log10GenotypePosterior), numAlleles);
        }

        final double[] log10PNoAllele = log10AbsentPosteriors.stream()
                .mapToDouble(buffer -> MathUtils.log10SumLog10(buffer.toDoubleArray()))
                .map(x -> Math.min(0, x)).toArray();    // if prob of non hom ref > 1 due to finite precision, short-circuit to avoid NaN

        // multiply the cumulative probabilities of alleles being absent, which is addition of logs
        MathUtils.addToArrayInPlace(log10POfZeroCountsByAllele, log10PNoAllele);
    }

    // for biallelic the allele-specific qual equals the variant qual, and we short-circuited the calculation above
    if (numAlleles == 2) {
        log10POfZeroCountsByAllele[1] = log10PNoVariant;
    }

    // unfortunately AFCalculationResult expects integers for the MLE.  We really should emit the EM no-integer values
    // which are valuable (eg in CombineGVCFs) as the sufficient statistics of the Dirichlet posterior on allele frequencies
    final int[] integerAlleleCounts = Arrays.stream(alleleCounts).mapToInt(x -> (int) Math.round(x)).toArray();
    final int[] integerAltAlleleCounts = Arrays.copyOfRange(integerAlleleCounts, 1, numAlleles);

    //skip the ref allele (index 0)
    final Map<Allele, Double> log10PRefByAllele = IntStream.range(1, numAlleles).boxed()
            .collect(Collectors.toMap(alleles::get, a -> log10POfZeroCountsByAllele[a]));

    return new AFCalculationResult(integerAltAlleleCounts, alleles, log10PNoVariant, log10PRefByAllele);
}
 
Example 17
Source File: SelectVariants.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC, final Set<String> selectedSampleNames) {
    if (fullyDecode) {
        return; // TODO -- annotations are broken with fully decoded data
    }

    if (keepOriginalChrCounts) {
        final int[] indexOfOriginalAlleleForNewAllele;
        final List<Allele> newAlleles = builder.getAlleles();
        final int numOriginalAlleles = originalVC.getNAlleles();

        // if the alleles already match up, we can just copy the previous list of counts
        if (numOriginalAlleles == newAlleles.size()) {
            indexOfOriginalAlleleForNewAllele = null;
        }
        // otherwise we need to parse them and select out the correct ones
        else {
            indexOfOriginalAlleleForNewAllele = new int[newAlleles.size() - 1];
            Arrays.fill(indexOfOriginalAlleleForNewAllele, -1);

            // note that we don't care about the reference allele at position 0
            for (int newI = 1; newI < newAlleles.size(); newI++) {
                final Allele newAlt = newAlleles.get(newI);
                for (int oldI = 0; oldI < numOriginalAlleles - 1; oldI++) {
                    if (newAlt.equals(originalVC.getAlternateAllele(oldI), false)) {
                        indexOfOriginalAlleleForNewAllele[newI - 1] = oldI;
                        break;
                    }
                }
            }
        }

        if (originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) {
            builder.attribute(GATKVCFConstants.ORIGINAL_AC_KEY,
                    getReorderedAttributes(originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY), indexOfOriginalAlleleForNewAllele));
        }
        if (originalVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) {
            builder.attribute(GATKVCFConstants.ORIGINAL_AF_KEY,
                    getReorderedAttributes(originalVC.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY), indexOfOriginalAlleleForNewAllele));
        }
        if (originalVC.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY)) {
            builder.attribute(GATKVCFConstants.ORIGINAL_AN_KEY, originalVC.getAttribute(VCFConstants.ALLELE_NUMBER_KEY));
        }
    }

    VariantContextUtils.calculateChromosomeCounts(builder, false);

    if (keepOriginalDepth && originalVC.hasAttribute(VCFConstants.DEPTH_KEY)) {
        builder.attribute(GATKVCFConstants.ORIGINAL_DP_KEY, originalVC.getAttribute(VCFConstants.DEPTH_KEY));
    }

    boolean sawDP = false;
    int depth = 0;
    for (final String sample : selectedSampleNames ) {
        final Genotype g = originalVC.getGenotype(sample);
        if (!g.isFiltered()) {
            if (g.hasDP()) {
                depth += g.getDP();
                sawDP = true;
            }
        }
    }

    if (sawDP) {
        builder.attribute(VCFConstants.DEPTH_KEY, depth);
    }
}
 
Example 18
Source File: GenotypeGVCFsIntegrationTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Test
public void testGenotypingForSomaticGVCFs() throws IOException{
    final List<Integer> starPositions = new ArrayList<>();
    starPositions.add(319);
    starPositions.add(321);
    starPositions.add(322);
    starPositions.add(323);

    final File output = createTempFile("tmp", ".vcf");
    ArgumentsBuilder args =   new ArgumentsBuilder()
            .addVCF(getTestFile("threeSamples.MT.g.vcf"))
            .addReference(new File(b37Reference))
            .addOutput(output)
            .add(CombineGVCFs.SOMATIC_INPUT_LONG_NAME, true);
    runCommandLine(args);

    //compared with the combined GVCF, this output should have called GTs and no alts with LODs less than TLOD_THRESHOLD
    //uncalled alleles should be removed

    final List<VariantContext> results = VariantContextTestUtils.getVariantContexts(output);

    //qualitative match
    for (final VariantContext vc : results) {
        Assert.assertTrue(!vc.getAlleles().contains(Allele.NON_REF_ALLELE));
        Assert.assertTrue(vc.getAlternateAlleles().size() >= 1);
        Assert.assertTrue(vc.filtersWereApplied());  //filtering should happen during combine, but make sure filters aren't dropped

        for (final Genotype g : vc.getGenotypes()) {
            Assert.assertTrue(g.isCalled());
            //homRef sites are sometimes filtered because they had low quality, filtered alleles that GGVCFs genotyped out
            //ideally if the site wasn't homRef and had a good allele it would be PASS, but htsjdk will only output genotype filters if at least one genotype is filtered
        }

        //MT:302 has an alphabet soup of alleles in the GVCF -- make sure the ones we keep are good
        if (vc.getStart() == 302) {
            Assert.assertEquals(vc.getNAlleles(), 6);
            double[] sample0LODs = VariantContextGetters.getAttributeAsDoubleArray(vc.getGenotype(0), GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, 0.0);
            double[] sample1LODs = VariantContextGetters.getAttributeAsDoubleArray(vc.getGenotype(1), GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, 0.0);
            double[] sample2LODs = VariantContextGetters.getAttributeAsDoubleArray(vc.getGenotype(2), GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, 0.0);
            for (int i = 0; i < vc.getNAlleles() - 1; i++) {
                Assert.assertTrue(sample0LODs[i] > TLOD_THRESHOLD || sample1LODs[i] > TLOD_THRESHOLD || sample2LODs[i] > TLOD_THRESHOLD);
            }
        }

        //make sure phasing is retained
        if (vc.getStart() == 317 || vc.getStart() == 320) {
            VariantContextTestUtils.assertGenotypeIsPhasedWithAttributes(vc.getGenotype(2));
        }

        //make sure *-only variants are dropped
        Assert.assertFalse(starPositions.contains(vc.getStart()));

        //sample 0 is also phased
        if (vc.getStart() == 4713 || vc.getStart() == 4720) {
            VariantContextTestUtils.assertGenotypeIsPhasedWithAttributes(vc.getGenotype(0));
        }

        //MT:4769 in combined GVCF has uncalled alleles
        if (vc.getStart() == 4769) {
            Assert.assertEquals(vc.getNAlleles(), 2);
            Assert.assertTrue(vc.getAlternateAlleles().get(0).basesMatch("G"));
        }
    }

    //exact match
    final File expectedFile = getTestFile("threeSamples.MT.vcf");
    final List<VariantContext> expected = VariantContextTestUtils.getVariantContexts(expectedFile);
    final VCFHeader header = VCFHeaderReader.readHeaderFrom(new SeekablePathStream(IOUtils.getPath(expectedFile.getAbsolutePath())));
    assertForEachElementInLists(results, expected,
            (a, e) -> VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, ATTRIBUTES_TO_IGNORE, ATTRIBUTES_WITH_JITTER, header));

}