htsjdk.samtools.SAMSequenceDictionary Java Examples

The following examples show how to use htsjdk.samtools.SAMSequenceDictionary. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PicardIndexedFastaSequenceFile.java    From chipster with MIT License 6 votes vote down vote up
/**
 * Do some basic checking to make sure the dictionary and the index match.
 * @param fastaFile Used for error reporting only.
 * @param sequenceDictionary sequence dictionary to check against the index.
 * @param index index file to check against the dictionary.
 */
protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile,
                                                        final SAMSequenceDictionary sequenceDictionary,
                                                        final FastaSequenceIndex index) {
    // Make sure dictionary and index are the same size.
    if( sequenceDictionary.getSequences().size() != index.size() )
        throw new SAMException("Sequence dictionary and index contain different numbers of contigs");

    Iterator<SAMSequenceRecord> sequenceIterator = sequenceDictionary.getSequences().iterator();
    Iterator<FastaSequenceIndexEntry> indexIterator = index.iterator();

    while(sequenceIterator.hasNext() && indexIterator.hasNext()) {
        SAMSequenceRecord sequenceEntry = sequenceIterator.next();
        FastaSequenceIndexEntry indexEntry = indexIterator.next();

        if(!sequenceEntry.getSequenceName().equals(indexEntry.getContig())) {
            throw new SAMException(String.format("Mismatch between sequence dictionary fasta index for %s, sequence '%s' != '%s'.",
                    fastaFile, sequenceEntry.getSequenceName(),indexEntry.getContig()));
        }

        // Make sure sequence length matches index length.
        if( sequenceEntry.getSequenceLength() != indexEntry.getSize())
            throw new SAMException("Index length does not match dictionary length for contig: " + sequenceEntry.getSequenceName() );
    }
}
 
Example #2
Source File: CopyRatioModellerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testMCMC() {
    final double variance = 0.01;
    final double outlierProbability = 0.05;
    final int numSegments = 100;
    final double averageIntervalsPerSegment = 100.;
    final int numSamples = 150;
    final int numBurnIn = 50;
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));

    final SampleLocatableMetadata metadata = new SimpleSampleLocatableMetadata(
            "test-sample",
            new SAMSequenceDictionary(IntStream.range(0, numSegments)
                    .mapToObj(i -> new SAMSequenceRecord("chr" + i + 1, 10000))
                    .collect(Collectors.toList())));
    final CopyRatioSimulatedData simulatedData = new CopyRatioSimulatedData(
            metadata, variance, outlierProbability, numSegments, averageIntervalsPerSegment, rng);

    final CopyRatioModeller modeller = new CopyRatioModeller(simulatedData.getData().getCopyRatios(), simulatedData.getData().getSegments());
    modeller.fitMCMC(numSamples, numBurnIn);

    assertCopyRatioPosteriorCenters(modeller, simulatedData);
}
 
Example #3
Source File: Shard.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Divide an interval into ShardBoundaries. Each shard will cover up to shardSize bases, include shardPadding
 * bases of extra padding on either side, and begin shardStep bases after the previous shard.
 *
 * @param interval interval to shard; must be on the contig according to the provided dictionary
 * @param shardSize desired shard size; intervals larger than this will be divided into shards of up to this size
 * @param shardStep each shard will begin this many bases away from the previous shard
 * @param shardPadding desired shard padding; each shard's interval will be padded on both sides by this number of bases (may be 0)
 * @param dictionary sequence dictionary for reads
 * @return List of {@link ShardBoundary} objects spanning the interval
 */
static List<ShardBoundary> divideIntervalIntoShards(final SimpleInterval interval, final int shardSize, final int shardStep, final int shardPadding, final SAMSequenceDictionary dictionary) {
    Utils.nonNull(interval);
    Utils.nonNull(dictionary);
    Utils.validateArg(shardSize >= 1, "shardSize must be >= 1");
    Utils.validateArg(shardStep >= 1, "shardStep must be >= 1");
    Utils.validateArg(shardPadding >= 0, "shardPadding must be >= 0");

    Utils.validateArg(IntervalUtils.intervalIsOnDictionaryContig(interval, dictionary), () ->
            "Interval " + interval + " not within the bounds of a contig in the provided dictionary");

    final List<ShardBoundary> shards = new ArrayList<>();
    int start = interval.getStart();

    while ( start <= interval.getEnd() ) {
        final int end = Math.min(start + shardSize - 1, interval.getEnd());
        final SimpleInterval nextShardInterval = new SimpleInterval(interval.getContig(), start, end);
        final SimpleInterval nextShardIntervalPadded = nextShardInterval.expandWithinContig(shardPadding, dictionary);
        shards.add(new ShardBoundary(nextShardInterval, nextShardIntervalPadded));
        start += shardStep;
    }

    return shards;
}
 
Example #4
Source File: GtcToVcf.java    From picard with MIT License 6 votes vote down vote up
@Override
protected String[] customCommandLineValidation() {

    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(EXTENDED_ILLUMINA_MANIFEST);
    IOUtil.assertFileIsReadable(ILLUMINA_BEAD_POOL_MANIFEST_FILE);
    IOUtil.assertFileIsWritable(OUTPUT);
    refSeq = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
    final SAMSequenceDictionary sequenceDictionary = refSeq.getSequenceDictionary();
    final String assembly = sequenceDictionary.getSequence(0).getAssembly();
    if (!assembly.equals("GRCh37")) {
        return new String[]{"The selected reference sequence ('" + assembly + "') is not supported.  This tool is currently only implemented to support NCBI Build 37 / HG19 Reference Sequence."};
    }

    if (FINGERPRINT_GENOTYPES_VCF_FILE != null) {
        IOUtil.assertFileIsReadable(FINGERPRINT_GENOTYPES_VCF_FILE);
    }
    if (GENDER_GTC != null) {
        IOUtil.assertFileIsReadable(GENDER_GTC);
    }

    return super.customCommandLineValidation();
}
 
Example #5
Source File: GtcToVcf.java    From picard with MIT License 6 votes vote down vote up
/**
 * Writes out the VariantContext objects in the order presented to the supplied output file
 * in VCF format.
 */
private void writeVcf(final SortingCollection<VariantContext> variants,
                      final File output,
                      final SAMSequenceDictionary dict,
                      final VCFHeader vcfHeader) {

    try (final VariantContextWriter writer = new VariantContextWriterBuilder()
            .setOutputFile(output)
            .setReferenceDictionary(dict)
            .setOptions(VariantContextWriterBuilder.DEFAULT_OPTIONS)
            .build()) {

        writer.writeHeader(vcfHeader);

        for (final VariantContext variant : variants) {
            if (variant.getAlternateAlleles().size() > 1) {
                variant.getCommonInfo().addFilter(InfiniumVcfFields.TRIALLELIC);
            }

            writer.add(variant);
        }
    }
}
 
Example #6
Source File: VariantWalkerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
public SAMSequenceDictionary getBestAvailableSequenceDictionary() {
    final SAMSequenceDictionary dictFromDrivingVariants = getHeaderForVariants().getSequenceDictionary();
    if (dictFromDrivingVariants != null){
        //If this dictionary looks like it was synthesized from a feature index, see
        //if there is a better dictionary available from another source (i.e., the reference)
        if (IndexUtils.isSequenceDictionaryFromIndex(dictFromDrivingVariants)) {
            final SAMSequenceDictionary otherDictionary = super.getBestAvailableSequenceDictionary();
            return otherDictionary != null ?
                    otherDictionary :
                    dictFromDrivingVariants;
        } else {
            return dictFromDrivingVariants;
        }
    }
    return super.getBestAvailableSequenceDictionary();
}
 
Example #7
Source File: BGZF_ReferenceSequenceFile.java    From cramtools with Apache License 2.0 6 votes vote down vote up
public BGZF_ReferenceSequenceFile(File file) throws FileNotFoundException {
	if (!file.canRead())
		throw new RuntimeException("Cannot find or read fasta file: " + file.getAbsolutePath());

	File indexFile = new File(file.getAbsolutePath() + ".fai");
	if (!indexFile.canRead())
		throw new RuntimeException("Cannot find or read fasta index file: " + indexFile.getAbsolutePath());

	Scanner scanner = new Scanner(indexFile);
	int seqID = 0;
	dictionary = new SAMSequenceDictionary();
	while (scanner.hasNextLine()) {
		String line = scanner.nextLine();
		FAIDX_FastaIndexEntry entry = FAIDX_FastaIndexEntry.fromString(seqID++, line);
		index.put(entry.getName(), entry);
		dictionary.addSequence(new SAMSequenceRecord(entry.getName(), entry.getLen()));
	}
	scanner.close();

	if (index.isEmpty())
		log.warn("No entries in the index: " + indexFile.getAbsolutePath());

	is = new BlockCompressedInputStream(new SeekableFileStream(file));
}
 
Example #8
Source File: ReferenceUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Given an InputStream connected to a fasta dictionary, returns its sequence dictionary
 *
 * Note: does not close the InputStream it's passed
 *
 * @param fastaDictionaryStream InputStream connected to a fasta dictionary
 * @return the SAMSequenceDictionary from the fastaDictionaryStream
 */
public static SAMSequenceDictionary loadFastaDictionary( final InputStream fastaDictionaryStream ) {
    // Don't close the reader when we're done, since we don't want to close the client's InputStream for them
    final BufferedLineReader reader = new BufferedLineReader(fastaDictionaryStream);

    final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
    final SAMFileHeader header = codec.decode(reader, fastaDictionaryStream.toString());

    // Make sure we have a valid sequence dictionary before continuing:
    if (header.getSequenceDictionary() == null || header.getSequenceDictionary().isEmpty()) {
        throw new UserException.MalformedFile (
                "Could not read sequence dictionary from given fasta stream " +
                        fastaDictionaryStream
        );
    }

    return header.getSequenceDictionary();
}
 
Example #9
Source File: IntervalUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testCompareLocatables() throws Exception {
    final SAMSequenceDictionary dict = new SAMSequenceDictionary();
    dict.addSequence(new SAMSequenceRecord("1", 1000));
    dict.addSequence(new SAMSequenceRecord("2", 1000));
    final SimpleInterval chr1_1_100 = new SimpleInterval("1:1-100");
    final SimpleInterval chr1_5_100 = new SimpleInterval("1:5-100");
    final SimpleInterval chr2_1_100 = new SimpleInterval("2:1-100");

    // equal intervals comparison return 0
    Assert.assertEquals(IntervalUtils.compareLocatables(chr1_1_100, chr1_1_100, dict), 0);
    Assert.assertEquals(IntervalUtils.compareLocatables(chr1_5_100, chr1_5_100, dict), 0);
    Assert.assertEquals(IntervalUtils.compareLocatables(chr2_1_100, chr2_1_100, dict), 0);
    // first < second return negative
    Assert.assertTrue(IntervalUtils.compareLocatables(chr1_1_100, chr1_5_100, dict) < 0);
    Assert.assertTrue(IntervalUtils.compareLocatables(chr1_1_100, chr2_1_100, dict) < 0);
    Assert.assertTrue(IntervalUtils.compareLocatables(chr1_5_100, chr2_1_100, dict) < 0);
    // first > second return positive
    Assert.assertTrue(IntervalUtils.compareLocatables(chr2_1_100, chr1_1_100, dict) > 0);
    Assert.assertTrue(IntervalUtils.compareLocatables(chr2_1_100, chr1_5_100, dict) > 0);
    Assert.assertTrue(IntervalUtils.compareLocatables(chr1_5_100, chr1_1_100, dict) > 0);
}
 
Example #10
Source File: CollectAllelicCountsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
protected void processAlignments(JavaRDD<LocusWalkerContext> rdd, JavaSparkContext ctx) {
    validateArguments();

    final SampleLocatableMetadata metadata = MetadataUtils.fromHeader(getHeaderForReads(), Metadata.Type.SAMPLE_LOCATABLE);
    final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    //this check is currently redundant, since the master dictionary is taken from the reads;
    //however, if any other dictionary is added in the future, such a check should be performed
    if (!CopyNumberArgumentValidationUtils.isSameDictionary(metadata.getSequenceDictionary(), sequenceDictionary)) {
        logger.warn("Sequence dictionary in BAM does not match the master sequence dictionary.");
    }
    final Broadcast<SampleLocatableMetadata> sampleMetadataBroadcast = ctx.broadcast(metadata);

    final AllelicCountCollector finalAllelicCountCollector =
            rdd.mapPartitions(distributedCount(sampleMetadataBroadcast, minimumBaseQuality))
            .reduce((a1, a2) -> combineAllelicCountCollectors(a1, a2, sampleMetadataBroadcast.getValue()));

    logger.info(String.format("Writing allelic counts to %s...", outputAllelicCountsFile.getAbsolutePath()));
    finalAllelicCountCollector.getAllelicCounts().write(outputAllelicCountsFile);

    logger.info(String.format("%s complete.", getClass().getSimpleName()));
}
 
Example #11
Source File: IntervalTagComparator.java    From Drop-seq with MIT License 6 votes vote down vote up
public static int compare (final Interval i1, final Interval i2, final SAMSequenceDictionary dict) {
 	int result = 0;
 	// if there's a sequence dictionary, compare on the index of the sequence instead of the contig name.
 	if (dict!=null) {
 		int seqIdx1 = dict.getSequenceIndex(i1.getContig());
 		int seqIdx2 = dict.getSequenceIndex(i2.getContig());
 		result = seqIdx1 - seqIdx2;
 	} else
result = i1.getContig().compareTo(i2.getContig());
 	// same as Interval.compareTo
 	if (result == 0) {
         if (i1.getStart() == i2.getStart())
	result = i1.getEnd() - i2.getEnd();
else
	result = i1.getStart() - i2.getStart();
         // added bonus, sort on interval names to tie break, if both intervals have names.
         if (result==0) {
         	String n1 = i1.getName();
         	String n2 = i2.getName();
         	if (n1!=null && n2!=null)
		result = n1.compareTo(n2);
         }
     }
 	return result;
 }
 
Example #12
Source File: VariantDataManager.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
public void writeOutRecalibrationTable(final VariantContextWriter recalWriter, final SAMSequenceDictionary seqDictionary) {
    // we need to sort in coordinate order in order to produce a valid VCF
    Collections.sort( data, VariantDatum.getComparator(seqDictionary) );

    // create dummy alleles to be used
    List<Allele> alleles = Arrays.asList(Allele.create("N", true), Allele.create("<VQSR>", false));

    for( final VariantDatum datum : data ) {
        if (VRAC.useASannotations)
            alleles = Arrays.asList(datum.referenceAllele, datum.alternateAllele); //use the alleles to distinguish between multiallelics in AS mode
        VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getEnd(), alleles);
        builder.attribute(VCFConstants.END_KEY, datum.loc.getEnd());
        builder.attribute(GATKVCFConstants.VQS_LOD_KEY, String.format("%.4f", datum.lod));
        builder.attribute(GATKVCFConstants.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"));

        if ( datum.atTrainingSite ) builder.attribute(GATKVCFConstants.POSITIVE_LABEL_KEY, true);
        if ( datum.atAntiTrainingSite ) builder.attribute(GATKVCFConstants.NEGATIVE_LABEL_KEY, true);

        recalWriter.add(builder.make());
    }
}
 
Example #13
Source File: VariantWalkerBase.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Overriding the superclass method to preferentially
 * choose the sequence dictionary from the driving source of variants.
 */
@Override
public SAMSequenceDictionary getBestAvailableSequenceDictionary() {
    final SAMSequenceDictionary dictFromDrivingVariants = getSequenceDictionaryForDrivingVariants();
    if (dictFromDrivingVariants != null) {
        //If this dictionary looks like it was synthesized from a feature index, see
        //if there is a better dictionary available from another source (i.e., the reference)
        if (IndexUtils.isSequenceDictionaryFromIndex(dictFromDrivingVariants)) {
            final SAMSequenceDictionary otherDictionary = super.getBestAvailableSequenceDictionary();
            return otherDictionary != null ?
                    otherDictionary :
                    dictFromDrivingVariants;
        } else {
            return dictFromDrivingVariants;
        }
    }
    return super.getBestAvailableSequenceDictionary();
}
 
Example #14
Source File: TagReadWithGeneExonFunctionTest.java    From Drop-seq with MIT License 6 votes vote down vote up
@Test
public void testTagCodingUTR () {
	SamReader inputSam = SamReaderFactory.makeDefault().open(testBAMFile);
	SAMFileHeader header = inputSam.getFileHeader();
	SAMSequenceDictionary bamDict = header.getSequenceDictionary();
	final OverlapDetector<Gene> geneOverlapDetector = GeneAnnotationReader.loadAnnotationsFile(annotationsFile, bamDict);

	String recName="H575CBGXY:2:21206:5655:8103";
	SAMRecord r = getRecord(testBAMFile, recName);

	TagReadWithGeneExonFunction tagger = new TagReadWithGeneExonFunction();
	r=tagger.setAnnotations(r, geneOverlapDetector);

	String geneName = r.getStringAttribute("GE");
	String geneStrand = r.getStringAttribute("GS");

	Assert.assertEquals(geneName, "Elp2");
	Assert.assertEquals(geneStrand, "+");
	Assert.assertEquals(r.getStringAttribute("XF"), LocusFunction.UTR.name());
}
 
Example #15
Source File: CollectSamErrorMetrics.java    From picard with MIT License 6 votes vote down vote up
/**
 * Interprets intervals from the input INTERVALS, if there's a file with that name, it opens the file, otherwise
 * it tries to parse it, checks that their dictionaries all agree with the input SequenceDictionary and returns the
 * intersection of all the lists.
 *
 * @param sequenceDictionary a {@link SAMSequenceDictionary} to parse the intervals against.
 * @return the intersection of the interval lists pointed to by the input parameter INTERVALS
 */
private IntervalList getIntervals(final SAMSequenceDictionary sequenceDictionary) {

    IntervalList regionOfInterest = null;

    for (final File intervalListFile : INTERVALS) {

        if (!intervalListFile.exists()) {
            throw new IllegalArgumentException("Input file " + intervalListFile + " doesn't seem to exist. ");
        }

        log.info("Reading IntervalList ", intervalListFile, ".");
        final IntervalList temp = IntervalList.fromFile(intervalListFile);
        sequenceDictionary.assertSameDictionary(temp.getHeader().getSequenceDictionary());

        if (regionOfInterest == null) {
            regionOfInterest = temp;
        } else {
            log.info("Intersecting interval_list: ", intervalListFile, ".");
            regionOfInterest = IntervalList.intersection(regionOfInterest, temp);
        }
    }
    return regionOfInterest;
}
 
Example #16
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Call Variants using HaplotypeCaller on Spark and write out a VCF file.
 *
 * This may be called from any spark pipeline in order to call variants from an RDD of GATKRead
 * @param ctx the spark context
 * @param reads the reads variants should be called from
 * @param header the header that goes with the reads
 * @param reference the full path to the reference file (must have been added via {@code SparkContext#addFile()})
 * @param intervalShards the interval shards to restrict calling to
 * @param hcArgs haplotype caller arguments
 * @param shardingArgs arguments to control how the assembly regions are sharded
 * @param output the output path for the VCF
 * @param logger
 * @param strict whether to use the strict implementation (slower) for finding assembly regions to match walker version
 * @param createOutputVariantIndex create a variant index (tabix for bgzipped VCF only)
 */
public static void callVariantsWithHaplotypeCallerAndWriteOutput(
        final JavaSparkContext ctx,
        final JavaRDD<GATKRead> reads,
        final SAMFileHeader header,
        final SAMSequenceDictionary sequenceDictionary,
        final String reference,
        final List<ShardBoundary> intervalShards,
        final HaplotypeCallerArgumentCollection hcArgs,
        final AssemblyRegionReadShardArgumentCollection shardingArgs,
        final AssemblyRegionArgumentCollection assemblyRegionArgs,
        final String output,
        final Collection<Annotation> annotations,
        final Logger logger,
        final boolean strict,
        final boolean createOutputVariantIndex) {

    final Path referencePath = IOUtils.getPath(reference);
    final String referenceFileName = referencePath.getFileName().toString();
    Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSupplierBroadcast = assemblyRegionEvaluatorSupplierBroadcast(ctx, hcArgs, assemblyRegionArgs, header, reference, annotations);
    JavaRDD<AssemblyRegionWalkerContext> assemblyRegions = strict ?
            FindAssemblyRegionsSpark.getAssemblyRegionsStrict(ctx, reads, header, sequenceDictionary, referenceFileName, null, intervalShards, assemblyRegionEvaluatorSupplierBroadcast, shardingArgs, assemblyRegionArgs, false) :
            FindAssemblyRegionsSpark.getAssemblyRegionsFast(ctx, reads, header, sequenceDictionary, referenceFileName, null, intervalShards, assemblyRegionEvaluatorSupplierBroadcast, shardingArgs, assemblyRegionArgs, false);
    processAssemblyRegions(assemblyRegions, ctx, header, reference, hcArgs, assemblyRegionArgs, output, annotations, logger, createOutputVariantIndex);
}
 
Example #17
Source File: IntervalUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Tests the sorting as well when using combining intervals
 */
@Test(dataProvider = "combineIntervalsTesting")
public void testCombineIntervalsWithShuffling(List<Locatable> locatables1, List<Locatable> locatables2, List<Locatable> gtOutput) {

    if (locatables1 != null) {
        Collections.shuffle(locatables1, new Random(4040));
    }
    if (locatables2 != null) {
        Collections.shuffle(locatables2, new Random(4040));
    }

    final SAMSequenceDictionary dictionary = getSamSequenceDictionaryForCombineIntervalTests();

    final List<Locatable> outputs = IntervalUtils.combineAndSortBreakpoints(locatables1, locatables2, dictionary);
    Assert.assertEquals(outputs.size(), gtOutput.size());
    Assert.assertEquals(outputs, gtOutput);
}
 
Example #18
Source File: SparkSharder.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * @return <code>true</code> if the locatable is to the right of the given interval
 */
private static <I extends Locatable, L extends Locatable> boolean toRightOf(I interval, L locatable, SAMSequenceDictionary sequenceDictionary) {
    int intervalContigIndex = sequenceDictionary.getSequenceIndex(interval.getContig());
    int locatableContigIndex = sequenceDictionary.getSequenceIndex(locatable.getContig());
    return (intervalContigIndex == locatableContigIndex && interval.getEnd() < locatable.getStart()) // locatable on same contig, to the right
            || intervalContigIndex < locatableContigIndex; // locatable on subsequent contig
}
 
Example #19
Source File: DiscoverVariantsFromContigAlignmentsSAMSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {

    validateParams();

    final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast =
            StructuralVariationDiscoveryPipelineSpark.broadcastCNVCalls(ctx, getHeaderForReads(),
                    discoverStageArgs.cnvCallsFile);

    final String vcfOutputPath = getVcfOutputPath();

    final SvDiscoveryInputMetaData svDiscoveryInputMetaData =
            new SvDiscoveryInputMetaData(ctx, discoverStageArgs, null, vcfOutputPath,
                    null, null, null,
                    cnvCallsBroadcast,
                    getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);

    final JavaRDD<AlignedContig> parsedContigAlignments =
            new SvDiscoverFromLocalAssemblyContigAlignmentsSpark
                    .SAMFormattedContigAlignmentParser(getReads(),
                                                       svDiscoveryInputMetaData.getSampleSpecificData().getHeaderBroadcast().getValue(), true)
                    .getAlignedContigs();

    // assembly-based breakpoints
    List<VariantContext> annotatedVariants =
            ContigChimericAlignmentIterativeInterpreter
                    .discoverVariantsFromChimeras(svDiscoveryInputMetaData, parsedContigAlignments);

    final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue();
    SVVCFWriter.writeVCF(annotatedVariants, vcfOutputPath, refSeqDictionary, getDefaultToolVCFHeaderLines(), localLogger);
}
 
Example #20
Source File: CollectAllelicCounts.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public void onTraversalStart() {
    validateArguments();

    final SampleLocatableMetadata metadata = MetadataUtils.fromHeader(getHeaderForReads(), Metadata.Type.SAMPLE_LOCATABLE);
    final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    //this check is currently redundant, since the master dictionary is taken from the reads;
    //however, if any other dictionary is added in the future, such a check should be performed
    if (!CopyNumberArgumentValidationUtils.isSameDictionary(metadata.getSequenceDictionary(), sequenceDictionary)) {
        logger.warn("Sequence dictionary in BAM does not match the master sequence dictionary.");
    }
    allelicCountCollector = new AllelicCountCollector(metadata);
    logger.info("Collecting allelic counts...");
}
 
Example #21
Source File: BreakpointsInference.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static boolean isFirstInPartner(final SimpleChimera ca, final SAMSequenceDictionary referenceDictionary) {
    switch (ca.strandSwitch) {
        case NO_SWITCH: return 0 > IntervalUtils.compareContigs(ca.regionWithLowerCoordOnContig.referenceSpan,
                ca.regionWithHigherCoordOnContig.referenceSpan, referenceDictionary);
        case FORWARD_TO_REVERSE: case REVERSE_TO_FORWARD:
            return ca.isForwardStrandRepresentation;
        default:
            throw new GATKException.ShouldNeverReachHereException("Unseen strand switch case for: " + ca.toString());
    }
}
 
Example #22
Source File: SageHotspotAnnotationTest.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
@Test
public void testComparatorUsesAlleles() {
    final String line1 = "15\t12345678\trs1;UCSC\tC\tA,G\t2\tPASS\tinfo;\tGT:AD:DP\t0/1:60,60:121";
    final String line2 = "15\t12345678\trs1;UCSC\tC\tA,GT\t2\tPASS\tinfo;\tGT:AD:DP\t0/1:60,60:121";

    SAMSequenceDictionary samSequenceDictionary = new SAMSequenceDictionary();
    samSequenceDictionary.addSequence(new SAMSequenceRecord("15", 123456789));

    SageHotspotAnnotation.VCComparator comparator = new SageHotspotAnnotation.VCComparator(samSequenceDictionary);
    assertEquals(0, comparator.compare(codec.decode(line1), codec.decode(line1)));
    assertNotEquals(0, comparator.compare(codec.decode(line1), codec.decode(line2)));
}
 
Example #23
Source File: SequenceDictionaryUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test( dataProvider = "SequenceDictionaryDataProvider" )
public void testSequenceDictionaryValidation( final List<SAMSequenceRecord> firstDictionaryContigs,
                                              final List<SAMSequenceRecord> secondDictionaryContigs,
                                              final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, //not needed by this test
                                              final Class<? extends UserException> expectedExceptionUponValidation,
                                              final boolean requireSuperset,
                                              final boolean checkContigOrdering) {
    final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs);
    final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs);
    final String testDescription = String.format("First dictionary: %s  Second dictionary: %s",
            SequenceDictionaryUtils.getDictionaryAsString(firstDictionary),
            SequenceDictionaryUtils.getDictionaryAsString(secondDictionary));
    Exception exceptionThrown = null;
    try {
        SequenceDictionaryUtils.validateDictionaries(
                "firstDictionary",
                firstDictionary,
                "secondDictionary",
                secondDictionary,
                requireSuperset,
                checkContigOrdering);
    }
    catch ( Exception e ) {
        exceptionThrown = e;
    }
    if ( expectedExceptionUponValidation != null ) {
        Assert.assertTrue(exceptionThrown != null && expectedExceptionUponValidation.isInstance(exceptionThrown),
                String.format("Expected exception %s but saw %s instead. %s",
                        expectedExceptionUponValidation.getSimpleName(),
                        exceptionThrown == null ? "no exception" : exceptionThrown.getClass().getSimpleName(),
                        testDescription));
    }
    else {
        Assert.assertTrue(exceptionThrown == null,
                String.format("Expected no exception but saw exception %s instead. %s",
                        exceptionThrown != null ? exceptionThrown.getClass().getSimpleName() : "none",
                        testDescription));
    }
}
 
Example #24
Source File: FeatureDataSource.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Returns the sequence dictionary for this source of Features.
 * Uses the dictionary from the VCF header (if present) for variant inputs,
 * otherwise attempts to create a sequence dictionary from the index file (if present).
 * Returns null if no dictionary could be created from either the header or the index.
 */
public SAMSequenceDictionary getSequenceDictionary() {
    SAMSequenceDictionary dict = null;
    final Object header = getHeader();
    if (header instanceof VCFHeader) {
        dict = ((VCFHeader) header).getSequenceDictionary();
    }
    if (dict != null && !dict.isEmpty()) {
        return dict;
    }
    if (hasIndex) {
        return IndexUtils.createSequenceDictionaryFromFeatureIndex(new File(featureInput.getFeaturePath()));
    }
    return null;
}
 
Example #25
Source File: PlottingUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
static void validateSequenceDictionarySubset(final SAMSequenceDictionary sequenceDictionary,
                                             final SAMSequenceDictionary sequenceDictionarySubset) {
    Utils.nonNull(sequenceDictionary);
    Utils.nonNull(sequenceDictionarySubset);
    final Set<SAMSequenceRecord> sequences = sequenceDictionary.getSequences().stream()
            .map(s -> new SAMSequenceRecord(s.getMd5(), s.getSequenceLength()))
            .collect(Collectors.toCollection(LinkedHashSet::new));
    final Set<SAMSequenceRecord> sequencesSubset = sequenceDictionary.getSequences().stream()
            .map(s -> new SAMSequenceRecord(s.getMd5(), s.getSequenceLength()))
            .collect(Collectors.toCollection(LinkedHashSet::new));
    Utils.validateArg(sequences.containsAll(sequencesSubset),
            String.format("Sequence dictionary (%s) must be a subset of those contained in other input files (%s).",
                    sequencesSubset, sequences));
}
 
Example #26
Source File: SamCompareUtilsTest.java    From rtg-tools with BSD 2-Clause "Simplified" License 5 votes vote down vote up
public void test() {
  final SAMFileHeader header = new SAMFileHeader();
  header.setSequenceDictionary(new SAMSequenceDictionary(Arrays.asList(new SAMSequenceRecord("raga", 100), new SAMSequenceRecord("yaga", 100), new SAMSequenceRecord("zaga", 100))));
  final SAMRecord rec1 = new SAMRecord(header);
  rec1.setReferenceIndex(1);
  final SAMRecord rec2 = new SAMRecord(header);
  rec2.setReferenceIndex(2);
  assertEquals(-1, SamCompareUtils.compareSamRecords(rec1, rec2));
  assertEquals(1, SamCompareUtils.compareSamRecords(rec2, rec1));
  rec1.setReferenceIndex(2);
  rec1.setAlignmentStart(50);
  rec2.setAlignmentStart(25);
  assertEquals(1, SamCompareUtils.compareSamRecords(rec1, rec2));
  assertEquals(-1, SamCompareUtils.compareSamRecords(rec2, rec1));
  rec1.setReadPairedFlag(true);
  rec2.setReadPairedFlag(true);
  rec1.setProperPairFlag(true);
  rec2.setProperPairFlag(false);
  rec1.setAlignmentStart(25);
  assertEquals(-1, SamCompareUtils.compareSamRecords(rec1, rec2));
  assertEquals(1, SamCompareUtils.compareSamRecords(rec2, rec1));
  rec2.setProperPairFlag(true);
  rec1.setReadUnmappedFlag(true);
  assertEquals(1, SamCompareUtils.compareSamRecords(rec1, rec2));
  assertEquals(-1, SamCompareUtils.compareSamRecords(rec2, rec1));
  rec2.setReadUnmappedFlag(true);
  assertEquals(0, SamCompareUtils.compareSamRecords(rec1, rec2));
  assertEquals(0, SamCompareUtils.compareSamRecords(rec2, rec1));
  rec1.setReferenceIndex(-1);
  assertEquals(1, SamCompareUtils.compareSamRecords(rec1, rec2));
  assertEquals(-1, SamCompareUtils.compareSamRecords(rec2, rec1));
  rec2.setReferenceIndex(-1);
  assertEquals(0, SamCompareUtils.compareSamRecords(rec2, rec1));
}
 
Example #27
Source File: ShardUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "DivideIntervalIntoShardsTestData")
public void testDivideIntervalIntoShards( final SimpleInterval originalInterval, final int shardSize, final int shardStep, final int shardPadding, final SAMSequenceDictionary dictionary, List<ShardBoundary> expectedShards  ) {
    final List<ShardBoundary> shards = Shard.divideIntervalIntoShards(originalInterval, shardSize, shardStep, shardPadding, dictionary);

    Assert.assertEquals(shards.size(), expectedShards.size(), "Wrong number of shards");
    for ( int i = 0; i < shards.size(); ++i ) {
        final ShardBoundary shard = shards.get(i);
        Assert.assertEquals(shard.getInterval(), expectedShards.get(i).getInterval(), "Shard has wrong size");
        Assert.assertEquals(shard.getPaddedSpan(), expectedShards.get(i).getPaddedSpan(), "Shard has wrong amount of padding");
    }
}
 
Example #28
Source File: GenotypeGVCFs.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Get the largest interval per contig that contains the intervals specified on the command line.
 * @param getIntervals intervals to be transformed
 * @param sequenceDictionary used to validate intervals
 * @return a list of one interval per contig spanning the input intervals after processing and validation
 */
@Override
protected List<SimpleInterval> transformTraversalIntervals(final List<SimpleInterval> getIntervals, final SAMSequenceDictionary sequenceDictionary) {
    if (mergeInputIntervals) {
        return IntervalUtils.getSpanningIntervals(getIntervals, sequenceDictionary);
    } else {
        return getIntervals;
    }
}
 
Example #29
Source File: SortVcf.java    From picard with MIT License 5 votes vote down vote up
private void collectFileReadersAndHeaders(final List<String> sampleList, SAMSequenceDictionary samSequenceDictionary) {
    for (final File input : INPUT) {
        final VCFFileReader in = new VCFFileReader(input, false);
        final VCFHeader header = in.getFileHeader();
        final SAMSequenceDictionary dict = in.getFileHeader().getSequenceDictionary();
        if (dict == null || dict.isEmpty()) {
            if (null == samSequenceDictionary) {
                throw new IllegalArgumentException("Sequence dictionary was missing or empty for the VCF: " + input.getAbsolutePath() + " Please add a sequence dictionary to this VCF or specify SEQUENCE_DICTIONARY.");
            }
            header.setSequenceDictionary(samSequenceDictionary);
        } else {
            if (null == samSequenceDictionary) {
                samSequenceDictionary = dict;
            } else {
                try {
                    samSequenceDictionary.assertSameDictionary(dict);
                } catch (final AssertionError e) {
                    throw new IllegalArgumentException(e);
                }
            }
        }
        if (sampleList.isEmpty()) {
            sampleList.addAll(header.getSampleNamesInOrder());
        } else {
            if (!sampleList.equals(header.getSampleNamesInOrder())) {
                throw new IllegalArgumentException("Input file " + input.getAbsolutePath() + " has sample names that don't match the other files.");
            }
        }
        inputReaders.add(in);
        inputHeaders.add(header);
    }
}
 
Example #30
Source File: SimpleNovelAdjacencyInterpreter.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static Tuple2<SimpleNovelAdjacencyAndChimericAlignmentEvidence, List<SvType>> inferType
        (final Tuple2<NovelAdjacencyAndAltHaplotype, Iterable<SimpleChimera>> noveltyAndEvidence,
         final Broadcast<SAMSequenceDictionary> referenceSequenceDictionaryBroadcast,
         final Broadcast<ReferenceMultiSparkSource> referenceBroadcast) {

    final NovelAdjacencyAndAltHaplotype novelAdjacencyAndAltHaplotype = noveltyAndEvidence._1;
    final Iterable<SimpleChimera> simpleChimeras = noveltyAndEvidence._2;
    final List<SvType> inferredTypes =
            novelAdjacencyAndAltHaplotype.toSimpleOrBNDTypes(referenceBroadcast.getValue(),
                    referenceSequenceDictionaryBroadcast.getValue());
    final SimpleNovelAdjacencyAndChimericAlignmentEvidence simpleNovelAdjacencyAndChimericAlignmentEvidence
            = new SimpleNovelAdjacencyAndChimericAlignmentEvidence(novelAdjacencyAndAltHaplotype, simpleChimeras);
    return new Tuple2<>(simpleNovelAdjacencyAndChimericAlignmentEvidence, inferredTypes);
}