Java Code Examples for htsjdk.variant.vcf.VCFHeader#getSequenceDictionary()

The following examples show how to use htsjdk.variant.vcf.VCFHeader#getSequenceDictionary() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CreateSomaticPanelOfNormals.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public Object doWork() {
    final List<File> inputVcfs = new ArrayList<>(vcfs);
    final Collection<CloseableIterator<VariantContext>> iterators = new ArrayList<>(inputVcfs.size());
    final Collection<VCFHeader> headers = new HashSet<>(inputVcfs.size());
    final VCFHeader headerOfFirstVcf = new VCFFileReader(inputVcfs.get(0), false).getFileHeader();
    final SAMSequenceDictionary sequenceDictionary = headerOfFirstVcf.getSequenceDictionary();
    final VariantContextComparator comparator = headerOfFirstVcf.getVCFRecordComparator();


    for (final File vcf : inputVcfs) {
        final VCFFileReader reader = new VCFFileReader(vcf, false);
        iterators.add(reader.iterator());
        final VCFHeader header = reader.getFileHeader();
        Utils.validateArg(comparator.isCompatible(header.getContigLines()), () -> vcf.getAbsolutePath() + " has incompatible contigs.");
        headers.add(header);
    }

    final VariantContextWriter writer = GATKVariantContextUtils.createVCFWriter(outputVcf, sequenceDictionary, false, Options.INDEX_ON_THE_FLY);
    writer.writeHeader(new VCFHeader(VCFUtils.smartMergeHeaders(headers, false)));

    final MergingIterator<VariantContext> mergingIterator = new MergingIterator<>(comparator, iterators);
    SimpleInterval currentPosition = new SimpleInterval("FAKE", 1, 1);
    final List<VariantContext> variantsAtThisPosition = new ArrayList<>(20);
    while (mergingIterator.hasNext()) {
        final VariantContext vc = mergingIterator.next();
        if (!currentPosition.overlaps(vc)) {
            processVariantsAtSamePosition(variantsAtThisPosition, writer);
            variantsAtThisPosition.clear();
            currentPosition = new SimpleInterval(vc.getContig(), vc.getStart(), vc.getStart());
        }
        variantsAtThisPosition.add(vc);
    }
    mergingIterator.close();
    writer.close();

    return "SUCCESS";
}
 
Example 2
Source File: VcfFormatConverter.java    From picard with MIT License 5 votes vote down vote up
@Override
protected int doWork() {
    final ProgressLogger progress = new ProgressLogger(LOG, 10000);
    
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);

    final VCFFileReader reader = new VCFFileReader(INPUT, REQUIRE_INDEX);
    final VCFHeader header = new VCFHeader(reader.getFileHeader());
    final SAMSequenceDictionary sequenceDictionary = header.getSequenceDictionary();
    if (CREATE_INDEX && sequenceDictionary == null) {
        throw new PicardException("A sequence dictionary must be available in the input file when creating indexed output.");
    }

    final VariantContextWriterBuilder builder = new VariantContextWriterBuilder()
            .setOutputFile(OUTPUT)
            .setReferenceDictionary(sequenceDictionary);
    if (CREATE_INDEX)
        builder.setOption(Options.INDEX_ON_THE_FLY);
    else
        builder.unsetOption(Options.INDEX_ON_THE_FLY);
    final VariantContextWriter writer = builder.build();
    writer.writeHeader(header);
    final CloseableIterator<VariantContext> iterator = reader.iterator();

    while (iterator.hasNext()) {
        final VariantContext context = iterator.next();
        writer.add(context);
        progress.record(context.getContig(), context.getStart());
    }

    CloserUtil.close(iterator);
    CloserUtil.close(reader);
    writer.close();

    return 0;
}
 
Example 3
Source File: VcfFileSegmentGenerator.java    From picard with MIT License 5 votes vote down vote up
private static List<SAMSequenceRecord> readSequences(final File vcf) {
    final VCFFileReader reader = new VCFFileReader(vcf);
    final VCFHeader header = reader.getFileHeader();
    final SAMSequenceDictionary dict = header.getSequenceDictionary();
    reader.close();
    return dict.getSequences();
}
 
Example 4
Source File: MultiVariantDataSource.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * We want the headers that are used to create the merged header to have the same sequence dictionary
 * that was  returned from the data source and used during validation (which may or may not be the
 * one that was embedded in the input file itself), so get the embedded one from the data source and
 * update it to include the actual sequence dictionary.
 */
private VCFHeader getHeaderWithUpdatedSequenceDictionary(final FeatureDataSource<VariantContext> dataSource) {
    final VCFHeader header = (VCFHeader) dataSource.getHeader();
    if (header.getSequenceDictionary() == null && dataSource.getSequenceDictionary() != null) {
        header.setSequenceDictionary(dataSource.getSequenceDictionary());
    }
    return header;
}
 
Example 5
Source File: UpdateVCFSequenceDictionary.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public void onTraversalStart() {
    VCFHeader inputHeader = getHeaderForVariants();
    VCFHeader outputHeader = inputHeader == null ?
            new VCFHeader() :
            new VCFHeader(inputHeader.getMetaDataInInputOrder(), inputHeader.getGenotypeSamples()) ;
    getDefaultToolVCFHeaderLines().forEach(line -> outputHeader.addMetaDataLine(line));
    sourceDictionary = getBestAvailableSequenceDictionary();

    // If -replace is set, do not need to check the sequence dictionary for validity here -- it will still be
    // checked in our normal sequence dictionary validation. Warn and require opt-in via -replace if we're about to
    // clobber a valid sequence dictionary. Check the input file directly via the header rather than using the
    // engine, since it might dig one up from an index.
    if (!replace) {
        SAMSequenceDictionary oldDictionary =
                inputHeader == null ? null : inputHeader.getSequenceDictionary();
        if (oldDictionary != null && !oldDictionary.getSequences().isEmpty())  {
            throw new CommandLineException.BadArgumentValue(
                    String.format(
                            "The input variant file %s already contains a sequence dictionary. " +
                                    "Use %s to force the dictionary to be replaced.",
                            getDrivingVariantsFeatureInput().getName(),
                            REPLACE_ARGUMENT_NAME
                    )
            );
        }

    }

    outputHeader.setSequenceDictionary(sourceDictionary);
    vcfWriter = createVCFWriter(new File(outFile));
    vcfWriter.writeHeader(outputHeader);
}
 
Example 6
Source File: UpdateVCFSequenceDictionaryIntegrationTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider="UpdateGoodSequenceDictionaryData")
public void testGoodUpdateSequenceDictionary(
        final File inputVariantsFile,
        final File inputSourceFile,
        final File inputReferenceFile,
        final String masterSequenceDictionary,
        final boolean replace,
        final boolean disableSequenceDictionaryValidation) throws FileNotFoundException, URISyntaxException {
    final SAMSequenceDictionary resultingDictionary =
            updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, masterSequenceDictionary, replace, disableSequenceDictionaryValidation);

    // get the original sequence dictionary from the source for comparison
    SAMSequenceDictionary sourceDictionary =
            SAMSequenceDictionaryExtractor.extractDictionary(
                    inputSourceFile == null ?
                            inputReferenceFile.toPath() : inputSourceFile.toPath()
            );

    // Some sequence dictionary sources will contain optional attributes (i.e., if the source is a .dict file,
    // or if only a reference is presented to the tool using -R, which will in turn cause the framework to retrieve
    // the dictionary from the .dict file accompanying the reference, the dictionary will likely include md5 and
    // UR attributes). However, htsjdk doesn't propagate these attributes to the VCF header properly
    // (https://github.com/samtools/htsjdk/issues/730), and many are stripped out. In order to ensure the
    // roundtrip comparison succeeds, roundtrip it through a VCF header to match what htsjdk will have written out.
    VCFHeader sourceVCFHeader = new VCFHeader();
    sourceVCFHeader.setSequenceDictionary(sourceDictionary);
    sourceDictionary = sourceVCFHeader.getSequenceDictionary();
    Assert.assertEquals(sourceDictionary, resultingDictionary);
}
 
Example 7
Source File: CreateSnpIntervalFromVcf.java    From Drop-seq with MIT License 4 votes vote down vote up
public IntervalList processData(final File vcfFile, final File sdFile, final Set<String> sample, int GQThreshold, final boolean hetSNPsOnly) {

		final VCFFileReader reader = new VCFFileReader(vcfFile, false);
		if (!VCFUtils.GQInHeader(reader)) {
			GQThreshold=-1;
			log.info("Genotype Quality [GQ] not found in header.  Disabling GQ_THRESHOLD parameter");
		}
		
		final VCFHeader inputVcfHeader = new VCFHeader(reader.getFileHeader().getMetaDataInInputOrder());
		SAMSequenceDictionary sequenceDictionary = inputVcfHeader.getSequenceDictionary();
		Set<String> sampleListFinal = sample;
		if (sample==null || sample.isEmpty()) {
			ArrayList<String> s = reader.getFileHeader().getSampleNamesInOrder();
			sampleListFinal=new TreeSet<String>(s);
		}

		if (sdFile != null)
			sequenceDictionary = getSequenceDictionary(sdFile);

		final ProgressLogger progress = new ProgressLogger(this.log, 500000);

		final SAMFileHeader samHeader = new SAMFileHeader();
		samHeader.setSequenceDictionary(sequenceDictionary);
		IntervalList result = new IntervalList(samHeader);

		// Go through the input, find sites we want to keep.
		final PeekableIterator<VariantContext> iterator = new PeekableIterator<>(reader.iterator());

		validateRequestedSamples (iterator, sampleListFinal);

		while (iterator.hasNext()) {
			final VariantContext site = iterator.next();
			progress.record(site.getContig(), site.getStart());
			// for now drop any filtered site.
			if (site.isFiltered())
				continue;
			// move onto the next record if the site is not a SNP or the samples aren't all heterozygous.
			if (!site.isSNP())
				continue;
			if (!sitePassesFilters(site, sampleListFinal, GQThreshold, hetSNPsOnly))
				continue;
			Interval varInt = new Interval(site.getContig(), site.getStart(),
					site.getEnd(), true, site.getID());

			// final Interval site = findHeterozygousSites(full, SAMPLE);
			result.add(varInt);

		}

		CloserUtil.close(iterator);
		CloserUtil.close(reader);
		return (result);
	}
 
Example 8
Source File: VariantContextCollectionImpl.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
public VariantContextCollectionImpl(@NotNull final VCFHeader header) {
    variantContexts = new TreeSet<>(new VCComparator(header.getSequenceDictionary()));
}
 
Example 9
Source File: MakeSitesOnlyVcf.java    From picard with MIT License 4 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);

    final VCFFileReader reader = new VCFFileReader(INPUT, false);
    final VCFHeader inputVcfHeader = new VCFHeader(reader.getFileHeader().getMetaDataInInputOrder());
    final SAMSequenceDictionary sequenceDictionary = inputVcfHeader.getSequenceDictionary();

    if (CREATE_INDEX && sequenceDictionary == null) {
        throw new PicardException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output.");
    }

    final ProgressLogger progress = new ProgressLogger(Log.getInstance(MakeSitesOnlyVcf.class), 10000);

    // Setup the site-only file writer
    final VariantContextWriterBuilder builder = new VariantContextWriterBuilder()
            .setOutputFile(OUTPUT)
            .setReferenceDictionary(sequenceDictionary);
    if (CREATE_INDEX)
        builder.setOption(Options.INDEX_ON_THE_FLY);
    else
        builder.unsetOption(Options.INDEX_ON_THE_FLY);
    final VariantContextWriter writer = builder.build();

    final VCFHeader header = new VCFHeader(inputVcfHeader.getMetaDataInInputOrder(), SAMPLE);
    writer.writeHeader(header);

    // Go through the input, strip the records and write them to the output
    final CloseableIterator<VariantContext> iterator = reader.iterator();
    while (iterator.hasNext()) {
        final VariantContext full = iterator.next();
        final VariantContext site = subsetToSamplesWithOriginalAnnotations(full, SAMPLE);
        writer.add(site);
        progress.record(site.getContig(), site.getStart());
    }

    CloserUtil.close(iterator);
    CloserUtil.close(reader);
    writer.close();

    return 0;
}
 
Example 10
Source File: MergeVcfs.java    From picard with MIT License 4 votes vote down vote up
@Override
protected int doWork() {
    final ProgressLogger progress = new ProgressLogger(log, 10000);
    final List<String> sampleList = new ArrayList<String>();
    INPUT = IOUtil.unrollFiles(INPUT, IOUtil.VCF_EXTENSIONS);
    final Collection<CloseableIterator<VariantContext>> iteratorCollection = new ArrayList<CloseableIterator<VariantContext>>(INPUT.size());
    final Collection<VCFHeader> headers = new HashSet<VCFHeader>(INPUT.size());
    VariantContextComparator variantContextComparator = null;
    SAMSequenceDictionary sequenceDictionary = null;

    if (SEQUENCE_DICTIONARY != null) {
        sequenceDictionary = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(SEQUENCE_DICTIONARY).getFileHeader().getSequenceDictionary();
    }

    for (final File file : INPUT) {
        IOUtil.assertFileIsReadable(file);
        final VCFFileReader fileReader = new VCFFileReader(file, false);
        final VCFHeader fileHeader = fileReader.getFileHeader();
        if (fileHeader.getContigLines().isEmpty()) {
            if (sequenceDictionary == null) {
                throw new IllegalArgumentException(SEQ_DICT_REQUIRED);
            } else {
                fileHeader.setSequenceDictionary(sequenceDictionary);
            }
        }

        if (variantContextComparator == null) {
            variantContextComparator = fileHeader.getVCFRecordComparator();
        } else {
            if (!variantContextComparator.isCompatible(fileHeader.getContigLines())) {
                throw new IllegalArgumentException(
                        "The contig entries in input file " + file.getAbsolutePath() + " are not compatible with the others.");
            }
        }

        if (sequenceDictionary == null) sequenceDictionary = fileHeader.getSequenceDictionary();

        if (sampleList.isEmpty()) {
            sampleList.addAll(fileHeader.getSampleNamesInOrder());
        } else {
            if (!sampleList.equals(fileHeader.getSampleNamesInOrder())) {
                throw new IllegalArgumentException("Input file " + file.getAbsolutePath() + " has sample entries that don't match the other files.");
            }
        }
        
        // add comments in the first header
        if (headers.isEmpty()) {
            COMMENT.stream().forEach(C -> fileHeader.addMetaDataLine(new VCFHeaderLine("MergeVcfs.comment", C)));
        }

        headers.add(fileHeader);
        iteratorCollection.add(fileReader.iterator());
    }

    if (CREATE_INDEX && sequenceDictionary == null) {
        throw new PicardException(String.format("Index creation failed. %s", SEQ_DICT_REQUIRED));
    }

    final VariantContextWriterBuilder builder = new VariantContextWriterBuilder()
            .setOutputFile(OUTPUT)
            .setReferenceDictionary(sequenceDictionary);

    if (CREATE_INDEX) {
        builder.setOption(Options.INDEX_ON_THE_FLY);
    } else {
        builder.unsetOption(Options.INDEX_ON_THE_FLY);
    }
    final VariantContextWriter writer = builder.build();

    writer.writeHeader(new VCFHeader(VCFUtils.smartMergeHeaders(headers, false), sampleList));

    final MergingIterator<VariantContext> mergingIterator = new MergingIterator<VariantContext>(variantContextComparator, iteratorCollection);
    while (mergingIterator.hasNext()) {
        final VariantContext context = mergingIterator.next();
        writer.add(context);
        progress.record(context.getContig(), context.getStart());
    }

    CloserUtil.close(mergingIterator);
    writer.close();
    return 0;
}
 
Example 11
Source File: SplitVcfs.java    From picard with MIT License 4 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    final ProgressLogger progress = new ProgressLogger(log, 10000);

    final VCFFileReader fileReader = new VCFFileReader(INPUT, false);
    final VCFHeader fileHeader = fileReader.getFileHeader();

    final SAMSequenceDictionary sequenceDictionary =
            SEQUENCE_DICTIONARY != null
                    ? SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(SEQUENCE_DICTIONARY).getSequenceDictionary()
                    : fileHeader.getSequenceDictionary();
    if (CREATE_INDEX && sequenceDictionary == null) {
        throw new PicardException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output.");
    }

    final VariantContextWriterBuilder builder = new VariantContextWriterBuilder()
            .setReferenceDictionary(sequenceDictionary)
            .clearOptions();
    if (CREATE_INDEX)
        builder.setOption(Options.INDEX_ON_THE_FLY);

    final VariantContextWriter snpWriter = builder.setOutputFile(SNP_OUTPUT).build();
    final VariantContextWriter indelWriter = builder.setOutputFile(INDEL_OUTPUT).build();
    snpWriter.writeHeader(fileHeader);
    indelWriter.writeHeader(fileHeader);

    int incorrectVariantCount = 0;

    final CloseableIterator<VariantContext> iterator = fileReader.iterator();
    while (iterator.hasNext()) {
        final VariantContext context = iterator.next();
        if (context.isIndel()) indelWriter.add(context);
        else if (context.isSNP()) snpWriter.add(context);
        else {
            if (STRICT) throw new IllegalStateException("Found a record with type " + context.getType().name());
            else incorrectVariantCount++;
        }

        progress.record(context.getContig(), context.getStart());
    }

    if (incorrectVariantCount > 0) {
        log.debug("Found " + incorrectVariantCount + " records that didn't match SNP or INDEL");
    }

    CloserUtil.close(iterator);
    CloserUtil.close(fileReader);
    snpWriter.close();
    indelWriter.close();

    return 0;
}
 
Example 12
Source File: MultiVariantDataSource.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private SAMSequenceDictionary getMergedSequenceDictionary(VCFHeader header) {
    return header != null ? header.getSequenceDictionary() : null;
}
 
Example 13
Source File: GatherVcfsCloud.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/** Validates that all headers contain the same set of genotyped samples and that files are in order by position of first record. */
private static void assertSameSamplesAndValidOrdering(final List<Path> inputFiles, final boolean disableContigOrderingCheck) {
    final VCFHeader firstHeader = getHeader(inputFiles.get(0));
    final SAMSequenceDictionary dict = firstHeader.getSequenceDictionary();
    if ( dict == null) {
        throw new UserException.BadInput("The first VCF specified is missing the required sequence dictionary. " +
                                                 "This is required to perform validation.  You can skip this validation " +
                                                 "using --"+IGNORE_SAFETY_CHECKS_LONG_NAME +" but ignoring safety checks " +
                                                 "can result in invalid output.");
    }
    final VariantContextComparator comparator = new VariantContextComparator(dict);
    final List<String> samples = firstHeader.getGenotypeSamples();

    Path lastFile = null;
    VariantContext lastContext = null;

    for (final Path f : inputFiles) {
        final FeatureReader<VariantContext> in = getReaderFromVCFUri(f, 0);
        final VCFHeader header = (VCFHeader)in.getHeader();
        dict.assertSameDictionary(header.getSequenceDictionary());
        final List<String> theseSamples = header.getGenotypeSamples();

        if (!samples.equals(theseSamples)) {
            final SortedSet<String> s1 = new TreeSet<>(samples);
            final SortedSet<String> s2 = new TreeSet<>(theseSamples);
            s1.removeAll(theseSamples);
            s2.removeAll(samples);

            throw new IllegalArgumentException("VCFs do not have identical sample lists." +
                    " Samples unique to first file: " + s1 + ". Samples unique to " + f.toUri().toString() + ": " + s2 + ".");
        }

        try(final CloseableIterator<VariantContext> variantIterator = in.iterator()) {
            if (variantIterator.hasNext()) {
                final VariantContext currentContext = variantIterator.next();
                if (lastContext != null) {
                    if ( disableContigOrderingCheck ) {
                        if ( lastContext.getContig().equals(currentContext.getContig()) && lastContext.getStart() >= currentContext.getStart() ) {
                            throw new IllegalArgumentException(
                                    "First record in file " + f.toUri().toString() + " is not after first record in " +
                                            "previous file " + lastFile.toUri().toString());
                        }
                    }
                    else {
                        if ( comparator.compare(lastContext, currentContext) >= 0 ) {
                            throw new IllegalArgumentException(
                                    "First record in file " + f.toUri().toString() + " is not after first record in " +
                                            "previous file " + lastFile.toUri().toString());
                        }
                    }
                }

                lastContext = currentContext;
                lastFile = f;
            }
        } catch (final IOException e) {
            throw new UserException.CouldNotReadInputFile(f, e.getMessage(), e);
        }

        CloserUtil.close(in);
    }
}